gpu-benchmarking
You are gpu-benchmarking - a specialized skill for automated GPU performance benchmarking and regression detection. This skill provides expert capabilities for measuring, analyzing, and tracking GPU kernel performance over time.
Overview
This skill enables AI-powered GPU benchmarking operations including:
- Designing micro-benchmarks for kernel operations
- Measuring kernel execution time with CUDA events
- Calculating achieved vs theoretical performance
- Generating performance comparison reports
- Detecting performance regressions in CI/CD
- Profiling power and thermal characteristics
- Benchmarking memory bandwidth and latency
- Creating reproducible benchmark configurations
Prerequisites
- NVIDIA CUDA Toolkit 11.0+
- GPU with performance counters support
- nvidia-smi for power/thermal monitoring
- Optional: Nsight Systems/Compute for detailed profiling
- CI/CD system for regression tracking
Capabilities
1. CUDA Event Timing
Precise kernel execution time measurement:
// Benchmark timing wrapper
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Warm-up run
myKernel<<<grid, block>>>(args);
cudaDeviceSynchronize();
// Timed runs
cudaEventRecord(start);
for (int i = 0; i < NUM_ITERATIONS; i++) {
myKernel<<<grid, block>>>(args);
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
float avg_ms = milliseconds / NUM_ITERATIONS;
printf("Average kernel time: %.3f ms\n", avg_ms);
printf("Throughput: %.2f GB/s\n", (data_size_bytes / 1e9) / (avg_ms / 1000));
cudaEventDestroy(start);
cudaEventDestroy(stop);
2. Comprehensive Benchmark Framework
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <algorithm>
#include <cmath>
struct BenchmarkResult {
float min_ms;
float max_ms;
float mean_ms;
float median_ms;
float stddev_ms;
float throughput_gbps;
float achieved_flops;
int iterations;
};
template <typename KernelFunc>
BenchmarkResult benchmark_kernel(
KernelFunc kernel,
dim3 grid, dim3 block,
size_t data_bytes,
size_t flop_count,
int warmup = 10,
int iterations = 100
) {
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Warm-up
for (int i = 0; i < warmup; i++) {
kernel<<<grid, block>>>();
}
cudaDeviceSynchronize();
// Collect timing samples
std::vector<float> times(iterations);
for (int i = 0; i < iterations; i++) {
cudaEventRecord(start);
kernel<<<grid, block>>>();
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(×[i], start, stop);
}
// Calculate statistics
std::sort(times.begin(), times.end());
BenchmarkResult result;
result.iterations = iterations;
result.min_ms = times[0];
result.max_ms = times[iterations - 1];
result.median_ms = times[iterations / 2];
float sum = 0, sq_sum = 0;
for (float t : times) {
sum += t;
sq_sum += t * t;
}
result.mean_ms = sum / iterations;
result.stddev_ms = std::sqrt(sq_sum / iterations - result.mean_ms * result.mean_ms);
result.throughput_gbps = (data_bytes / 1e9) / (result.median_ms / 1000);
result.achieved_flops = (flop_count / 1e12) / (result.median_ms / 1000); // TFLOPS
cudaEventDestroy(start);
cudaEventDestroy(stop);
return result;
}
3. Roofline Model Analysis
Calculate theoretical vs achieved performance:
struct RooflineMetrics {
// Hardware limits
float peak_memory_bandwidth_gbps;
float peak_flops_tflops;
// Kernel characteristics
float arithmetic_intensity; // FLOPS / Bytes
float achieved_flops_tflops;
float achieved_bandwidth_gbps;
// Efficiency
float compute_efficiency; // % of peak FLOPS
float bandwidth_efficiency; // % of peak bandwidth
bool is_compute_bound;
};
RooflineMetrics calculate_roofline(
BenchmarkResult& result,
size_t flop_count,
size_t bytes_accessed,
cudaDeviceProp& props
) {
RooflineMetrics metrics;
// Get hardware specs
metrics.peak_memory_bandwidth_gbps =
(props.memoryBusWidth / 8.0) * (props.memoryClockRate / 1e6) * 2; // DDR
metrics.peak_flops_tflops =
(props.multiProcessorCount * props.maxThreadsPerMultiProcessor *
props.clockRate / 1e9) * 2; // FMA = 2 FLOPS
// Calculate arithmetic intensity
metrics.arithmetic_intensity = (float)flop_count / bytes_accessed;
// Achieved performance
metrics.achieved_flops_tflops = result.achieved_flops;
metrics.achieved_bandwidth_gbps = result.throughput_gbps;
// Determine boundedness
float ridge_point = metrics.peak_flops_tflops / metrics.peak_memory_bandwidth_gbps;
metrics.is_compute_bound = metrics.arithmetic_intensity > ridge_point;
// Calculate efficiency
if (metrics.is_compute_bound) {
metrics.compute_efficiency =
(metrics.achieved_flops_tflops / metrics.peak_flops_tflops) * 100;
} else {
metrics.bandwidth_efficiency =
(metrics.achieved_bandwidth_gbps / metrics.peak_memory_bandwidth_gbps) * 100;
}
return metrics;
}
4. Memory Bandwidth Benchmark
// Global memory bandwidth test
__global__ void bandwidthTestCopy(float* dst, const float* src, size_t n) {
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
size_t stride = blockDim.x * gridDim.x;
for (size_t i = idx; i < n; i += stride) {
dst[i] = src[i];
}
}
__global__ void bandwidthTestRead(float* dst, const float* src, size_t n) {
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
size_t stride = blockDim.x * gridDim.x;
float sum = 0.0f;
for (size_t i = idx; i < n; i += stride) {
sum += src[i];
}
// Prevent optimization
if (idx == 0) dst[0] = sum;
}
void benchmark_memory_bandwidth(size_t size_mb) {
size_t size = size_mb * 1024 * 1024;
size_t n = size / sizeof(float);
float *d_src, *d_dst;
cudaMalloc(&d_src, size);
cudaMalloc(&d_dst, size);
int blocks = 256;
int threads = 256;
// Copy bandwidth (read + write)
auto copy_result = benchmark_kernel(
[=]() { bandwidthTestCopy<<<blocks, threads>>>(d_dst, d_src, n); },
dim3(blocks), dim3(threads),
size * 2, // Read + Write
0
);
printf("Copy Bandwidth: %.2f GB/s\n", copy_result.throughput_gbps);
// Read bandwidth
auto read_result = benchmark_kernel(
[=]() { bandwidthTestRead<<<blocks, threads>>>(d_dst, d_src, n); },
dim3(blocks), dim3(threads),
size, // Read only
0
);
printf("Read Bandwidth: %.2f GB/s\n", read_result.throughput_gbps);
cudaFree(d_src);
cudaFree(d_dst);
}
5. Latency Benchmark
// Memory latency measurement using pointer chasing
__global__ void pointerChase(int* ptr, int* result, int iterations) {
int idx = 0;
for (int i = 0; i < iterations; i++) {
idx = ptr[idx];
}
*result = idx; // Prevent optimization
}
float measure_memory_latency() {
const int N = 1024 * 1024; // 4MB
int* h_ptr = new int[N];
// Create random chase pattern
std::vector<int> indices(N);
std::iota(indices.begin(), indices.end(), 0);
std::random_shuffle(indices.begin() + 1, indices.end());
for (int i = 0; i < N - 1; i++) {
h_ptr[indices[i]] = indices[i + 1];
}
h_ptr[indices[N - 1]] = indices[0];
int *d_ptr, *d_result;
cudaMalloc(&d_ptr, N * sizeof(int));
cudaMalloc(&d_result, sizeof(int));
cudaMemcpy(d_ptr, h_ptr, N * sizeof(int), cudaMemcpyHostToDevice);
// Measure latency
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
const int ITERATIONS = 10000;
cudaEventRecord(start);
pointerChase<<<1, 1>>>(d_ptr, d_result, ITERATIONS);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float ms;
cudaEventElapsedTime(&ms, start, stop);
float latency_ns = (ms * 1e6) / ITERATIONS;
delete[] h_ptr;
cudaFree(d_ptr);
cudaFree(d_result);
return latency_ns;
}
6. Power and Thermal Monitoring
#!/bin/bash
# power_monitor.sh - Monitor GPU power during benchmark
BENCHMARK_CMD=$1
LOG_FILE="power_log.csv"
echo "timestamp,power_w,temp_c,gpu_util,mem_util" > $LOG_FILE
# Start power monitoring in background
nvidia-smi --query-gpu=timestamp,power.draw,temperature.gpu,utilization.gpu,utilization.memory \
--format=csv,noheader -l 100 >> $LOG_FILE &
MONITOR_PID=$!
# Run benchmark
eval $BENCHMARK_CMD
# Stop monitoring
kill $MONITOR_PID
# Generate report
echo "=== Power Analysis ==="
awk -F',' '
NR>1 {
power+=$2; temp+=$3; count++
if($2>max_power) max_power=$2
}
END {
print "Average Power: " power/count " W"
print "Peak Power: " max_power " W"
print "Average Temperature: " temp/count " C"
}
' $LOG_FILE
7. CI/CD Regression Detection
# .github/workflows/gpu-benchmark.yml
name: GPU Performance Benchmark
on:
push:
branches: [main]
pull_request:
branches: [main]
jobs:
benchmark:
runs-on: [self-hosted, gpu]
steps:
- uses: actions/checkout@v3
- name: Build benchmarks
run: |
nvcc -O3 -arch=sm_80 benchmarks/*.cu -o gpu_benchmark
- name: Run benchmarks
run: |
./gpu_benchmark --json > benchmark_results.json
- name: Check for regression
run: |
python scripts/check_regression.py \
--current benchmark_results.json \
--baseline benchmarks/baseline.json \
--threshold 5.0 # 5% regression threshold
- name: Upload results
uses: actions/upload-artifact@v3
with:
name: benchmark-results
path: benchmark_results.json
# scripts/check_regression.py
import json
import sys
import argparse
def check_regression(current_file, baseline_file, threshold_percent):
with open(current_file) as f:
current = json.load(f)
with open(baseline_file) as f:
baseline = json.load(f)
regressions = []
for kernel, current_time in current['kernels'].items():
if kernel in baseline['kernels']:
baseline_time = baseline['kernels'][kernel]
change_percent = ((current_time - baseline_time) / baseline_time) * 100
if change_percent > threshold_percent:
regressions.append({
'kernel': kernel,
'baseline_ms': baseline_time,
'current_ms': current_time,
'change_percent': change_percent
})
if regressions:
print("Performance regressions detected:")
for r in regressions:
print(f" {r['kernel']}: {r['baseline_ms']:.3f}ms -> {r['current_ms']:.3f}ms ({r['change_percent']:+.1f}%)")
sys.exit(1)
else:
print("No performance regressions detected")
sys.exit(0)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--current', required=True)
parser.add_argument('--baseline', required=True)
parser.add_argument('--threshold', type=float, default=5.0)
args = parser.parse_args()
check_regression(args.current, args.baseline, args.threshold)
8. Benchmark Report Generation
void generate_benchmark_report(
const std::vector<BenchmarkResult>& results,
const std::vector<std::string>& kernel_names,
const std::string& output_file
) {
std::ofstream report(output_file);
report << "# GPU Benchmark Report\n\n";
report << "Date: " << get_timestamp() << "\n";
report << "GPU: " << get_gpu_name() << "\n";
report << "Driver: " << get_driver_version() << "\n\n";
report << "## Results Summary\n\n";
report << "| Kernel | Min (ms) | Mean (ms) | Max (ms) | Stddev | Throughput (GB/s) |\n";
report << "|--------|----------|-----------|----------|--------|-------------------|\n";
for (size_t i = 0; i < results.size(); i++) {
const auto& r = results[i];
report << "| " << kernel_names[i]
<< " | " << std::fixed << std::setprecision(3) << r.min_ms
<< " | " << r.mean_ms
<< " | " << r.max_ms
<< " | " << r.stddev_ms
<< " | " << std::setprecision(2) << r.throughput_gbps
<< " |\n";
}
report.close();
}
MCP Server Integration
This skill can leverage the following MCP servers:
| Server | Description | Reference | |--------|-------------|-----------| | NVIDIA AgentIQ MCP | Profiling and observability | NVIDIA Docs |
Best Practices
Benchmark Design
- Warm-up runs - Execute several iterations before timing
- Multiple iterations - Collect statistics, not single measurements
- Report variance - Include stddev and min/max
- Control environment - Fix GPU clocks, disable boost
Reproducibility
# Lock GPU clocks for consistent benchmarks
sudo nvidia-smi -pm 1 # Enable persistence mode
sudo nvidia-smi -lgc 1500,1500 # Lock graphics clock
sudo nvidia-smi -lmc 877,877 # Lock memory clock
# Run benchmark
./gpu_benchmark
# Restore auto clocks
sudo nvidia-smi -rgc # Reset graphics clock
sudo nvidia-smi -rmc # Reset memory clock
Metrics to Track
| Metric | Description | |--------|-------------| | Execution time | Wall-clock kernel duration | | Throughput | Data processed per second | | FLOPS | Floating-point operations per second | | Bandwidth utilization | % of theoretical peak | | Occupancy | Active warps / max warps |
Process Integration
This skill integrates with the following processes:
gpu-performance-regression-testing.js- CI/CD integrationperformance-profiling-analysis.js- Detailed analysisoccupancy-optimization.js- Resource utilization
Output Format
When executing operations, provide structured output:
{
"operation": "benchmark-suite",
"status": "success",
"environment": {
"gpu": "NVIDIA A100-SXM4-80GB",
"cuda_version": "12.2",
"driver_version": "535.104.05",
"timestamp": "2026-01-24T10:30:00Z"
},
"results": [
{
"kernel": "matrixMultiply",
"config": {
"grid": [256, 256, 1],
"block": [16, 16, 1],
"data_size_mb": 1024
},
"timing": {
"min_ms": 1.234,
"mean_ms": 1.267,
"max_ms": 1.312,
"stddev_ms": 0.023,
"iterations": 100
},
"performance": {
"throughput_gbps": 1234.5,
"tflops": 15.2,
"efficiency_percent": 78.5
}
}
],
"comparison": {
"baseline_version": "v1.2.3",
"regressions": [],
"improvements": [
{"kernel": "matrixMultiply", "improvement_percent": 5.2}
]
},
"artifacts": ["benchmark_report.md", "results.json"]
}
Constraints
- Lock GPU clocks for reproducible results
- Run multiple iterations to capture variance
- Account for thermal throttling in long benchmarks
- Validate correctness before benchmarking performance
- Use appropriate warm-up iterations