warp-primitives Skill | Agent Skills

warp-primitives

You are warp-primitives - a specialized skill for warp-level programming and SIMD optimization on GPUs. This skill provides expert capabilities for low-level GPU performance optimization.

Overview

This skill enables AI-powered warp-level programming including:

Use warp shuffle instructions (_shfl*)
Implement warp voting functions (__ballot, __any, __all)
Design warp-synchronous algorithms
Optimize warp divergence patterns
Use cooperative groups for flexible sync
Implement warp-level reductions
Analyze and minimize warp stalls
Support CUDA 11+ warp intrinsics

Prerequisites

CUDA Toolkit 11.0+
GPU with compute capability 3.0+
Understanding of SIMT execution model

Capabilities

1. Warp Shuffle Instructions

Data exchange within a warp:

// __shfl_sync: Broadcast from any lane
__device__ float warpBroadcast(float val, int srcLane) {
    return __shfl_sync(0xffffffff, val, srcLane);
}

// __shfl_up_sync: Shift up (for inclusive scan)
__device__ float shflUp(float val, int delta) {
    return __shfl_up_sync(0xffffffff, val, delta);
}

// __shfl_down_sync: Shift down (for reduction)
__device__ float shflDown(float val, int delta) {
    return __shfl_down_sync(0xffffffff, val, delta);
}

// __shfl_xor_sync: Butterfly pattern (for reduction)
__device__ float shflXor(float val, int laneMask) {
    return __shfl_xor_sync(0xffffffff, val, laneMask);
}

// Warp-level reduction using shuffle
__device__ float warpReduceSum(float val) {
    for (int offset = warpSize / 2; offset > 0; offset >>= 1) {
        val += __shfl_down_sync(0xffffffff, val, offset);
    }
    return val;
}

// Warp-level reduction using XOR (butterfly)
__device__ float warpReduceSumXor(float val) {
    for (int mask = warpSize / 2; mask > 0; mask >>= 1) {
        val += __shfl_xor_sync(0xffffffff, val, mask);
    }
    return val;  // All lanes have result
}

// Warp-level inclusive scan
__device__ float warpInclusiveScan(float val) {
    for (int offset = 1; offset < warpSize; offset <<= 1) {
        float n = __shfl_up_sync(0xffffffff, val, offset);
        if (threadIdx.x % warpSize >= offset) {
            val += n;
        }
    }
    return val;
}

2. Warp Voting Functions

Collective warp operations:

// __ballot_sync: Create bitmask of predicate
__device__ unsigned int warpBallot(bool predicate) {
    return __ballot_sync(0xffffffff, predicate);
}

// __any_sync: Any thread has true predicate
__device__ bool warpAny(bool predicate) {
    return __any_sync(0xffffffff, predicate);
}

// __all_sync: All threads have true predicate
__device__ bool warpAll(bool predicate) {
    return __all_sync(0xffffffff, predicate);
}

// Count set bits in warp
__device__ int warpPopcount(bool predicate) {
    return __popc(__ballot_sync(0xffffffff, predicate));
}

// Find position within active threads
__device__ int warpExclusiveCount(bool predicate) {
    unsigned int mask = __ballot_sync(0xffffffff, predicate);
    unsigned int laneMask = (1u << (threadIdx.x % warpSize)) - 1;
    return __popc(mask & laneMask);
}

// Example: Stream compaction within warp
__device__ int warpCompact(int* output, int value, bool keep) {
    unsigned int mask = __ballot_sync(0xffffffff, keep);
    int total = __popc(mask);

    if (keep) {
        int pos = __popc(mask & ((1u << (threadIdx.x % warpSize)) - 1));
        output[pos] = value;
    }

    return total;
}

3. Cooperative Groups

Flexible synchronization:

#include <cooperative_groups.h>
namespace cg = cooperative_groups;

// Warp-level cooperative group
__device__ void warpOperation(float* data) {
    cg::thread_block_tile<32> warp = cg::tiled_partition<32>(cg::this_thread_block());

    int lane = warp.thread_rank();
    float val = data[lane];

    // Warp-level reduction
    for (int offset = warp.size() / 2; offset > 0; offset >>= 1) {
        val += warp.shfl_down(val, offset);
    }

    if (lane == 0) data[0] = val;
}

// Flexible tile sizes
template<int TILE_SIZE>
__device__ void tiledOperation(float* data) {
    cg::thread_block_tile<TILE_SIZE> tile =
        cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());

    float val = data[tile.thread_rank()];

    // Tile-level reduction
    for (int offset = tile.size() / 2; offset > 0; offset >>= 1) {
        val += tile.shfl_down(val, offset);
    }

    if (tile.thread_rank() == 0) {
        data[tile.meta_group_rank()] = val;
    }
}

// Grid-level synchronization (requires cooperative launch)
__global__ void gridSyncKernel(float* data, int n) {
    cg::grid_group grid = cg::this_grid();

    // Phase 1
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) data[idx] *= 2.0f;

    grid.sync();  // Synchronize entire grid

    // Phase 2 - all blocks see phase 1 results
    if (idx < n) data[idx] += 1.0f;
}

4. Warp Divergence Optimization

Minimize divergence impact:

// Bad: Divergent branches
__global__ void divergentKernel(float* data, int n) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < n) {
        if (data[idx] > 0) {  // Divergent!
            data[idx] = expf(data[idx]);  // Some threads execute
        } else {
            data[idx] = 0.0f;  // Other threads execute
        }
    }
}

// Better: Predicated execution
__global__ void predicatedKernel(float* data, int n) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < n) {
        bool positive = data[idx] > 0;
        // Both paths computed, result selected
        float result = positive ? expf(data[idx]) : 0.0f;
        data[idx] = result;
    }
}

// Best: Reorganize data to reduce divergence
// Process positive and negative values separately
__global__ void reorganizedKernel(float* positive, float* negative,
                                   int nPos, int nNeg) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;

    // All threads in warp take same path
    if (idx < nPos) {
        positive[idx] = expf(positive[idx]);
    }
}

// Warp-level early exit
__global__ void warpEarlyExit(float* data, int* flags, int n) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;

    // Check if entire warp can skip
    bool needsWork = (idx < n) && flags[idx];
    if (!__any_sync(0xffffffff, needsWork)) {
        return;  // Entire warp exits
    }

    // Only warps with work continue
    if (needsWork) {
        data[idx] = expensiveComputation(data[idx]);
    }
}

5. Warp-Synchronous Programming

Implicit warp synchronization:

// Pre-Volta: Implicit warp sync (deprecated pattern)
// Post-Volta: Use explicit __syncwarp()

__device__ float warpSafeReduce(float val) {
    // Always use explicit sync mask
    val += __shfl_down_sync(0xffffffff, val, 16);
    val += __shfl_down_sync(0xffffffff, val, 8);
    val += __shfl_down_sync(0xffffffff, val, 4);
    val += __shfl_down_sync(0xffffffff, val, 2);
    val += __shfl_down_sync(0xffffffff, val, 1);
    return val;
}

// Active mask handling
__device__ float activeWarpReduce(float val) {
    unsigned int active = __activemask();
    for (int offset = warpSize / 2; offset > 0; offset >>= 1) {
        val += __shfl_down_sync(active, val, offset);
    }
    return val;
}

// Match sync for convergent warps
__device__ void convergentOperation() {
    // Ensure threads converge before warp operation
    unsigned int mask = __match_any_sync(__activemask(), threadIdx.x / 8);
    // mask contains threads with same value
}

6. Warp-Level Matrix Operations

Matrix fragments with warp cooperation:

// Warp-level matrix multiply (simplified WMMA concept)
__device__ void warpMatMul4x4(float* A, float* B, float* C) {
    int lane = threadIdx.x % 32;

    // Each lane owns one element of result
    int row = lane / 4;
    int col = lane % 4;

    float sum = 0.0f;
    for (int k = 0; k < 4; k++) {
        // Broadcast A[row][k] and B[k][col]
        float a = __shfl_sync(0xffffffff, A[row * 4 + k], row * 4 + k);
        float b = __shfl_sync(0xffffffff, B[k * 4 + col], k * 4 + col);
        sum += a * b;
    }
    C[lane] = sum;
}

7. Warp Stall Analysis

Identify and fix stall causes:

// Common stall causes and solutions

// 1. Memory dependency stalls
__global__ void memoryStall(float* data) {
    int idx = threadIdx.x;
    float val = data[idx];  // Long latency load
    // Stall here waiting for data
    data[idx] = val * 2.0f;
}

// Solution: Increase occupancy or hide latency
__global__ void hiddenLatency(float* data, int n) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;

    // Load multiple values
    float v1 = data[idx];
    float v2 = data[idx + n];

    // Compute on v1 while v2 loads
    v1 = v1 * 2.0f + 1.0f;

    // Now v2 should be ready
    v2 = v2 * 2.0f + 1.0f;

    data[idx] = v1;
    data[idx + n] = v2;
}

// 2. Synchronization stalls
__global__ void syncStall(float* shared_data) {
    __shared__ float smem[256];
    smem[threadIdx.x] = shared_data[threadIdx.x];
    __syncthreads();  // All threads wait here
}

// Solution: Minimize sync points, use warp-level sync

Process Integration

This skill integrates with the following processes:

warp-efficiency-optimization.js - Warp efficiency workflow
reduction-scan-implementation.js - Reduction/scan patterns
parallel-algorithm-design.js - Algorithm optimization

Output Format

{
  "operation": "generate-warp-reduction",
  "configuration": {
    "data_type": "float",
    "reduction_op": "sum",
    "use_xor_pattern": true
  },
  "generated_code": "warp_reduction.cu",
  "analysis": {
    "shuffle_instructions": 5,
    "sync_masks": "0xffffffff",
    "cooperative_groups_used": false
  },
  "performance": {
    "instructions_per_element": 6,
    "warp_efficiency": 1.0,
    "divergence": "none"
  }
}

Dependencies

CUDA Toolkit 11.0+
cooperative_groups header

Constraints

Warp shuffle requires all participating threads
Sync masks must correctly represent active threads
Cooperative groups require compile-time tile sizes
Grid sync requires cooperative kernel launch

Agent Skills: warp-primitives

Install this agent skill to your local

Skill Files