CUDA Programming
Programming model
To execute any CUDA program, there are three main steps:
- Host-to-device data transfer
- Load GPU program and execute, caching data on-chip for performance.
- Device-to-host data transfer
Function prefix:
__global__ // called by host (<<<grid, block>>>), run on device. (kernels, must return void)
__device__ // called by device (__global__ or __device__), run on device.
__host__ // called by host, run on host.
CUDA kernel and thread hierarchy
-
Kernel: A function that gets executed on GPU
-
Thread: Concurrent code and associated state executed on the CUDA device
-
Block: A a group of threads that are executed together and form the unit of resource assignment
-
Grid: A group of thread blocks that must all complete before the next kernel call of the program can take effect
To get the unique index of a thread, see below for an example for 1d case:
threadIdx.x // local id within a block (ranges from 0 to num_threads - 1)
blockDim.x // = num_threads
blockIdx.x // ranges from 0 to num_blocks - 1
gridDim.x // = num_blocks
CUDA streaming multiprocesser (SM)
- Each CUDA block is executed by one streaming multiprocessor (SM) and cannot be migrated to other SMs in GPU (except during preemption, debugging, or CUDA dynamic parallelism).
-
One SM can run several concurrent CUDA blocks
-
Each kernel is executed on one device and CUDA supports running multiple kernels on a device at one time.
- SM uses SIMT (Single-Instruction, Multiple-Thread), the basic unit is warps.
- Each warp contains 32 threads and they will execute the same instructions.
- When the block has been assigned to SM, it will be further divided into warps for execution.
- The size of block has better to be the multiple of 32 for the best performance
Memory model
-
Each thread has its own local memory
-
Each block has a shared memory
-
Each thread can access global memory, and read-only memory: constant & texture memory
Code tutorial
Vector addition
Baseline
#include <algorithm>
#include <cassert>
#include <iostream>
#include <vector>
// CUDA kernel for vector addition
// __global__ means this is called from the CPU, and runs on the GPU
__global__ void vectorAdd(const int *__restrict a, const int *__restrict b,
int *__restrict c, int N) {
// Calculate global thread ID
int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
// Boundary check
if (tid < N) c[tid] = a[tid] + b[tid];
}
// Check vector add result
void verify_result(std::vector<int> &a, std::vector<int> &b,
std::vector<int> &c) {
for (int i = 0; i < a.size(); i++) {
assert(c[i] == a[i] + b[i]);
}
}
int main() {
// constexpr:allows certain computations to be performed
// during compilation rather than runtime
// 0000 0000 0000 0000 0000 0000 0000 0001(original binary representation of 1)
// << 16
// ---------------------------------------
// 0000 0000 0000 0001 0000 0000 0000 0000 (result after left shift by 16 positions)
// Array size: 2^16 = 65536
constexpr int N = 1 << 16;
constexpr size_t bytes = sizeof(int) * N;
// printf("Hello bytes %zu\n", bytes);
// Vectors for holding the host-side (CPU-side) data
std::vector<int> a;
// This reserves memory space for N elements in the vector a
// without actually initializing those elements.
a.reserve(N);
std::vector<int> b;
b.reserve(N);
std::vector<int> c;
c.reserve(N);
// Initialize random numbers in each array
for (int i = 0; i < N; i++) {
a.push_back(rand() % 100);
b.push_back(rand() % 100);
}
// Allocate memory on the device
// These allocated memory blocks can be used for storing data
// that will be processed by GPU kernels.
int *d_a, *d_b, *d_c;
cudaMalloc(&d_a, bytes);
cudaMalloc(&d_b, bytes);
cudaMalloc(&d_c, bytes);
// Copy data from the host to the device (CPU -> GPU)
cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b.data(), bytes, cudaMemcpyHostToDevice);
// Threads per block (CTA) (1024)
int NUM_THREADS = 1 << 10;
// Blocks (CTAs) per Grid
// We need to launch at LEAST as many threads as we have elements
// This equation pads an extra CTA to the grid if N cannot evenly be divided
// by NUM_THREADS (e.g. N = 1025, NUM_THREADS = 1024)
int NUM_BLOCKS = (N + NUM_THREADS - 1) / NUM_THREADS;
// Launch the kernel on the GPU
// Kernel calls are asynchronous (the CPU program continues execution after
// call, but no necessarily before the kernel finishes)
vectorAdd<<<NUM_BLOCKS, NUM_THREADS>>>(d_a, d_b, d_c, N);
// Copy sum vector from device to host
// cudaMemcpy is a synchronous operation, and waits for the prior kernel
// launch to complete (both go to the default stream in this case).
// Therefore, this cudaMemcpy acts as both a memcpy and synchronization
// barrier.
cudaMemcpy(c.data(), d_c, bytes, cudaMemcpyDeviceToHost);
// Check result for errors
verify_result(a, b, c);
// Free memory on device
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
std::cout << "COMPLETED SUCCESSFULLY\n";
return 0;
}
Baseline + Unified memory
Use unified memory cudaMallocManaged()
to avoid allocate memory in host and device separately. Do not forget cudaDeviceSynchronize()
as we do not have implicit synchronization of cudaMemcpy()
.
#include <stdio.h>
#include <cassert>
#include <iostream>
using std::cout;
// CUDA kernel for vector addition
// No change when using CUDA unified memory
__global__ void vectorAdd(int *a, int *b, int *c, int N) {
// Calculate global thread thread ID
int tid = (blockDim.x * blockIdx.x) + threadIdx.x;
// Boundary check
if (tid < N) {
c[tid] = a[tid] + b[tid];
}
}
int main() {
// Array size of 2^16 (65536 elements)
const int N = 1 << 16;
size_t bytes = N * sizeof(int);
// Declare unified memory pointers
int *a, *b, *c;
// Allocation memory for these pointers
cudaMallocManaged(&a, bytes);
cudaMallocManaged(&b, bytes);
cudaMallocManaged(&c, bytes);
// Initialize vectors
for (int i = 0; i < N; i++) {
a[i] = rand() % 100;
b[i] = rand() % 100;
}
// Threads per CTA (1024 threads per CTA)
int BLOCK_SIZE = 1 << 10;
// CTAs per Grid
int GRID_SIZE = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
// Call CUDA kernel
vectorAdd<<<GRID_SIZE, BLOCK_SIZE>>>(a, b, c, N);
// Wait for all previous operations before using values
// We need this because we don't get the implicit synchronization of
// cudaMemcpy like in the original example
cudaDeviceSynchronize();
// Verify the result on the CPU
for (int i = 0; i < N; i++) {
assert(c[i] == a[i] + b[i]);
}
// Free unified memory (same as memory allocated with cudaMalloc)
cudaFree(a);
cudaFree(b);
cudaFree(c);
cout << "COMPLETED SUCCESSFULLY!!!!!!!\n";
return 0;
}
Matrix multiplication
Baseline
#include <algorithm>
#include <cassert>
#include <cstdlib>
#include <functional>
#include <iostream>
#include <vector>
using std::cout;
using std::generate;
using std::vector;
__global__ void matrixMul(const int *a, const int *b, int *c, int N) {
// Compute each thread's global row and column index
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
// Iterate over row, and down column
c[row * N + col] = 0;
for (int k = 0; k < N; k++) {
// Accumulate results for a single element
c[row * N + col] += a[row * N + k] * b[k * N + col];
}
}
// Check result on the CPU
void verify_result(vector<int> &a, vector<int> &b, vector<int> &c, int N) {
// For every row...
for (int i = 0; i < N; i++) {
// For every column...
for (int j = 0; j < N; j++) {
// For every element in the row-column pair
int tmp = 0;
for (int k = 0; k < N; k++) {
// Accumulate the partial results
tmp += a[i * N + k] * b[k * N + j];
}
// Check against the CPU result
assert(tmp == c[i * N + j]);
}
}
}
int main() {
// Matrix size of 1024 x 1024;
int N = 1 << 10;
// Size (in bytes) of matrix
size_t bytes = N * N * sizeof(int);
// Host vectors
vector<int> h_a(N * N);
vector<int> h_b(N * N);
vector<int> h_c(N * N);
// Initialize matrices
generate(h_a.begin(), h_a.end(), []() { return rand() % 100; });
generate(h_b.begin(), h_b.end(), []() { return rand() % 100; });
// Allocate device memory
int *d_a, *d_b, *d_c;
cudaMalloc(&d_a, bytes);
cudaMalloc(&d_b, bytes);
cudaMalloc(&d_c, bytes);
// Copy data to the device
cudaMemcpy(d_a, h_a.data(), bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b.data(), bytes, cudaMemcpyHostToDevice);
// Threads per CTA dimension
int THREADS = 32;
// Blocks per grid dimension (assumes THREADS divides N evenly)
int BLOCKS = N / THREADS;
// Use dim3 structs for block and grid dimensions
dim3 threads(THREADS, THREADS);
dim3 blocks(BLOCKS, BLOCKS);
// Launch kernel
matrixMul<<<blocks, threads>>>(d_a, d_b, d_c, N);
// Copy back to the host
cudaMemcpy(h_c.data(), d_c, bytes, cudaMemcpyDeviceToHost);
// Check result
verify_result(h_a, h_b, h_c, N);
cout << "COMPLETED SUCCESSFULLY\n";
// Free memory on device
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
Memory coalescing:
Tiled matrix multiplication
Tiled matrix multiplication are essentially doing mini-matrix multiplication:
- Tiling is a technique that partitions data into equal-sized subsets (tiles) that fits into shared memory'
- Each data subset will be handled with one thread block
// This program computes matrix multiplication using shared memory tiling
// By: Nick from CoffeeBeforeArch
#include <algorithm>
#include <cassert>
#include <cstdlib>
#include <functional>
#include <iostream>
#include <vector>
using std::cout;
using std::generate;
using std::vector;
// Pull out matrix and shared memory tile size
const int N = 1 << 10; // 1024
const int SHMEM_SIZE = 1 << 10; // 1024
__global__ void matrixMul(const int *a, const int *b, int *c) {
// Compute each thread's global row and column index
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
// Statically allocated shared memory
// Shared among all threads within a block and can be
// used for communication and data sharing among those threads
__shared__ int s_a[SHMEM_SIZE];
__shared__ int s_b[SHMEM_SIZE];
// Accumulate in temporary variable
int tmp = 0;
// -----x blockDim.x
// |
// |
// x blockDim.y
// Sweep tile across matrix
for (int i = 0; i < N; i += blockDim.x) {
// Load in elements for this tile
// The size of s_a equals to the number of threads in a block
// s_a[threadIdx.y * blockDim.x + threadIdx.x]: Local position within the block
// a[row * N + i + threadIdx.x]: Global position
s_a[threadIdx.y * blockDim.x + threadIdx.x] = a[row * N + i + threadIdx.x];
s_b[threadIdx.y * blockDim.x + threadIdx.x] = b[i * N + threadIdx.y * N + col];
// Wait for both tiles to be loaded in before doing computation
__syncthreads();
// Do matrix multiplication on the small matrix
for (int j = 0; j < blockDim.x; j++) {
tmp +=
s_a[threadIdx.y * blockDim.x + j] * s_b[j * blockDim.x + threadIdx.x];
}
// Wait for all threads to finish using current tiles before loading in new
// ones
__syncthreads();
}
// Write back results
// A block of threads writes back to a block of memory
c[row * N + col] = tmp;
}
// Check result on the CPU
void verify_result(vector<int> &a, vector<int> &b, vector<int> &c) {
// For every row...
for (int i = 0; i < N; i++) {
// For every column...
for (int j = 0; j < N; j++) {
// For every element in the row-column pair
int tmp = 0;
for (int k = 0; k < N; k++) {
// Accumulate the partial results
tmp += a[i * N + k] * b[k * N + j];
}
// Check against the CPU result
assert(tmp == c[i * N + j]);
}
}
}
int main() {
// Size (in bytes) of matrix
size_t bytes = N * N * sizeof(int);
// Host vectors
vector<int> h_a(N * N);
vector<int> h_b(N * N);
vector<int> h_c(N * N);
// Initialize matrices
generate(h_a.begin(), h_a.end(), []() { return rand() % 100; });
generate(h_b.begin(), h_b.end(), []() { return rand() % 100; });
// Allocate device memory
int *d_a, *d_b, *d_c;
cudaMalloc(&d_a, bytes);
cudaMalloc(&d_b, bytes);
cudaMalloc(&d_c, bytes);
// Copy data to the device
cudaMemcpy(d_a, h_a.data(), bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b.data(), bytes, cudaMemcpyHostToDevice);
// Threads per CTA dimension
// CTA is a group of threads that can cooperate
// with each other within a block.
int THREADS = 32;
// Blocks per grid dimension (assumes THREADS divides N evenly)
int BLOCKS = N / THREADS;
// Use dim3 structs for block and grid dimensions
// The third dimension is implicitly set to 1
dim3 threads(THREADS, THREADS);
dim3 blocks(BLOCKS, BLOCKS);
// Launch kernel
matrixMul<<<blocks, threads>>>(d_a, d_b, d_c);
// Copy back to the host
cudaMemcpy(h_c.data(), d_c, bytes, cudaMemcpyDeviceToHost);
// Check result
verify_result(h_a, h_b, h_c);
cout << "COMPLETED SUCCESSFULLY\n";
// Free memory on device
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}