CUDA is a parallel computing platform and programming model developed by NVIDIA. It enables developers to use NVIDIA GPUs for general-purpose computing. This tutorial will guide you through the basics of writing CUDA kernels, assuming no prior experience with GPU programming.
A CUDA kernel is a special C/C++ function that can be executed on a GPU. It is defined using the __global__
keyword before the function definition. The kernel is executed in parallel by multiple threads organized into blocks and grids.
Here's a simple example of a CUDA kernel for adding two vectors:
#include <iostream>
#include <cuda_runtime.h>
__global__ void vectorAddKernel(const float *A, const float *B, float *C, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N) {
C[idx] = A[idx] + B[idx];
}
}
int main() {
const int N = 1024;
size_t size = N * sizeof(float);
// Allocate and initialize host memory
float *h_A = new float[N];
float *h_B = new float[N];
float *h_C = new float[N];
for (int i = 0; i < N; ++i) {
h_A[i] = static_cast<float>(i);
h_B[i] = static_cast<float>(i * 2);
}
// Allocate device memory
float *d_A, *d_B, *d_C;
cudaMalloc((void **)&d_A, size);
cudaMalloc((void **)&d_B, size);
cudaMalloc((void **)&d_C, size);
// Copy data from host to device
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
// Define block and grid sizes
int blockSize = 256;
int gridSize = (N + blockSize - 1) / blockSize;
// Launch the CUDA kernel
vectorAddKernel<<<gridSize, blockSize>>>(d_A, d_B, d_C, N);
// Copy the result from device to host
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
// Verify the result
for (int i = 0; i < N; ++i) {
if (h_C[i] != h_A[i] + h_B[i]) {
std::cerr << "Verification failed at index " << i << std::endl;
break;
}
}
// Free device and host memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
delete[] h_A;
delete[] h_B;
delete[] h_C;
return 0;
}