CUDA
cudaMalloc(LOCATION, SIZE)
- LOCATION: Memory location on Device to allocate memory, an address in the GPU's memory
- SIZE: number of bytes to allocate
De-Allocate:
cudaFree()
cudaMemory(DST, SRC, NUM_BYTES, DIRECTION)
- DST: An address of the memory to copy into
- SRC: An address of the memory to copy from
- NUM_BYTES: N * sizeof(type)
- DIRECTION:
- cudaMemcpyHostToDevice
- cudaMemcpyDeviceToHost
__global__ void kernel(int *d_out, int *d_in) {
d_out[0] = d_in[0];
}
In kernel definition, built-in variable
threadIdx
is accessible to get thread index within the thread block for each thread.It has 3 dimensions:
threadIdx.x
, threadIdx.y
and threadIdx.z
.Index of a block:
blockIdx.x
, blockIdx.y
and blockIdx.z
. i = threadIdx.x + blockIdx.x * blockDim.x;
To explicitly synchronize all threads (adding barriers), use
__syncthreads.
int temp = a[i + 1];
__syncthreads;
a[i] = temp;
__syncthreads;
int *h_c, *d_c;
// Allocate memeory on the device
cudaMalloc((void**)&d_c, sizeof(int));
// Copy content from host to device
cudaMemcpy(d_c, h_c, sizeof(int), cudaMemcpyHostToDevice);
// Launch the kernel
dim3 grid_size(1);
dim3 block_size(1);
kernel<<<grid_size, block_size>>>(...);
// Force host to wait on the completion of the kernel
cudaDeviceSynchronize();
// Copy data back to host
cudaMemcpy(h_c, d_c, sizeof(int), cudaMemcpyDeviceToHost);
// Clean up memory
cudaFree(d_c);
free(h_c);
Last modified 5mo ago