The following source listing for incrementMappedArrayInPlace.cu is an adapted version of the incrementArrays.cu example from Part 2 to use the new mapped, pinned runtime API.
// incrementMappedArrayInPlace.cu
#include <stdio.h>
#include <assert.h>
#include <cuda.h>
// define the problem and block size
#define NUMBER_OF_ARRAY_ELEMENTS 100000
#define N_THREADS_PER_BLOCK 256
void incrementArrayOnHost(float *a, int N)
{
int i;
for (i=0; i < N; i++) a[i] = a[i]+1.f;
}
__global__ void incrementArrayOnDevice(float *a, int N)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx < N) a[idx] = a[idx]+1.f;
}
void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) {
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
exit(EXIT_FAILURE);
}
}
int main(void)
{
float *a_m; // pointer to host memory
float *a_d; // pointer to mapped device memory
float *check_h; // pointer to host memory used to check results
int i, N = NUMBER_OF_ARRAY_ELEMENTS;
size_t size = N*sizeof(float);
cudaDeviceProp deviceProp;
#if CUDART_VERSION < 2020
#error "This CUDART version does not support mapped memory!\n"
#endif
// Get properties and verify device 0 supports mapped memory
cudaGetDeviceProperties(&deviceProp, 0);
checkCUDAError("cudaGetDeviceProperties");
if(!deviceProp.canMapHostMemory) {
fprintf(stderr, "Device %d cannot map host memory!\n", 0);
exit(EXIT_FAILURE);
}
// set the device flags for mapping host memory
cudaSetDeviceFlags(cudaDeviceMapHost);
checkCUDAError("cudaSetDeviceFlags");
// allocate mapped arrays
cudaHostAlloc((void **)&a_m, size, cudaHostAllocMapped);
checkCUDAError("cudaHostAllocMapped");
// Get the device pointers to the mapped memory
cudaHostGetDevicePointer((void **)&a_d, (void *)a_m, 0);
checkCUDAError("cudaHostGetDevicePointer");
// initialization of host data
for (i=0; i<N; i++) a_m[i] = (float)i;
// do calculation on device:
// Part 1 of 2. Compute execution configuration
int blockSize = N_THREADS_PER_BLOCK;
int nBlocks = N/blockSize + (N%blockSize > 0?1:0);
// Part 2 of 2. Call incrementArrayOnDevice kernel
incrementArrayOnDevice <<< nBlocks, blockSize >>> (a_d, N);
checkCUDAError("incrementArrayOnDevice");
/* Note the allocation, initialization and call to incrementArrayOnHost
occurs asynchronously to the GPU */
check_h = (float *)malloc(size);
for (i=0; i<N; i++) check_h[i] = (float)i;
incrementArrayOnHost(check_h, N);
// Make certain that all threads are idle before proceeding
cudaThreadSynchronize();
checkCUDAError("cudaThreadSynchronize");
// check results
for (i=0; i<N; i++) assert(check_h[i] == a_m[i]);
// cleanup
free(check_h); // free host memory
cudaFreeHost(a_m); // free mapped memory (and device pointers)
}
CUDA 2.2 added the following two device properties to the cudaDeviceProp structure that is retrieved by cudaGetDeviceProperties so you can determine if a device can support the new mapped memory API (as well as check if the GPU is an integrated graphics processor):
The following code block utilizes a pre-processor check to make certain that a valid version of CUDA is being used to compile the mapped code plus the function cudaGetDeviceProperties is called so a runtime check can be made to ensure that the CUDA device supports mapped memory:
#if CUDART_VERSION < 2020
#error "This CUDART version does not support mapped memory!\n"
#endif
// Get properties and verify device 0 supports mapped memory
cudaGetDeviceProperties(&deviceProp, 0);
checkCUDAError("cudaGetDeviceProperties");
if(!deviceProp.canMapHostMemory) {
fprintf(stderr, "Device %d cannot map host memory!\n", 0);
exit(EXIT_FAILURE);
}
Host memory mapping is then enabled on the device:
// set the device flags for mapping host memory
cudaSetDeviceFlags(cudaDeviceMapHost);
checkCUDAError("cudaSetDeviceFlags");
A mapped array, a_m, is then allocated on the host. (Note: The memory is mapped at this point but there is no device pointer. Getting the device pointer occurs in the following step.)
// allocate host mapped arrays
cudaHostAlloc((void **)&a_m, size, cudaHostAllocMapped);
checkCUDAError("cudaHostAllocMapped");
Get the device pointer to the mapped memory:
// Get the device pointers to the mapped memory
cudaHostGetDevicePointer((void **)&a_d, (void *)a_m, 0);
checkCUDAError("cudaHostGetDevicePointer");
Data initialization occurs and the kernel is executed on the GPU. Unlike the original incrementArrays.cu example, no explicit programmer initiated data movement occurs with a cudaMemcpy. Note that the data movement and kernel execution occurs asynchronously to the host operations. As a result, the host creation and calculation of the the validation array, check_h, occurs while the GPU is simultaneously running the incrementArrayOnDevice kernel to update the host array a_m through the mapped device memory pointer a_d.
Synchronization occurs via the call to cudaThreadSynchronize after which the GPU results are validated against the host generated results.
Assuming the results from the host and GPU kernels agree, the program then cleans up after itself. The function cudaFreeHost is used to free up the mapped array on the host and pointer on the GPU.
Under Linux, the program can be compiled with the command-line:
nvcc –o incrementMappedArrayInPlace incrementMappedArrayInPlace.cu
The performance implications of performing in-place updates to mapped memory are not clear. To ensure the minimum number of PCIe operations occur, it seems prudent to stream data between separate arrays. In other words, use separate arrays where one is dedicated read operations and the other is dedicated to write operations.


