Channels ▼
RSS

Design

CUDA: Unifying Host/Device Interactions with a Single C++ Macro


Eventually, CUDA will offer some form of high-performance cached mapped memory. Until then, the SHADOW_MACRO() code is useful because it encapsulates device-side data allocation and data movement between the host and device. Regardless of how the data moves between the device(s), the programmer must ensure that all C++ classes provide a data layout that is usable on both devices! This implies that C++ type traits and C++ compiler methods to check type traits will remain important for the foreseeable future.

A Test Code

The following code demonstrates how to use ParallelCounter on both the host and GPU for several common scenarios:

  • Allocate an object on the stack on the host, use it on the device, and get the results on the host.
  • Dynamically allocate the object with new on the host and use it on the device.
  • Dynamically allocate an array of objects on the host and use one of them on the device.
  • Create an STL vector of objects and use one on the device.
  • Dynamically allocate an array of objects on the host and initialize them in parallel on the host prior to using one on the device.
  • Dynamically allocate an array of objects on the device and use one in a calculation on the device.

Listing Four: Complete source code for secondCounter.cu.

//Rob Farber
#include <iostream>
#include <vector>
using namespace std;
#include <cstdio>
#include <cassert>
#include "ParallelCounter.hpp"

#ifndef SPREAD
#define SPREAD 32
#endif

//*****************
inline __device__ void doTest(ParallelCounter<SPREAD> *ptCntr, uint32_t nSamples) {
  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;

  while(tid < nSamples) {
    *ptCntr += 1;
    tid += blockDim.x * gridDim.x;
  }
}
__global__ void doTest1(ParallelCounter<SPREAD> *pt,uint32_t n) {doTest(pt,n); }
__global__ void doTest2(ParallelCounter<SPREAD> *pt,uint32_t n) {doTest(pt,n); }
__global__ void doTest3(ParallelCounter<SPREAD> *pt,uint32_t n) {doTest(pt,n); }
__global__ void doTest4(ParallelCounter<SPREAD> *pt,uint32_t n) {doTest(pt,n); }
__global__ void doTest5(ParallelCounter<SPREAD> *pt,uint32_t n) {doTest(pt,n); }

__device__ ParallelCounter<SPREAD> *myAra;
__global__ void allocDeviceTest(uint32_t n) {
  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
  if(tid==0) {
    myAra = new ParallelCounter<SPREAD>[n];
  }
}
__global__ void initDeviceTest(uint32_t n) {
  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
  while(tid < n) {
    myAra[tid].set(0);
    tid += blockDim.x * gridDim.x;
  }
}
__global__ void fcnDeviceTest(int index, uint32_t nSamples) {
  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;

  while(tid < nSamples) {
    myAra[index] += 1;
    tid += blockDim.x * gridDim.x;
  }
}
__global__ void finiDeviceTest(int index, uint32_t *result) {
  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;

  if(tid==0) {
    *result = myAra[index].getCount();
    if(myAra) delete [] myAra;
  }
}

//*****************
// Start of main()
int main(int argc, char *argv[])
{
  uint32_t nSamples;
  int device=0;
  
  try {
    if(argc < 3) {
      cerr << "Use: nSamples device" << endl;
      return -1;
    }
    
    nSamples=atoi(argv[1]);
    device=atoi(argv[2]);

    cudaSetDevice(device);
    if(cudaPeekAtLastError() != cudaSuccess) throw "failed to set device";

    int nThreadsPerBlock=256;
    int nBlocks = nSamples/nThreadsPerBlock + ((nSamples % nThreadsPerBlock)?1:0);
    if(nBlocks > 65535) nBlocks=65535;
    
    cout << "device " << device << " nSamples " << nSamples
	 << " spread " << SPREAD << " nBlocks " << nBlocks
	 << " threadsPerBlock " << nThreadsPerBlock << endl;

    cout << "ParallelCounter<SPREAD> is_standard_layout: " 
	 << __is_standard_layout(ParallelCounter<SPREAD>) << endl;
    assert(__is_standard_layout(ParallelCounter<SPREAD>));
    cout << "ParallelCounter is in standard layout" << endl;
    
    
    { // test 1: stack allocate an object
      ParallelCounter<SPREAD> foo;
      foo.set(0);
      doTest1<<<nBlocks, nThreadsPerBlock>>>(foo.d_ptr(),nSamples);
      uint32_t check = foo.getCount(); // read result on host
      assert(check == nSamples);
      cerr << "Passed test 1" << endl;
    }

    { // test 2: dynamically allocate an object
      ParallelCounter<SPREAD> *foo = new ParallelCounter<SPREAD>;
      foo->set(0);
      doTest2<<<nBlocks, nThreadsPerBlock>>>(foo->d_ptr(),nSamples);
      uint32_t check = foo->getCount(); // read result on host
      assert(check == nSamples);
      delete foo;
      cerr << "Passed test 2" << endl;
    }
    
    { // test 3: dynamically allocate an array of objects but use only one
      int n=1000000;
      int index=10;
      ParallelCounter<SPREAD> *foo = new ParallelCounter<SPREAD>[n];
      
      for(int i=0; i < n; i++) foo[i].set(0);
      doTest3<<<nBlocks, nThreadsPerBlock>>>(foo[index].d_ptr(),nSamples);
      uint32_t check = foo[index].getCount(); // read result on host
      assert(check == nSamples);
      delete [] foo;
      cerr << "Passed test 3" << endl;
    }

    { // test 4: create an STL vector of objects but use only one
      int n=1000000;
      int index=10;
      vector< ParallelCounter<SPREAD> > v;
      v.resize(n);
      
      v[index].set(0);
      doTest4<<<nBlocks, nThreadsPerBlock>>>(v[index].d_ptr(),nSamples);
      uint32_t check = v[index].getCount(); // read result on host
      assert(check == nSamples);
      cerr << "Passed test 4" << endl;
    }

    { // test 5: dynamically allocate an array of objects. Makes use
      // of set_d_ptr and the contiguous layout of an array of
      // shadowed objects.
      int n=1000000;
      int index=10;
      ParallelCounter<SPREAD> *foo = new ParallelCounter<SPREAD>[n];
      ParallelCounter<SPREAD> *d_foo;
      cudaMalloc(&d_foo, n*sizeof(ParallelCounter<SPREAD>));
      
      // Note: the ParallelCounters array must be contiguous to work!
#pragma omp parallel for
      for(int i=0; i < n; i++) {
	      foo[i].set(0);
	      foo[i].set_d_ptr(d_foo + i); 
      }
      
      //try index and adjacent indices to see if corruption occurs
      doTest5<<<nBlocks, nThreadsPerBlock>>>(foo[index].d_ptr(),nSamples);
      uint32_t check = foo[index].getCount(); // read result on host
      assert(check == nSamples);

      assert( (index-1) >=0);
      doTest5<<<nBlocks, nThreadsPerBlock>>>(foo[index-1].d_ptr(),nSamples);
      check = foo[index-1].getCount(); // read result on host
      assert(check == nSamples);

      assert( (index+1) < n);
      doTest5<<<nBlocks, nThreadsPerBlock>>>(foo[index+1].d_ptr(),nSamples);
      check = foo[index+1].getCount(); // read result on host
      assert(check == nSamples);

      delete [] foo;
      cudaFree(d_foo);
      cerr << "Passed test 5" << endl;
    }

    { // test 6: allocate and use on GPU
      int n=10000; //subject to internal GPU allocation limits
      int index=10;
      uint32_t *result;
      cudaMalloc(&result, sizeof(uint32_t));
      if(cudaPeekAtLastError() != cudaSuccess) throw "cudaMalloc failed";

      allocDeviceTest<<<1,1>>>(n);
      initDeviceTest<<<n/256+1,256>>>(n);
      fcnDeviceTest<<<nBlocks,nThreadsPerBlock>>>(index, nSamples);
      finiDeviceTest<<<1,1>>>(index, result);

      uint32_t check = 10;
      cudaMemcpy(&check, result, sizeof(uint32_t), cudaMemcpyDefault);
      if(cudaPeekAtLastError() != cudaSuccess) throw "memcpy failed";
      assert(check == nSamples);

      cudaFree(result);
      cerr << "Passed test 6" << endl;
    }
    
    cout << "***** Passed all sanity checks! *****" << endl;
    cudaDeviceReset();
  } catch (const char * e) {
    cerr << "caught exception: " << e << endl;
    cudaError err;
    if((err=cudaGetLastError()) != cudaSuccess)
      cerr << "CUDA reports: " << cudaGetErrorString(err) << endl;
    return -1;
  }
  cudaDeviceReset();
  return 0;
}

Walking through the code starting at main() shows that it uses C++ exceptions to catch errors. Currently, GPU kernels and CUDA library functions do not throw exceptions on errors, which is why this example uses cudaPeekAtLastError() to decide if an error needs to be thrown. The cudaPeekAtLastError() method does not clear the error, so cudaGetLastError() can be used to retrieve the error for printing with cudaGetErrorString().

The application requires two command-line arguments:

  • The number of times the counter will be incremented. For consistency with the histogram example, this number is referred to as nSamples.
  • The number of the CUDA device to use, which makes it easy to compare Fermi and Kepler GPU performance in mixed GPU systems.

For convenience, the application prints out information about the runtime configuration. In particular, note that the C-preprocessor variable SPREAD can be defined at compile time to test the impact of distributing the atomic operations across various sizes of the internal ParallelCounter count vector. See the previous article for a detailed discussion of the ParallelCounter class.

Instantiate an Object on the Host Stack, Use it on the Device, and Get the Results on the Host.

A very common scenario is to instantiate one or more objects on the stack, use them, and destroy them once they go out of scope.

In the following test, the object foo is instantiated, and the constructor initializes it to zero on the host. Aside from the CUDA kernel call, this example looks like standard C++ running on the host.

Listing Five: The test 1 code snippet.

 
    { // test 1: stack allocate an object
      ParallelCounter<SPREAD> foo;
      foo.set(0);
      doTest1<<<nBlocks, nThreadsPerBlock>>>(foo.d_ptr(),nSamples);
      uint32_t check = foo.getCount(); // read result on host
      assert(check == nSamples);
      cerr << "Passed test 1" << endl;
    }

The interface to the GPU starts with the call to the d_ptr() method that passes the pointer of a device-side copy of foo to the doTest1() kernel. This method initiates a set of actions that include: allocating space for a ParallelCounter object on the device, using cudaMemcpy() to transfer the contents of the host version to the device, and returning the pointer to the space for the device-side object. The host then queues the doTest1() kernel to run on the GPU.

CUDA kernel calls are asynchronous, which means the host immediate proceeds to call the getCount() method. The first thing getCount() does is call cpyDtoH(), which ultimately calls cudaMemcpy(). CUDA programmers know that cudaMemcpy() waits until all kernels have completed before performing the copy. Once the data is on the host, the getCount() method can then finish and return the overall sum of how many times foo has been incremented in parallel by the GPU. An assert() on the host then checks that this count is correct.

Dynamically Allocate the Object with new on the Host and Use it on the Device.

A variation of the previous scenario is to dynamically allocate the object on the host with the new operator. Aside from the use of a pointer to foo, Listing Six (test 2) is identical to the first test.

Listing Six: The test 2 code snippet.

    { // test 2: dynamically allocate an object
      ParallelCounter<SPREAD> *foo = new ParallelCounter<SPREAD>;
      foo->set(0);
      doTest2<<<nBlocks, nThreadsPerBlock>>>(foo->d_ptr(),nSamples);
      uint32_t check = foo->getCount(); // read result on host
      assert(check == nSamples);
      delete foo;
      cerr << "Passed test 2" << endl;
    }

Dynamically Allocate an Array of Objects on the Host and Use One of Them on the Device.

In Listing Seven (test 3), an array of one million ParalleCounter objects is allocated. Only the object located at index is passed to the GPU.

Listing Seven: The test 3 code snippet.

    { // test 3: dynamically allocate an array of objects but use only one
      int n=1000000;
      int index=10;
      ParallelCounter<SPREAD> *foo = new ParallelCounter<SPREAD>[n];
      
      for(int i=0; i < n; i++) foo[i].set(0);
      doTest3<<<nBlocks, nThreadsPerBlock>>>(foo[index].d_ptr(),nSamples);
      uint32_t check = foo[index].getCount(); // read result on host
      assert(check == nSamples);
      delete [] foo;
      cerr << "Passed test 3" << endl;
    }

Create an STL Vector of Objects to Use on the Device.

CUDA C++ does not support the use of the STL (Standard Template Library) on the device. However, it is important that host-side STL objects be able to utilize GPU-accelerated C++ objects. Listing Eight (test 4) utilizes a host-side STL vector rather than a C++ array to demonstrate that it is possible to use objects that incorporate the convenience and GPU acceleration provided by SHADOW_MACRO() .

Listing Eight: The test 4 code snippet.

    { // test 4: create an STL vector of objects but use only one
      int n=1000000;
      int index=10;
      vector< ParallelCounter<SPREAD> > v;
      v.resize(n);
      
      v[index].set(0);
      doTest4<<<nBlocks, nThreadsPerBlock>>>(v[index].d_ptr(),nSamples);
      uint32_t check = v[index].getCount(); // read result on host
      assert(check == nSamples);
      cerr << "Passed test 4" << endl;
    }


Related Reading


More Insights






Currently we allow the following HTML tags in comments:

Single tags

These tags can be used alone and don't need an ending tag.

<br> Defines a single line break

<hr> Defines a horizontal line

Matching tags

These require an ending tag - e.g. <i>italic text</i>

<a> Defines an anchor

<b> Defines bold text

<big> Defines big text

<blockquote> Defines a long quotation

<caption> Defines a table caption

<cite> Defines a citation

<code> Defines computer code text

<em> Defines emphasized text

<fieldset> Defines a border around elements in a form

<h1> This is heading 1

<h2> This is heading 2

<h3> This is heading 3

<h4> This is heading 4

<h5> This is heading 5

<h6> This is heading 6

<i> Defines italic text

<p> Defines a paragraph

<pre> Defines preformatted text

<q> Defines a short quotation

<samp> Defines sample computer code text

<small> Defines small text

<span> Defines a section in a document

<s> Defines strikethrough text

<strike> Defines strikethrough text

<strong> Defines strong text

<sub> Defines subscripted text

<sup> Defines superscripted text

<u> Defines underlined text

Dr. Dobb's encourages readers to engage in spirited, healthy debate, including taking us to task. However, Dr. Dobb's moderates all comments posted to our site, and reserves the right to modify or remove any content that it determines to be derogatory, offensive, inflammatory, vulgar, irrelevant/off-topic, racist or obvious marketing or spam. Dr. Dobb's further reserves the right to disable the profile of any commenter participating in said activities.

 
Disqus Tips To upload an avatar photo, first complete your Disqus profile. | View the list of supported HTML tags you can use to style comments. | Please read our commenting policy.
 

Video