Channels ▼
RSS

Design

A Robust Histogram for Massive Parallelism


A Dynamic Parallelism Histogram Example

It is fairly straightforward to create a histogram example program using shadowed classes. The source code in Listing Two, histogram.cu, demonstrates one such application. To make it interesting, dynamic parallelism was used to demonstrate how the ability to spawn device-side kernels cleans-up the code and how useful it is to have the ability to specify different execution configurations in kernel calls on the device.

The example code increments only bin one when the preprocessor variable PATHOLOGICAL is defined. This provides a worst-case histogram data distribution, where all the data fits into a single bin.

Listing Two: Source code for histogram.cu.

#include <iostream>
#include <vector>
using namespace std;
#include <cstdio>
#include <cassert>
#include "ParallelCounter.hpp"

#ifndef SPREAD
#define SPREAD 32
#endif

//****************************
__global__ void gatherHistogram(ParallelCounter<SPREAD> *counterAra, int nBins,  uint32_t *histo) 
{
  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
  
  if(tid < nBins)
    histo[tid] = counterAra[tid].getCount();
}

__global__ void sampleHistogram(ParallelCounter<SPREAD> *counterAra, int nBins,  uint32_t nSamples) {
  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;

  while(tid < nSamples) {
#ifdef PATHOLOGICAL
    counterAra[1] += 1;
#else
    counterAra[tid % nBins] += 1;
#endif
    tid += blockDim.x * gridDim.x;
  }
}

__global__ void initCounters(ParallelCounter<SPREAD> *counterAra, int nBins) 
{
  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
  
  if(tid < nBins)
    counterAra[tid].set(0);
}

__global__ void doHistoTest(uint32_t *histo, int nBins, uint32_t nSamples)
{
  // C++ new works in the kernel
  ParallelCounter<SPREAD> *counterAra = new ParallelCounter<SPREAD>[nBins];

  initCounters<<<nBins/256+1, 256>>>(counterAra, nBins);
  cudaDeviceSynchronize();

  // Note we can increase the size of the grid in the kernel!
  int nBlocks = (nSamples/256 + 1);
  nBlocks = (nBlocks > 65535)?65535:nBlocks;

  sampleHistogram<<<nBlocks,256>>>(counterAra, nBins, nSamples);
  cudaDeviceSynchronize();

  gatherHistogram<<<nBins/256+1, 256>>>(counterAra, nBins, histo);
  cudaDeviceSynchronize();

  delete [] counterAra;
}

int main(int argc, char *argv[])
{
  uint32_t nSamples, nBins;
  int device=0;
  
  try {
    if(argc < 3) {
      cerr << "Use: nSamples nBins device" << endl;
      return -1;
    }
    
    nSamples=atoi(argv[1]);
    nBins=atoi(argv[2]);
    device=atoi(argv[3]);

    cudaSetDevice(device);
    if(cudaPeekAtLastError() != cudaSuccess) throw "failed to set device";

    uint32_t *histo; // alloc histo in mapped memory
    cudaMallocHost(&histo, nBins * sizeof(uint32_t) );
    
    doHistoTest<<<1,1>>>(histo, nBins, nSamples);
    
    cudaDeviceSynchronize();

    // check results
    uint32_t sum=0;
    for(int i=0; i < nBins; i++)
      sum += histo[i];

    cout << "sum " << sum << " " << nSamples << endl;
    assert(sum == nSamples);

    cudaFreeHost(histo);
    
  } catch (const char * e) {
    cerr << "caught exception: " << e << endl;
    cudaError err;
    if((err=cudaGetLastError()) != cudaSuccess)
      cerr << "CUDA reports: " << cudaGetErrorString(err) << endl;
    cudaDeviceReset();
    return -1;
  }

  cudaDeviceReset();
  return 0;
}

A Simple AtomicAdd() Integer Example

Listing Three modifies histogram.cu to use simple integer counters instead of a ParallelCounter object.

Listing Three: Source code for simpleHisto.cu.

#include <iostream>
#include <vector>
using namespace std;
#include <cstdio>
#include <cassert>
#include "ParallelCounter.hpp"

#ifndef SPREAD
#define SPREAD 32
#endif

//*****************************
__global__ void gatherHistogram(uint32_t *counterAra, int nBins,  uint32_t *histo) 
{
  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
  
  if(tid < nBins)
    histo[tid] = counterAra[tid];
}

__global__ void sampleHistogram(uint32_t *counterAra, int nBins,  uint32_t nSamples) {
  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;

  while(tid < nSamples) {
#ifdef PATHOLOGICAL
    atomicAdd(counterAra, 1); 
#else
    atomicAdd(counterAra + (tid % nBins), 1); 
#endif
    tid += blockDim.x * gridDim.x;
  }
}

__global__ void initCounters(uint32_t *counterAra, int nBins) 
{
  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
  
  if(tid < nBins)
    counterAra[tid] = 0;
}

__global__ void doHistoTest(uint32_t *histo, int nBins, uint32_t nSamples)
{
  // C++ new works in the kernel
  uint32_t *counterAra = new uint32_t[nBins];

  initCounters<<<nBins/256+1, 256>>>(counterAra, nBins);

  // Note we can increase the size of the grid in the kernel!
  {
    int nBlocks = (nSamples/256 + 1);
    nBlocks = (nBlocks > 65535)?65535:nBlocks;
    
    sampleHistogram<<<nBlocks,256>>>(counterAra, nBins, nSamples);
  }
  gatherHistogram<<<nBins/256+1, 256>>>(counterAra, nBins, histo);
  cudaDeviceSynchronize(); // wait for all kernel launches to complete before deleting memory!

  delete [] counterAra;
}

int main(int argc, char *argv[])
{
  uint32_t nSamples, nBins;
  int device=0;
  
  try {
    if(argc < 3) {
      cerr << "Use: nSamples nBins device" << endl;
      return -1;
    }
    
    nSamples=atoi(argv[1]);
    nBins=atoi(argv[2]);
    device=atoi(argv[3]);

    cudaSetDevice(device);
    if(cudaPeekAtLastError() != cudaSuccess) throw "failed to set device";

    uint32_t *histo; // alloc histo in mapped memory
    cudaMallocHost(&histo, nBins * sizeof(uint32_t) );
    
    doHistoTest<<<1,1>>>(histo, nBins, nSamples);
    
    cudaDeviceSynchronize();

    // check results
    uint32_t sum=0;
    for(int i=0; i < nBins; i++)
      sum += histo[i];

    cout << "sum " << sum << " " << nSamples << endl;
    assert(sum == nSamples);

    cudaFreeHost(histo);
    
  } catch (const char * e) {
    cerr << "caught exception: " << e << endl;
    cudaError err;
    if((err=cudaGetLastError()) != cudaSuccess)
      cerr << "CUDA reports: " << cudaGetErrorString(err) << endl;
    cudaDeviceReset();
    return -1;
  }

  cudaDeviceReset();
  return 0;
}

Building the Examples

The nvcc commands in Listing Four were used to build the examples. Note the arguments –rdc=true and –arch=sm_35 were used to correctly compile for dynamic parallelism on Kepler.

Listing Four: Compilation commands for the histogram tests.

nvcc -O3 -arch=sm_35 -rdc=true -DSPREAD=32 histogram.cu -o histogram.exe -lcudadevrt
nvcc -O3 -arch=sm_35 -rdc=true simpleHisto.cu -o simpleHisto.exe -lcudadevrt

nvcc -O3 -arch=sm_35 -rdc=true -DPATHOLOGICAL -DSPREAD=32 histogram.cu -o histogramPathological.exe -lcudadevrt
nvcc -O3 -arch=sm_35 -rdc=true -DPATHOLOGICAL simpleHisto.cu -o simpleHistoPathological.exe -lcudadevrt

Performance Comparison

Table 1 shows that there is basically no difference between the ParallelCounter runtimes demonstrating performance degradation caused by the data distribution. The simpleHisto.cu example is just as susceptible as the NVIDIA histogram SDK example to performance degradation due to a small number of bins or non-uniform data distributions. However, the comparison of the two pathological cases reveals a more than 10x difference in performance!

Executable nSamples nBins Runtime
histogram.exe

4 billion

16

538.15ms

histogramPathological.exe

4 billion

16

538.37ms

simpleHisto.exe

4 billion

16

711.87ms

simpleHistoPathological.exe

4 billion

16

5710 ms

Table 1: Observed performance results using nvprof on a K20c GPU.

Conclusion

The example code in this tutorial demonstrates that a vector of ParallelCounter objects can provide more than an order-of-magnitude greater performance than a vector of atomically incremented counters when many or all the threads need to increment the same bin in the histogram Meanwhile, performance for uniformly distributed datasets is as fast or slightly faster. Based on these performance results, I recommend that CUDA programmers utilize some form of this low-wait parallel counter. (Multicore programmers can also benefit from using this low-wait approach as well.)

Astute readers will note that the histogram.cu example had to preallocate storage for a known number of results. The next article in this series will provide the ability to generate results in a massively parallel manner on the device where the number of results is not known beforehand.

The integration of the general-purpose SHADOW_MACRO() into the ParallelCounter class adds much of the transparency and simplicity of mapped memory without sacrificing speed. Host-side STL classes can work with SHADOW_MACRO()-enabled classes to leverage the power and convenience of the STL, and potentially deliver significant performance gains compared with the host. For these performance and convenience reasons, developers should consider incorporating SHADOW_MACRO() into their classes.


Rob Farber is a frequent contributor to Dr. Dobb's on CPU and GPGPU programming topics.

Related Articles

Atomic Operations and Low-Wait Algorithms in CUDA

Unifying Host/Device Interactions with a Single C++ Macro


Related Reading


More Insights






Currently we allow the following HTML tags in comments:

Single tags

These tags can be used alone and don't need an ending tag.

<br> Defines a single line break

<hr> Defines a horizontal line

Matching tags

These require an ending tag - e.g. <i>italic text</i>

<a> Defines an anchor

<b> Defines bold text

<big> Defines big text

<blockquote> Defines a long quotation

<caption> Defines a table caption

<cite> Defines a citation

<code> Defines computer code text

<em> Defines emphasized text

<fieldset> Defines a border around elements in a form

<h1> This is heading 1

<h2> This is heading 2

<h3> This is heading 3

<h4> This is heading 4

<h5> This is heading 5

<h6> This is heading 6

<i> Defines italic text

<p> Defines a paragraph

<pre> Defines preformatted text

<q> Defines a short quotation

<samp> Defines sample computer code text

<small> Defines small text

<span> Defines a section in a document

<s> Defines strikethrough text

<strike> Defines strikethrough text

<strong> Defines strong text

<sub> Defines subscripted text

<sup> Defines superscripted text

<u> Defines underlined text

Dr. Dobb's encourages readers to engage in spirited, healthy debate, including taking us to task. However, Dr. Dobb's moderates all comments posted to our site, and reserves the right to modify or remove any content that it determines to be derogatory, offensive, inflammatory, vulgar, irrelevant/off-topic, racist or obvious marketing or spam. Dr. Dobb's further reserves the right to disable the profile of any commenter participating in said activities.

 
Disqus Tips To upload an avatar photo, first complete your Disqus profile. | View the list of supported HTML tags you can use to style comments. | Please read our commenting policy.
 

Video