Channels ▼
RSS

Parallel

Hot-Rodding Windows and Linux App Performance with CUDA-Based Plugins


The following source file, reduction.cc, demonstrates how to use init() and fini() to calculate the sum of either a float or double protobuf vector message on the host.

//reduction.cc (Rob Farber)
#include <stdlib.h>
#include <stdint.h>
#include <iostream>
#include "tutorial.pb.h"
using namespace std;
#ifdef _WIN32
// tell the linker about needed libraries
#pragma comment(lib,"libprotobuf")
#endif

extern "C" 
#ifdef _WIN32
__declspec(dllexport)
#endif
char* init(const char* progname, const char* sourcename, 
	 uint32_t *size, uint32_t *type) {
 return(NULL); 
}

extern "C" 
#ifdef _WIN32
__declspec(dllexport)
#endif
char* func(const char* progname, const char* sourcename, 
	 uint32_t *size, uint32_t *type, char *blob)
{
  switch(*type) {
  case tutorial::PB_VEC_FLOAT: {
    tutorial::FloatVector vec;
    if(!vec.ParseFromArray(blob,*size)) {
       cerr << progname << "," << sourcename << "Illegal packet" << endl;
    } else {
       if(vec.has_name() == true) cerr << "vec_float " << vec.name() << " ";
       float sum=0.f;
       for(int i=0; i < vec.values_size(); i++) sum += vec.values(i);
       cerr << "sum of vector " << sum << endl;
       cerr << "\tlast value in vector is " << vec.values(vec.values_size()-1)
	     << endl;
       cerr << "\tvector size is " << vec.values_size() << endl;
    }
  } 
  break;
  case tutorial::PB_VEC_DOUBLE: {
    tutorial::DoubleVector vec;
    if(!vec.ParseFromArray(blob,*size)) {
      cerr << progname << "," << sourcename << "Illegal packet" << endl;
    } else {
      if(vec.has_name() == true) cerr << "vec_double " << vec.name() << " ";
      double sum=0.;
      for(int i=0; i < vec.values_size(); i++) sum += vec.values(i);
      cerr << "sum of vector " << sum << endl;
      cerr << "\tlast value in vector is " << vec.values(vec.values_size()-1)
	 << endl;
      cerr << "\tvector size is " << vec.values_size() << endl;
    }
  } 
  break;
  default:
    cerr << "Unknown packet type" << endl;
  }
  return(NULL);
}

extern "C" 
#ifdef _WIN32
__declspec(dllexport)
#endif
char* fini(const char* progname, const char* sourcename, 
	 uint32_t *size, uint32_t *type) {
  return(NULL); 
}

extern "C" 
#ifdef _WIN32
__declspec(dllexport)
#endif
void dynFree(char* pt) {
  if(pt) delete [] pt;
}

The next plugin, cutest2.cu, uses Thrust to multiply each element in the vector by a factor of two on a GPU. The resulting vector is then moved off the GPU and passed on to the next application in the pipeline. Note that the number of copies was not optimized in this example.

//cutest2.cu (Rob Farber)
#include <stdlib.h>
#include <stdint.h>
#include <iostream>
#include "tutorial.pb.h"
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
using namespace std;

#ifdef _WIN32
// tell the linker about needed libraries
#pragma comment(lib,"libprotobuf")
#endif

extern "C" 
#ifdef _WIN32
__declspec(dllexport)
#endif
char* init(const char* progname, const char* sourcename, 
	 uint32_t *size, uint32_t *type) {
  return(NULL); 
}

struct double_functor 
{
  template<class T>
  __host__ __device__
  void operator() (T &t) const
  {
  t *= 2.;
  }
};

extern "C" 
#ifdef _WIN32
__declspec(dllexport)
#endif
char* func(const char* progname, const char* sourcename, 
	 uint32_t *size, uint32_t *type, char *blob)
{
  char *pt_newblob = blob;
  switch(*type) {
    case tutorial::PB_VEC_FLOAT: {
      tutorial::FloatVector vec;
      if(!vec.ParseFromArray(blob,*size)) {
        cerr << progname << "," << sourcename << "Illegal packet" << endl;
      }
    // create thrust vectors for GPU computing
    thrust::host_vector<float> h_vec(vec.values_size());
    for(int i=0; i < vec.values_size(); i++) h_vec[i] = vec.values(i);
    thrust::device_vector<float> d_vec=h_vec;
    // double the values of the vector
    thrust::for_each(d_vec.begin(), d_vec.end(), double_functor() );
    h_vec = d_vec;
    for(int i=0; i < vec.values_size(); i++) vec.set_values(i, h_vec[i]);
    // repackage protobuf and specify new size (type is unchanged).
       *size = vec.ByteSize();

    // return the pointer to the serialized modified protobuf
    pt_newblob = new char[*size];
    vec.SerializeToArray(pt_newblob, *size);
    break;
    }
  default:
    break;
  }
  return(pt_newblob);
}

extern "C" 
#ifdef _WIN32
__declspec(dllexport)
#endif
char* fini(const char* progname, const char* sourcename,
	 uint32_t *size, uint32_t *type) {
  return(NULL); 
}

extern "C" 
#ifdef _WIN32
__declspec(dllexport)
#endif
void dynFree(char* pt) {
 if(pt) delete [] pt;
}

The following commands are used to build and run some tests under Linux:

nvcc -Xcompiler "-rdynamic" -o dynFunc dynFunc.cc tutorial.pb.cc -l protobuf -ldl
echo "-------------STATIC LINK TEST -------------------------------"
nvcc prog.cc reduction.cc tutorial.pb.cc -l protobuf -o prog
../protobuf_examples/testWrite | ./prog

echo "--------------- showing a dynamic reduction ----------------------------"
../protobuf_examples/testWrite | ./dynFunc reduction.cc

echo "---------------- Pass through demo -------------------------------"
../protobuf_examples/testWrite | ./dynFunc passthrough.cc \
 | ./dynFunc reduction.cc

echo "------------- the float vector contains values*4 -----------------------"
# increase the float vector by a factor of four
../protobuf_examples/testWrite \
 	| ./dynFunc cutest2.cu | ./dynFunc cutest2.cu | ./dynFunc reduction.cc

These commands generated the following output under Linux. Note the cutest2.cu plugin is called twice in one of the tests, which increased the value of the floating-point vector by a factor of four as shown below:

$:~/DDJ023/combined_protobuf_dynamic$ sh BUILD.linux 
-------------STATIC LINK TEST -------------------------------
vec_float A sum of vector 4950
	last value in vector is 99
	vector size is 100
vec_double B sum of vector 19900
	last value in vector is 199
	vector size is 200
--------------- showing a dynamic reduction ----------------------------
Compiling with "nvcc -Xcompiler "-fPIC -shared" reduction.cc -o reduction.so "
vec_float A sum of vector 4950
	last value in vector is 99
	vector size is 100
vec_double B sum of vector 19900
	last value in vector is 199
	vector size is 200
---------------- Pass through demo -------------------------------
Compiling with "Compiling with "nvcc -Xcompiler "-fPIC -shared" reduction.cc -o reduction.so nvcc -Xcompiler "-fPIC -shared" passthrough.cc -o passthrough.so ""

vec_float A sum of vector 4950
	last value in vector is 99
	vector size is 100
vec_double B sum of vector 19900
	last value in vector is 199
	vector size is 200
------------- the float vector contains values*4 -----------------------
Compiling with "Compiling with "nvcc -Xcompiler "-fPIC -shared" cutest2.cu -o cutest2.so nvcc -Xcompiler "-fPIC -shared" cutest2.cu -o cutest2.so ""

Compiling with "nvcc -Xcompiler "-fPIC -shared" reduction.cc -o reduction.so "
ptxas /tmp/tmpxft_00000e66_00000000-2_cutest2.ptx, line 111; warning : Double is not supported. Demoting to float
ptxas /tmp/tmpxft_00000e68_00000000-2_cutest2.ptx, line 111; warning : Double is not supported. Demoting to float
vec_float A sum of vector 19800
	last value in vector is 396
	vector size is 100
vec_double B sum of vector 19900
	last value in vector is 199
	vector size is 200

Windows users running under Cygwin can use the following commands to build and run the examples:

PROTO_BASE=../protobuf_examples/protobuf-2.4.1/vsprojects
PB_INC=$PROTO_BASE/include
PB_LIB=$PROTO_BASE/Release

nvcc -Xcompiler "/MD /EHsc" -I $PB_INC -L $PB_LIB dynFunc.cc tutorial.pb.cc -llibprotobuf Ws2_32.lib -o dynFunc.exe

nvcc -Xcompiler "/MD /EHsc" -I $PB_INC -L $PB_LIB prog.cc tutorial.pb.cc reduction.cc -llibprotobuf Ws2_32.lib -o prog.exe

echo "------ BUILDING dlls for tests -----------------------"
nvcc -Xcompiler "/MD" -I $PB_INC -L $PB_LIB reduction.cc tutorial.pb.cc -llibprotobuf -Xlinker "/DLL /OUT:reduction.dll"
nvcc -Xcompiler "/MD" -I $PB_INC -L $PB_LIB passthrough.cc tutorial.pb.cc -llibprotobuf -Xlinker "/DLL /OUT:passthrough.dll"
nvcc -Xcompiler "/MD" -I $PB_INC -L $PB_LIB cutest2.cu tutorial.pb.cc -llibprotobuf -Xlinker "/DLL /OUT:cutest2.dll"


echo "-------------STATIC LINK TEST -------------------------------"
../protobuf_examples/testWrite.exe | ./prog.exe

echo "--------------- showing a dynamic reduction ----------------------------"
../protobuf_examples/testWrite.exe | ./dynFunc.exe reduction.cc


echo "---------------- Pass through demo -------------------------------"
../protobuf_examples/testWrite | ./dynFunc passthrough.cc \
 | ./dynFunc reduction.cc

echo "------------- the float vector contains values*4 -----------------------"
# increase the float vector by a factor of four
../protobuf_examples/testWrite \
 	| ./dynFunc cutest2.cu | ./dynFunc cutest2.cu | ./dynFunc reduction.cc

Summary

With the ability to create CUDA plugins, application programmers have the ability to write and support generic applications that will deliver accelerated performance when a GPU is present and CPU-based performance when a GPU is not available. These plugin architectures are well-understood and a convenient way to leverage existing applications and code bases. They also help preserve existing software investments.

The ability to dynamically compile CUDA source code and link it into a running application — just like OpenCL — opens a host of opportunities for optimizing code generators and transparently running a single CUDA application on multiple device back ends. With this technique, CUDA applications can deliver optimized code for specific problems and achieve very high performance far beyond what a single generic code can deliver. Just like OpenCL, such CUDA applications will also transparently benefit from any compiler improvements.

The workflow example in this article provides another way to use CUDA to exploit hybrid CPU/GPU computation. Incorporating CUDA into the flexibility, robustness, scalability, and performance of "click-together" pipeline workflows gives programmers the ability to capitalize on GPU acceleration and CPU capabilities in their production workflows.


Rob Farber is a visiting HPC expert at Irish Center for High-End Computing (ICHEC), supported by Science Foundation Ireland.


Related Reading


More Insights






Currently we allow the following HTML tags in comments:

Single tags

These tags can be used alone and don't need an ending tag.

<br> Defines a single line break

<hr> Defines a horizontal line

Matching tags

These require an ending tag - e.g. <i>italic text</i>

<a> Defines an anchor

<b> Defines bold text

<big> Defines big text

<blockquote> Defines a long quotation

<caption> Defines a table caption

<cite> Defines a citation

<code> Defines computer code text

<em> Defines emphasized text

<fieldset> Defines a border around elements in a form

<h1> This is heading 1

<h2> This is heading 2

<h3> This is heading 3

<h4> This is heading 4

<h5> This is heading 5

<h6> This is heading 6

<i> Defines italic text

<p> Defines a paragraph

<pre> Defines preformatted text

<q> Defines a short quotation

<samp> Defines sample computer code text

<small> Defines small text

<span> Defines a section in a document

<s> Defines strikethrough text

<strike> Defines strikethrough text

<strong> Defines strong text

<sub> Defines subscripted text

<sup> Defines superscripted text

<u> Defines underlined text

Dr. Dobb's encourages readers to engage in spirited, healthy debate, including taking us to task. However, Dr. Dobb's moderates all comments posted to our site, and reserves the right to modify or remove any content that it determines to be derogatory, offensive, inflammatory, vulgar, irrelevant/off-topic, racist or obvious marketing or spam. Dr. Dobb's further reserves the right to disable the profile of any commenter participating in said activities.

 
Disqus Tips To upload an avatar photo, first complete your Disqus profile. | View the list of supported HTML tags you can use to style comments. | Please read our commenting policy.
 

Video