Channels ▼
RSS

Parallel

CUDA, Supercomputing for the Masses: Part 17


The following code snippet specifies a new class drawable2DshaderARB that is derived from the original drawable2DTexture class defined in mandelbrotPBO.cu. In other words, drawable2DshaderARB inherits the characteristics of parent class and is free to add new variables and methods as well as redefine existing methods so they can provide capabilities not previously available in the parent class.


class drawable2DshaderARB : public drawable2DTexture {
 private:
  uchar4 *h_Src;
  GLuint shader;

 public:
 ~drawable2DshaderARB() {
    glDeleteProgramsARB( 1, &shader );
  }
 drawable2DshaderARB(ulong2 winSize, gpuFunctor userGPUfunc) :
  drawable2DTexture(winSize, userGPUfunc) {
    
    deleteTexture(); // get rid of inherited texture
    deletePBO(); // get rid of inherited PBO

    h_Src = new uchar4[size.x*size.y];
    
    glEnable( GL_TEXTURE_2D );
    glGenTextures( 1, &tex );
    glBindTexture( GL_TEXTURE_2D, tex );
    glTexParameteri( GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP );
    glTexParameteri( GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP );
    glTexParameteri( GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST );
    glTexParameteri( GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST );
    glTexImage2D( GL_TEXTURE_2D, 0, GL_RGBA8, size.x, size.y, 0, 
                  GL_RGBA, GL_UNSIGNED_BYTE, h_Src );
    
    
    // create and bind buffer
    glGenBuffers( 1, &pbo );
    glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, pbo );
    glBufferData( GL_PIXEL_UNPACK_BUFFER_ARB, 
                  sizeof( uchar4 ) * size.x * size.y, h_Src, GL_STREAM_COPY );
#ifdef USE_CUDA3
    cudaGraphicsGLRegisterBuffer( &pboCUDA, pbo, cudaGraphicsMapFlagsNone );
#else
    cudaGLRegisterBufferObject( pbo );
#endif
    
    // create shader
#ifdef COLORFUL
    const char * const code = 
      "!!ARBfp1.0\n"
      "PARAM c[1] = { { 0.95, 2, 0.1 } };\n"
      "TEMP R0;\n"
      "TEMP R1;\n"
      "TEX R0.xyz, fragment.texcoord[0], texture[0], 2D;\n"
      "SIN R0.w, fragment.texcoord[0].x;\n"
      "ADD R1.x, R0, -c[0].z;\n"
      "ABS R0.w, R0;\n"
      "CMP result.color.y, -R1.x, R0.w, R0.x;\n"
      "SIN R0.w, fragment.texcoord[0].y;\n"
      "MAD R0.x, fragment.texcoord[0].y, c[0].y, fragment.texcoord[0];\n"
      "COS R0.x, R0.x;\n"
      "ABS R0.x, R0;\n"
      "ADD R1.x, R0.y, -c[0].z;\n"
      "ABS R0.w, R0;\n"
      "CMP result.color.z, -R1.x, R0.w, R0.y;\n"
      "MOV result.color.w, R0.z;\n"
      "MUL result.color.x, R0, c[0];\n"
      "END" 
      ;
#else
    const char * const code =
      "!!ARBfp1.0\nTEX result.color, fragment.texcoord, texture[ 0 ], 2D;\nEND";
#endif
    
    glGenProgramsARB( 1, &shader );
    glBindProgramARB( GL_FRAGMENT_PROGRAM_ARB, shader );
    glProgramStringARB( GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB,
                        static_cast< GLsizei >( strlen( code ) ),
                        ( GLubyte * )code );
    
    GLint err;
    glGetIntegerv( GL_PROGRAM_ERROR_POSITION_ARB, &err );
    if ( err != -1 ) shader = NULL;
  }
  void display( void ) {
    glBindProgramARB( GL_FRAGMENT_PROGRAM_ARB, shader );
    glEnable( GL_FRAGMENT_PROGRAM_ARB );
    glDisable( GL_DEPTH_TEST );
    
    glBegin( GL_QUADS );
    glTexCoord2f( 0.0f, 0.0f ); glVertex2f( 0.0f, 0.0f );
    glTexCoord2f( 1.0f, 0.0f ); glVertex2f( 1.0f, 0.0f );
    glTexCoord2f( 1.0f, 1.0f ); glVertex2f( 1.0f, 1.0f );
    glTexCoord2f( 0.0f, 1.0f ); glVertex2f( 0.0f, 1.0f );
    glEnd();
    
    glBindTexture( GL_TEXTURE_2D, 0 );
    glDisable( GL_FRAGMENT_PROGRAM_ARB );
  }
};

For simplicity, the variable imageTexture is defined to be of type drawable2DshaderARB as shown in the code snippet below:


drawable2DshaderARB *imageTexture=NULL;

Notice that the drawable2DshaderARB constructor contains a small shader program pointed to by the variable code. Defining the preprocessor variable COLORFUL selects one of two possible demonstration shaders:

  1. If COLORFUL is not defined, a simple shader is utilized that simply moves the data from the input to the output. In this case, the same black and white Mandelbrot image created by mandelbrotPBO.cu will be displayed.
  2. If COLORFUL is defined, a programmable shader will be used that changes the color of each pixel based on the original color values and position within the 2D texture.

The following character string defines the pass-through shader for drawable2DshaderARB:


const char * const code =
      "!!ARBfp1.0\nTEX result.color, fragment.texcoord, texture[ 0 ], 2D;\nEND";

The string is actually the shader assembly instructions generated by the Cg compiler, cgc after compiling the following program in simple1.cg:


struct Output {
  float4 color : COLOR;
};

Output texture(float2 texCoord : TEXCOORD0, uniform sampler2D decal)
{

  Output OUT;

  OUT.color = tex2D(decal, texCoord);

  return OUT;

}

The following command-line compiled the Cg program under Linux:


cgc -profile arbfp1 -entry texture simple1.cg > try1.arbfp1

The cgc compiler and Cg information can be downloaded from the NVIDIA Cg toolkit site. Ubuntu users can get an slightly older version of the Cg software by typing:


apt-get install nvidia-cg-toolkit

The shader program used when the preprocessor variable COLORFUL is defined was generated by compiling the following source code, colorful.cg:


struct Output {
  float4 color : COLOR;
};

Output texture(float2 texCoord : TEXCOORD0, uniform sampler2D tex)
{
  Output OUT;

  OUT.color = float4(
	0.25*abs(cos(2*texCoord.y+texCoord.x)),
	(tex2D(tex,texCoord).x > 0.1)?
    		abs(sin(texCoord.x)):tex2D(tex,texCoord).x, 
	(tex2D(tex,texCoord).y > 0.1)?
    		abs(sin(texCoord.y)):tex2D(tex,texCoord).y, 
	tex2D(tex, texCoord).z);

  return OUT;

}

The following command-line generated the string used in the drawable2DshaderARB constructor:


cgc -profile arbfp1 -entry texture colorful.cg > colorful.arbfp1

The complete buildable source for mandelbrotShader.cu including the drawable2DshaderARB class and shader programs is as follows:


#include <GL/glew.h>
#include <GL/glut.h>
#define USE_CUDA3

#include <cuda.h>
#include <cuda_gl_interop.h>

__global__ void mandelbrot( uchar4 *d_ptr, ulong2 size, 
			    float2 rrange, float2 irange, unsigned long n )
{
  unsigned long x = blockIdx.x * blockDim.x + threadIdx.x;
  unsigned long y = blockIdx.y * blockDim.y + threadIdx.y;
  if ( x >= size.x || y >= size.y ) return;
  
  float2 C = make_float2( ( rrange.y - rrange.x ) / size.x * x + rrange.x,
			  ( irange.y - irange.x ) / size.y * y + irange.x );
  
  uchar4 buf = make_uchar4( 0, 0, 0, 0 );
  if ( C.x * C.x + C.y * C.y <= 4.0f ) {
    float2 z = make_float2( 0.0f, 0.0f );
    unsigned long cnt = 0;
    while ( cnt < n ) {
      ++cnt;
      z = make_float2( z.x * z.x - z.y * z.y + C.x, 2.0f * z.x * z.y + C.y );
      if ( z.x * z.x + z.y * z.y > 4.0f ) {
	unsigned char c = ( 255 - 15 ) * __powf( ( float )cnt / n, 0.8f )+15;
	buf = make_uchar4( c, c, c, 0 );
	break;
      }
    }
  }
  
  d_ptr[ x + y * size.x ] = buf;
}

// A functor is basically a function that maintains state
struct gpuFunctor {

protected:
  float2 rrange;
  float2 irange;

public:
gpuFunctor(float2 rrange, float2 irange) : rrange(rrange), irange(irange) {}
gpuFunctor() : rrange(make_float2(-2.f, 2.f)), irange(make_float2(-2.f, 2.f)) {}
  
  void callKernel(uchar4 *d_ptr, ulong2 gridSize, unsigned long convergence)
  {
    dim3 dimBlock( 16, 4 );
    dim3 dimGrid( ( gridSize.x + dimBlock.x - 1 ) / dimBlock.x,
		  ( gridSize.y + dimBlock.y - 1 ) / dimBlock.y );
    mandelbrot<<< dimGrid, dimBlock >>>( d_ptr, gridSize, rrange,
					 irange, convergence );
  }
  inline float2 getRrange() {return(rrange);}
  inline float2 getIrange() {return(irange);}
};

class drawable2DTexture {
 protected:
  GLuint tex,pbo;
  ulong2 size;
  gpuFunctor gpuFunc;

#ifdef USE_CUDA3
  struct cudaGraphicsResource *pboCUDA;
#endif

  void createPBO()
  {
    // set up vertex data parameter
    int num_texels = size.x * size.y;
    int num_values = num_texels * 4;
    int size_tex_data = sizeof(GLubyte) * num_values;
    
    // Generate a buffer ID called a PBO (Pixel Buffer Object)
    glGenBuffers(1,&pbo);
    // Make this the current UNPACK buffer (OpenGL is state-based)
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo);
    // Allocate data for the buffer. 4-channel 8-bit image
    glBufferData(GL_PIXEL_UNPACK_BUFFER, size_tex_data, NULL, GL_DYNAMIC_COPY);

#ifdef USE_CUDA3
    cudaGraphicsGLRegisterBuffer( &pboCUDA, pbo, cudaGraphicsMapFlagsNone );
#else
    cudaGLRegisterBufferObject( pbo );
#endif
  }
  
  void deletePBO()
  {
    if(pbo) {
      // delete the PBO
#ifdef USE_CUDA3
      cudaGraphicsUnregisterResource( pboCUDA );
#else
      cudaGLUnregisterBufferObject( pbo );
#endif
      glDeleteBuffers( 1, &pbo );
      pbo=NULL;
      pboCUDA = NULL;
    }
  }
  
  void createTexture()
  {
    if(tex) deleteTexture();

    unsigned int image_height=size.y;
    unsigned int image_width=size.x;

    // Enable Texturing
    glEnable(GL_TEXTURE_2D);
    
    // Generate a texture identifier
    glGenTextures(1,&tex);
    
    // Make this the current texture (remember that GL is state-based)
    glBindTexture( GL_TEXTURE_2D, tex);
    
    // Allocate the texture memory. The last parameter is NULL since we only
    // want to allocate memory, not initialize it
    glTexImage2D( GL_TEXTURE_2D, 0, GL_RGBA8, image_width, image_height, 0, GL_BGRA,
		  GL_UNSIGNED_BYTE, NULL);
    
    // Must set the filter mode, GL_LINEAR enables interpolation when scaling
    glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MIN_FILTER,GL_LINEAR);
    glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MAG_FILTER,GL_LINEAR);
    // Note: GL_TEXTURE_RECTANGLE_ARB may be used instead of
    // GL_TEXTURE_2D for improved performance if linear interpolation is
    // not desired. Replace GL_LINEAR with GL_NEAREST in the
    // glTexParameteri() call
  }

  void deleteTexture()
  {
    if(tex) {
      glDeleteTextures(1, &tex);
      tex = NULL;
    }
  }

  void cleanup (void) { deletePBO(); deleteTexture(); }

 public:

  ~drawable2DTexture() { cleanup(); }

  drawable2DTexture(ulong2 winSize, gpuFunctor userGPUfunc)
    {
      size = winSize;
      gpuFunc = userGPUfunc;
#ifdef USE_CUDA3
      pboCUDA = NULL;
#endif

      createTexture();
      createPBO();
    }
  void draw( unsigned long convergence )
  {
    uchar4 *d_ptr = NULL;
    
#ifdef USE_CUDA3
    size_t start;
    cudaGraphicsMapResources( 1, &pboCUDA, NULL );
    cudaGraphicsResourceGetMappedPointer( ( void ** )&d_ptr, &start, pboCUDA );
#else
    cudaGLMapBufferObject( ( void ** )&d_ptr, pbo );
#endif
    
    gpuFunc.callKernel(d_ptr, size, convergence);
    cudaThreadSynchronize();
    
#ifdef USE_CUDA3
    cudaGraphicsUnmapResources( 1, &pboCUDA, NULL );
#else
    cudaGLUnmapBufferObject( pbo );
#endif

    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo);

    glBindTexture( GL_TEXTURE_2D, tex );
    glTexSubImage2D( GL_TEXTURE_2D, 0, 0, 0, size.x, size.y,
		     GL_RGBA, GL_UNSIGNED_BYTE, NULL );
    
  }

  void display( void )
  {
    glDisable( GL_DEPTH_TEST );
    
    glBegin( GL_QUADS );
    glTexCoord2f( 0.0f, 0.0f ); glVertex2f( 0.0f, 0.0f );
    glTexCoord2f( 1.0f, 0.0f ); glVertex2f( 1.0f, 0.0f );
    glTexCoord2f( 1.0f, 1.0f ); glVertex2f( 1.0f, 1.0f );
    glTexCoord2f( 0.0f, 1.0f ); glVertex2f( 0.0f, 1.0f );
    glEnd();
  }

  void reshape(ulong2 size)
  {
    float2 irange = gpuFunc.getIrange();
    float2 rrange = gpuFunc.getRrange();

    float icenter = ( irange.x + irange.y ) * 0.5f;
    float iwidth = ( rrange.y - rrange.x ) / size.x * size.y * 0.5f;
    irange = make_float2( icenter - iwidth, icenter + iwidth );
    gpuFunc = gpuFunctor(rrange, irange);
  }
};

class drawable2DshaderARB: public drawable2DTexture {
 private:
  uchar4 *h_Src;
  GLuint shader;

 public:
 ~drawable2DshaderARB() {
    glDeleteProgramsARB( 1, &shader );
  }
 drawable2DshaderARB(ulong2 winSize, gpuFunctor userGPUfunc) :
  drawable2DTexture(winSize, userGPUfunc) {
    
    deleteTexture(); // get rid of inherited texture
    deletePBO(); // get rid of inherited PBO

    h_Src = new uchar4[size.x*size.y];
    
    glEnable( GL_TEXTURE_2D );
    glGenTextures( 1, &tex );
    glBindTexture( GL_TEXTURE_2D, tex );
    glTexParameteri( GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP );
    glTexParameteri( GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP );
    glTexParameteri( GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST );
    glTexParameteri( GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST );
    glTexImage2D( GL_TEXTURE_2D, 0, GL_RGBA8, size.x, size.y, 0, 
                  GL_RGBA, GL_UNSIGNED_BYTE, h_Src );
    
    
    // create and bind buffer
    glGenBuffers( 1, &pbo );
    glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, pbo );
    glBufferData( GL_PIXEL_UNPACK_BUFFER_ARB, 
                  sizeof( uchar4 ) * size.x * size.y, h_Src, GL_STREAM_COPY );
#ifdef USE_CUDA3
    cudaGraphicsGLRegisterBuffer( &pboCUDA, pbo, cudaGraphicsMapFlagsNone );
#else
    cudaGLRegisterBufferObject( pbo );
#endif
    
    // create shader
#ifdef COLORFUL
    const char * const code = 
      "!!ARBfp1.0\n"
      "PARAM c[1] = { { 0.95, 2, 0.1 } };\n"
      "TEMP R0;\n"
      "TEMP R1;\n"
      "TEX R0.xyz, fragment.texcoord[0], texture[0], 2D;\n"
      "SIN R0.w, fragment.texcoord[0].x;\n"
      "ADD R1.x, R0, -c[0].z;\n"
      "ABS R0.w, R0;\n"
      "CMP result.color.y, -R1.x, R0.w, R0.x;\n"
      "SIN R0.w, fragment.texcoord[0].y;\n"
      "MAD R0.x, fragment.texcoord[0].y, c[0].y, fragment.texcoord[0];\n"
      "COS R0.x, R0.x;\n"
      "ABS R0.x, R0;\n"
      "ADD R1.x, R0.y, -c[0].z;\n"
      "ABS R0.w, R0;\n"
      "CMP result.color.z, -R1.x, R0.w, R0.y;\n"
      "MOV result.color.w, R0.z;\n"
      "MUL result.color.x, R0, c[0];\n"
      "END" 
      ;
#else
    const char * const code =
      "!!ARBfp1.0\nTEX result.color, fragment.texcoord, texture[ 0 ], 2D;\nEND";
#endif
    
    glGenProgramsARB( 1, &shader );
    glBindProgramARB( GL_FRAGMENT_PROGRAM_ARB, shader );
    glProgramStringARB( GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB,
                        static_cast< GLsizei >( strlen( code ) ),
                        ( GLubyte * )code );
    
    GLint err;
    glGetIntegerv( GL_PROGRAM_ERROR_POSITION_ARB, &err );
    if ( err != -1 ) shader = NULL;
  }
  void display( void ) {
    glBindProgramARB( GL_FRAGMENT_PROGRAM_ARB, shader );
    glEnable( GL_FRAGMENT_PROGRAM_ARB );
    glDisable( GL_DEPTH_TEST );
    
    glBegin( GL_QUADS );
    glTexCoord2f( 0.0f, 0.0f ); glVertex2f( 0.0f, 0.0f );
    glTexCoord2f( 1.0f, 0.0f ); glVertex2f( 1.0f, 0.0f );
    glTexCoord2f( 1.0f, 1.0f ); glVertex2f( 1.0f, 1.0f );
    glTexCoord2f( 0.0f, 1.0f ); glVertex2f( 0.0f, 1.0f );
    glEnd();
    
    glBindTexture( GL_TEXTURE_2D, 0 );
    glDisable( GL_FRAGMENT_PROGRAM_ARB );
  }
};


drawable2DshaderARB *imageTexture=NULL;

void reshapeCallback( int w, int h )
{
  glViewport( 0, 0, w, h );
  glMatrixMode( GL_MODELVIEW );
  glLoadIdentity();
  
  glMatrixMode( GL_PROJECTION );
  glLoadIdentity();
  glOrtho( 0.0f, 1.0f, 0.0f, 1.0f, 0.0f, 1.0f );
  
  imageTexture->reshape( make_ulong2( h, w ) );
}

void displayCallback( void )
{
  imageTexture->draw( 1024 );
  imageTexture->display();

  glutSwapBuffers();
}

void cleanupCallback( void) 
{
}

int main(int argc, char *argv[] )
{
  int height=1024, width = 768;
  glutInit( &argc, argv );
  glutInitDisplayMode( GLUT_RGBA | GLUT_DOUBLE );
  glutInitWindowSize( height, width);
  glutCreateWindow( *argv );
  
  glewInit();
  
  glutDisplayFunc( displayCallback );
  glutReshapeFunc( reshapeCallback );
  
  cudaGLSetGLDevice( 0 );

  // create the pixel image object
  imageTexture = new drawable2DshaderARB(make_ulong2(height,width), gpuFunctor());

  atexit( cleanupCallback );
  
  glutMainLoop();
  
  cleanupCallback();
  cudaThreadExit();
  return 0;
}

The black-and-white executable mandelbrotShaderBW (that uses the pass-through shader simple1.cg) was built with the following under Linux:


SDK_PATH=/$HOME/NVIDIA_GPU_Computing_SDK/C
INC=$SDK_PATH/common/inc
LIB=$SDK_PATH/lib
nvcc -DCOLORFUL -O3 -I$INC -L$LIB mandelbrotShader.cu -lcutil_x86_64 -lglut -lGLEW -o mandelbrotShaderColor

Adding the following preprocessor specification -DCOLORFUL to the command line creates the mandelbrotShaderColor executable:


SDK_PATH=$HOME/NVIDIA_GPU_Computing_SDK/C
INC=$SDK_PATH/common/inc
LIB=$SDK_PATH/lib
nvcc -DCOLORFUL -O3 -I$INC -L$LIB mandelbrotShader.cu -lcutil_x86_64 -lglut -lGLEW -o mandelbrotShaderColor

Running mandelbrotShaderColor produces Figure 1 on the screen:

[Click image to view at full size]
Figure 1

Summary

CUDA 3.0 is a major revision number release that delivers important benefits for debugging; C++; OpenCL; CUDA driver and runtime developers; CUBLAS and CUFFT users, and many other areas that make this a "must install" update. The expanded capabilities discussed in this brief two-part article series should provide outstanding food-for-thought how the 3.0 release creates new opportunities for code development and integration of existing software projects.

In addition, this article provided examples that touched on a number of exciting capabilities that will be discussed in greater depth in future articles.


Rob Farber is a senior scientist at Pacific Northwest National Laboratory. He has worked in massively parallel computing at several national laboratories and as co-founder of several startups. He can be reached at rmfarber@gmail.com.


Related Reading


More Insights






Currently we allow the following HTML tags in comments:

Single tags

These tags can be used alone and don't need an ending tag.

<br> Defines a single line break

<hr> Defines a horizontal line

Matching tags

These require an ending tag - e.g. <i>italic text</i>

<a> Defines an anchor

<b> Defines bold text

<big> Defines big text

<blockquote> Defines a long quotation

<caption> Defines a table caption

<cite> Defines a citation

<code> Defines computer code text

<em> Defines emphasized text

<fieldset> Defines a border around elements in a form

<h1> This is heading 1

<h2> This is heading 2

<h3> This is heading 3

<h4> This is heading 4

<h5> This is heading 5

<h6> This is heading 6

<i> Defines italic text

<p> Defines a paragraph

<pre> Defines preformatted text

<q> Defines a short quotation

<samp> Defines sample computer code text

<small> Defines small text

<span> Defines a section in a document

<s> Defines strikethrough text

<strike> Defines strikethrough text

<strong> Defines strong text

<sub> Defines subscripted text

<sup> Defines superscripted text

<u> Defines underlined text

Dr. Dobb's encourages readers to engage in spirited, healthy debate, including taking us to task. However, Dr. Dobb's moderates all comments posted to our site, and reserves the right to modify or remove any content that it determines to be derogatory, offensive, inflammatory, vulgar, irrelevant/off-topic, racist or obvious marketing or spam. Dr. Dobb's further reserves the right to disable the profile of any commenter participating in said activities.

 
Disqus Tips To upload an avatar photo, first complete your Disqus profile. | View the list of supported HTML tags you can use to style comments. | Please read our commenting policy.
 

Video