The following code snippet specifies a new class drawable2DshaderARB that is derived from the original drawable2DTexture class defined in mandelbrotPBO.cu. In other words, drawable2DshaderARB inherits the characteristics of parent class and is free to add new variables and methods as well as redefine existing methods so they can provide capabilities not previously available in the parent class.
class drawable2DshaderARB : public drawable2DTexture {
private:
uchar4 *h_Src;
GLuint shader;
public:
~drawable2DshaderARB() {
glDeleteProgramsARB( 1, &shader );
}
drawable2DshaderARB(ulong2 winSize, gpuFunctor userGPUfunc) :
drawable2DTexture(winSize, userGPUfunc) {
deleteTexture(); // get rid of inherited texture
deletePBO(); // get rid of inherited PBO
h_Src = new uchar4[size.x*size.y];
glEnable( GL_TEXTURE_2D );
glGenTextures( 1, &tex );
glBindTexture( GL_TEXTURE_2D, tex );
glTexParameteri( GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP );
glTexParameteri( GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP );
glTexParameteri( GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST );
glTexParameteri( GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST );
glTexImage2D( GL_TEXTURE_2D, 0, GL_RGBA8, size.x, size.y, 0,
GL_RGBA, GL_UNSIGNED_BYTE, h_Src );
// create and bind buffer
glGenBuffers( 1, &pbo );
glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, pbo );
glBufferData( GL_PIXEL_UNPACK_BUFFER_ARB,
sizeof( uchar4 ) * size.x * size.y, h_Src, GL_STREAM_COPY );
#ifdef USE_CUDA3
cudaGraphicsGLRegisterBuffer( &pboCUDA, pbo, cudaGraphicsMapFlagsNone );
#else
cudaGLRegisterBufferObject( pbo );
#endif
// create shader
#ifdef COLORFUL
const char * const code =
"!!ARBfp1.0\n"
"PARAM c[1] = { { 0.95, 2, 0.1 } };\n"
"TEMP R0;\n"
"TEMP R1;\n"
"TEX R0.xyz, fragment.texcoord[0], texture[0], 2D;\n"
"SIN R0.w, fragment.texcoord[0].x;\n"
"ADD R1.x, R0, -c[0].z;\n"
"ABS R0.w, R0;\n"
"CMP result.color.y, -R1.x, R0.w, R0.x;\n"
"SIN R0.w, fragment.texcoord[0].y;\n"
"MAD R0.x, fragment.texcoord[0].y, c[0].y, fragment.texcoord[0];\n"
"COS R0.x, R0.x;\n"
"ABS R0.x, R0;\n"
"ADD R1.x, R0.y, -c[0].z;\n"
"ABS R0.w, R0;\n"
"CMP result.color.z, -R1.x, R0.w, R0.y;\n"
"MOV result.color.w, R0.z;\n"
"MUL result.color.x, R0, c[0];\n"
"END"
;
#else
const char * const code =
"!!ARBfp1.0\nTEX result.color, fragment.texcoord, texture[ 0 ], 2D;\nEND";
#endif
glGenProgramsARB( 1, &shader );
glBindProgramARB( GL_FRAGMENT_PROGRAM_ARB, shader );
glProgramStringARB( GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB,
static_cast< GLsizei >( strlen( code ) ),
( GLubyte * )code );
GLint err;
glGetIntegerv( GL_PROGRAM_ERROR_POSITION_ARB, &err );
if ( err != -1 ) shader = NULL;
}
void display( void ) {
glBindProgramARB( GL_FRAGMENT_PROGRAM_ARB, shader );
glEnable( GL_FRAGMENT_PROGRAM_ARB );
glDisable( GL_DEPTH_TEST );
glBegin( GL_QUADS );
glTexCoord2f( 0.0f, 0.0f ); glVertex2f( 0.0f, 0.0f );
glTexCoord2f( 1.0f, 0.0f ); glVertex2f( 1.0f, 0.0f );
glTexCoord2f( 1.0f, 1.0f ); glVertex2f( 1.0f, 1.0f );
glTexCoord2f( 0.0f, 1.0f ); glVertex2f( 0.0f, 1.0f );
glEnd();
glBindTexture( GL_TEXTURE_2D, 0 );
glDisable( GL_FRAGMENT_PROGRAM_ARB );
}
};
For simplicity, the variable imageTexture is defined to be of type drawable2DshaderARB as shown in the code snippet below:
drawable2DshaderARB *imageTexture=NULL;
Notice that the drawable2DshaderARB constructor contains a small shader program pointed to by the variable code. Defining the preprocessor variable COLORFUL selects one of two possible demonstration shaders:
- If COLORFUL is not defined, a simple shader is utilized that simply moves the data from the input to the output. In this case, the same black and white Mandelbrot image created by mandelbrotPBO.cu will be displayed.
- If COLORFUL is defined, a programmable shader will be used that changes the color of each pixel based on the original color values and position within the 2D texture.
The following character string defines the pass-through shader for drawable2DshaderARB:
const char * const code =
"!!ARBfp1.0\nTEX result.color, fragment.texcoord, texture[ 0 ], 2D;\nEND";
The string is actually the shader assembly instructions generated by the Cg compiler, cgc after compiling the following program in simple1.cg:
struct Output {
float4 color : COLOR;
};
Output texture(float2 texCoord : TEXCOORD0, uniform sampler2D decal)
{
Output OUT;
OUT.color = tex2D(decal, texCoord);
return OUT;
}
The following command-line compiled the Cg program under Linux:
cgc -profile arbfp1 -entry texture simple1.cg > try1.arbfp1
The cgc compiler and Cg information can be downloaded from the NVIDIA Cg toolkit site. Ubuntu users can get an slightly older version of the Cg software by typing:
apt-get install nvidia-cg-toolkit
The shader program used when the preprocessor variable COLORFUL is defined was generated by compiling the following source code, colorful.cg:
struct Output {
float4 color : COLOR;
};
Output texture(float2 texCoord : TEXCOORD0, uniform sampler2D tex)
{
Output OUT;
OUT.color = float4(
0.25*abs(cos(2*texCoord.y+texCoord.x)),
(tex2D(tex,texCoord).x > 0.1)?
abs(sin(texCoord.x)):tex2D(tex,texCoord).x,
(tex2D(tex,texCoord).y > 0.1)?
abs(sin(texCoord.y)):tex2D(tex,texCoord).y,
tex2D(tex, texCoord).z);
return OUT;
}
The following command-line generated the string used in the drawable2DshaderARB constructor:
cgc -profile arbfp1 -entry texture colorful.cg > colorful.arbfp1
The complete buildable source for mandelbrotShader.cu including the drawable2DshaderARB class and shader programs is as follows:
#include <GL/glew.h>
#include <GL/glut.h>
#define USE_CUDA3
#include <cuda.h>
#include <cuda_gl_interop.h>
__global__ void mandelbrot( uchar4 *d_ptr, ulong2 size,
float2 rrange, float2 irange, unsigned long n )
{
unsigned long x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned long y = blockIdx.y * blockDim.y + threadIdx.y;
if ( x >= size.x || y >= size.y ) return;
float2 C = make_float2( ( rrange.y - rrange.x ) / size.x * x + rrange.x,
( irange.y - irange.x ) / size.y * y + irange.x );
uchar4 buf = make_uchar4( 0, 0, 0, 0 );
if ( C.x * C.x + C.y * C.y <= 4.0f ) {
float2 z = make_float2( 0.0f, 0.0f );
unsigned long cnt = 0;
while ( cnt < n ) {
++cnt;
z = make_float2( z.x * z.x - z.y * z.y + C.x, 2.0f * z.x * z.y + C.y );
if ( z.x * z.x + z.y * z.y > 4.0f ) {
unsigned char c = ( 255 - 15 ) * __powf( ( float )cnt / n, 0.8f )+15;
buf = make_uchar4( c, c, c, 0 );
break;
}
}
}
d_ptr[ x + y * size.x ] = buf;
}
// A functor is basically a function that maintains state
struct gpuFunctor {
protected:
float2 rrange;
float2 irange;
public:
gpuFunctor(float2 rrange, float2 irange) : rrange(rrange), irange(irange) {}
gpuFunctor() : rrange(make_float2(-2.f, 2.f)), irange(make_float2(-2.f, 2.f)) {}
void callKernel(uchar4 *d_ptr, ulong2 gridSize, unsigned long convergence)
{
dim3 dimBlock( 16, 4 );
dim3 dimGrid( ( gridSize.x + dimBlock.x - 1 ) / dimBlock.x,
( gridSize.y + dimBlock.y - 1 ) / dimBlock.y );
mandelbrot<<< dimGrid, dimBlock >>>( d_ptr, gridSize, rrange,
irange, convergence );
}
inline float2 getRrange() {return(rrange);}
inline float2 getIrange() {return(irange);}
};
class drawable2DTexture {
protected:
GLuint tex,pbo;
ulong2 size;
gpuFunctor gpuFunc;
#ifdef USE_CUDA3
struct cudaGraphicsResource *pboCUDA;
#endif
void createPBO()
{
// set up vertex data parameter
int num_texels = size.x * size.y;
int num_values = num_texels * 4;
int size_tex_data = sizeof(GLubyte) * num_values;
// Generate a buffer ID called a PBO (Pixel Buffer Object)
glGenBuffers(1,&pbo);
// Make this the current UNPACK buffer (OpenGL is state-based)
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo);
// Allocate data for the buffer. 4-channel 8-bit image
glBufferData(GL_PIXEL_UNPACK_BUFFER, size_tex_data, NULL, GL_DYNAMIC_COPY);
#ifdef USE_CUDA3
cudaGraphicsGLRegisterBuffer( &pboCUDA, pbo, cudaGraphicsMapFlagsNone );
#else
cudaGLRegisterBufferObject( pbo );
#endif
}
void deletePBO()
{
if(pbo) {
// delete the PBO
#ifdef USE_CUDA3
cudaGraphicsUnregisterResource( pboCUDA );
#else
cudaGLUnregisterBufferObject( pbo );
#endif
glDeleteBuffers( 1, &pbo );
pbo=NULL;
pboCUDA = NULL;
}
}
void createTexture()
{
if(tex) deleteTexture();
unsigned int image_height=size.y;
unsigned int image_width=size.x;
// Enable Texturing
glEnable(GL_TEXTURE_2D);
// Generate a texture identifier
glGenTextures(1,&tex);
// Make this the current texture (remember that GL is state-based)
glBindTexture( GL_TEXTURE_2D, tex);
// Allocate the texture memory. The last parameter is NULL since we only
// want to allocate memory, not initialize it
glTexImage2D( GL_TEXTURE_2D, 0, GL_RGBA8, image_width, image_height, 0, GL_BGRA,
GL_UNSIGNED_BYTE, NULL);
// Must set the filter mode, GL_LINEAR enables interpolation when scaling
glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MIN_FILTER,GL_LINEAR);
glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MAG_FILTER,GL_LINEAR);
// Note: GL_TEXTURE_RECTANGLE_ARB may be used instead of
// GL_TEXTURE_2D for improved performance if linear interpolation is
// not desired. Replace GL_LINEAR with GL_NEAREST in the
// glTexParameteri() call
}
void deleteTexture()
{
if(tex) {
glDeleteTextures(1, &tex);
tex = NULL;
}
}
void cleanup (void) { deletePBO(); deleteTexture(); }
public:
~drawable2DTexture() { cleanup(); }
drawable2DTexture(ulong2 winSize, gpuFunctor userGPUfunc)
{
size = winSize;
gpuFunc = userGPUfunc;
#ifdef USE_CUDA3
pboCUDA = NULL;
#endif
createTexture();
createPBO();
}
void draw( unsigned long convergence )
{
uchar4 *d_ptr = NULL;
#ifdef USE_CUDA3
size_t start;
cudaGraphicsMapResources( 1, &pboCUDA, NULL );
cudaGraphicsResourceGetMappedPointer( ( void ** )&d_ptr, &start, pboCUDA );
#else
cudaGLMapBufferObject( ( void ** )&d_ptr, pbo );
#endif
gpuFunc.callKernel(d_ptr, size, convergence);
cudaThreadSynchronize();
#ifdef USE_CUDA3
cudaGraphicsUnmapResources( 1, &pboCUDA, NULL );
#else
cudaGLUnmapBufferObject( pbo );
#endif
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo);
glBindTexture( GL_TEXTURE_2D, tex );
glTexSubImage2D( GL_TEXTURE_2D, 0, 0, 0, size.x, size.y,
GL_RGBA, GL_UNSIGNED_BYTE, NULL );
}
void display( void )
{
glDisable( GL_DEPTH_TEST );
glBegin( GL_QUADS );
glTexCoord2f( 0.0f, 0.0f ); glVertex2f( 0.0f, 0.0f );
glTexCoord2f( 1.0f, 0.0f ); glVertex2f( 1.0f, 0.0f );
glTexCoord2f( 1.0f, 1.0f ); glVertex2f( 1.0f, 1.0f );
glTexCoord2f( 0.0f, 1.0f ); glVertex2f( 0.0f, 1.0f );
glEnd();
}
void reshape(ulong2 size)
{
float2 irange = gpuFunc.getIrange();
float2 rrange = gpuFunc.getRrange();
float icenter = ( irange.x + irange.y ) * 0.5f;
float iwidth = ( rrange.y - rrange.x ) / size.x * size.y * 0.5f;
irange = make_float2( icenter - iwidth, icenter + iwidth );
gpuFunc = gpuFunctor(rrange, irange);
}
};
class drawable2DshaderARB: public drawable2DTexture {
private:
uchar4 *h_Src;
GLuint shader;
public:
~drawable2DshaderARB() {
glDeleteProgramsARB( 1, &shader );
}
drawable2DshaderARB(ulong2 winSize, gpuFunctor userGPUfunc) :
drawable2DTexture(winSize, userGPUfunc) {
deleteTexture(); // get rid of inherited texture
deletePBO(); // get rid of inherited PBO
h_Src = new uchar4[size.x*size.y];
glEnable( GL_TEXTURE_2D );
glGenTextures( 1, &tex );
glBindTexture( GL_TEXTURE_2D, tex );
glTexParameteri( GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP );
glTexParameteri( GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP );
glTexParameteri( GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST );
glTexParameteri( GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST );
glTexImage2D( GL_TEXTURE_2D, 0, GL_RGBA8, size.x, size.y, 0,
GL_RGBA, GL_UNSIGNED_BYTE, h_Src );
// create and bind buffer
glGenBuffers( 1, &pbo );
glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, pbo );
glBufferData( GL_PIXEL_UNPACK_BUFFER_ARB,
sizeof( uchar4 ) * size.x * size.y, h_Src, GL_STREAM_COPY );
#ifdef USE_CUDA3
cudaGraphicsGLRegisterBuffer( &pboCUDA, pbo, cudaGraphicsMapFlagsNone );
#else
cudaGLRegisterBufferObject( pbo );
#endif
// create shader
#ifdef COLORFUL
const char * const code =
"!!ARBfp1.0\n"
"PARAM c[1] = { { 0.95, 2, 0.1 } };\n"
"TEMP R0;\n"
"TEMP R1;\n"
"TEX R0.xyz, fragment.texcoord[0], texture[0], 2D;\n"
"SIN R0.w, fragment.texcoord[0].x;\n"
"ADD R1.x, R0, -c[0].z;\n"
"ABS R0.w, R0;\n"
"CMP result.color.y, -R1.x, R0.w, R0.x;\n"
"SIN R0.w, fragment.texcoord[0].y;\n"
"MAD R0.x, fragment.texcoord[0].y, c[0].y, fragment.texcoord[0];\n"
"COS R0.x, R0.x;\n"
"ABS R0.x, R0;\n"
"ADD R1.x, R0.y, -c[0].z;\n"
"ABS R0.w, R0;\n"
"CMP result.color.z, -R1.x, R0.w, R0.y;\n"
"MOV result.color.w, R0.z;\n"
"MUL result.color.x, R0, c[0];\n"
"END"
;
#else
const char * const code =
"!!ARBfp1.0\nTEX result.color, fragment.texcoord, texture[ 0 ], 2D;\nEND";
#endif
glGenProgramsARB( 1, &shader );
glBindProgramARB( GL_FRAGMENT_PROGRAM_ARB, shader );
glProgramStringARB( GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB,
static_cast< GLsizei >( strlen( code ) ),
( GLubyte * )code );
GLint err;
glGetIntegerv( GL_PROGRAM_ERROR_POSITION_ARB, &err );
if ( err != -1 ) shader = NULL;
}
void display( void ) {
glBindProgramARB( GL_FRAGMENT_PROGRAM_ARB, shader );
glEnable( GL_FRAGMENT_PROGRAM_ARB );
glDisable( GL_DEPTH_TEST );
glBegin( GL_QUADS );
glTexCoord2f( 0.0f, 0.0f ); glVertex2f( 0.0f, 0.0f );
glTexCoord2f( 1.0f, 0.0f ); glVertex2f( 1.0f, 0.0f );
glTexCoord2f( 1.0f, 1.0f ); glVertex2f( 1.0f, 1.0f );
glTexCoord2f( 0.0f, 1.0f ); glVertex2f( 0.0f, 1.0f );
glEnd();
glBindTexture( GL_TEXTURE_2D, 0 );
glDisable( GL_FRAGMENT_PROGRAM_ARB );
}
};
drawable2DshaderARB *imageTexture=NULL;
void reshapeCallback( int w, int h )
{
glViewport( 0, 0, w, h );
glMatrixMode( GL_MODELVIEW );
glLoadIdentity();
glMatrixMode( GL_PROJECTION );
glLoadIdentity();
glOrtho( 0.0f, 1.0f, 0.0f, 1.0f, 0.0f, 1.0f );
imageTexture->reshape( make_ulong2( h, w ) );
}
void displayCallback( void )
{
imageTexture->draw( 1024 );
imageTexture->display();
glutSwapBuffers();
}
void cleanupCallback( void)
{
}
int main(int argc, char *argv[] )
{
int height=1024, width = 768;
glutInit( &argc, argv );
glutInitDisplayMode( GLUT_RGBA | GLUT_DOUBLE );
glutInitWindowSize( height, width);
glutCreateWindow( *argv );
glewInit();
glutDisplayFunc( displayCallback );
glutReshapeFunc( reshapeCallback );
cudaGLSetGLDevice( 0 );
// create the pixel image object
imageTexture = new drawable2DshaderARB(make_ulong2(height,width), gpuFunctor());
atexit( cleanupCallback );
glutMainLoop();
cleanupCallback();
cudaThreadExit();
return 0;
}
The black-and-white executable mandelbrotShaderBW (that uses the pass-through shader simple1.cg) was built with the following under Linux:
SDK_PATH=/$HOME/NVIDIA_GPU_Computing_SDK/C INC=$SDK_PATH/common/inc LIB=$SDK_PATH/lib nvcc -DCOLORFUL -O3 -I$INC -L$LIB mandelbrotShader.cu -lcutil_x86_64 -lglut -lGLEW -o mandelbrotShaderColor
Adding the following preprocessor specification -DCOLORFUL to the command line creates the mandelbrotShaderColor executable:
SDK_PATH=$HOME/NVIDIA_GPU_Computing_SDK/C INC=$SDK_PATH/common/inc LIB=$SDK_PATH/lib nvcc -DCOLORFUL -O3 -I$INC -L$LIB mandelbrotShader.cu -lcutil_x86_64 -lglut -lGLEW -o mandelbrotShaderColor
Running mandelbrotShaderColor produces Figure 1 on the screen:
Summary
CUDA 3.0 is a major revision number release that delivers important benefits for debugging; C++; OpenCL; CUDA driver and runtime developers; CUBLAS and CUFFT users, and many other areas that make this a "must install" update. The expanded capabilities discussed in this brief two-part article series should provide outstanding food-for-thought how the 3.0 release creates new opportunities for code development and integration of existing software projects.
In addition, this article provided examples that touched on a number of exciting capabilities that will be discussed in greater depth in future articles.
- CUDA, Supercomputing for the Masses: Part 16
- CUDA, Supercomputing for the Masses: Part 15
- CUDA, Supercomputing for the Masses: Part 14
- CUDA, Supercomputing for the Masses: Part 13
- CUDA, Supercomputing for the Masses: Part 12
- CUDA, Supercomputing for the Masses: Part 11
- CUDA, Supercomputing for the Masses: Part 10
- CUDA, Supercomputing for the Masses: Part 9
- CUDA, Supercomputing for the Masses: Part 8
- CUDA, Supercomputing for the Masses: Part 7
- CUDA, Supercomputing for the Masses: Part 6
- CUDA, Supercomputing for the Masses: Part 5
- CUDA, Supercomputing for the Masses: Part 4
- CUDA, Supercomputing for the Masses: Part 3
- CUDA, Supercomputing for the Masses: Part 2
- CUDA, Supercomputing for the Masses: Part 1
Rob Farber is a senior scientist at Pacific Northwest National Laboratory. He has worked in massively parallel computing at several national laboratories and as co-founder of several startups. He can be reached at rmfarber@gmail.com.


