From Window Creation to CUDA Buffer Registration
Before mapping the OpenGL buffer to CUDA, the following steps must be taken:
- Create a window (OS specific)
- Create a GL context (also OS specific)
- Set up the GL viewport and coordinate system
- Generate one or more GL buffers to be shared with CUDA
- Register these buffers with CUDA
Figure 2 illustrates these steps.
Let's begin walking quickly through simpleGLmain.cpp to see how this happens. Prior to the definition of
As you can see in the
Without going into too much detail, we see the GLUT initialization of the window and registration of the callbacks:
It is also important to check that the OpenGL version supports the capabilities and extensions needed to run the application. The cross-platform GLEW (OpenGL Extension Wrangler Library) library is used to load this functionality and expose it to the user. The check occurs through the call to
Now we setup our viewport, which defines which portion of the window is to be used; clear the color and viewport; and disable depth sorting
Finally, we specify a simple orthogonal view and define the OpenGL coordinates.
For more detailed information about the OpenGL coordinates, viewing, and transforms used thus far, I recommend one of the many excellent resources available on the web including the tutorial by Song Ho Ann.
Steps 4 and 5 occur via the call to
The routine
As noted in the source code,
The routine
As can be seen in
Completing our discussion of
This completes our discussion of simpleGLmain.cpp. For more information, please refer to the GLUT and GLEW documentation as well as one of the many excellent OpenGL references on the Internet.
Here is the complete source code for simplePBO.cpp.
Aside from the previously discussed routine
initGL() is specified to simplify the discussion in this article.
fpsDisplay() and computeFPS() are defined to use a timer and display the calculated frames per second in the window title so we can get a sense of how fast our code is running. (Note: Windows users may need to disable vsync to see full performance because buffer swapping normally occurs at the vertical refresh interval to avoid tearing (commonly 60 hz).V-sync can be turned off in the control panel when benchmarking.)
// simpleGLmain.cpp (Rob Farber)
// includes
#include <GL/glew.h>
#include <cuda_runtime.h>
#include <cutil_inline.h>
#include <cutil_gl_inline.h>
#include <cutil_gl_error.h>
#include <cuda_gl_interop.h>
#include <rendercheck_gl.h>
// The user must create the following routines:
// CUDA methods
extern void initCuda(int argc, char** argv);
extern void runCuda();
extern void renderCuda(int);
// callbacks
extern void display();
extern void keyboard(unsigned char key, int x, int y);
extern void mouse(int button, int state, int x, int y);
extern void motion(int x, int y);
// GLUT specific variables
unsigned int window_width = 512;
unsigned int window_height = 512;
unsigned int timer = 0; // a timer for FPS calculations
// Forward declarations of GL functionality
CUTBoolean initGL(int argc, char** argv);
// Simple method to display the Frames Per Second in the window title
void computeFPS()
{
static int fpsCount=0;
static int fpsLimit=100;
fpsCount++;
if (fpsCount == fpsLimit) {
char fps[256];
float ifps = 1.f / (cutGetAverageTimerValue(timer) / 1000.f);
sprintf(fps, "Cuda GL Interop Wrapper: %3.1f fps ", ifps);
glutSetWindowTitle(fps);
fpsCount = 0;
cutilCheckError(cutResetTimer(timer));
}
}
void fpsDisplay()
{
cutilCheckError(cutStartTimer(timer));
display();
cutilCheckError(cutStopTimer(timer));
computeFPS();
}
// Main program
int main(int argc, char** argv)
{
// Create the CUTIL timer
cutilCheckError( cutCreateTimer( &timer));
if (CUTFalse == initGL(argc, argv)) {
return CUTFalse;
}
initCuda(argc, argv);
CUT_CHECK_ERROR_GL();
// register callbacks
glutDisplayFunc(fpsDisplay);
glutKeyboardFunc(keyboard);
glutMouseFunc(mouse);
glutMotionFunc(motion);
// start rendering mainloop
glutMainLoop();
// clean up
cudaThreadExit();
cutilExit(argc, argv);
}
CUTBoolean initGL(int argc, char **argv)
{
//Steps 1-2: create a window and GL context (also register callbacks)
glutInit(&argc, argv);
glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE);
glutInitWindowSize(window_width, window_height);
glutCreateWindow("Cuda GL Interop Demo (adapted from NVIDIA's simpleGL");
glutDisplayFunc(fpsDisplay);
glutKeyboardFunc(keyboard);
glutMotionFunc(motion);
// check for necessary OpenGL extensions
glewInit();
if (! glewIsSupported( "GL_VERSION_2_0 " ) ) {
fprintf(stderr, "ERROR: Support for necessary OpenGL extensions missing.");
return CUTFalse;
}
// Step 3: Setup our viewport and viewing modes
glViewport(0, 0, window_width, window_height);
glClearColor(0.0, 0.0, 0.0, 1.0);
glDisable(GL_DEPTH_TEST);
// set view matrix
glMatrixMode(GL_MODELVIEW);
glLoadIdentity();
glMatrixMode(GL_PROJECTION);
glLoadIdentity();
glOrtho(0.0f, 1.0f, 0.0f, 1.0f, 0.0f, 1.0f);
return CUTTrue;
}
main() routine, a timer is first initialized and the routine initGL() is called. As mentioned previously, GLUT is utilized to create the window in a portable fashion for Windows and Linux users. (It is likely that these same callbacks and CUDA code can be adapted to work other windowing systems as well.)
//Steps 1-2: create a window and GL context (also register callbacks)
glutInit(&argc, argv);
glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE);
glutInitWindowSize(window_width, window_height);
glutCreateWindow("Cuda GL Interop Demo (adapted from NVIDIA's simpleGL");
glutDisplayFunc(fpsDisplay);
glutKeyboardFunc(keyboard);
glutMotionFunc(motion);
glewIsSupported().
// check for necessary OpenGL extensions
glewInit();
if (! glewIsSupported( "GL_VERSION_2_0 " ) ) {
fprintf(stderr, "ERROR: Support for necessary OpenGL extensions missing.");
return CUTFalse;
}
// Step 3: Setup our viewport and viewing modes
glViewport(0, 0, window_width, window_height);
glClearColor(0.0, 0.0, 0.0, 1.0);
glDisable(GL_DEPTH_TEST);
// set view matrix
glMatrixMode(GL_MODELVIEW);
glLoadIdentity();
glMatrixMode(GL_PROJECTION);
glLoadIdentity();
glOrtho(0.0f, 1.0f, 0.0f, 1.0f, 0.0f, 1.0f);
initCuda(), which defines the CUDA and OpenGL buffers and interoperability contexts. The CUDA/OpenGL interoperability functions are defined for the runtime API with the line:
#include <cuda_gl_interop.h>
initCuda() is defined in simplePBO.cpp, which maximized flexibility of the framework by isolating simpleGLmain.cpp from any special application requirements such as changing the names and numbers of the PBOs.
void initCuda(int argc, char** argv)
{
// First initialize OpenGL context, so we can properly set the GL
// for CUDA. NVIDIA notes this is necessary in order to achieve
// optimal performance with OpenGL/CUDA interop. use command-line
// specified CUDA device, otherwise use device with highest Gflops/s
if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
cutilGLDeviceInit(argc, argv);
} else {
cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() );
}
createPBO(&pbo);
createTexture(&textureID,image_width,image_height);
// Clean up on program exit
atexit(cleanupCuda);
runCuda();
}
initCuda() sets the CUDA and OpenGL contexts to get the best performance for the given hardware configuration.
createPBO() is called to create the OpenGL buffer(s) plus a texture is created via createTexture() that can be used for rendering. We also perform some housekeeping so all our buffers and textures get cleaned up on program exit.
createPBO(), we finally complete steps 4 and 5 by generating the buffer through a call to glGenBuffers(), and binding it with glBindBuffer(), and registering it for use with CUDA with cudaGLRegisterBufferObject(). Please note that the opengl call to glGenBuffers() call performs the actual memory allocation. Since the data pointer is NULL, the data is just allocated and not initialized.
void createPBO(GLuint* pbo)
{
if (pbo) {
// set up vertex data parameter
int num_texels = image_width * image_height;
int num_values = num_texels * 4;
int size_tex_data = sizeof(GLubyte) * num_values;
// Generate a buffer ID called a PBO (Pixel Buffer Object)
glGenBuffers(1,pbo);
// Make this the current UNPACK buffer (OpenGL is state-based)
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, *pbo);
// Allocate data for the buffer. 4-channel 8-bit image
glBufferData(GL_PIXEL_UNPACK_BUFFER, size_tex_data, NULL, GL_DYNAMIC_COPY);
cudaGLRegisterBufferObject( *pbo );
}
}
main(), we see that the main GLUT main loop is called. Once that completes, we clean up and exit both main() and the application.
// start rendering mainloop
glutMainLoop();
// clean up
cudaThreadExit();
cutilExit(argc, argv);
}
// simplePBO.cpp (Rob Farber)
// includes
#include <GL/glew.h>
#include <cuda_runtime.h>
#include <cutil_inline.h>
#include <cutil_gl_inline.h>
#include <cuda_gl_interop.h>
#include <rendercheck_gl.h>
// external variables
extern float animTime;
extern unsigned int window_width;
extern unsigned int window_height;
// constants (the following should be a const in a header file)
unsigned int image_width = window_width;
unsigned int image_height = window_height;
extern "C" void launch_kernel(uchar4* , unsigned int, unsigned int, float);
// variables
GLuint pbo=NULL;
GLuint textureID=NULL;
void createPBO(GLuint* pbo)
{
if (pbo) {
// set up vertex data parameter
int num_texels = image_width * image_height;
int num_values = num_texels * 4;
int size_tex_data = sizeof(GLubyte) * num_values;
// Generate a buffer ID called a PBO (Pixel Buffer Object)
glGenBuffers(1,pbo);
// Make this the current UNPACK buffer (OpenGL is state-based)
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, *pbo);
// Allocate data for the buffer. 4-channel 8-bit image
glBufferData(GL_PIXEL_UNPACK_BUFFER, size_tex_data, NULL, GL_DYNAMIC_COPY);
cudaGLRegisterBufferObject( *pbo );
}
}
void deletePBO(GLuint* pbo)
{
if (pbo) {
// unregister this buffer object with CUDA
cudaGLUnregisterBufferObject(*pbo);
glBindBuffer(GL_ARRAY_BUFFER, *pbo);
glDeleteBuffers(1, pbo);
*pbo = NULL;
}
}
void createTexture(GLuint* textureID, unsigned int size_x, unsigned int size_y)
{
// Enable Texturing
glEnable(GL_TEXTURE_2D);
// Generate a texture identifier
glGenTextures(1,textureID);
// Make this the current texture (remember that GL is state-based)
glBindTexture( GL_TEXTURE_2D, *textureID);
// Allocate the texture memory. The last parameter is NULL since we only
// want to allocate memory, not initialize it
glTexImage2D( GL_TEXTURE_2D, 0, GL_RGBA8, image_width, image_height, 0,
GL_BGRA,GL_UNSIGNED_BYTE, NULL);
// Must set the filter mode, GL_LINEAR enables interpolation when scaling
glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MIN_FILTER,GL_LINEAR);
glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MAG_FILTER,GL_LINEAR);
// Note: GL_TEXTURE_RECTANGLE_ARB may be used instead of
// GL_TEXTURE_2D for improved performance if linear interpolation is
// not desired. Replace GL_LINEAR with GL_NEAREST in the
// glTexParameteri() call
}
void deleteTexture(GLuint* tex)
{
glDeleteTextures(1, tex);
*tex = NULL;
}
void cleanupCuda()
{
if(pbo) deletePBO(&pbo);
if(textureID) deleteTexture(&textureID);
}
// Run the Cuda part of the computation
void runCuda()
{
uchar4 *dptr=NULL;
// map OpenGL buffer object for writing from CUDA on a single GPU
// no data is moved (Win & Linux). When mapped to CUDA, OpenGL
// should not use this buffer
cudaGLMapBufferObject((void**)&dptr, pbo);
// execute the kernel
launch_kernel(dptr, image_width, image_height, animTime);
// unmap buffer object
cudaGLUnmapBufferObject(pbo);
}
void initCuda(int argc, char** argv)
{
// First initialize OpenGL context, so we can properly set the GL
// for CUDA. NVIDIA notes this is necessary in order to achieve
// optimal performance with OpenGL/CUDA interop. use command-line
// specified CUDA device, otherwise use device with highest Gflops/s
if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
cutilGLDeviceInit(argc, argv);
} else {
cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() );
}
createPBO(&pbo);
createTexture(&textureID,image_width,image_height);
// Clean up on program exit
atexit(cleanupCuda);
runCuda();
}
initCuda(), we have several straightforward housekeeping routines cleanupCuda(), deletePBO(), and deleteTexture() that will not be discussed in this article. Please refer to the CUDA documentation for more information about the calls used in these routines.


