Comparing Primitive Restart to Other OpenGL Rendering Methods
For convenience, a zip file of the Visual Studio project used in this example can be downloaded from GPUcomputing.net here. Once it is downloaded, perform the following steps:
- Expand the zip file, which creates the director VS_DDJ018_VBO.
- Copy the common directory from the NVIDIA GPU Computing SDK distribution (shown below) to VS_DDJ018_VBO. This places DLLs, include files, and packages where the Visual Studio project expects them as seen in the annotated screenshot below.
- Double-click on the Visual Studio project file in VS_DDJ018_VBO\VS_DDJ018_VBO\DDJ018_VBO\DDJ018_VBO/ to start.
- Build the project by clicking on Build | Build Project.
- Right click on project in the Solution Explorer window and select Nsight User Properties
- Change localhost in Connection Name to the remote IP address (e.g. 10.37.130.3) or the hostname of the target machine that is running the Parallel Nsight monitor.
Note that even the compressed zip files for Visual Studio project can contain megabytes of data. Many of these are too large to email to a colleague. However, the following two bullets can significantly reduce the amount of data that needs to be transmitted:
- Before creating a zip file, delete any Debug directories in the project folders. These will be transparently recreated when the solution is rebuilt.
- When possible, define post-build copy operations to bring needed libraries from commonly available locations. Some of the DLL files can be quite large.
The following example has modified the simpleVBO.cpp from Part 18 of this series support three different OpenGL rendering techniques and NVTX labeling. Only one of the rendering techniques can be selected at compile time by removing the comment slashes. '//' from one of the #define statements in the body of the code.
The following three OpenGL rendering methods have been defined:
- PRIMITIVE_RESTART: This utilizes the primitive restart as described in Part 18 of this article series.
- SIMPLE_ONE_BY_ONE: Draws each TRIANGLE_FAN separately.
- MULTI_DRAW: Utilizes the OpenGL
glMultiDrawElements()API call.
By default, PRIMITIVE_RESTART is defined. Select one of the other preprocessor defines to use a different rendering method. Once changed the code needs to be (Build | Rebuild Project).
The full source for simpleVBO.cpp code is included below. The other files required to build this program, as well as a discussion about how OpenGL and CUDA work together in the same application can be found in Part 18 of this article series as well as in the Visual Studio project.
/*
This wrapper demonstrates how to use the Cuda OpenGL bindings to
dynamically modify data using a Cuda kernel and display it with opengl.
The steps are:
1. Create an empty vertex buffer object (VBO)
2. Register the VBO with Cuda
3. Map the VBO for writing from Cuda
4. Run Cuda kernel to modify the vertex positions
5. Unmap the VBO
6. Render the results using OpenGL
Host code
*/
// includes, GL
#include <GL/glew.h>
#include <GL/gl.h>
#include <GL/glext.h>
// includes
#include <cuda_runtime.h>
#include <cutil_inline.h>
#include <cutil_gl_inline.h>
#include <cuda_gl_interop.h>
#include <rendercheck_gl.h>
#include <nvToolsExt.h>
extern float animTime;
////////////////////////////////////////////////////////////////////////////////
// VBO specific code
#include <cuda_runtime.h>
#include <cutil_inline.h>
// constants
const unsigned int mesh_width = 128;
const unsigned int mesh_height = 128;
const unsigned int RestartIndex = 0xffffffff;
extern "C"
void launch_kernel(float4* pos, uchar4* posColor,
unsigned int mesh_width, unsigned int mesh_height, float time);
// vbo variables
GLuint vbo;
GLuint colorVBO;
////////////////////////////////////////////////////////////////////////////////
//! Create VBO
////////////////////////////////////////////////////////////////////////////////
void createVBO(GLuint* vbo, unsigned int typeSize)
{
// create buffer object
glGenBuffers(1, vbo);
glBindBuffer(GL_ARRAY_BUFFER, *vbo);
// initialize buffer object
unsigned int size = mesh_width * mesh_height * typeSize;
glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW);
glBindBuffer(GL_ARRAY_BUFFER, 0);
// register buffer object with CUDA
cudaGLRegisterBufferObject(*vbo);
}
////////////////////////////////////////////////////////////////////////////////
//! Delete VBO
////////////////////////////////////////////////////////////////////////////////
void deleteVBO(GLuint* vbo)
{
glBindBuffer(1, *vbo);
glDeleteBuffers(1, vbo);
cudaGLUnregisterBufferObject(*vbo);
*vbo = NULL;
}
void cleanupCuda()
{
deleteVBO(&vbo);
deleteVBO(&colorVBO);
}
////////////////////////////////////////////////////////////////////////////////
//! Run the Cuda part of the computation
////////////////////////////////////////////////////////////////////////////////
void runCuda()
{
// map OpenGL buffer object for writing from CUDA
float4 *dptr;
uchar4 *cptr;
unsigned int *iptr;
cudaGLMapBufferObject((void**)&dptr, vbo);
cudaGLMapBufferObject((void**)&cptr, colorVBO);
// execute the kernel
launch_kernel(dptr, cptr, mesh_width, mesh_height, animTime);
// unmap buffer object
cudaGLUnmapBufferObject(vbo);
cudaGLUnmapBufferObject(colorVBO);
}
void initCuda(int argc, char** argv)
{
// First initialize OpenGL context, so we can properly set the GL
// for CUDA. NVIDIA notes this is necessary in order to achieve
// optimal performance with OpenGL/CUDA interop. use command-line
// specified CUDA device, otherwise use device with highest Gflops/s
if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
cutilGLDeviceInit(argc, argv);
} else {
cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() );
}
createVBO(&vbo, sizeof(float4));
createVBO(&colorVBO, sizeof(uchar4));
// make certain the VBO gets cleaned up on program exit
atexit(cleanupCuda);
runCuda();
}
void renderCuda(int drawMode)
{
glBindBuffer(GL_ARRAY_BUFFER, vbo);
glVertexPointer(4, GL_FLOAT, 0, 0);
glEnableClientState(GL_VERTEX_ARRAY);
glBindBuffer(GL_ARRAY_BUFFER, colorVBO);
glColorPointer(4, GL_UNSIGNED_BYTE, 0, 0);
glEnableClientState(GL_COLOR_ARRAY);
//glColor3f(1.0, 0.5, 0.0);
switch(drawMode) {
case GL_LINE_STRIP:
for(int i=0 ; i < mesh_width*mesh_height; i+= mesh_width)
glDrawArrays(GL_LINE_STRIP, i, mesh_width);
break;
case GL_TRIANGLE_FAN: {
// I left these in to show some alternative drawing methods
#define PRIMITIVE_RESTART
//#define SIMPLE_ONE_BY_ONE
//#define MULTI_DRAW
#ifdef PRIMITIVE_RESTART
nvtxRangePushA("Primitive Restart");
static GLuint* qIndices=NULL;
int size = 5*(mesh_height-1)*(mesh_width-1);
if(qIndices == NULL) { // allocate and assign trianglefan indicies
qIndices = (GLuint *) malloc(size*sizeof(GLint));
int index=0;
nvtxRangePushA("Mesh Init");
for(int i=1; i < mesh_height; i++) {
for(int j=1; j < mesh_width; j++) {
qIndices[index++] = (i)*mesh_width + j;
qIndices[index++] = (i)*mesh_width + j-1;
qIndices[index++] = (i-1)*mesh_width + j-1;
qIndices[index++] = (i-1)*mesh_width + j;
qIndices[index++] = RestartIndex;
}
}
nvtxRangePop();
}
glPrimitiveRestartIndexNV(RestartIndex);
glEnableClientState(GL_PRIMITIVE_RESTART_NV);
glDrawElements(GL_TRIANGLE_FAN, size, GL_UNSIGNED_INT, qIndices);
glDisableClientState(GL_PRIMITIVE_RESTART_NV);
nvtxRangePop();
#endif
#ifdef SIMPLE_ONE_BY_ONE
static GLuint* qIndices=NULL;
int size = 4*(mesh_height-1)*(mesh_width-1);
nvtxRangePushA("ONE_BY_ONE");
if(qIndices == NULL) { // allocate and assign trianglefan indices
nvtxRangePushA("Mesh Init");
qIndices = (GLuint *) malloc(size*sizeof(GLint));
int index=0;
for(int i=1; i < mesh_height; i++) {
for(int j=1; j < mesh_width; j++) {
qIndices[index++] = (i)*mesh_width + j;
qIndices[index++] = (i)*mesh_width + j-1;
qIndices[index++] = (i-1)*mesh_width + j-1;
qIndices[index++] = (i-1)*mesh_width + j;
}
}
nvtxRangePop();
fprintf(stderr,"size %d index %d\n",size,index);
}
nvtxRangePushA("Iteratively draw elements");
for(int i=0; i < size; i +=4)
glDrawElements(GL_TRIANGLE_FAN, 4, GL_UNSIGNED_INT, &qIndices[i]);
nvtxRangePop();
nvtxRangePop();
#endif
#ifdef MULTI_DRAW
nvtxRangePushA("MULTI_DRAW");
static GLint* qIndices=NULL;
static GLint* qCounts=NULL;
static GLint** qIndex=NULL;
int size = (mesh_height-1)*(mesh_width-1);
if(qIndices == NULL) { // allocate and assign trianglefan indicies
nvtxRangePushA("Mesh Init");
qIndices = (GLint *) malloc(4*size*sizeof(GLint));
qCounts = (GLint *) malloc(size*sizeof(GLint));
qIndex = (GLint **) malloc(size*sizeof(GLint*));
int index=0;
for(int i=1; i < mesh_height; i++)
for(int j=1; j < mesh_width; j++) {
qIndices[index++] = ((i)*mesh_width + j);
qIndices[index++] = ((i)*mesh_width + j-1);
qIndices[index++] = ((i-1)*mesh_width + j-1);
qIndices[index++] = ((i-1)*mesh_width + j);
}
for(int i=0; i < size; i++) qCounts[i] = 4;
for(int i=0; i < size; i++) qIndex[i] = &qIndices[i*4];
nvtxRangePop();
}
nvtxRangePushA("multidraw elements");
glMultiDrawElements(GL_TRIANGLE_FAN, qCounts,
GL_UNSIGNED_INT, (const GLvoid**)qIndex, size);
nvtxRangePop();
nvtxRangePop();
#endif
} break;
default:
glDrawArrays(GL_POINTS, 0, mesh_width * mesh_height);
break;
}
glDisableClientState(GL_VERTEX_ARRAY);
glDisableClientState(GL_COLOR_ARRAY);
}


