Channels ▼
RSS

C/C++

CUDA, Supercomputing for the Masses: Part 18


Artificial Terrain With a Perlin Noise Generator

Now let's have some fun and create a 3D virtual terrain world!

The following example adapts the Improved Perlin Noise generator from Part 15 to generate 3D coordinates and color based on elevation. For simplicity, fBm (Fractal Brownian Motion) was chosen to generate the fractal terrain. Better methods exist to create more realistic landscapes as noted here.

Essentially four changes were made to the original Part 15 code:

  • The call to k_perlin was modified so the position and color buffers can be passed to the kernel.
  • launch_kernel was modified to pass the position and color buffers.
  • The routine fBm was added to calculate Fractal Brownian Motion.
  • The terrain was colored according to elevation.


 colorPos[idx] = colorElevation(w);
 float u = ((float) (idx%width))/(float) width;
 float v = ((float) (idx/width))/(float) height;
 u = u*2.f - 1.f; // center the mesh
 v = v*2.f - 1.f;
 w = (w>0.f)?w:0.f; // don't show region underwater
 pos[idx] = make_float4( u, w, v, 1.0f);

The complete listing for perlinKernelVBO.cu is as follows:



// perlinKernelVBO.cu (Rob Farber)
#include <cutil_math.h>
#include <cutil_inline.h>
#include <cutil_gl_inline.h>
#include <cuda_gl_interop.h>

float gain=0.75f;
float xStart=2.f;
float yStart=1.f;
float zOffset = 0.0f;
float octaves = 2.f;
float lacunarity = 2.0f;
#define Z_PLANE 50.f

__constant__ unsigned char c_perm[256];
__shared__ unsigned char s_perm[256]; // shared memory copy of permuation array
unsigned char* d_perm=NULL; // global memory copy of permutation array
// host version of permutation array
const static unsigned char h_perm[] = {151,160,137,91,90,15,
   131,13,201,95,96,53,194,233,7,225,140,36,103,30,69,142,8,99,37,240,21,10,23,
   190, 6,148,247,120,234,75,0,26,197,62,94,252,219,203,117,35,11,32,57,177,33,
   88,237,149,56,87,174,20,125,136,171,168, 68,175,74,165,71,134,139,48,27,166,
   77,146,158,231,83,111,229,122,60,211,133,230,220,105,92,41,55,46,245,40,244,
   102,143,54, 65,25,63,161, 1,216,80,73,209,76,132,187,208, 89,18,169,200,196,
   135,130,116,188,159,86,164,100,109,198,173,186, 3,64,52,217,226,250,124,123,
   5,202,38,147,118,126,255,82,85,212,207,206,59,227,47,16,58,17,182,189,28,42,
   223,183,170,213,119,248,152,2,44,154,163, 70,221,153,101,155,167, 43,172,9,
   129,22,39,253, 19,98,108,110,79,113,224,232,178,185, 112,104,218,246,97,228,
   251,34,242,193,238,210,144,12,191,179,162,241, 81,51,145,235,249,14,239,107,
   49,192,214, 31,181,199,106,157,184,84,204,176,115,121,50,45,127, 4,150,254,
   138,236,205,93,222,114,67,29,24,72,243,141,128,195,78,66,215,61,156,180
   };

__device__ inline int perm(int i) { return(s_perm[i&0xff]); }
__device__ inline float fade(float t) { return t * t * t * (t * (t * 6.f - 15.f) + 10.f); }
__device__ inline float lerpP(float t, float a, float b) { return a + t * (b - a); }
__device__ inline float grad(int hash, float x, float y, float z) {
  int h = hash & 15;                      // CONVERT LO 4 BITS OF HASH CODE
  float u = h<8 ? x : y,                 // INTO 12 GRADIENT DIRECTIONS.
    v = h<4 ? y : h==12||h==14 ? x : z;
  return ((h&1) == 0 ? u : -u) + ((h&2) == 0 ? v : -v);
}

__device__ float inoise(float x, float y, float z) {
  int X = ((int)floorf(x)) & 255, // FIND UNIT CUBE THAT
    Y = ((int)floorf(y)) & 255,   // CONTAINS POINT.
    Z = ((int)floorf(z)) & 255;
  x -= floorf(x);               // FIND RELATIVE X,Y,Z
  y -= floorf(y);               // OF POINT IN CUBE.
  z -= floorf(z);
  float u = fade(x),            // COMPUTE FADE CURVES
    v = fade(y),                // FOR EACH OF X,Y,Z.
    w = fade(z);
  int A = perm(X)+Y, AA = perm(A)+Z, AB = perm(A+1)+Z, // HASH COORDINATES OF
    B = perm(X+1)+Y, BA = perm(B)+Z, BB = perm(B+1)+Z; // THE 8 CUBE CORNERS,
  
  return lerpP(w, lerpP(v, lerpP(u, grad(perm(AA), x  , y  , z   ), // AND ADD
				 grad(perm(BA), x-1.f, y  , z   )),   // BLENDED
			lerpP(u, grad(perm(AB), x  , y-1.f, z   ),    // RESULTS
			      grad(perm(BB), x-1.f, y-1.f, z   ))),     // FROM  8
	       lerpP(v, lerpP(u, grad(perm(AA+1), x  , y  , z-1.f ),  // CORNERS
			      grad(perm(BA+1), x-1.f, y  , z-1.f )),    // OF CUBE
		     lerpP(u, grad(perm(AB+1), x  , y-1.f, z-1.f ),
			   grad(perm(BB+1), x-1.f, y-1.f, z-1.f ))));
#ifdef ORIG
  return(perm(X));
#endif
 
}

__device__ float fBm(float x, float y, int octaves,
		     float lacunarity = 2.0f, float gain = 0.5f)
{
  float freq = 1.0f, amp = 0.5f;
  float sum = 0.f;  
  for(int i=0; i<octaves; i++) {
    sum += inoise(x*freq, y*freq, Z_PLANE)*amp;
    freq *= lacunarity;
    amp *= gain;
  }
  return sum;
}


__device__ inline uchar4 colorElevation(float texHeight)
{
  uchar4 pos;

  // color textel (r,g,b,a)
       if (texHeight < -1.000f) pos = make_uchar4(000, 000, 128, 255); //deeps
  else if (texHeight < -.2500f) pos = make_uchar4(000, 000, 255, 255); //shallow
  else if (texHeight < 0.0000f) pos = make_uchar4(000, 128, 255, 255); //shore
  else if (texHeight < 0.0125f) pos = make_uchar4(240, 240, 064, 255); //sand
  else if (texHeight < 0.1250f) pos = make_uchar4(032, 160, 000, 255); //grass
  else if (texHeight < 0.3750f) pos = make_uchar4(224, 224, 000, 255); //dirt
  else if (texHeight < 0.7500f) pos = make_uchar4(128, 128, 128, 255); //rock
  else                          pos = make_uchar4(255, 255, 255, 255); //snow

  return(pos);
}

void checkCUDAError(const char *msg) {
  cudaError_t err = cudaGetLastError();
  if( cudaSuccess != err) {
    fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) ); 
    exit(EXIT_FAILURE); 
  }
} 

//Simple kernel fills an array with perlin noise
__global__ void k_perlin(float4* pos, uchar4 *colorPos,
			  unsigned int width, unsigned int height, 
			 float2 start, float2 delta, float gain, float zOffset,
			 unsigned char* d_perm, float octaves, float lacunarity)
{
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  float xCur = start.x + ((float) (idx%width)) * delta.x;
  float yCur = start.y + ((float) (idx/width)) * delta.y;

  if(threadIdx.x < 256)
    // Optimization: this causes bank conflicts
    s_perm[threadIdx.x] = d_perm[threadIdx.x];
  // this synchronization can be important if there are more that 256 threads
  __syncthreads();

  
  // Each thread creates one pixel location in the texture (textel)
  if(idx < width*height) {
    float w = fBm(xCur, yCur, octaves, lacunarity, gain) + zOffset;
    
    colorPos[idx] = colorElevation(w);
    float u = ((float) (idx%width))/(float) width;
    float v = ((float) (idx/width))/(float) height;
    u = u*2.f - 1.f;  // center the mesh
    v = v*2.f - 1.f;
    w = (w>0.f)?w:0.f; // don't show region underwater
    pos[idx] = make_float4( u, w, v, 1.0f);
  }
}


uchar4 *eColor=NULL;
// Wrapper for the __global__ call that sets up the kernel call
extern "C" void launch_kernel(float4 *pos, uchar4* posColor,
			      unsigned int image_width, unsigned int image_height,
			      float time)
{
  int nThreads=256; // must be equal or larger than 256! (see s_perm)
  int totalThreads = image_height * image_width;
  int nBlocks = totalThreads/nThreads; 
  nBlocks += ((totalThreads%nThreads)>0)?1:0;
  
  float xExtent = 10.f;
  float yExtent = 10.f;
  float xDelta = xExtent/(float)image_width;
  float yDelta = yExtent/(float)image_height;
  
  if(!d_perm) { // for convenience allocate and copy d_perm here
    cudaMalloc((void**) &d_perm,sizeof(h_perm));
    cudaMemcpy(d_perm,h_perm,sizeof(h_perm),cudaMemcpyHostToDevice);
    checkCUDAError("d_perm malloc or copy failed!");
  }

  k_perlin<<< nBlocks, nThreads>>>(pos, posColor, image_width, image_height,
				   make_float2(xStart, yStart),
				   make_float2(xDelta, yDelta),
				   gain, zOffset, d_perm,
				   octaves, lacunarity);
  
  // make certain the kernel has completed 
  cudaThreadSynchronize();
  checkCUDAError("kernel failed!");
}

Minor changes were also made to the callbacksVBO.cpp file to so the mouse movements with the middle button pressed translate the image along the Z-axis. In addition support for the keyboard commands in Table 2 was added.

Table 2: Keyboard commands.

The following is the complete source for perlinCallbacksVBO.cpp.


// perlinCallbacksVBO.cpp (Rob Farber)
// includes, GL
#include <GL/glew.h>

// includes
#include <cuda_runtime.h>
#include <cutil_inline.h>
#include <cutil_gl_inline.h>
#include <cuda_gl_interop.h>
#include <rendercheck_gl.h>

extern float animTime;

// The user must create the following routines:
void initCuda(int argc, char** argv);
void runCuda();
void renderCuda(int);

// Callbacks

int drawMode=GL_TRIANGLE_FAN; // the default draw mode

// mouse controls
int mouse_old_x, mouse_old_y;
int mouse_buttons = 0;
float rotate_x = 0.0, rotate_y = 0.0;
float translate_z = -3.0;

//! Display callback for GLUT
//! Keyboard events handler for GLUT
//! Display callback for GLUT
void display()
{
  glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);

  // set view matrix
  glMatrixMode(GL_MODELVIEW);
  glLoadIdentity();
  glTranslatef(0.0, 0.0, translate_z);
  glRotatef(rotate_x, 1.0, 0.0, 0.0);
  glRotatef(rotate_y, 0.0, 1.0, 0.0);

  // run CUDA kernel to generate vertex positions
  runCuda();

  // render the data
  renderCuda(drawMode);
  
  glutSwapBuffers();
  glutPostRedisplay();
  
  animTime += 0.01;
}

extern float xStart,yStart,zOffset;
extern float gain, octaves, lacunarity;

//! Keyboard events handler for GLUT
void keyboard(unsigned char key, int x, int y)
{
  switch(key) {
  case(27) : // exit
  case('q') :
    exit(0);
    break;
  case '+': // lower the ocean level
    zOffset += 0.01;
    zOffset = (zOffset > 1.0)? 1.0:zOffset; // guard input
    break;
  case '-': // raise the ocean level
    zOffset -= 0.01;
    zOffset = (zOffset < -1.0)? -1.0:zOffset; // guard input
    break;
  case 'k': // move withing Perlin function
    yStart -= 0.1;
    break;
  case 'j': 
    yStart += 0.1;
    break;
  case 'l': 
    xStart += 0.1;
    break;
  case 'h': 
    xStart -= 0.1;
    break;
  case 'd':
  case 'D':
    switch(drawMode) {
    case GL_POINTS: drawMode = GL_LINE_STRIP; break;
    case GL_LINE_STRIP: drawMode = GL_TRIANGLE_FAN; break;
    default: drawMode=GL_POINTS;
    } break;
  case 'I': // change gain
    gain += 0.25;
    break;
  case 'i': // change gain
    gain -= 0.25;
    gain = (gain < 0.25)?0.25:gain; // guard input
    break;
  case 'O': // change octaves
    octaves += 1.0f;
    octaves = (octaves > 8)?8:octaves; // guard input
    break;
  case 'o': // change octaves
    octaves -= 1.0f;
    octaves = (octaves<2)?2:octaves; // guard input
    break;
  case 'P': // change lacunarity
    lacunarity += 0.25;
    break;
  case 'p': // change lacunarity
    lacunarity -= 0.25;
    lacunarity = (lacunarity<0.2)?0.2:lacunarity; // guard input
    break;
  }
  glutPostRedisplay();
}

// Mouse event handlers for GLUT
void mouse(int button, int state, int x, int y)
{
  if (state == GLUT_DOWN) {
    mouse_buttons |= 1<<button;
  } else if (state == GLUT_UP) {
    mouse_buttons = 0;
  }
  
  mouse_old_x = x;
  mouse_old_y = y;
  glutPostRedisplay();
}

void motion(int x, int y)
{
  float dx, dy;
  dx = x - mouse_old_x;
  dy = y - mouse_old_y;
  
  if (mouse_buttons & 1) {
    rotate_x += dy * 0.2;
    rotate_y += dx * 0.2;
  } else if (mouse_buttons & 4) {
    translate_z += dy * 0.01;
  }
 rotate_x = (rotate_x < -60.)?-60.:(rotate_x > 60.)?60:rotate_x;
 rotate_y = (rotate_y < -60.)?-60.:(rotate_y > 60.)?60:rotate_y;
  
  mouse_old_x = x;
  mouse_old_y = y;
}

The code can be built with the following command after the complete source has been saved to appropriately named files:


#/bin/bash
SDK_PATH=PATH_TO_NVIDIA_GPU_Computing_SDK

nvcc -O3 -L $SDK_PATH/C/lib -I $SDK_PATH/C/common/inc simpleGLmain.cpp simpleVBO
.cpp perlinCallbacks.cpp perlinKernelVBO.cu -lglut -lGLEW -lcutil_x86_64 -o test
GL

Figure 13 is an example terrain that can be created after using the defined keys and mouse movements to make a pleasing picture.

Figure 13: An example 3D surface created using Perlin Noise

Summary

Mixing CUDA and visualization opens tremendous opportunities for commercial games and visual products as well as scientific applications. The examples in this article demonstrate that the current generation of CUDA-enabled graphics processors can both render and generate very complex data at hundreds of frames per second.

In particular, this article attempts to point the way to an extraordinarily simple and flexible way for CUDA developers to generate and render 3D images using the OpenGL standards compliant primitive restart capability so that minimal host processor interaction is required. As a result, PCIe bottlenecks and latencies can be avoided to deliver high-performance high-quality graphics even when the images require irregular meshes and/or computationally expensive data generation. Of course, this is of interest when generating very realistic images on high-end GPUs but do not forget that this same technique can enable product penetration into the mid- and lower-performance markets as well!

Finally, the simple OpenGL software framework introduced in Part 15 of this series that was initially used to generate and view 2D pictures has been adapted -- with minor modification -- to support fully animated and interactive 3D images and landscapes. Hopefully, this software will provide the starting point for many new projects.

As shown in Part 17 of this series, this same software framework can easily be adapted to exploit the full class and inheritance capabilities of C++ plus a vast collection of Cg libraries and other existing visualization software!

References


Rob Farber is a senior scientist at Pacific Northwest National Laboratory. He has worked in massively parallel computing at several national laboratories and as co-founder of several startups. He can be reached at rmfarber@gmail.com.


Related Reading


More Insights






Currently we allow the following HTML tags in comments:

Single tags

These tags can be used alone and don't need an ending tag.

<br> Defines a single line break

<hr> Defines a horizontal line

Matching tags

These require an ending tag - e.g. <i>italic text</i>

<a> Defines an anchor

<b> Defines bold text

<big> Defines big text

<blockquote> Defines a long quotation

<caption> Defines a table caption

<cite> Defines a citation

<code> Defines computer code text

<em> Defines emphasized text

<fieldset> Defines a border around elements in a form

<h1> This is heading 1

<h2> This is heading 2

<h3> This is heading 3

<h4> This is heading 4

<h5> This is heading 5

<h6> This is heading 6

<i> Defines italic text

<p> Defines a paragraph

<pre> Defines preformatted text

<q> Defines a short quotation

<samp> Defines sample computer code text

<small> Defines small text

<span> Defines a section in a document

<s> Defines strikethrough text

<strike> Defines strikethrough text

<strong> Defines strong text

<sub> Defines subscripted text

<sup> Defines superscripted text

<u> Defines underlined text

Dr. Dobb's encourages readers to engage in spirited, healthy debate, including taking us to task. However, Dr. Dobb's moderates all comments posted to our site, and reserves the right to modify or remove any content that it determines to be derogatory, offensive, inflammatory, vulgar, irrelevant/off-topic, racist or obvious marketing or spam. Dr. Dobb's further reserves the right to disable the profile of any commenter participating in said activities.

 
Disqus Tips To upload an avatar photo, first complete your Disqus profile. | View the list of supported HTML tags you can use to style comments. | Please read our commenting policy.
 

Video