Channels ▼
RSS

Tools

CUDA, Supercomputing for the Masses: Part 13


An Example

Let's take a look at the following very simple example, readTexels.cu, which demonstrates how to bind a texture to a CUDA array and sets the filterMode attribute to cudaFilterModeLinear.


//readTexels.cu
#include <stdio.h>

void checkCUDAError(const char *msg) {
  cudaError_t err = cudaGetLastError();
  if( cudaSuccess != err) {
    fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) ); 
    exit(EXIT_FAILURE); 
  }
} 

texture<float, 1, cudaReadModeElementType> texRef;

__global__ void readTexels(int n, float *d_out)
{
  int idx = blockIdx.x*blockDim.x + threadIdx.x;
  
  if(idx < n) {
    //Note: Appendix D.2 gives formula for interpolation
    float x = tex1D(texRef, float(idx));
    d_out[idx] = x;
  }
}

#define NUM_THREADS 256

int main()
{
  int N = 10; // 10 is illustrative and should be larger in practice
  int nBlocks = N/NUM_THREADS + ((N % NUM_THREADS)?1:0);
  float *d_out;
  
  // allocate space on the device for the results
  cudaMalloc((void**)&d_out, sizeof(float) * N);

  // allocate space on the host for the results
  float *h_out = (float*)malloc(sizeof(float)*N);

  // data fill array with increasing values
  float *data = (float*)malloc(N*sizeof(float));
  for (int i = 0; i <  N; i++) data[i] = float(i);
  
  // create a CUDA array on the device
  cudaArray* cuArray;
  cudaMallocArray (&cuArray, &texRef.channelDesc, N, 1);
  cudaMemcpyToArray(cuArray, 0, 0, data, sizeof(float)*N, cudaMemcpyHostToDevice);
  
  // bind a texture to the CUDA array
  cudaBindTextureToArray (texRef, cuArray);

  // host side settable texture attributes
  texRef.normalized = false;
  texRef.filterMode = cudaFilterModeLinear;
  
  // read texels from texture
  readTexels<<<nBlocks, NUM_THREADS>>>(N, d_out);
  
  // copy texels to host
  cudaMemcpy(h_out, d_out, sizeof(float)*N, cudaMemcpyDeviceToHost);
  
  // look at them
  for (int i = 0; i << N; i++) {
    printf("%f\n",h_out[i]);
  }
  
  free(h_out);

  cudaFree(d_out);
  cudaFreeArray(cuArray);
  cudaUnbindTexture(texRef);
  checkCUDAError("cuda free operations");
}

Under Linux, the following nvcc command-line can be used to build this program:


nvcc readTexel.cu –o readTexel

On the host side, the texture reference, texRef, is created with:


texture<float, 1, cudaReadModeElementType> texRef;

A CUDA array, cuArray, is allocated and initialized:


  // create a CUDA array on the device
  cudaArray* cuArray;
  cudaMallocArray (&cuArray, &texRef.channelDesc, N, 1);

The texRef texture is then bound to cuArray and the texture attributes are set. In this case, we specify linear interpolation and we will not be using normalized texture coordinates.


// bind a texture to the CUDA array
  cudaBindTextureToArray (texRef, cuArray);

  // host side settable texture attributes
  texRef.normalized = false;
  texRef.filterMode = cudaFilterModeLinear;

The kernel, readTexels(), simply fetches values from the texture unit and places them in the d_out array.


    //Note: Appendix D.2 gives formula for interpolation
    float x = tex1D(texRef, float(idx));
    d_out[idx] = x;

The d_out array is then copied back to the host and printed out on the screen. Finally, the texture is released with the call:


  cudaUnbindTexture(texRef);

Playing with the attributes and data in this simple example might help clarify the processing capabilities of texture memory. For this example, you should see the following output demonstrating that the texture is interpolating between data points.


0.000000
0.500000
1.500000
2.500000
3.500000
4.500000
5.500000
6.500000
7.500000
8.500000

Example 1: Binding a texture to linear memory that is updated in-place.

The following simple example, negateArray.cu, binds a 1D texture to linear memory. The texture is used to fetch floating-point values from the linear memory and the texture is then updated in-place. The results are then brought back to the host and checked for correctness.


#include <stdio.h>
#include <assert.h>

void checkCUDAError(const char *msg) {
  cudaError_t err = cudaGetLastError();
  if( cudaSuccess != err) {
    fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) ); 
    exit(EXIT_FAILURE); 
  }
} 

texture<float, 1, cudaReadModeElementType> texRef;

__global__ void kernel(int n, float *d_out)
{
  int idx = blockIdx.x*blockDim.x + threadIdx.x;

  if(idx < n) {
    d_out[idx] = -tex1Dfetch(texRef, idx);
  }
}

#define NUM_THREADS 256
int main()
{
  int N = 2560;
  int nBlocks = N/NUM_THREADS + ((N % NUM_THREADS)?1:0);
  int memSize = N*sizeof(float);
  
  // data fill array with increasing values
  float *data;
  data = (float*) malloc(memSize);
  for (int i = 0; i < N; i++) data[i] = float(i);
  
  float *d_a;
  cudaMalloc( (void **) &d_a, memSize );
  cudaMemcpy( d_a, data, memSize, cudaMemcpyHostToDevice );
  
  cudaBindTexture(0,texRef,d_a,memSize);
  checkCUDAError("bind");
  
  kernel<<<nBlocks, NUM_THREADS>>>(N, d_a);

  float *h_out = (float*)malloc(memSize);
  cudaMemcpy(h_out, d_a, memSize, cudaMemcpyDeviceToHost);
  checkCUDAError("cudaMemcpy");
  
  for (int i = 0; i <<N; i++) {
    assert(data[i] == -h_out[i]);
  }
  printf("Correct\n");
  
  cudaUnbindTexture(texRef);
  checkCUDAError("cudaUnbindTexture");

  free(h_out);
  free(data);
}

There are a few minor but important differences between negateArray.cu and the previous readTexels.cu example.

The first real difference is that we allocate a linear region of memory, d_a, with cudaMalloc():


  float *d_a;
  cudaMalloc( (void **) &d_a, memSize );

This linear memory is bound to a texture with the following:


  cudaBindTexture(0,texRef,d_a,memSize);
  checkCUDAError("bind");

On the device, tex1Dfetch() is used to fetch the data, which is then negated and written to d_out:


    d_out[idx] = -tex1Dfetch(texRef, idx);

Please note that the kernel call passed d_a, which means that the data is updated in-place:


  kernel<<<nBlocks, NUM_THREADS>>>(N, d_a);

Example 2: Revisiting the reverseArray_multiblock.cu example

Finally, let's revisit the reverseArray_multiblock.cu example, which was discussed in detail in Part 3 of this series and adapt it to use texture memory. As can be seen in the source for reverseArray_multiblockTexture.cu below, only a few minor changes were needed to change from using a linear array to a texture object bound to the linear region of memory, d_a, allocated with cudaMalloc. For convenience, changes from reverseArray_multiblock.cu are highlighted with red and the "* Texture Specific *" string.


// reverseArray_multiblockTexture.cu

// includes, system
#include <stdio.h>
#include <assert.h>

// Simple utility function to check for CUDA runtime errors 
void checkCUDAError(const char* msg);

<font color="#FF0000">// ****************** Texture Specific *******************
// Note: default mode is cudaReadModeElementType
// section 4.3.4.1 of the NVIDIA CUDA Programming Guide</font>
texture<int, 1> tex_d_a;

// Part3: implement the kernel 
__global__ void reverseArrayTexture(int *d_out, int *d_in) 
{
  int inOffset = blockDim.x * blockIdx.x;
  int outOffset = blockDim.x * (gridDim.x - 1 - blockIdx.x);
  int in = inOffset + threadIdx.x;
  int out = outOffset + (blockDim.x - 1 - threadIdx.x);

 <font color="#FF0000"> // ****************** Texture Specific *******************</font>
  d_out[out] = tex1Dfetch(tex_d_a,in);
}

// Program main 
int main( int argc, char** argv) 
{
  // pointer for host memory and size
  int *h_a; int dimA = 256 * 1024; // 256K elements (1MB total)
  
  // pointer for device memory
  int *d_b, *d_a;
  
  // define grid and block size
  int numThreadsPerBlock = 256;
  
  // Part 1: compute number of blocks needed based on
  // array size and desired block size 
  int numBlocks = dimA / numThreadsPerBlock;
  
  // allocate host and device memory
  size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
  h_a = (int *) malloc(memSize);
  cudaMalloc( (void **) &d_a, memSize );
  cudaMalloc( (void **) &d_b, memSize );

 <font color="#FF0000">// ****************** Texture Specific *******************
  // Bind the device array d_a to a texture object tex_d_a</font>
  cudaBindTexture(NULL,tex_d_a,d_a);
  checkCUDAError("Bind Texture");
  
  // Initialize input array on host
  for (int i = 0; i < dimA; ++i) { h_a[i] = i; }
  
  // Copy host array to device array
  cudaMemcpy( d_a, h_a, memSize, cudaMemcpyHostToDevice );
  
  // launch kernel
  dim3 dimGrid(numBlocks);
  dim3 dimBlock(numThreadsPerBlock);
  reverseArrayTexture<<< dimGrid, dimBlock >>>( d_b, d_a );
  
  // block until the device has completed 
  cudaThreadSynchronize();
  
  // check if kernel execution generated an error 
  // Check for any CUDA errors 
  checkCUDAError("kernel invocation");
  
  // device to host copy
  cudaMemcpy( h_a, d_b, memSize, cudaMemcpyDeviceToHost );
  
  // Check for any CUDA errors
  checkCUDAError("memcpy");
  
  // verify the data returned to the host is correct
  for (int i = 0; i < dimA; i++) { assert(h_a[i] == dimA - 1 - i ); }
  
 <font color="#FF0000">// ****************** Texture Specific *******************</font>
  cudaUnbindTexture(tex_d_a);
  checkCUDAError("Unbind Texture");

  // free device memory 
  cudaFree(d_a); cudaFree(d_b);
  
  // free host memory
  free(h_a);
  
  // If the program makes it this far, then the results are
  // correct and there are no run-time errors. Good work!
  printf("Correct!\n"); return 0;
}

void checkCUDAError(const char *msg) {
  cudaError_t err = cudaGetLastError();
  if( cudaSuccess != err) {
    fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) ); 
    exit(EXIT_FAILURE); 
  }
} 

In short summary form, CUDA texturing requires the following steps:

  • Host (CPU) code
    • Allocate/obtain memory (linear memory, pitch linear memory, or CUDA array)
    • Create a texture reference object

      • Currently must be at file-scope

    • Bind the texture reference to memory/array
    • When done

      • Unbind the texture reference, free resources

  • Device (kernel) code

    • Fetch using texture reference
    • Linear memory textures

      • tex1Dfetch

    • Array textures and pitch linear memory

      • tex1D, tex2D, or tex3D

This structure can be seen in reverseArray_multiblockTexture.cu:

  • Host (CPU) code:


// reverseArray_multiblockTexture.cu
   ...
<font color="#FF0000">// ****************** Texture Specific *******************
// Note: default mode is cudaReadModeElementType
// section 4.3.4.1 of the NVIDIA CUDA Programming Guide</font>
texture<int, 1> tex_d_a;

// Program main 
int main( int argc, char** argv) 
{
    ...
    // pointer for device memory
  int *d_b, *d_a;
  ...
  cudaMalloc( (void **) &d_a, memSize );
...
  <font color="#FF0000">// ****************** Texture Specific *******************
  // Bind the device array d_a to a texture object tex_d_a</font>
  cudaBindTexture(NULL,tex_d_a,d_a);
  checkCUDAError("Bind Texture");
...
     <font color="#FF0000">// ****************** Texture Specific *******************</font>
  cudaUnbindTexture(tex_d_a);
  checkCUDAError("Unbind Texture");
...
}

  • Device (kernel) code:


// Part3: implement the kernel 
__global__ void reverseArrayTexture(int *d_out, int *d_in) 
{
  <font color="#FF0000">// ****************** Texture Specific *******************</font>
  d_out[out] = tex1Dfetch(tex_d_a,in);
}


Related Reading


More Insights






Currently we allow the following HTML tags in comments:

Single tags

These tags can be used alone and don't need an ending tag.

<br> Defines a single line break

<hr> Defines a horizontal line

Matching tags

These require an ending tag - e.g. <i>italic text</i>

<a> Defines an anchor

<b> Defines bold text

<big> Defines big text

<blockquote> Defines a long quotation

<caption> Defines a table caption

<cite> Defines a citation

<code> Defines computer code text

<em> Defines emphasized text

<fieldset> Defines a border around elements in a form

<h1> This is heading 1

<h2> This is heading 2

<h3> This is heading 3

<h4> This is heading 4

<h5> This is heading 5

<h6> This is heading 6

<i> Defines italic text

<p> Defines a paragraph

<pre> Defines preformatted text

<q> Defines a short quotation

<samp> Defines sample computer code text

<small> Defines small text

<span> Defines a section in a document

<s> Defines strikethrough text

<strike> Defines strikethrough text

<strong> Defines strong text

<sub> Defines subscripted text

<sup> Defines superscripted text

<u> Defines underlined text

Dr. Dobb's encourages readers to engage in spirited, healthy debate, including taking us to task. However, Dr. Dobb's moderates all comments posted to our site, and reserves the right to modify or remove any content that it determines to be derogatory, offensive, inflammatory, vulgar, irrelevant/off-topic, racist or obvious marketing or spam. Dr. Dobb's further reserves the right to disable the profile of any commenter participating in said activities.

 
Disqus Tips To upload an avatar photo, first complete your Disqus profile. | View the list of supported HTML tags you can use to style comments. | Please read our commenting policy.
 

Video