An Example
Let's take a look at the following very simple example, readTexels.cu, which demonstrates how to bind a texture to a CUDA array and sets the filterMode attribute to cudaFilterModeLinear.
//readTexels.cu
#include <stdio.h>
void checkCUDAError(const char *msg) {
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) {
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
exit(EXIT_FAILURE);
}
}
texture<float, 1, cudaReadModeElementType> texRef;
__global__ void readTexels(int n, float *d_out)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if(idx < n) {
//Note: Appendix D.2 gives formula for interpolation
float x = tex1D(texRef, float(idx));
d_out[idx] = x;
}
}
#define NUM_THREADS 256
int main()
{
int N = 10; // 10 is illustrative and should be larger in practice
int nBlocks = N/NUM_THREADS + ((N % NUM_THREADS)?1:0);
float *d_out;
// allocate space on the device for the results
cudaMalloc((void**)&d_out, sizeof(float) * N);
// allocate space on the host for the results
float *h_out = (float*)malloc(sizeof(float)*N);
// data fill array with increasing values
float *data = (float*)malloc(N*sizeof(float));
for (int i = 0; i < N; i++) data[i] = float(i);
// create a CUDA array on the device
cudaArray* cuArray;
cudaMallocArray (&cuArray, &texRef.channelDesc, N, 1);
cudaMemcpyToArray(cuArray, 0, 0, data, sizeof(float)*N, cudaMemcpyHostToDevice);
// bind a texture to the CUDA array
cudaBindTextureToArray (texRef, cuArray);
// host side settable texture attributes
texRef.normalized = false;
texRef.filterMode = cudaFilterModeLinear;
// read texels from texture
readTexels<<<nBlocks, NUM_THREADS>>>(N, d_out);
// copy texels to host
cudaMemcpy(h_out, d_out, sizeof(float)*N, cudaMemcpyDeviceToHost);
// look at them
for (int i = 0; i << N; i++) {
printf("%f\n",h_out[i]);
}
free(h_out);
cudaFree(d_out);
cudaFreeArray(cuArray);
cudaUnbindTexture(texRef);
checkCUDAError("cuda free operations");
}
Under Linux, the following nvcc command-line can be used to build this program:
nvcc readTexel.cu –o readTexel
On the host side, the texture reference, texRef, is created with:
texture<float, 1, cudaReadModeElementType> texRef;
A CUDA array, cuArray, is allocated and initialized:
// create a CUDA array on the device cudaArray* cuArray; cudaMallocArray (&cuArray, &texRef.channelDesc, N, 1);
The texRef texture is then bound to cuArray and the texture attributes are set. In this case, we specify linear interpolation and we will not be using normalized texture coordinates.
// bind a texture to the CUDA array cudaBindTextureToArray (texRef, cuArray); // host side settable texture attributes texRef.normalized = false; texRef.filterMode = cudaFilterModeLinear;
The kernel, readTexels(), simply fetches values from the texture unit and places them in the d_out array.
//Note: Appendix D.2 gives formula for interpolation
float x = tex1D(texRef, float(idx));
d_out[idx] = x;
The d_out array is then copied back to the host and printed out on the screen. Finally, the texture is released with the call:
cudaUnbindTexture(texRef);
Playing with the attributes and data in this simple example might help clarify the processing capabilities of texture memory. For this example, you should see the following output demonstrating that the texture is interpolating between data points.
0.000000 0.500000 1.500000 2.500000 3.500000 4.500000 5.500000 6.500000 7.500000 8.500000
The following simple example, negateArray.cu, binds a 1D texture to linear memory. The texture is used to fetch floating-point values from the linear memory and the texture is then updated in-place. The results are then brought back to the host and checked for correctness.
#include <stdio.h>
#include <assert.h>
void checkCUDAError(const char *msg) {
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) {
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
exit(EXIT_FAILURE);
}
}
texture<float, 1, cudaReadModeElementType> texRef;
__global__ void kernel(int n, float *d_out)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if(idx < n) {
d_out[idx] = -tex1Dfetch(texRef, idx);
}
}
#define NUM_THREADS 256
int main()
{
int N = 2560;
int nBlocks = N/NUM_THREADS + ((N % NUM_THREADS)?1:0);
int memSize = N*sizeof(float);
// data fill array with increasing values
float *data;
data = (float*) malloc(memSize);
for (int i = 0; i < N; i++) data[i] = float(i);
float *d_a;
cudaMalloc( (void **) &d_a, memSize );
cudaMemcpy( d_a, data, memSize, cudaMemcpyHostToDevice );
cudaBindTexture(0,texRef,d_a,memSize);
checkCUDAError("bind");
kernel<<<nBlocks, NUM_THREADS>>>(N, d_a);
float *h_out = (float*)malloc(memSize);
cudaMemcpy(h_out, d_a, memSize, cudaMemcpyDeviceToHost);
checkCUDAError("cudaMemcpy");
for (int i = 0; i <<N; i++) {
assert(data[i] == -h_out[i]);
}
printf("Correct\n");
cudaUnbindTexture(texRef);
checkCUDAError("cudaUnbindTexture");
free(h_out);
free(data);
}
There are a few minor but important differences between negateArray.cu and the previous readTexels.cu example.
The first real difference is that we allocate a linear region of memory, d_a, with cudaMalloc():
float *d_a; cudaMalloc( (void **) &d_a, memSize );
This linear memory is bound to a texture with the following:
cudaBindTexture(0,texRef,d_a,memSize);
checkCUDAError("bind");
On the device, tex1Dfetch() is used to fetch the data, which is then negated and written to d_out:
d_out[idx] = -tex1Dfetch(texRef, idx);
Please note that the kernel call passed d_a, which means that the data is updated in-place:
kernel<<<nBlocks, NUM_THREADS>>>(N, d_a);
Finally, let's revisit the reverseArray_multiblock.cu example, which was discussed in detail in Part 3 of this series and adapt it to use texture memory. As can be seen in the source for reverseArray_multiblockTexture.cu below, only a few minor changes were needed to change from using a linear array to a texture object bound to the linear region of memory, d_a, allocated with cudaMalloc. For convenience, changes from reverseArray_multiblock.cu are highlighted with red and the "* Texture Specific *" string.
// reverseArray_multiblockTexture.cu
// includes, system
#include <stdio.h>
#include <assert.h>
// Simple utility function to check for CUDA runtime errors
void checkCUDAError(const char* msg);
<font color="#FF0000">// ****************** Texture Specific *******************
// Note: default mode is cudaReadModeElementType
// section 4.3.4.1 of the NVIDIA CUDA Programming Guide</font>
texture<int, 1> tex_d_a;
// Part3: implement the kernel
__global__ void reverseArrayTexture(int *d_out, int *d_in)
{
int inOffset = blockDim.x * blockIdx.x;
int outOffset = blockDim.x * (gridDim.x - 1 - blockIdx.x);
int in = inOffset + threadIdx.x;
int out = outOffset + (blockDim.x - 1 - threadIdx.x);
<font color="#FF0000"> // ****************** Texture Specific *******************</font>
d_out[out] = tex1Dfetch(tex_d_a,in);
}
// Program main
int main( int argc, char** argv)
{
// pointer for host memory and size
int *h_a; int dimA = 256 * 1024; // 256K elements (1MB total)
// pointer for device memory
int *d_b, *d_a;
// define grid and block size
int numThreadsPerBlock = 256;
// Part 1: compute number of blocks needed based on
// array size and desired block size
int numBlocks = dimA / numThreadsPerBlock;
// allocate host and device memory
size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
h_a = (int *) malloc(memSize);
cudaMalloc( (void **) &d_a, memSize );
cudaMalloc( (void **) &d_b, memSize );
<font color="#FF0000">// ****************** Texture Specific *******************
// Bind the device array d_a to a texture object tex_d_a</font>
cudaBindTexture(NULL,tex_d_a,d_a);
checkCUDAError("Bind Texture");
// Initialize input array on host
for (int i = 0; i < dimA; ++i) { h_a[i] = i; }
// Copy host array to device array
cudaMemcpy( d_a, h_a, memSize, cudaMemcpyHostToDevice );
// launch kernel
dim3 dimGrid(numBlocks);
dim3 dimBlock(numThreadsPerBlock);
reverseArrayTexture<<< dimGrid, dimBlock >>>( d_b, d_a );
// block until the device has completed
cudaThreadSynchronize();
// check if kernel execution generated an error
// Check for any CUDA errors
checkCUDAError("kernel invocation");
// device to host copy
cudaMemcpy( h_a, d_b, memSize, cudaMemcpyDeviceToHost );
// Check for any CUDA errors
checkCUDAError("memcpy");
// verify the data returned to the host is correct
for (int i = 0; i < dimA; i++) { assert(h_a[i] == dimA - 1 - i ); }
<font color="#FF0000">// ****************** Texture Specific *******************</font>
cudaUnbindTexture(tex_d_a);
checkCUDAError("Unbind Texture");
// free device memory
cudaFree(d_a); cudaFree(d_b);
// free host memory
free(h_a);
// If the program makes it this far, then the results are
// correct and there are no run-time errors. Good work!
printf("Correct!\n"); return 0;
}
void checkCUDAError(const char *msg) {
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) {
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
exit(EXIT_FAILURE);
}
}
In short summary form, CUDA texturing requires the following steps:
- Host (CPU) code
- Allocate/obtain memory (linear memory, pitch linear memory, or CUDA array)
- Create a texture reference object
- Currently must be at file-scope
- Bind the texture reference to memory/array
- When done
- Unbind the texture reference, free resources
- Device (kernel) code
- Fetch using texture reference
- Linear memory textures
tex1Dfetch
- Array textures and pitch linear memory
tex1D,tex2D, or tex3D
This structure can be seen in reverseArray_multiblockTexture.cu:
- Host (CPU) code:
// reverseArray_multiblockTexture.cu
...
<font color="#FF0000">// ****************** Texture Specific *******************
// Note: default mode is cudaReadModeElementType
// section 4.3.4.1 of the NVIDIA CUDA Programming Guide</font>
texture<int, 1> tex_d_a;
// Program main
int main( int argc, char** argv)
{
...
// pointer for device memory
int *d_b, *d_a;
...
cudaMalloc( (void **) &d_a, memSize );
...
<font color="#FF0000">// ****************** Texture Specific *******************
// Bind the device array d_a to a texture object tex_d_a</font>
cudaBindTexture(NULL,tex_d_a,d_a);
checkCUDAError("Bind Texture");
...
<font color="#FF0000">// ****************** Texture Specific *******************</font>
cudaUnbindTexture(tex_d_a);
checkCUDAError("Unbind Texture");
...
}
- Device (kernel) code:
// Part3: implement the kernel
__global__ void reverseArrayTexture(int *d_out, int *d_in)
{
<font color="#FF0000">// ****************** Texture Specific *******************</font>
d_out[out] = tex1Dfetch(tex_d_a,in);
}


