Channels ▼
RSS

Parallel

Creating and Using Libraries with OpenACC


To interface with CUBLAS, the following CUDA source file, cublasInterface.cu is used to define the CUDA methods required to initialize the CUBLAS library, call the sgemm() library, and wait for completion of asynchronous CUDA calls. The NVIDIA nvcc C++ compiler is used to compile the cublasInterface.cu source file, which means that each of the CUDA methods must be prefaced with ‘extern "C"' to prevent name mangling.

// cublasInterface.cu
#include <cublas_v2.h>
#include <omp.h>
#include <stdio.h>

cublasHandle_t handle;

extern "C"
void cublas_Init()
{
  int status = cublasCreate(&handle);
  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf (stderr, "!!!! CUBLAS initialization error\n");
    exit(1);
  }
}

extern "C"
void cublas_sgemm(char transa, char transb, int l, int n, int m, float alpha,
		  float *A, int lda, float *B, int ldb, float beta, float *C, int ldc)
{
  int status = cublasSgemm(handle,CUBLAS_OP_N, CUBLAS_OP_N, l, n, m, &alpha, A, lda, B, ldb, &beta, C, ldc);
  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf (stderr, "!!!! Sgemm  error\n");
    exit(1);
  }
}

extern "C"
void cuda_WaitForCompletion()
{
  cudaThreadSynchronize();
}

Following is the complete source for the test code matrix-lib.c.

/* matrix-lib.c */
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <acml.h>
#include <openacc.h>
#include "testcublas.h"

// An OpenACC simple matrix multiply.
void doMult_acc(int size, float (* restrict A)[size], float (* restrict B)[size], float (* restrict C)[size]) 
{
  // Compute matrix multiplication.
#pragma acc kernels pcopyin(A[0:size][0:size],B[0:size][0:size]) pcopyout(C[0:size][0:size])
  {
    for (int i = 0; i < size; ++i)
      for (int j = 0; j < size; ++j)
	C[i][j] =0.f;

    for (int i = 0; i < size; ++i) {
      for (int k = 0; k < size; ++k) {
	for (int j = 0; j < size; ++j) {
	  C[i][j] += A[i][k] * B[k][j];
	}
      }
    }
  }
}

// An OpenMP simple matrix multiply
void doMult_omp(int size, float (* restrict A)[size], float (* restrict B)[size], float (* restrict C)[size]) 
{
#pragma omp parallel for default(none) shared(A,B,C,size)
  for (int i = 0; i < size; ++i)
    for (int j = 0; j < size; ++j)
      C[i][j]=0.f;

  // Compute matrix multiplication.
#pragma omp parallel for default(none) shared(A,B,C,size)
  for (int i = 0; i < size; ++i) {
    for (int k = 0; k < size; ++k) {
      for (int j = 0; j < size; ++j) {
	C[i][j] += A[i][k] * B[k][j];
      }
    }
  }
}

// Initialization required when CUBLAS is going to be used on an Nvidia GPU
void initSgemm()
{
#ifdef _OPENACC
  if( acc_get_device_type() == acc_device_nvidia) {
    cublas_Init();
  }
#endif
}

//A generic sgemm that calls cublas when ACC_DEVICE_TYPE=nvidia and acml otherwise
void doSgemm(int size, float (* restrict A)[size], float (* restrict B)[size], float (* restrict C)[size]) 
{
#ifdef _OPENACC
  if( acc_get_device_type() == acc_device_nvidia) {

    // workaround while waiting for the host_data use_device pragma and clause to work
    // allocate space at a known device pointer location
    float (*restrict At)[size] = acc_malloc(sizeof(float)*size*size);
    float (*restrict Bt)[size] = acc_malloc(sizeof(float)*size*size);
    float (*restrict Ct)[size] = acc_malloc(sizeof(float)*size*size);

#pragma acc data pcopyin(A[0:size][0:size],B[0:size][0:size]) pcopyout(C[0:size][0:size]) deviceptr(At,Bt,Ct)
    {
      // copy data on the device to the known pointer locations 
#pragma acc kernels loop 
      for(int i=0; i< size; i++)
      for(int j=0; j < size; j++) {
	At[i][j] = A[j][i];
	Bt[i][j] = B[j][i];
      }

      cublas_sgemm('N', 'N', size, size, size, 1.0, (float*)At, size, (float*)Bt, size, 0.0, (float*)Ct, size);
      cuda_WaitForCompletion(); // A production code would probably not wait for completion here

#pragma acc kernels loop 
      for(int i=0; i< size; i++)
	for(int j=0; j < size; j++)
	  C[i][j] = Ct[j][i];
    }
    acc_free(At); acc_free(Bt); acc_free(Ct);
    return;
  }
#endif
  { // use host based sgemm

    float (*restrict At)[size] = malloc(sizeof(float)*size*size);
    float (*restrict Bt)[size] = malloc(sizeof(float)*size*size);
    float (*restrict Ct)[size] = malloc(sizeof(float)*size*size);
    
    // use transposed matrices
    for(int i=0; i < size; i++)
      for(int j=0; j < size; j++) {
	At[i][j] = A[j][i];
	Bt[i][j] = B[j][i];
      }
    
    sgemm('N', 'N', size, size, size, 1.0, (float*)At, size, (float*)Bt, size, 0.0, (float*)Ct, size);
    
    // transpose the matrix back for calculation of the error
    for(int i=0; i < size; i++)
      for(int j=0; j < size; j++) {
	C[i][j] = Ct[j][i];
      }
    
    free(At); free(Bt); free(Ct);
  }
}

void fill(int size, float (* restrict A)[size], float (* restrict B)[size], float (* restrict C)[size]) 
{
  for (int i = 0; i < size; ++i) {
    for (int j = 0; j < size; ++j) {
      //A[i][j] = ((float)i + j); B[i][j] = ((float)i - j);
      A[i][j] = random()/(double)RAND_MAX; B[i][j] = random()/(double)RAND_MAX;
      C[i][j]=0.f;
      }
    }
}

float rmsdError( int size, float (* restrict M1)[size], float (* restrict M2)[size]) 
{
    double sum=0.;
#pragma omp parallel for reduction(+:sum)
    for (int i = 0; i < size; ++i) 
      for (int j = 0; j < size; ++j) {
	sum += (M1[i][j]- M2[i][j])*(M1[i][j]- M2[i][j]);
      }
    
    return(sqrt(sum/(size*size)));
}

void checkMult(int size, float (* restrict A)[size], float (* restrict B)[size], 
	       float (* restrict C)[size], float runtime) 
{
  char *deviceName="host";
  double startTime, endTime;
  float (*restrict Chost)[size] = malloc(sizeof(float)*size*size);

#ifdef _OPENACC
  if( acc_get_device_type() == acc_device_nvidia) 
    deviceName = "Nvidia";
#endif

  printf("check %d %d matrix using %s optimized methods\n",size, size, deviceName);
  printf("\tACC \t%s\t runtime %8.5g\n",deviceName, runtime);

  // compare results and runtimes against an optimized matrix multiply
  initSgemm();
  startTime = omp_get_wtime();
  doSgemm(size, A, B,Chost);
  endTime = omp_get_wtime();

  printf("\tSgemm \t%s\t runtime %8.5g,\t rmsdError %8.5g\n",
	 deviceName,(endTime-startTime),rmsdError(size,C,Chost));

  // Check results and runtime against the simple matrix multiply using OpenMP
  startTime = omp_get_wtime();
  doMult_omp(size,A,B,Chost);
  endTime = omp_get_wtime();

  printf("\tOMP \t%s\t runtime %8.5g,\t rmsdError %8.5g\n",
	 "host",(endTime-startTime),rmsdError(size,C,Chost));
  free(Chost);
}

int main(int argc, char *argv[])
{

  if(argc != 4) {
    fprintf(stderr,"Use: %s size nIter doCheck\n",argv[0]);
    return -1;
  }

  int size=atoi(argv[1]);
  int nIter=atoi(argv[2]);
  int doCheck=atoi(argv[3]);
  
  if(nIter <= 0) {
    fprintf(stderr,"%s: Invalid nIter (%d)\n",argv[0],nIter);
    return -1;
  }

  // print out some OMP info
#pragma omp parallel
#pragma omp master
  printf("%s running with OMP_NUM_THREADS=%d\n", argv[0], omp_get_num_threads());

  float (*restrict A)[size] = malloc(sizeof(float)*size*size);
  float (*restrict B)[size] = malloc(sizeof(float)*size*size);
  float (*restrict C)[size] = malloc(sizeof(float)*size*size);

  fill(size,A,B,C);
  
  double aveRuntime=0.;
#pragma acc data pcopyin(A[0:size][0:size],B[0:size][0:size]) pcopyout(C[0:size][0:size])
  {
#pragma omp single
    for(int i=1; i < nIter; i++) {
      double startTime = omp_get_wtime();
      doMult_acc(size,A,B,C);
      double endTime = omp_get_wtime();
      aveRuntime += endTime-startTime;
    }
    aveRuntime /= nIter;

    printf("%s matrix %d %d average runtime over %d iter is %g\n",argv[0],size, size, nIter, aveRuntime);
  }

  if(doCheck) {
    checkMult(size,A,B,C,aveRuntime);
  }

  free(A); free(B); free(C);

  return 0;
}

Running the binaries requires three command-line arguments:

  1. The size of the square matrix.
  2. The number of times the OpenACC matrix multiplication should be called to determine an average runtime.
  3. A flag to indicate if the application needs to verify the results.

Figure 3 shows how the size of the matrix affects the runtime behavior of the following matrix multiplication implementations:

  • The sgemm() in the optimized multithreaded x86 ACML BLAS library running on all the host cores.
  • The sgemm() in the optimized GPU CUBLAS library.
  • An OpenMP fast, simple matrix multiplication (doMult_omp()).
  • A fast, simple OpenACC matrix multiplication (doMult_acc()) running on both the GPU and all four Xeon x86 processor cores.


Figure 3: Runtimes of various square matrix multiplications on both host and GPU devices.

The graph shows that the OpenACC GPU implementation outperforms the optimized, multithreaded ACML SGEMM library call (running on four Xeon cores) and appears to compare well to the optimized CUBLAS implementation. Both the host-based OpenMP and OpenACC binaries clearly run slower than the GPU and ACML implementations at larger matrix sizes. It will be interesting to see if the OpenACC host-based performance improves relative to the OpenMP performance in the future as the compiler technology matures.

Note that data transfer time across the PCIe bus was not counted in the CUBLAS and OpenACC runtimes, as the data is assumed to reside on the device due to the use of OpenACC conditional data clauses. However, the time to transpose the matrices was considered part of the sgemm() runtime for both the host-based ACML and GPU-based CUBLAS methods. Adventurous readers can instrument the code to measure the speed of the sgemm() functions without counting matrix transposition times. BLAS also provides options to work directly with transposed matrices.

The following bash script was used to compile the CUDA and OpenACC source files plus perform the survey that generated the results shown in Figure 3. This script compiles dedicated executable files for both host and NVIDIA OpenACC devices, as shown by the "-ta" command line arguments in lines 7 and 11. In addition, the ACC_DEVICE_TYPE environment variable was used to specify the intended device prior to running the executable as a workaround for binaries created with the PGI 12.6 compiler. Later versions should allow compilation of a single binary with "-ta=host,nvidia" and specification of the appropriate device solely through ACC_DEVICE_TYPE.

INC=/usr/include/x86_64-linux-gnu
PGI=/opt/pgi/linux86-64/2012
ACML=$PGI/acml/5.1.0
CUDA_PATH=$PGI/cuda/4.2

nvcc -c cublasInterface.cu
pgcc -acc -O3 -ta=nvidia -mp -Minfo -fast -I $CUDA_PATH/include -L $CUDA_PATH/lib64 -I $INC -L $ACML/lib matrix-lib.c cublasInterface.o -lacml_mp -lcublas -lcudart -pgf90libs -lacc1 -o test-mmult-lib.nvidia

pgcc -acc -O3 -ta=host  -mp -Minfo -fast -I $CUDA_PATH/include -L $CUDA_PATH/lib64 -I $INC -L $ACML/lib matrix-lib.c cublasInterface.o -lacml_mp -lcublas -lcudart -pgf90libs -lacc1 -o test-mmult-lib.host -lacc1

export OMP_NUM_THREADS=4
export N_ITER=3

for SIZE in 100 200 300 400 500 600 700 800 900 1000 1500 2000 2500 3000 3500 4000 4500 5000 6000 7000 8000 9000 10000
do
	echo $SIZE
	echo "--------- HOST --------------"
	export ACC_DEVICE_TYPE=host
	./test-mmult-lib.$ACC_DEVICE_TYPE $SIZE $N_ITER 1
	echo "--------- NVIDIA --------------"
	export ACC_DEVICE_TYPE=nvidia
	./test-mmult-lib.$ACC_DEVICE_TYPE $SIZE $N_ITER 1
done

Conclusion

The OpenACC specification provides the features needed to create transparent, yet efficient, scientific and commercial applications for both CPUs and GPUs from a single source tree. It does not matter whether the preferred source language is C, Fortran, or some mix of languages. Conditional execution based on device type, togethter with the ability to call external methods written in other languages, means developers can write generic libraries that can select optimized algorithms at runtime for each device. Even so, methods written entirely in OpenACC show a promising performance potential compared to optimized libraries.

Performance regression testing found the Xeon processor was sensitive to the order of the loops that perform the matrix multiplication. Changing the code slightly resulted in a 20-times speed-up of the OpenMP code on a quad-core Xeon processor. In contrast, an NVIDIA C2050 running OpenACC proved to be robust against performance degradation caused by the loop ordering. This indicates that latency hiding on the GPU due to thread-level parallelism and hardware scheduling is working. The reader is encouraged to investigate further to see if these results can be generalized to support (or refute) the hypothesis that TLP-based computer architectures are more robust in supporting nested parallelism than traditional SMP/vector processor designs. Correctly structuring nested loops to most efficiently exploit hardware parallelism is an important problem in computer science and a key question in high-performance parallel application design.

The ability to run on both the host and GPU devices without recompilation provides a convenient and necessary capability for portable, high-performance applications. It is likely that this "one source tree that can create one binary for all devices" characteristic will motivate many to consider OpenACC for all their application needs. HPC users, in particular, will certainly look to exploit the ability to generate executable functions for host and GPU devices from one source file when building hybrid applications. Such applications will be able to concurrently utilize all the host and device capabilities in each node of a supercomputer. The freedom to mix both OpenMP and OpenACC pragmas in the same source code will also motivate many to start exploring OpenACC as a path to accelerator programming.


Rob Farber is an analyst who writes frequently on high-performance computing hardware topics.


Related Reading


More Insights






Currently we allow the following HTML tags in comments:

Single tags

These tags can be used alone and don't need an ending tag.

<br> Defines a single line break

<hr> Defines a horizontal line

Matching tags

These require an ending tag - e.g. <i>italic text</i>

<a> Defines an anchor

<b> Defines bold text

<big> Defines big text

<blockquote> Defines a long quotation

<caption> Defines a table caption

<cite> Defines a citation

<code> Defines computer code text

<em> Defines emphasized text

<fieldset> Defines a border around elements in a form

<h1> This is heading 1

<h2> This is heading 2

<h3> This is heading 3

<h4> This is heading 4

<h5> This is heading 5

<h6> This is heading 6

<i> Defines italic text

<p> Defines a paragraph

<pre> Defines preformatted text

<q> Defines a short quotation

<samp> Defines sample computer code text

<small> Defines small text

<span> Defines a section in a document

<s> Defines strikethrough text

<strike> Defines strikethrough text

<strong> Defines strong text

<sub> Defines subscripted text

<sup> Defines superscripted text

<u> Defines underlined text

Dr. Dobb's encourages readers to engage in spirited, healthy debate, including taking us to task. However, Dr. Dobb's moderates all comments posted to our site, and reserves the right to modify or remove any content that it determines to be derogatory, offensive, inflammatory, vulgar, irrelevant/off-topic, racist or obvious marketing or spam. Dr. Dobb's further reserves the right to disable the profile of any commenter participating in said activities.

 
Disqus Tips To upload an avatar photo, first complete your Disqus profile. | View the list of supported HTML tags you can use to style comments. | Please read our commenting policy.
 

Video