A First Implementation
The following code is a complete C-language implementation of a serial, OpenMP, and OpenACC CGS test code. This code fills a matrix with random numbers and then performs a Classical Gram-Schmidt orthogonalization. The orthogonality of the results is checked. If desired, the user can call the printOctave() method to check the results with Octave or Matlab.
// Classical Gram-Schmidt Rob Farber
#include <stdio.h>
#include <stdlib.h>
#include <accelmath.h>
#include <omp.h>
#ifndef ROWS
#define ROWS 640
#endif
#ifndef COLS
#define COLS 640
#endif
// The _OPENACC variable defined by the OpenACC compiler.
#ifdef _OPENACC
// Use the OpenACC implementation if _OPENACC is defined
void gramSchmidt(restrict float Q[][COLS], const int rows, const int cols)
{
#pragma acc data copy(Q[0:rows][0:cols])
for(int k=0; k < cols; k++) {
double tmp = 0.;
#pragma acc parallel reduction(+:tmp)
for(int i=0; i < rows; i++) tmp += (Q[i][k] * Q[i][k]);
tmp = sqrt(tmp);
#pragma acc parallel loop
for(int i=0; i < rows; i++) Q[i][k] /= tmp;
#pragma acc parallel loop
for(int j=k+1; j < cols; j++) {
tmp=0.;
for(int i=0; i < rows; i++) tmp += Q[i][k] * Q[i][j];
for(int i=0; i < rows; i++) Q[i][j] -= tmp * Q[i][k];
}
}
}
#else
// If _OPENACC is not defined then use OMP/serial code for performance testing
void gramSchmidt(restrict float Q[][COLS], const int rows, const int cols)
{
for(int k=0; k < cols; k++) {
double tmp = 0.;
#pragma omp parallel for reduction(+:tmp)
for(int i=0; i < rows; i++) tmp += (Q[i][k] * Q[i][k]);
tmp = sqrt(tmp);
#pragma omp parallel for
for(int i=0; i < rows; i++) Q[i][k] /= tmp;
#pragma omp parallel for reduction(+:tmp)
for(int j=k+1; j < cols; j++) {
tmp=0.;
for(int i=0; i < rows; i++) tmp += Q[i][k] * Q[i][j];
for(int i=0; i < rows; i++) Q[i][j] -= tmp * Q[i][k];
}
}
}
#endif
//Allows loading of the matrices into octave/matlab for testing
void printOctave(char* var, float A[][COLS], int rows, int cols)
{
// if((rows*cols) > 1000) return;
printf("%s=[\n",var);
for(int i=0; i < rows; i++) {
for(int j=0; j < cols; j++) printf("%c%e", (j==0)?' ':',', A[i][j]);
printf(";");
}
printf("];\n");
}
// checks that the results are orthonogal where they should be
void checkOctave(restrict float A[][COLS], int rows, int cols)
{
int found_error=0;
for(int c1=0; c1 < cols; c1++)
for(int c2=c1; c2 < cols; c2++) {
double sum=0.;
for(int i=0; i < rows; i++) sum += A[i][c1] * A[i][c2];
if(c1 == c2) { // should be near 1 (unit length)
if(sum < 0.9) {
printf("Failed unit length: %d %d %g\n", c1,c2,sum);
found_error = 1;
exit(1);
}
} else { // should be very small (orthogonal)
if(sum > 0.1) {
printf("Failed orthonogonal %d %d %g\n", c1,c2,sum);
found_error = 1;
exit(1);
}
}
}
if(!found_error) printf("Check OK!\n");
}
//Simple driver using random values
int main(int argc, char *argv[])
{
int rows=ROWS;
int cols=COLS;
float (*A)[COLS] = (float(*)[COLS])malloc(sizeof(float)*rows*cols);
// fill matrix A with random numbers
for(int i=0; i < rows; i++)
for(int j=0; j < cols; j++)
A[i][j] = (float)rand()/ (float)RAND_MAX;
printf("Done with init!\n");
//printOctave("A", A, rows, cols);
double startTime = omp_get_wtime();
gramSchmidt(A,rows, cols);
double endTime = omp_get_wtime();
printf("runtime %d %d matrix %g\n",rows, cols, (endTime-startTime));
checkOctave(A, rows, cols);
//printOctave("Q", A, rows, cols);
free(A);
}
Example 6: Source for cgs-01.c>
Note that the OpenACC implementation closely mirrors the OpenMP implementation. The only real difference between the OpenMP and OpenACC code (aside from slight syntax differences in the pragmas) is the data clause that tells the OpenACC compiler to copy the matrix Q to the accelerator at the beginning of the gramSchmidt() method and return the modified Q matrix just prior to the return. Since the scope of the k loop encompasses the entire gramSchmidt() method, no additional curly brackets are required. The j loop demonstrates how the parallel loop can enclose multiple work-sharing loops. (A final note on the copy pragma: do not be tempted to eliminate the "0:" in the copy clause as "copy(Q[rows][cols]" tells the OpenACC compiler to copy just one element. Some compilers will recognize this as an error and perform a copy operation on the entire array. For portability reasons, make certain the copy clause for C code correctly specifies the entire region.)
// Use the OpenACC implementation if _OPENACC is defined
void gramSchmidt(restrict float Q[][COLS], const int rows, const int cols)
{
#pragma acc data copy(Q[0:rows][0:cols])
for(int k=0; k < cols; k++) {
double tmp = 0.;
#pragma acc parallel reduction(+:tmp)
for(int i=0; i < rows; i++) tmp += (Q[i][k] * Q[i][k]);
tmp = sqrt(tmp);
#pragma acc parallel loop
for(int i=0; i < rows; i++) Q[i][k] /= tmp;
#pragma acc parallel loop
for(int j=k+1; j < cols; j++) {
tmp=0.;
for(int i=0; i < rows; i++) tmp += Q[i][k] * Q[i][j];
for(int i=0; i < rows; i++) Q[i][j] -= tmp * Q[i][k];
}
}
}
Example 7: The OpenACC gramSchmidt method.
Combining both OpenMP and OpenACC pragmas in the implementation of the gramSchmidt() method clearly demonstrates the similarity between the pragmas and that it is not necessary to write separate OpenMP and OpenACC methods. The following code compiles correctly for OpenMP and OpenACC without the need for an #ifdef _OPENACC preprocessor conditional:
void gramSchmidt(restrict float Q[][COLS], const int rows, const int cols)
{
#pragma acc data copy(Q[0:rows][0:cols])
for(int k=0; k < cols; k++) {
double tmp = 0.;
#pragma omp parallel for reduction(+:tmp)
#pragma acc parallel reduction(+:tmp)
for(int i=0; i < rows; i++) tmp += (Q[i][k] * Q[i][k]);
tmp = sqrt(tmp);
#pragma omp parallel for
#pragma acc parallel loop
for(int i=0; i < rows; i++) Q[i][k] /= tmp;
#pragma omp parallel for reduction(+:tmp)
#pragma acc parallel loop
for(int j=k+1; j < cols; j++) {
tmp=0.;
for(int i=0; i < rows; i++) tmp += Q[i][k] * Q[i][j];
for(int i=0; i < rows; i++) Q[i][j] -= tmp * Q[i][k];
}
}
}
Example 8: Combining OpenACC and OpenMP pragmas in a single method.


