Scaling Behavior According to Matrix Size
The source code for matrix.c (Listing Five) can be compiled without modification by the Intel compiler to run in the following modes:
Native: The entire application runs on the Intel Xeon Phi.Offload: The host processor runs the application and offloads compute intensive codeand associated data to the device as specified by the programmer via pragmas in the source code.Host: Run the code as a traditional OpenMP application on the host.
Regression testing and verification of results is an essential part of any programming project, and so to validate the correctness of the tutorial matrix multiplication (Listing One), the doCheck() method has been added to call the highly optimized, known-to-be-correctly working Intel MKL (Math Kernel Library) sgemm() matrix multiplication routine.
Unfortunately, it is not possible to perform a bit-wise comparison between the matrices calculated by doMult() and sgemm(). Floating-point arithmetic is only approximate, which means that even slight variations in the order in which the arithmetic operations are performed (say when compiling with different optimization flags or as a result of running on different devices) can cause applications compiled from the same source code to produce slightly different numerical results. For this reason, the nrmsdError() calculates a Normalized Root–Mean-Squared Deviation (NRMSD) as a measure of similarity. The NRMSD value will be small when the matrices are similar.
The doCheck() method also returns the average runtime of the MKL sgemm(). Just like our example code, the the MKL library can run in three modes: natively on the coprocessor, in offload mode, and on the host processor. The MKL runtime reported by doCheck() provides that ability to observe how fast highly-optimized code can run a coprocessor and compare performance against a similarly highly optimized code running on the host processor.
Listing Five: Complete source code for matrix.c.
/* matrix.c (Rob Farber) */
#ifndef MIC_DEV
#define MIC_DEV 0
#endif
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <mkl.h>
#include <math.h>
// An OpenMP simple matrix multiply
void doMult(int size, float (* restrict A)[size],
float (* restrict B)[size], float (* restrict C)[size])
{
#pragma offload target(mic:MIC_DEV) \
in(A:length(size*size)) in( B:length(size*size)) \
out(C:length(size*size))
{
// Zero the C matrix
#pragma omp parallel for default(none) shared(C,size)
for (int i = 0; i < size; ++i)
for (int j = 0; j < size; ++j)
C[i][j] =0.f;
// Compute matrix multiplication.
#pragma omp parallel for default(none) shared(A,B,C,size)
for (int i = 0; i < size; ++i)
for (int k = 0; k < size; ++k)
for (int j = 0; j < size; ++j)
C[i][j] += A[i][k] * B[k][j];
}
}
float nrmsdError(int size, float (* restrict M1)[size],
float (* restrict M2)[size])
{
double sum=0.;
double max,min;
max=min=(M1[0][0]- M2[0][0]);
#pragma omp parallel for
for (int i = 0; i < size; ++i)
for (int j = 0; j < size; ++j) {
double diff = (M1[i][j]- M2[i][j]);
#pragma omp critical
{
max = (max>diff)?max:diff;
min = (min<diff)?min:diff;
sum += diff*diff;
}
}
return(sqrt(sum/(size*size))/(max-min));
}
float doCheck(int size, float (* restrict A)[size],
float (* restrict B)[size],
float (* restrict C)[size],
int nIter,
float *error)
{
float (*restrict At)[size] = malloc(sizeof(float)*size*size);
float (*restrict Bt)[size] = malloc(sizeof(float)*size*size);
float (*restrict Ct)[size] = malloc(sizeof(float)*size*size);
float (*restrict Cgemm)[size] = malloc(sizeof(float)*size*size);
// transpose to get best sgemm performance
#pragma omp parallel for
for(int i=0; i < size; i++)
for(int j=0; j < size; j++) {
At[i][j] = A[j][i];
Bt[i][j] = B[j][i];
}
float alpha = 1.0f, beta = 0.0f; /* Scaling factors */
// warm up
sgemm("N", "N", &size, &size, &size, &alpha,
(float *)At, &size, (float *)Bt, &size, &beta, (float *) Ct, &size);
double mklStartTime=dsecnd();
for(int i=0; i < nIter; i++)
sgemm("N", "N", &size, &size, &size, &alpha,
(float *)At, &size, (float *)Bt, &size, &beta, (float *) Ct, &size);
double mklEndTime=dsecnd();
// transpose in Cgemm to calculate error
#pragma omp parallel for
for(int i=0; i < size; i++)
for(int j=0; j < size; j++)
Cgemm[i][j] = Ct[j][i];
*error = nrmsdError(size, C,Cgemm);
free(At); free(Bt); free(Ct); free(Cgemm);
return (2e-9*size*size*size/((mklEndTime-mklStartTime)/nIter) );
}
int main(int argc, char *argv[])
{
if(argc != 4) {
fprintf(stderr,"Use: %s size nThreads nIter\n",argv[0]);
return -1;
}
int i,j,k;
int size=atoi(argv[1]);
int nThreads=atoi(argv[2]);
int nIter=atoi(argv[3]);
omp_set_num_threads(nThreads);
float (*restrict A)[size] = malloc(sizeof(float)*size*size);
float (*restrict B)[size] = malloc(sizeof(float)*size*size);
float (*restrict C)[size] = malloc(sizeof(float)*size*size);
// Fill the A and B arrays
#pragma omp parallel for default(none) shared(A,B,size) private(i,j,k)
for (i = 0; i < size; ++i) {
for (j = 0; j < size; ++j) {
A[i][j] = (float)i + j;
B[i][j] = (float)i - j;
}
}
double aveDoMultTime=0.;
{
// warm up
doMult(size, A,B,C);
double startTime = dsecnd();
for(int i=0; i < nIter; i++) {
doMult(size, A,B,C);
}
double endTime = dsecnd();
aveDoMultTime = (endTime-startTime)/nIter;
}
#pragma omp parallel
#pragma omp master
printf("%s nThreads %d matrix %d %d runtime %g GFlop/s %g",
argv[0], omp_get_num_threads(), size, size,
aveDoMultTime, 2e-9*size*size*size/aveDoMultTime);
#pragma omp barrier
// do check
float error=0.f;
float mklGflop = doCheck(size,A,B,C,nIter,&error);
printf(" mklGflop %g NRMSD_error %g", mklGflop, error);
printf("\n");
free(A); free(B); free(C);
return 0;
}
The Intel compiler commands in Listing Six are used to compile the matrix.c source code for the various modes:
Listing Six: Build script for matrix.c.
# compile for host-based OpenMP icc -mkl -O3 -no-offload -openmp -Wno-unknown-pragmas -std=c99 -vec-report3 \ matrix.c -o matrix.omp # compile for offload mode icc -mkl -O3 -offload-build -Wno-unknown-pragmas -std=c99 -vec-report3 \ matrix.c -o matrix.off # compile to run natively on the Xeon Phi icc -mkl -O3 -mmic -openmp -L /opt/intel/lib/mic -Wno-unknown-pragmas \ -std=c99 -vec-report3 matrix.c -o matrix.mic -liomp5


