Channels ▼
RSS

Programming Intel's Xeon Phi: A Jumpstart Introduction


Scaling Behavior According to Matrix Size

The source code for matrix.c (Listing Five) can be compiled without modification by the Intel compiler to run in the following modes:

  • Native: The entire application runs on the Intel Xeon Phi.
  • Offload: The host processor runs the application and offloads compute intensive code and associated data to the device as specified by the programmer via pragmas in the source code.
  • Host: Run the code as a traditional OpenMP application on the host.

Regression testing and verification of results is an essential part of any programming project, and so to validate the correctness of the tutorial matrix multiplication (Listing One), the doCheck() method has been added to call the highly optimized, known-to-be-correctly working Intel MKL (Math Kernel Library) sgemm() matrix multiplication routine.

Unfortunately, it is not possible to perform a bit-wise comparison between the matrices calculated by doMult() and sgemm(). Floating-point arithmetic is only approximate, which means that even slight variations in the order in which the arithmetic operations are performed (say when compiling with different optimization flags or as a result of running on different devices) can cause applications compiled from the same source code to produce slightly different numerical results. For this reason, the nrmsdError() calculates a Normalized Root–Mean-Squared Deviation (NRMSD) as a measure of similarity. The NRMSD value will be small when the matrices are similar.

The doCheck() method also returns the average runtime of the MKL sgemm(). Just like our example code, the the MKL library can run in three modes: natively on the coprocessor, in offload mode, and on the host processor. The MKL runtime reported by doCheck() provides that ability to observe how fast highly-optimized code can run a coprocessor and compare performance against a similarly highly optimized code running on the host processor.

Listing Five: Complete source code for matrix.c.

/* matrix.c (Rob Farber) */
#ifndef MIC_DEV
#define MIC_DEV 0
#endif

#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <mkl.h>
#include <math.h>

// An OpenMP simple matrix multiply
void doMult(int size, float (* restrict A)[size],
	    float (* restrict B)[size], float (* restrict C)[size]) 
{
#pragma offload target(mic:MIC_DEV) \
                in(A:length(size*size)) in( B:length(size*size))	\
                out(C:length(size*size))
  {
    // Zero the C matrix
#pragma omp parallel for default(none) shared(C,size)
    for (int i = 0; i < size; ++i)
      for (int j = 0; j < size; ++j)
        C[i][j] =0.f;
    
    // Compute matrix multiplication.
#pragma omp parallel for default(none) shared(A,B,C,size)
    for (int i = 0; i < size; ++i)
      for (int k = 0; k < size; ++k)
        for (int j = 0; j < size; ++j)
          C[i][j] += A[i][k] * B[k][j];
  }
}

float nrmsdError(int size, float (* restrict M1)[size], 
		float (* restrict M2)[size]) 
{
    double sum=0.;
    double max,min;
    max=min=(M1[0][0]- M2[0][0]);

#pragma omp parallel for
    for (int i = 0; i < size; ++i) 
      for (int j = 0; j < size; ++j) {
	double diff = (M1[i][j]- M2[i][j]);
#pragma omp critical
	{
	  max = (max>diff)?max:diff;
	  min = (min<diff)?min:diff;
	  sum += diff*diff;
	}
      }
    
    return(sqrt(sum/(size*size))/(max-min));
}

float doCheck(int size, float (* restrict A)[size],
	      float (* restrict B)[size],
	      float (* restrict C)[size],
	      int nIter,
	      float *error) 
{
  float (*restrict At)[size] = malloc(sizeof(float)*size*size);
  float (*restrict Bt)[size] = malloc(sizeof(float)*size*size);
  float (*restrict Ct)[size] = malloc(sizeof(float)*size*size);
  float (*restrict Cgemm)[size] = malloc(sizeof(float)*size*size);

  // transpose to get best sgemm performance
#pragma omp parallel for
  for(int i=0; i < size; i++)
    for(int j=0; j < size; j++) {
      At[i][j] = A[j][i];
      Bt[i][j] = B[j][i];
    }

  float alpha = 1.0f, beta = 0.0f; /* Scaling factors */

  // warm up
  sgemm("N", "N", &size, &size, &size, &alpha,
	(float *)At, &size, (float *)Bt, &size, &beta, (float *) Ct, &size);
  double mklStartTime=dsecnd();
  for(int i=0; i < nIter; i++)
    sgemm("N", "N", &size, &size, &size, &alpha,
	  (float *)At, &size, (float *)Bt, &size, &beta, (float *) Ct, &size);
  double mklEndTime=dsecnd();

  // transpose in Cgemm to calculate error
  #pragma omp parallel for
  for(int i=0; i < size; i++)
    for(int j=0; j < size; j++)
      Cgemm[i][j] = Ct[j][i];

  *error = nrmsdError(size, C,Cgemm);

  free(At); free(Bt); free(Ct); free(Cgemm);

  return (2e-9*size*size*size/((mklEndTime-mklStartTime)/nIter) );
}

int main(int argc, char *argv[])
{

  if(argc != 4) {
    fprintf(stderr,"Use: %s size nThreads nIter\n",argv[0]);
    return -1;
  }

  int i,j,k;
  int size=atoi(argv[1]);
  int nThreads=atoi(argv[2]);
  int nIter=atoi(argv[3]);
  
  omp_set_num_threads(nThreads);

  float (*restrict A)[size] = malloc(sizeof(float)*size*size);
  float (*restrict B)[size] = malloc(sizeof(float)*size*size);
  float (*restrict C)[size] = malloc(sizeof(float)*size*size);

  // Fill the A and B arrays
#pragma omp parallel for default(none) shared(A,B,size) private(i,j,k)
  for (i = 0; i < size; ++i) {
    for (j = 0; j < size; ++j) {
      A[i][j] = (float)i + j;
      B[i][j] = (float)i - j;
    }
  }
  
  double aveDoMultTime=0.;
  {
    // warm up
    doMult(size, A,B,C);

    double startTime = dsecnd();
    for(int i=0; i < nIter; i++) {
      doMult(size, A,B,C);
    }
    double endTime = dsecnd();
    aveDoMultTime = (endTime-startTime)/nIter;
  }

#pragma omp parallel
#pragma omp master
  printf("%s nThreads %d matrix %d %d runtime %g GFlop/s %g",
	 argv[0], omp_get_num_threads(), size, size, 
	 aveDoMultTime, 2e-9*size*size*size/aveDoMultTime);
#pragma omp barrier

  // do check
  float error=0.f;
  float mklGflop = doCheck(size,A,B,C,nIter,&error);
  printf(" mklGflop %g NRMSD_error %g", mklGflop, error);

  printf("\n");

  free(A); free(B); free(C);
  return 0;
}

The Intel compiler commands in Listing Six are used to compile the matrix.c source code for the various modes:

Listing Six: Build script for matrix.c.

# compile for host-based OpenMP
icc -mkl -O3 -no-offload -openmp -Wno-unknown-pragmas -std=c99 -vec-report3 \
	matrix.c -o matrix.omp
# compile for offload mode
icc -mkl -O3 -offload-build -Wno-unknown-pragmas -std=c99 -vec-report3 \
matrix.c -o matrix.off 
# compile to run natively on the Xeon Phi
icc -mkl -O3 -mmic -openmp -L  /opt/intel/lib/mic -Wno-unknown-pragmas \
-std=c99 -vec-report3 matrix.c -o matrix.mic -liomp5


Related Reading


More Insights






Currently we allow the following HTML tags in comments:

Single tags

These tags can be used alone and don't need an ending tag.

<br> Defines a single line break

<hr> Defines a horizontal line

Matching tags

These require an ending tag - e.g. <i>italic text</i>

<a> Defines an anchor

<b> Defines bold text

<big> Defines big text

<blockquote> Defines a long quotation

<caption> Defines a table caption

<cite> Defines a citation

<code> Defines computer code text

<em> Defines emphasized text

<fieldset> Defines a border around elements in a form

<h1> This is heading 1

<h2> This is heading 2

<h3> This is heading 3

<h4> This is heading 4

<h5> This is heading 5

<h6> This is heading 6

<i> Defines italic text

<p> Defines a paragraph

<pre> Defines preformatted text

<q> Defines a short quotation

<samp> Defines sample computer code text

<small> Defines small text

<span> Defines a section in a document

<s> Defines strikethrough text

<strike> Defines strikethrough text

<strong> Defines strong text

<sub> Defines subscripted text

<sup> Defines superscripted text

<u> Defines underlined text

Dr. Dobb's encourages readers to engage in spirited, healthy debate, including taking us to task. However, Dr. Dobb's moderates all comments posted to our site, and reserves the right to modify or remove any content that it determines to be derogatory, offensive, inflammatory, vulgar, irrelevant/off-topic, racist or obvious marketing or spam. Dr. Dobb's further reserves the right to disable the profile of any commenter participating in said activities.

 
Disqus Tips To upload an avatar photo, first complete your Disqus profile. | View the list of supported HTML tags you can use to style comments. | Please read our commenting policy.
 


Video