Channels ▼
RSS

Parallel

CUDA vs. Phi: Phi Programming for CUDA Developers


The following source code for matrix.c (Example 7) can be compiled without modification by the Intel compiler to run in the following modes:

  • Native: The entire application runs on the Phi coprocessor.
  • Offload: The host processor runs the application and offloads compute intensive code and associated data to the coprocessor as specified by the programmer via pragmas in the source code.
  • Host: Run the code as a traditional OpenMP application on the host.

More information about the syntax of the offload directive can be found in the Intel C++ Compiler XE 13.0 User and Reference Guides.

/* matrix.c (Rob Farber) */
#ifndef MIC_DEV
#define MIC_DEV 0
#endif

#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <mkl.h>
#include <math.h>

// An OpenMP simple matrix multiply
void doMult(int size, float (* restrict A)[size],
	    float (* restrict B)[size], float (* restrict C)[size]) 
{
#pragma offload target(mic:MIC_DEV) \
                in(A:length(size*size)) in( B:length(size*size))	\
                out(C:length(size*size))
  {
    // Zero the C matrix
#pragma omp parallel for default(none) shared(C,size)
    for (int i = 0; i < size; ++i)
      for (int j = 0; j < size; ++j)
        C[i][j] =0.f;
    
    // Compute matrix multiplication.
#pragma omp parallel for default(none) shared(A,B,C,size)
    for (int i = 0; i < size; ++i)
      for (int k = 0; k < size; ++k)
        for (int j = 0; j < size; ++j)
          C[i][j] += A[i][k] * B[k][j];
  }
}

float nrmsdError(int size, float (* restrict M1)[size], 
		float (* restrict M2)[size]) 
{
    double sum=0.;
    double max,min;
    max=min=(M1[0][0]- M2[0][0]);

#pragma omp parallel for
    for (int i = 0; i < size; ++i) 
      for (int j = 0; j < size; ++j) {
	double diff = (M1[i][j]- M2[i][j]);
#pragma omp critical
	{
	  max = (max>diff)?max:diff;
	  min = (min<diff)?min:diff;
	  sum += diff*diff;
	}
      }
    
    return(sqrt(sum/(size*size))/(max-min));
}

float doCheck(int size, float (* restrict A)[size],
	      float (* restrict B)[size],
	      float (* restrict C)[size],
	      int nIter,
	      float *error) 
{
  float (*restrict At)[size] = malloc(sizeof(float)*size*size);
  float (*restrict Bt)[size] = malloc(sizeof(float)*size*size);
  float (*restrict Ct)[size] = malloc(sizeof(float)*size*size);
  float (*restrict Cgemm)[size] = malloc(sizeof(float)*size*size);

  // transpose to get best sgemm performance
#pragma omp parallel for
  for(int i=0; i < size; i++)
    for(int j=0; j < size; j++) {
      At[i][j] = A[j][i];
      Bt[i][j] = B[j][i];
    }

  float alpha = 1.0f, beta = 0.0f; /* Scaling factors */

  // warm up
  sgemm("N", "N", &size, &size, &size, &alpha,
	(float *)At, &size, (float *)Bt, &size, &beta, (float *) Ct, &size);
  double mklStartTime=dsecnd();
  for(int i=0; i < nIter; i++)
    sgemm("N", "N", &size, &size, &size, &alpha,
	  (float *)At, &size, (float *)Bt, &size, &beta, (float *) Ct, &size);
  double mklEndTime=dsecnd();

  // transpose in Cgemm to calculate error
  #pragma omp parallel for
  for(int i=0; i < size; i++)
    for(int j=0; j < size; j++)
      Cgemm[i][j] = Ct[j][i];

  *error = nrmsdError(size, C,Cgemm);

  free(At); free(Bt); free(Ct); free(Cgemm);

  return (2e-9*size*size*size/((mklEndTime-mklStartTime)/nIter) );
}

int main(int argc, char *argv[])
{

  if(argc != 4) {
    fprintf(stderr,"Use: %s size nThreads nIter\n",argv[0]);
    return -1;
  }

  int i,j,k;
  int size=atoi(argv[1]);
  int nThreads=atoi(argv[2]);
  int nIter=atoi(argv[3]);
  
  omp_set_num_threads(nThreads);

  float (*restrict A)[size] = malloc(sizeof(float)*size*size);
  float (*restrict B)[size] = malloc(sizeof(float)*size*size);
  float (*restrict C)[size] = malloc(sizeof(float)*size*size);

  // Fill the A and B arrays
#pragma omp parallel for default(none) shared(A,B,size) private(i,j,k)
  for (i = 0; i < size; ++i) {
    for (j = 0; j < size; ++j) {
      A[i][j] = (float)i + j;
      B[i][j] = (float)i - j;
    }
  }
  
  double aveDoMultTime=0.;
  {
    // warm up
    doMult(size, A,B,C);

    double startTime = dsecnd();
    for(int i=0; i < nIter; i++) {
      doMult(size, A,B,C);
    }
    double endTime = dsecnd();
    aveDoMultTime = (endTime-startTime)/nIter;
  }

#pragma omp parallel
#pragma omp master
  printf("%s nThreads %d matrix %d %d runtime %g GFlop/s %g",
	 argv[0], omp_get_num_threads(), size, size, 
	 aveDoMultTime, 2e-9*size*size*size/aveDoMultTime);
#pragma omp barrier

  // do check
  float error=0.f;
  float mklGflop = doCheck(size,A,B,C,nIter,&error);
  printf(" mklGflop %g NRMSD_error %g", mklGflop, error);

  printf("\n");

  free(A); free(B); free(C);
  return 0;
}

Example 7 : Complete source code for matrix.c.

Example 8 presents the Intel C++ compiler commands used to compile the matrix.c source code for the various modes.

# compile for host-based OpenMP
icc -mkl -O3 -no-offload -openmp -Wno-unknown-pragmas -std=c99 -vec-report3 \
	matrix.c -o matrix.omp
# compile for offload mode
icc -mkl -O3  -Wno-unknown-pragmas -std=c99 -vec-report3 \
matrix.c -o matrix.off 
# compile to run natively on the Xeon Phi
icc -mkl -O3 -mmic -openmp -L  /opt/intel/lib/mic -Wno-unknown-pragmas \
-std=c99 -vec-report3 matrix.c -o matrix.mic -liomp5

Example 8: Build script for matrix.c.

The Intel icc compiler accepts the following command-line arguments to set the operating mode:

  • -no-offload: Ignore any offload directives
  • -offload-build: Create offload regions according to the directives in the source code.
  • -mmic: Build the executable for Phi coprocessor.

In addition, the –mkl command-line option tells the compiler to utilize MKL while the –std=c99 directive allows use of the restrict keyword and C99 VLAs (Variable Length Arrays). The MKL_MIC_ENABLE environmental variable was also set to tell the MKL library to run in thunking mode on the Phi coprocessor. Figure 3 reports the runtime on a 12-core Sandy Bridge X5680 running at 3.33GHz and a preproduction Phi coprocessor.


Figure 3 : SGEMM performance of preproduction Phi coprocessors on square matrices.

CUDA MPI

The Message Passing Interface (MPI) is the standard for portable high-performance distributed computing. Both Phi coprocessors and GPU devices can accelerate MPI applications in offload mode where portions of the application are accelerated by the remote device.

As noted in the introduction, CUDA programmers need to remember that they can also run MPI and OpenMP code natively on Phi coprocessors. Owners of legacy code might find that they can just recompile existing applications to run on the Phi coprocessors without any porting effort. Those who have already ported their MPI code to CUDA will find that the MPI calls will not require any changes. As discussed in the introduction, there are several paths to investigate for automated translation of CUDA code to Phi coprocessors.

The Future

The good news for CUDA programmers is that Phi coprocessors provide a teraflop-capable hardware platform that should run CUDA applications with high performance. It should be relatively straight-forward to create OpenMP implementations of the CUDA kernels that can run in offload mode on Phi coprocessors. As discussed in the introduction, it is also possible that one of the existing tools that currently allows CUDA to run on x86 processors will be extended to generate Phi coprocessor binaries in the future.

While it should be relatively easy and straightforward to port CUDA applications to Phi coprocessors with the expectation is that the resulting binary will run with high performance, this does not necessarily mean that the translated CUDA code will achieve the best possible performance. Developers should look to the benefits of the Phi's MIMD execution of Posix threads, large page tables, non-blocking algorithms, and native execution to maximize performance. In addition, care should be exercised to make sure coalesced GPU memory accesses in CUDA code does not cause false sharing across cores on Phi coprocessors.

Looking to the future, language platforms such as OpenACC and OpenMP will probably converge (possibly in OpenMP 4.0) so that only a change of compiler switches will be required to run on a GPU or Phi coprocessor. As a result, we will all benefit from the teraflop performance of both devices. Regardless, the generality of the Phi coprocessors also means that it can act as a support processor for less intensive vector loads.

CUDA programmers need to remember that the Phi is designed as a coprocessor that runs Linux. Unlike GPUs that act only as accelerators, the Phi coprocessor has the ability to be used as a very capable support processor merely by compiling existing applications to run natively on it. Although this means that Phi coprocessors will probably not be a performance star for non-vector applications, they can still be used to speed applications via their many-core parallelism and high memory bandwidth.

Related Reading

Programming Intel's Xeon Phi: A Jumpstart Introduction

Update: This article was updated to correct some command-line switches in the compilation scripts.


Rob Farber is a frequent contributor to Dr. Dobb's on CPU and GPGPU-associated programming topics.


Related Reading


More Insights






Currently we allow the following HTML tags in comments:

Single tags

These tags can be used alone and don't need an ending tag.

<br> Defines a single line break

<hr> Defines a horizontal line

Matching tags

These require an ending tag - e.g. <i>italic text</i>

<a> Defines an anchor

<b> Defines bold text

<big> Defines big text

<blockquote> Defines a long quotation

<caption> Defines a table caption

<cite> Defines a citation

<code> Defines computer code text

<em> Defines emphasized text

<fieldset> Defines a border around elements in a form

<h1> This is heading 1

<h2> This is heading 2

<h3> This is heading 3

<h4> This is heading 4

<h5> This is heading 5

<h6> This is heading 6

<i> Defines italic text

<p> Defines a paragraph

<pre> Defines preformatted text

<q> Defines a short quotation

<samp> Defines sample computer code text

<small> Defines small text

<span> Defines a section in a document

<s> Defines strikethrough text

<strike> Defines strikethrough text

<strong> Defines strong text

<sub> Defines subscripted text

<sup> Defines superscripted text

<u> Defines underlined text

Dr. Dobb's encourages readers to engage in spirited, healthy debate, including taking us to task. However, Dr. Dobb's moderates all comments posted to our site, and reserves the right to modify or remove any content that it determines to be derogatory, offensive, inflammatory, vulgar, irrelevant/off-topic, racist or obvious marketing or spam. Dr. Dobb's further reserves the right to disable the profile of any commenter participating in said activities.

 
Disqus Tips To upload an avatar photo, first complete your Disqus profile. | View the list of supported HTML tags you can use to style comments. | Please read our commenting policy.
 

Video