Example 2 implements the 2x1x2 simple autoencoder as a single myFunc()
that can be used in a least squares objective function to perform both a PCA and NLPCA analysis. The type of analysis depends on the definition of the function G()
. The C preprocessor macro IN()
fetches the example values from memory while the code in the DO_PRED
conditional compilation block allows use of this function for prediction and other applications.
Just as with matrix multiplication, this example also makes heavy use of the fused multiply-add instruction and should provide high
floating-point performance. The code for an autoencoder is also useful for benchmarking purposes because all computations occur in vector registers once the in[]
vector is read from main memory. By varying the size of the autoencoder, the programmer can adjust the number of flops per byte fetched from memory from low to very high.
inline float myFunc(const int vIndex, const float * restrict P, const float * restrict ex, const int nExamples, float * restrict pred) { float in[2]; // load the input vector in[0] = ex[IN(0, nExamples, vIndex)]; in[1] = ex[IN(1, nExamples, vIndex)]; register float h1_0 = P[0]; // bottleneck neuron 0 offset h1_0 += in[0] * P[1]; // input 0 to the bottleneck neuron h1_0 += in[1] * P[2]; // input 1 to the bottleneck neuron h1_0 = G(h1_0); // Calculate G() if nonlinear register float o,sum = 0.f; // Bottleneck neuron to Output 0 o = P[3]; // Output 0 offset o += h1_0 * P[4]; #ifdef DO_PRED pred[0] = o; #endif o -= in[0]; // calculate error sum += o*o; // Bottleneck neuron to Output 1 o = P[5]; // Output 0 offset o += h1_0 * P[6]; #ifdef DO_PRED pred[1] = o; #endif o -= in[1]; // calculate error sum += o*o; return(sum); }
Example 2: Definition of a simple 2x1x2 autoencoder.
Example 3 is the complete definition of the objective function, myFunc(),
and an associated fini()
method to print out some timing information about the objective function.
// Rob Farber #include <stdlib.h> #include <string.h> #include <stdint.h> #include <malloc.h> #include <math.h> #include <omp.h> #define MIC_DEV 0 #define ALLOC alloc_if(1) free_if(0) #define FREE alloc_if(0) free_if(1) #define REUSE alloc_if(0) free_if(0) // Use a struct to pass and get data from the objective function typedef struct userData { // Data information int nExamples; __declspec(align(64)) float * restrict example; __declspec(align(64)) float * restrict param; // Timing information int isWarmup; double timeObjFunc; int countObjFunc; double timeDataLoad; double minTime, maxTime; } userData_t; // function to measure wall clock time inline double getTime() { return(omp_get_wtime());} #pragma offload_attribute (push, target (mic)) // helper macros to index into the example array #define IN(i,nExamples,j) (i*nExamples+j) #define OUT(i,nExamples,j) ((i+N_INPUT)*nExamples+j) // Define the Sigmoid #ifdef USE_LINEAR char *desc="generated_PCA_func LINEAR()"; inline float G(float x) { return( x ) ;} #define G_ESTIMATE 0 #elif USE_TANH char *desc="generated_func tanh()"; inline float G(float x) { return( tanhf(x) ) ;} #define G_ESTIMATE 7 // estimate 7 flops for G #elif LOGISTIC char *desc="generated func logistic()"; inline float G(float x) { return( 1.f/(1.f+expf(-x)) ) ;} #define G_ESTIMATE 7 // estimate flops for G #else // Use Elliott function char *desc="generated func Eliott activation: x/(1+fabsf(x))"; inline float G(float x) { return( x/(1.f+fabsf(x)) ) ;} #define G_ESTIMATE 3 // estimate flops for G #endif // This file defines the function to be evaluated #include "fcn.h" // The offload objective function double _objFunc(unsigned int n, const double * restrict x, double * restrict grad, void * restrict my_func_data) { double err; userData_t *uData = (userData_t *) my_func_data; // convert from double to float for speed for(int i=0; i < N_PARAM; i++) uData->param[i]=x[i]; int nExamples = uData->nExamples; // compiler workaround __declspec(align(64)) float * restrict example = uData->example; __declspec(align(64)) float * restrict param = uData->param; #pragma offload target(mic:MIC_DEV) in(param:length(N_PARAM) REUSE) out(err) in(example:length(0) REUSE) { err=0.; // initialize error here in case offload selected #pragma omp parallel for reduction(+ : err) for(int i=0; i < nExamples; i++) { float d=myFunc(i, param, example, nExamples, NULL); err += d*d; } } return sqrt(err); } #pragma offload_attribute (pop) // The optizimation library callable objective function that gathers timing information double objFunc(unsigned int n, const double * restrict x, double * restrict grad, void * restrict my_func_data) { if(grad) { fprintf(stderr,"Gradient not implemented!\n"); exit(1); } userData_t *uData = (userData_t *) my_func_data; double runTime=getTime(); double err = _objFunc(n,x,grad,my_func_data); runTime = getTime() - runTime; if(!uData->isWarmup) { // Note a maxTime of zero means this is the first call if(uData->maxTime == 0.) { uData->maxTime = uData->minTime = runTime; } uData->maxTime = (uData->maxTime > runTime)?uData->maxTime:runTime; uData->minTime = (uData->minTime < runTime)?uData->minTime:runTime; uData->timeObjFunc += runTime; uData->countObjFunc++; } return( err ); } // Called to free memory and report timing information void fini(userData_t *uData) { int nThreads=0; // Intel recommended way to get the number of threads in offload mode. #pragma offload target(mic:MIC_DEV) out(nThreads) { #pragma omp parallel { #pragma omp single { nThreads = omp_get_num_threads(); } } } // Ouput some information if(!uData->isWarmup) { printf("number OMP threads %d\n", nThreads); printf("DataLoadTime %g\n", uData->timeDataLoad); printf("AveObjTime %g, countObjFunc %d, totalObjTime %g\n", uData->timeObjFunc/uData->countObjFunc, uData->countObjFunc, uData->timeObjFunc); #ifdef FLOP_ESTIMATE printf("Estimated flops in myFunc %d, estimated average GFlop/s %g\n", FLOP_ESTIMATE, (((double)uData->nExamples*FLOP_ESTIMATE)/(uData->timeObjFunc/uData->countObjFunc)/1.e9) ); printf("Estimated maximum GFlop/s %g, minimum GFLop/s %g\n", (((double)uData->nExamples*FLOP_ESTIMATE)/(uData->minTime)/1.e9), (((double)uData->nExamples*FLOP_ESTIMATE)/(uData->maxTime)/1.e9) ); } #endif // free if using offload mode __declspec(align(64)) float * restrict example = uData->example;// compiler workaround __declspec(align(64)) float * restrict param = uData->param;// compiler workaround #pragma offload target(mic:MIC_DEV) in(example: length(0) FREE) in(param : length(0) FREE) {} // free on the host if(uData->example) free(uData->example); uData->example=NULL; if(uData->param) free(uData->param); uData->param=NULL; } void offloadData(userData_t *uData) { #ifdef __INTEL_OFFLOAD int nDevices =_Offload_number_of_devices(); if(nDevices == 0) { fprintf(stderr,"No devices found!\n"); exit -1; } // If necessary, perform offload transfer and allocation double startOffload=getTime(); __declspec(align(64)) float * restrict example = uData->example; // compiler workaround __declspec(align(64)) float * restrict param = uData->param; // compiler workaround int Xsiz = uData->nExamples*EXAMPLE_SIZE; // compiler workaround // Note: the in for param just allocates memory on the device #pragma offload target(mic:MIC_DEV) in(example: length(Xsiz) ALLOC) in(param : length(N_PARAM) ALLOC) {} // set data load time if using offload mode uData->timeDataLoad = getTime() - startOffload; #endif }
Example 3: Source listing for myFunc.h.
Using Persistent Data in Offload Mode
Walking through the myFunc.h
code, note that ALLOC
, FREE
, and REUSE
are defined using the C preprocessor as recommended in Ronald Green's, "Effective Use of Compiler Features for Offload." It's important to follow Intel's notes when using persistent data in offload mode: "Always remember to specify the card in case of multi-card environments. By default, the cards are accessed in a round-robin fashion. Hence the persistent data will not be available on the other cards." For this reason, the preprocessor MIC_DEV
specifies the example code will run on card 0. This objective function utilizes persistent data when running in offload mode on the Intel Xeon Phi coprocessor. "Persistence" means that the data is loaded onto the device and kept there. This conforms to the three rules required to achieve high-performance on external processors like GPUs and the Intel Xeon Phi family. Succinctly stated, high offload computational performance with external coprocessors requires that the programmer:
- Transfer the data across the PCIe bus to the coprocessor and keep it there
- Give the coprocessor enough work to do
- Focus on data reuse within the coprocessor(s) to avoid memory bandwidth bottlenecks
Unlike the previous offload examples in this series, the code in this article allocates and transfers the data to be fit into one routine (for example, main()
discussed below), uses the data in objFunc()
, and frees it in fini()
. As Ronald Green notes, "Memory allocation is controlled by alloc_if
and free_if
, and data transfer is controlled by in/out/inout/nocopy
. The two are independent, but data can only be transferred in and out of allocated memory." The allocation is controlled by the value passed to alloc_if()
. Similarly, the memory is freed on the device as a result of the flag passed to free_if()
. The C preprocessor define
s make the code easier to read:
ALLOC
: Allocate the data but do not free it (alloc_if(1) free_if(0)
)REUSE
: do not allocate or free the data (alloc_if(0) free_if(0)
)FREE
: Free the data (alloc_if(0) free_if(1)
)
The objFunc()
method will be called many times, potentially hundreds of thousands to millions of times during a complex optimization. For this reason, only the optimization parameters contained in the vector x
are transferred to the device on each call via the in()
clause. (The offload mode assumes the optimization method runs on the host device per the mapping shown in Figure 2.) To avoid even the tiny transfer of the variable err
that holds the sum of the squares error, the out()
clause is used only to transfer the value off the device. Initialization occurs inside the offload pragma, on the device as noted in Example 4.
#pragma offload target(mic:MIC_DEV) in(param:length(N_PARAM) REUSE) out(err) in(example:length(0) REUSE) { err=0.; // initialize error here in case offload selected #pragma omp parallel for reduction(+ : err) for(int i=0; i < nExamples; i++) { float d=myFunc(i, param, example, nExamples, NULL); err += d*d; } }
Example 4: Minimizing offload data movement.
The pointer to the large example
dataset is merely refreshed by specifying length(0)
, which avoids an expensive large data transfer per objective function call. The REUSE
preprocessor define adds the appropriate alloc_if()
and free_if()
clauses, so the persistent data in example
remains intact across multiple calls. More detailed information about refreshing device pointers and uses of the length parameter in the offload pragma clauses can also be found in "Effective Use of Compiler Features for Offload."
The example
data is deallocated on the device with the following offload pragma in fini()
. The in()
clause refreshes the device pointer while the FREE
preprocessor specifies the correct the alloc_if()
and free_if()
flags to deallocate the persistent memory on the device (see Example 5).
// free if using offload mode __declspec(align(64)) float * restrict example = uData->example; __declspec(align(64)) float * restrict param = uData->param; #pragma offload target(mic:MIC_DEV) in(example: length(0) FREE) \ in(param : length(0) FREE) {}
Example 5: Freeing persistent offload data.
Versions 13.0.0, 13.0.1, and 13.1.0 Intel compilers have a bug when accessing structure members in offload pragmas. For this reason, the pragma utilizes a local copy of example
and param
.