Offloading using OpenCL¶
This chapter describes a sequence of steps to offload computation from ARM/Linux to the DSPs on TI’s heterogeneous multicore devices such as AM572, AM571 and 66AK2H. For details on offloading from ARM/TI-RTOS, see OpenCL on TI-RTOS.
Note
This matrix multiplication example is only meant to illustrate the steps required to offload using OpenCL. It has not been optimized for performance. Refer section Optimization Tips and sgemm example for details on optimizing matrix multiplication and other OpenCL kernels for the DSP.
Also, this example uses the OpenCL 1.1 C++ Wrapper API for conciseness.
Matrix multiply on the ARM host¶
Listing matmul_arm.cpp performs matrix multiplication, C[N][M] = A[N][K] x B[K][M]
. It can be compiled and run on the host using the command: g++ -std=c++11 matmul_arm.cpp -o matmul
#include <cassert>
#include <cstdlib>
const int DIM = 16;
const int mat_N = DIM;
const int mat_K = DIM;
const int mat_M = DIM;
void mat_mpy(const float *A, const float *B, float *C, int mat_N,
int mat_K, int mat_M)
{
for (int col = 0; col < mat_M; ++col)
for (int row = 0; row < mat_N; ++row)
{
C[row*mat_M+col] = 0;
for (int i = 0; i < mat_K; ++i)
C[row*mat_M+col] += A[row*mat_K+i] * B[i*mat_M+col];
}
}
int main(int argc, char *argv[])
{
size_t mat_size = DIM * DIM * sizeof(float);
// Allocate matrices
float *A = (float *) malloc(mat_size);
float *B = (float *) malloc(mat_size);
float *C = (float *) malloc(mat_size);
assert(A != nullptr && B != nullptr && C != nullptr && C != nullptr);
// Initialize matrices
srand(42);
for (int i=0; i < mat_N * mat_K; ++i) A[i] = rand() % 5 + 1;
for (int i=0; i < mat_K * mat_M; ++i) B[i] = rand() % 5 + 1;
for (int i=0; i < mat_N * mat_M; ++i) C[i] = 0.0;
// Multiple matrices C = A x B
mat_mpy(A, B, C, mat_N, mat_K, mat_M);
free(A);
free(B);
free(C);
return 0;
}
Offloading matrix multiplication to the DSPs¶
Steps¶
Represent matrix multiplication as an OpenCL-C kernel¶
Listing OpenCL-C kernel for matrix multiplication illustrates a way of representing matrix multiplication as an OpenCL-C kernel. In this approach, each workgroup computes one row of the output C matrix. The number of workgroups is equal to the number of columns in the C matrix, M
. For details on how workgroups are mapped to the DSPs, refer Understanding Kernels, Work-groups and Work-items.
const std::string kernelSrc = R"(
kernel void ocl_matmpy(const global float *a,
const global float *b,
global float *c,
int mat_K,
int mat_N)
{
int col = get_global_id(0);
int mat_M = get_global_size(0);
for (int row = 0; row < mat_N; ++row)
{
c[row * mat_M + col] = 0;
for (int i = 0; i < mat_K; ++i)
c[row * mat_M + col] += a[row*mat_K+i] * b[i*mat_M+col];
}
}
)";
Allocate matrices using __malloc_<memory> functions¶
The DSPs operate out of contiguous memory. In order to avoid copies from non-contiguous host memory to contiguous DSP memory, allocate the matrices using a special contiguous allocator. Refer Alternate Host malloc/free Extension for Zero Copy OpenCL Kernels for details.
// Allocate matrices
float *A = (float *) __malloc_ddr(mat_size);
float *B = (float *) __malloc_ddr(mat_size);
float *C = (float *) __malloc_ddr(mat_size);
Initialize the OpenCL runtime¶
Boiler-plate code to create an OpenCL context. The DSPs are modeled as a single OpenCL device of type CL_DEVICE_TYPE_ACCELERATOR. TI’s OpenCL runtime on AM57x and 66AK2x SoCs supports a single device. Refer Execution Model for details.
// Initialize context and command queue
Context context(CL_DEVICE_TYPE_ACCELERATOR);
std::vector<Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
CommandQueue Q (context, devices[0]);
Compile the OpenCL-C kernel¶
Use OpenCL APIs to compile the OpenCL-C kernel for the DSP. This example uses online compilation. For an overview of the various compilation modes available, refer Compilation.
// Build the OpenCL program
Program::Sources source(1, std::make_pair(kernelSrc.c_str(),
kernelSrc.length()));
Program P = Program(context, source);
P.build(devices);
Create a kernel object, set up arguments¶
Listing Kernel object creates an OpenCL Kernel object and sets up the kernel arguments.
// Create kernel and set up arguments
Kernel K (P, "ocl_matmpy");
K.setArg(0, bufA);
K.setArg(1, bufB);
K.setArg(2, bufC);
K.setArg(3, mat_K);
K.setArg(4, mat_N);
Call the kernel and wait for completion¶
The execution of the kernel is asynchronous. The host can perform computations that do not depend on the output matrix, C
, between the enqueueNDRangeKernel
and wait
.
// Run the kernel and wait for completion
Event E;
Q.enqueueNDRangeKernel(K, NullRange, NDRange(mat_M), NDRange(1), NULL, &E);
E.wait();
Putting it all together¶
Compile the host program with OpenCL offload using the following command: g++ -O3 -std=c++11 matmul_ocl.cpp -lOpenCL -locl_util -o matmpy
. libOpenCL.so
is TI’s OpenCL Runtime library. libocl_util.so
provides utility functions such as ocl_code_error
.
#include <iostream>
#include <cstdlib>
#include <assert.h>
#include <utility>
#include "ocl_util.h"
#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
/******************************************************************************
* C[N][M] = A[N][K] * B[K][M];
******************************************************************************/
using namespace cl;
using std::cout;
using std::cerr;
using std::endl;
const int DIM = 16;
const int mat_N = DIM;
const int mat_K = DIM;
const int mat_M = DIM;
const std::string kernelSrc = R"(
kernel void ocl_matmpy(const global float *a,
const global float *b,
global float *c,
int mat_K,
int mat_N)
{
int col = get_global_id(0);
int mat_M = get_global_size(0);
for (int row = 0; row < mat_N; ++row)
{
c[row * mat_M + col] = 0;
for (int i = 0; i < mat_K; ++i)
c[row * mat_M + col] += a[row*mat_K+i] * b[i*mat_M+col];
}
}
)";
void mat_mpy_ocl(float *A, float *B, float *C, int mat_N,
int mat_K, int mat_M, std::size_t mat_size)
{
try
{
// Initialize context and command queue
Context context(CL_DEVICE_TYPE_ACCELERATOR);
std::vector<Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
CommandQueue Q (context, devices[0]);
// Build the OpenCL program
Program::Sources source(1, std::make_pair(kernelSrc.c_str(),
kernelSrc.length()));
Program P = Program(context, source);
P.build(devices);
// Create buffers from memory allocated via __malloc_ddr
Buffer bufA(context, CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR, mat_size, A);
Buffer bufB(context, CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR, mat_size, B);
Buffer bufC(context, CL_MEM_WRITE_ONLY|CL_MEM_USE_HOST_PTR, mat_size, C);
// Create kernel and set up arguments
Kernel K (P, "ocl_matmpy");
K.setArg(0, bufA);
K.setArg(1, bufB);
K.setArg(2, bufC);
K.setArg(3, mat_K);
K.setArg(4, mat_N);
// Run the kernel and wait for completion
Event E;
Q.enqueueNDRangeKernel(K, NullRange, NDRange(mat_M), NDRange(1), NULL, &E);
E.wait();
}
catch (Error err)
{
cerr << "ERROR: " << err.what() << "(" << err.err() << ", "
<< ocl_decode_error(err.err()) << ")" << endl;
exit(-1);
}
}
int main(int argc, char *argv[])
{
std::size_t mat_size = DIM * DIM * sizeof(float);
// Allocate matrices
float *A = (float *) __malloc_ddr(mat_size);
float *B = (float *) __malloc_ddr(mat_size);
float *C = (float *) __malloc_ddr(mat_size);
assert(A != nullptr && B != nullptr && C != nullptr && C != nullptr);
// Initialize matrices
srand(42);
for (int i=0; i < mat_N * mat_K; ++i) A[i] = rand() % 5 + 1;
for (int i=0; i < mat_K * mat_M; ++i) B[i] = rand() % 5 + 1;
for (int i=0; i < mat_N * mat_M; ++i) C[i] = 0.0;
// Multiple matrices C = A x B
mat_mpy_ocl(A, B, C, mat_N, mat_K, mat_M, mat_size);
// Free the matrices
__free_ddr(A);
__free_ddr(B);
__free_ddr(C);
return 0;
}