Example: Optimizing 3x3 Gaussian smoothing filter¶
This section describes a step-by-step approach to optimizing the 3x3 Gaussian smoothing filter kernel for the C66x DSP.
Overview of Gaussian Filter¶
The Gaussian Filter is used as a smoothing filter. The filter is applied by convolving a nxn image window with a nxn Gaussian kernel and obtaining a weighted sum. More on the filter is available here: https://homepages.inf.ed.ac.uk/rbf/HIPR2/gsmooth.htm
The kernel size that we are using here is a 3x3 kernel. Let A be a 3x3 image window and B be the 3x3 Gaussian kernel. The filter is applied by convolving A and B and A is obtained in a sliding window fashion.
Natural C Code¶
The first listing is a snippet of C code for convolution:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 | float image[image_size];
float gaussian_kernel[9];
float weight;
float filtered_image[(image_width - 2) * (image_height - 2)];
for (i = 1; i < img_height - 1; i++)
{
for (j = 1; j < img_width - 1; j++)
{
sum = 0;
for (p = 0; p < 3; p++)
{
for (q = 0; q < 3; q++)
{
sum += image[(i + p) * img_width + j + q] *
gaussian_kernel[p * 3 + q];
}
}
sum /= weight;
filtered_image[(i - 1) * img_width + j] = sum;
}
}
|
Optimizing for DSP¶
An OpenCL C kernel for convolution. Note that the types are float.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 | // Serves as bounds check
bool OutsideImage(int2 pos, int width, int height)
{
if (pos.x < 1 || pos.y < 1)
return true;
if (pos.x >= width - 1 || pos.y >= height - 1)
return true;
return false;
}
kernel void gaussian_filter (global float* image,
global float* filtered_image,
global float* gaussian_kernel,
global int* image_dims
float weight)
{
const int image_height = image_dims[0];
const int image_width = image_dims[1];
const int global_x = get_global_id(0);
const int global_y = get_global_id(1);
const int2 pixel_pos = { global_x, global_y };
if (OutsideImage(pixel_pos, image_width, image_height))
return;
float sum = 0;
int index = 0;
int2 pos;
/* 3x3 Convolution */
for(int y= -1; y<=1; y++)
for(int x=-1; x<=1; x++)
{
pos = pixel_pos + (int2)(x,y);
sum += gaussian_kernel[index++] * image[pos.y * image_width + pos.x];
}
sum /= weight;
filtered_image[global_y * img_width + global_x] = sum;
}
|
- Step 1: Initial optimization for DSP:
- Convert the float type to uchar
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 | // Serves as bounds check
bool OutsideImage(int2 pos, int width, int height)
{
if (pos.x < 1 || pos.y < 1)
return true;
if (pos.x >= width - 1 || pos.y >= height - 1)
return true;
return false;
}
kernel void gaussian_filter (global uchar* image,
global uchar* filtered_image,
global char* gaussian_kernel,
global int* image_dims,
short weight)
{
const int image_height = image_dims[0];
const int image_width = image_dims[1];
const int global_x = get_global_id(0);
const int global_y = get_global_id(1);
const int2 pixel_pos = { global_x, global_y };
if (OutsideImage(pixel_pos, image_width, image_height))
return;
short sum = 0;
int index = 0;
int2 pos;
/* 3x3 Convolution */
for(int y= -1; y<=1; y++)
for(int x=-1; x<=1; x++)
{
pos = pixel_pos + (int2)(x,y);
sum += gaussian_kernel[index++] * image[pos.y * image_width + pos.x];
}
sum /= weight;
filtered_image[global_y * img_width + global_x] = (uchar) sum;
}
|
- Step 2:
- Switch to using vector types to take advantage of vector instructions available on the DSP
- Annotate the kernel with a work-group size attribute
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | inline int
dot_product (uchar4 mask, uchar4 data)
{
int sum = 0;
sum = (int) (mask.s0 * data.s0 +
mask.s1 * data.s1 + mask.s2 * data.s2 + mask.s3 * data.s3);
return sum;
}
kernel __attribute__ ((reqd_work_group_size (1, 1, 1)))
void gaussian_filter (global const uchar4* restrict imgin_ptr,
global uchar4* restrict imgout_ptr,
short width,
short pitch,
global const uchar* kernel_coefficient,
short shift)
{
int i;
int sum0, sum1, sum2;
int sum3;
uchar4 mask1_0, mask2_0, mask3_0;
uchar4 mask1_1, mask2_1, mask3_1;
uchar4 r1_3210;
uchar4 r2_3210;
uchar4 r3_3210;
uchar4 r1_5432;
uchar4 r2_5432;
uchar4 r3_5432;
uchar8 r1_76543210, r2_76543210, r3_76543210;
mask1_0 =
(uchar4) (kernel_coefficient[0], kernel_coefficient[1],
kernel_coefficient[2], 0);
mask2_0 =
(uchar4) (kernel_coefficient[3], kernel_coefficient[4],
kernel_coefficient[5], 0);
mask3_0 =
(uchar4) (kernel_coefficient[6], kernel_coefficient[7],
kernel_coefficient[8], 0);
mask1_1 =
(uchar4) (0, kernel_coefficient[0], kernel_coefficient[1],
kernel_coefficient[2]);
mask2_1 =
(uchar4) (0, kernel_coefficient[3], kernel_coefficient[4],
kernel_coefficient[5]);
mask3_1 =
(uchar4) (0, kernel_coefficient[6], kernel_coefficient[7],
kernel_coefficient[8]);
for (i = 0; i < width; i += 1)
{
r1_76543210 = vload8 (i, imgin_ptr);
r1_76543210 = vload8 (pitch + i, imgin_ptr);
r1_76543210 = vload8 (2 * pitch + i, imgin_ptr);
r1_3210 = (uchar4) (r1_76543210.s0123);
r2_3210 = (uchar4) (r2_76543210.s0123);
r3_3210 = (uchar4) (r3_76543210.s0123);
sum0 = (dot_product (mask1_0, r1_3210) +
dot_product (mask2_0, r2_3210) +
dot_product (mask3_0, r3_3210)) >> shift;
sum1 = (dot_product (mask1_1, r1_3210) +
dot_product (mask2_1, r2_3210) +
dot_product (mask3_1, r3_3210)) >> shift;
r1_5432 = (uchar4) (r1_76543210.s2345);
r2_5432 = (uchar4) (r2_76543210.s2345);
r3_5432 = (uchar4) (r3_76543210.s2345);
sum2 = (dot_product (mask1_0, r1_5432) +
dot_product (mask2_0, r2_5432) +
dot_product (mask3_0, r3_5432)) >> shift;
sum3 = (dot_product (mask1_1, r1_5432) +
dot_product (mask2_1, r2_5432) +
dot_product (mask3_1, r3_5432)) >> shift;
imgout_ptr[i] = (uchar4) (sum0, sum1, sum2, sum3);
}
}
|
Step 3: Use double buffering to overlap data movement with computation
Pseudo-code for a double-buffered version of the OpenCL C kernel:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
#define ARRAY_SIZE n
#define NUM_BATCHES n_b
#define BATCH_SIZE (ARRAY_SIZE / NUM_BATCHES) // This is the size of a single buffer
#define LOCAL_SIZE (BATCH_SIZE * 2) // This is the size of the double buffer
kernel __attribute__ ((reqd_work_group_size (1, 1, 1)))
void gaussian_filter (global const uchar4 * restrict imgin_ptr,
global uchar4 * restrict imgout_ptr,
short width,
short pitch,
global const uchar * kernel_coefficient,
short shift)
{
//Initialize the required variables
//Copy content in the double buffer
//Compute for the buffer in batch 1
for (batch = 0; batch < NUM_BATCHES - 2; batch++)
{
if (batch % 2 == 0)
{
Copy content in buffer batch 1 || Compute for buffer in batch 2
}
else
{
Copy content in buffer batch 2 || Compute for buffer in batch 1
}
}
if (batch % 2 == 0)
{
Copy content in buffer batch 1 || Compute for buffer in batch 2
}
else
{
Copy content in buffer batch 2 || Compute for buffer in batch 1
}
}
|
Now, we have an optimized OpenCL C kernel for the DSP. Note that the kernel is a generic OpenCL C kernel and can be compiled/run on any OpenCL device.
Performance Improvement¶
Description | Performance in cycles per pixel |
---|---|
Generic OpenCL C kernel | 12.0 |
OpenCL C kernel optimized for DSP | 5.0 |