Example: Optimizing 3x3 Gaussian smoothing filter

This section describes a step-by-step approach to optimizing the 3x3 Gaussian smoothing filter kernel for the C66x DSP.

Overview of Gaussian Filter

The Gaussian Filter is used as a smoothing filter. The filter is applied by convolving a nxn image window with a nxn Gaussian kernel and obtaining a weighted sum. More on the filter is available here: https://homepages.inf.ed.ac.uk/rbf/HIPR2/gsmooth.htm

The kernel size that we are using here is a 3x3 kernel. Let A be a 3x3 image window and B be the 3x3 Gaussian kernel. The filter is applied by convolving A and B and A is obtained in a sliding window fashion.

Natural C Code

The first listing is a snippet of C code for convolution:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
float image[image_size];
float gaussian_kernel[9];
float weight;
float filtered_image[(image_width - 2) * (image_height - 2)];

for (i = 1; i < img_height - 1; i++)
{
    for (j = 1; j < img_width - 1; j++)
    {
	sum = 0;
	for (p = 0; p < 3; p++)
        {
	    for (q = 0; q < 3; q++)
            {
		sum += image[(i + p) * img_width + j + q] * 
                       gaussian_kernel[p * 3 + q];
            }
	}
	sum /= weight;
	filtered_image[(i - 1) * img_width + j] = sum;
    }
}

Optimizing for DSP

An OpenCL C kernel for convolution. Note that the types are float.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
// Serves as bounds check
bool OutsideImage(int2 pos, int width, int height)
{
 if (pos.x < 1 || pos.y < 1)
    return true;

 if (pos.x >= width - 1 || pos.y >= height - 1)
    return true;

 return false;
}

kernel void gaussian_filter (global float* image,
                             global float* filtered_image,
                             global float* gaussian_kernel,
                             global int*   image_dims
                                    float  weight)
{
   const int image_height = image_dims[0];
   const int image_width = image_dims[1];
    
   const int global_x = get_global_id(0);
   const int global_y = get_global_id(1);
   const int2 pixel_pos = { global_x, global_y };

   if (OutsideImage(pixel_pos, image_width, image_height))
      return;

   float sum = 0;
   int index = 0;
   int2 pos;

   /* 3x3 Convolution */

   for(int y= -1; y<=1; y++)
      for(int x=-1; x<=1; x++)
      {
         pos = pixel_pos + (int2)(x,y);
         sum += gaussian_kernel[index++] * image[pos.y * image_width + pos.x];
      }

   sum /= weight;

   filtered_image[global_y * img_width + global_x] = sum; 
}
Step 1: Initial optimization for DSP:
  • Convert the float type to uchar
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
// Serves as bounds check
bool OutsideImage(int2 pos, int width, int height)
{
    if (pos.x < 1 || pos.y < 1)
        return true;

    if (pos.x >= width - 1 || pos.y >= height - 1)
        return true;

    return false;
}

kernel void gaussian_filter (global uchar* image,
                             global uchar* filtered_image,
                             global char*  gaussian_kernel,
                             global int*   image_dims,
                                    short   weight)
{
   const int image_height = image_dims[0];
   const int image_width = image_dims[1];
    
   const int global_x = get_global_id(0);
   const int global_y = get_global_id(1);
   const int2 pixel_pos = { global_x, global_y };

   if (OutsideImage(pixel_pos, image_width, image_height))
      return;

   short sum = 0;
   int index = 0;
   int2 pos;

   /* 3x3 Convolution */
   for(int y= -1; y<=1; y++)
      for(int x=-1; x<=1; x++)
      {
         pos = pixel_pos + (int2)(x,y);
         sum += gaussian_kernel[index++] * image[pos.y * image_width + pos.x];
      }

   sum /= weight;

   filtered_image[global_y * img_width + global_x] = (uchar) sum; 
}
Step 2:
  • Switch to using vector types to take advantage of vector instructions available on the DSP
  • Annotate the kernel with a work-group size attribute
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
inline int
dot_product (uchar4 mask, uchar4 data)
{
    int sum = 0;
    sum = (int) (mask.s0 * data.s0 +
		 mask.s1 * data.s1 + mask.s2 * data.s2 + mask.s3 * data.s3);
    return sum;
}

kernel __attribute__ ((reqd_work_group_size (1, 1, 1)))
void gaussian_filter (global const uchar4* restrict imgin_ptr,
		      global       uchar4* restrict imgout_ptr,
		      short                         width,
		      short                         pitch,
		      global const uchar*           kernel_coefficient,
		      short                         shift)
{

    int i;
    int sum0, sum1, sum2;
    int sum3;

    uchar4 mask1_0, mask2_0, mask3_0;
    uchar4 mask1_1, mask2_1, mask3_1;

    uchar4 r1_3210;
    uchar4 r2_3210;
    uchar4 r3_3210;
    uchar4 r1_5432;
    uchar4 r2_5432;
    uchar4 r3_5432;

    uchar8 r1_76543210, r2_76543210, r3_76543210;

    mask1_0 =
	(uchar4) (kernel_coefficient[0], kernel_coefficient[1],
		  kernel_coefficient[2], 0);
    mask2_0 =
	(uchar4) (kernel_coefficient[3], kernel_coefficient[4],
		  kernel_coefficient[5], 0);
    mask3_0 =
	(uchar4) (kernel_coefficient[6], kernel_coefficient[7],
		  kernel_coefficient[8], 0);

    mask1_1 =
	(uchar4) (0, kernel_coefficient[0], kernel_coefficient[1],
		  kernel_coefficient[2]);
    mask2_1 =
	(uchar4) (0, kernel_coefficient[3], kernel_coefficient[4],
		  kernel_coefficient[5]);
    mask3_1 =
	(uchar4) (0, kernel_coefficient[6], kernel_coefficient[7],
		  kernel_coefficient[8]);

    for (i = 0; i < width; i += 1)
    {
	  r1_76543210 = vload8 (i, imgin_ptr);
	  r1_76543210 = vload8 (pitch + i, imgin_ptr);
	  r1_76543210 = vload8 (2 * pitch + i, imgin_ptr);

	  r1_3210 = (uchar4) (r1_76543210.s0123);
	  r2_3210 = (uchar4) (r2_76543210.s0123);
	  r3_3210 = (uchar4) (r3_76543210.s0123);

	  sum0 = (dot_product (mask1_0, r1_3210) +
		  dot_product (mask2_0, r2_3210) +
		  dot_product (mask3_0, r3_3210)) >> shift;

	  sum1 = (dot_product (mask1_1, r1_3210) +
		  dot_product (mask2_1, r2_3210) +
		  dot_product (mask3_1, r3_3210)) >> shift;

	  r1_5432 = (uchar4) (r1_76543210.s2345);
	  r2_5432 = (uchar4) (r2_76543210.s2345);
	  r3_5432 = (uchar4) (r3_76543210.s2345);

	  sum2 = (dot_product (mask1_0, r1_5432) +
		  dot_product (mask2_0, r2_5432) +
		  dot_product (mask3_0, r3_5432)) >> shift;

	  sum3 = (dot_product (mask1_1, r1_5432) +
		  dot_product (mask2_1, r2_5432) +
		  dot_product (mask3_1, r3_5432)) >> shift;

	  imgout_ptr[i] = (uchar4) (sum0, sum1, sum2, sum3);
    }
}

Step 3: Use double buffering to overlap data movement with computation

Pseudo-code for a double-buffered version of the OpenCL C kernel:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41

#define ARRAY_SIZE n
#define NUM_BATCHES n_b
#define BATCH_SIZE (ARRAY_SIZE / NUM_BATCHES)	// This is the size of a single buffer
#define LOCAL_SIZE (BATCH_SIZE * 2)	// This is the size of the double buffer

kernel __attribute__ ((reqd_work_group_size (1, 1, 1)))
void gaussian_filter (global const uchar4 * restrict imgin_ptr,
	              global uchar4 * restrict imgout_ptr,
	              short width,
	              short pitch, 
                      global const uchar * kernel_coefficient, 
                      short shift)
{
    //Initialize the required variables

    //Copy content in the double buffer

    //Compute for the buffer in batch 1

    for (batch = 0; batch < NUM_BATCHES - 2; batch++)
    {
      if (batch % 2 == 0)
      {
	  Copy content in buffer batch 1 || Compute for buffer in batch 2
      }
      else
      {
	Copy content in buffer batch 2 || Compute for buffer in batch 1
      }
    }

    if (batch % 2 == 0)
    {
        Copy content in buffer batch 1 || Compute for buffer in batch 2
    }
    else
    {
        Copy content in buffer batch 2 || Compute for buffer in batch 1
    }
}

Now, we have an optimized OpenCL C kernel for the DSP. Note that the kernel is a generic OpenCL C kernel and can be compiled/run on any OpenCL device.

Performance Improvement

Description Performance in cycles per pixel
Generic OpenCL C kernel 12.0
OpenCL C kernel optimized for DSP 5.0