Hi, I am accelerating a image processing algorithm with OpenCL on the cellphone, however I met a case which had a very poor performance.
The case is to calculate the prefix sum of each row on the image (a 2D buffer). For example, a 3x3 image:
50 32 10
48 45 100
30 80 10
the result should be:
50 82 92
48 93 193
30 110 120
That is for each row store it's prefix sum. My opencl kernel is somewhat as follows:
__kernel void scan_h( __global int *dst, __global int *src, int width, int height, int stride) { int x = get_global_id(0); int data; data = 0; int i; for(i=0; i<width; i++) { data += src[i + x*stride]; dst[i+ x*stride] = data; } }
it needed about 100ms to finish a 3000*3000 image.
Another experiment show interesting result. If I calculate the each column prefix sum with the kernel as follows, the run-time is only 4ms.
__kernel void scan_v( __global int *dst, __global int *src, int width, int height, int stride) { int x = get_global_id(0); int data = 0; int i; for(i=0; i<height; i++) { data += src[x+i*stride]; dst[x+i*stride] = data; } }
I do know why is the difference. So a work around for the row prefix sum is to tranpose the image first and then perform column prefix sum followed with another transpose.
But I strongly wonder why the native row prefix sum is so slow?
Thanks.
I tried the vector operations, it really increased the performance a lot. sigh~~
The tunning results of the following three kernels were 47ms, 19ms, 19ms, respectively. (half or quarter time consumption of the origin kernel).
__kernel void scan_h_8(
__global int *dst,
__global int *src,
int width, int height, int stride)
{
int x = get_global_id(0);
int data;
data = 0;
int2 value;
int i;
for(i=0; i<width; i+=2)
value = vload2(0, src+x*stride+i);
value.s0 += data;
value.s1 += value.s0;
data = value.s1;
vstore2(value, 0, dst+x*stride+i);
}
__kernel void scan_h_16(
int4 value;
for(i=0; i<width; i+=4)
value = vload4(0, src+x*stride+i);
value.s2 += value.s1;
value.s3 += value.s2;
data = value.s3;
vstore4(value, 0, dst+x*stride+i);
__kernel void scan_h_32(
int8 value;
for(i=0; i<width; i+=8)
value = vload8(0, src+x*stride+i);
value.s4 += value.s3;
value.s5 += value.s4;
value.s6 += value.s5;
value.s7 += value.s6;
data = value.s7;
vstore8(value, 0, dst+x*stride+i);