This discussion has been locked.
You can no longer post new replies to this discussion. If you have a question you can start a new discussion

prefix sum with Opencl

Hi, I am accelerating a image processing algorithm with OpenCL on the cellphone, however I met a case which had a very poor performance.

The case is to calculate the prefix sum of each row on the image (a 2D buffer). For example, a 3x3 image:

50   32   10

48   45   100

30   80    10

the result should be:

50  82  92

48   93 193

30  110  120

That is for each row store it's prefix sum. My opencl kernel is somewhat as follows:

__kernel void scan_h( __global int *dst,
    __global int *src,
        int width, int height, int stride)
{
    int x = get_global_id(0);

    int data;
    data = 0;

    int i;
    for(i=0; i<width; i++)
    {
        data += src[i + x*stride];
        dst[i+ x*stride] = data;
    }
}

 

it needed about 100ms to finish a 3000*3000 image.

Another experiment show interesting result. If I calculate the each column prefix sum with the kernel as follows, the run-time is only 4ms.

__kernel void scan_v(
    __global int *dst,
    __global int *src,
        int width, int height, int stride)
{
    int x = get_global_id(0);


    int data = 0;


    int i;
    for(i=0; i<height; i++)
    {
        data += src[x+i*stride];
        dst[x+i*stride] = data;
    }
}
 

I do know why is the difference. So a work around for the row prefix sum is to tranpose the image first and then perform column prefix sum followed with another transpose.

But I strongly wonder why the native row prefix sum is so slow?

Parents
  • Thanks.

    I tried the vector operations, it really increased the performance a lot.  sigh~~

    The tunning results of the following three kernels were 47ms, 19ms, 19ms, respectively. (half or quarter time consumption of the origin kernel).

    __kernel void scan_h_8(

        __global int *dst,

        __global int *src,

            int width, int height, int stride)

    {

        int x = get_global_id(0);

        int data;

        data = 0;

        int2 value;

        int i;

        for(i=0; i<width; i+=2)

        {

            value = vload2(0, src+x*stride+i);

            value.s0 += data;

            value.s1 += value.s0;

            data = value.s1;

            vstore2(value, 0, dst+x*stride+i);

        }

    }

    __kernel void scan_h_16(

        __global int *dst,

        __global int *src,

            int width, int height, int stride)

    {

        int x = get_global_id(0);

        int data;

        data = 0;

        int4 value;

        int i;

        for(i=0; i<width; i+=4)

        {

            value = vload4(0, src+x*stride+i);

            value.s0 += data;

            value.s1 += value.s0;

            value.s2 += value.s1;

            value.s3 += value.s2;

            data = value.s3;

            vstore4(value, 0, dst+x*stride+i);

        }

    }

    __kernel void scan_h_32(

        __global int *dst,

        __global int *src,

            int width, int height, int stride)

    {

        int x = get_global_id(0);

        int data;

        data = 0;

        int8 value;

        int i;

        for(i=0; i<width; i+=8)

        {

            value = vload8(0, src+x*stride+i);

            value.s0 += data;

            value.s1 += value.s0;

            value.s2 += value.s1;

            value.s3 += value.s2;

            value.s4 += value.s3;

            value.s5 += value.s4;

            value.s6 += value.s5;

            value.s7 += value.s6;

            data = value.s7;

            vstore8(value, 0, dst+x*stride+i);

        }

    }

Reply
  • Thanks.

    I tried the vector operations, it really increased the performance a lot.  sigh~~

    The tunning results of the following three kernels were 47ms, 19ms, 19ms, respectively. (half or quarter time consumption of the origin kernel).

    __kernel void scan_h_8(

        __global int *dst,

        __global int *src,

            int width, int height, int stride)

    {

        int x = get_global_id(0);

        int data;

        data = 0;

        int2 value;

        int i;

        for(i=0; i<width; i+=2)

        {

            value = vload2(0, src+x*stride+i);

            value.s0 += data;

            value.s1 += value.s0;

            data = value.s1;

            vstore2(value, 0, dst+x*stride+i);

        }

    }

    __kernel void scan_h_16(

        __global int *dst,

        __global int *src,

            int width, int height, int stride)

    {

        int x = get_global_id(0);

        int data;

        data = 0;

        int4 value;

        int i;

        for(i=0; i<width; i+=4)

        {

            value = vload4(0, src+x*stride+i);

            value.s0 += data;

            value.s1 += value.s0;

            value.s2 += value.s1;

            value.s3 += value.s2;

            data = value.s3;

            vstore4(value, 0, dst+x*stride+i);

        }

    }

    __kernel void scan_h_32(

        __global int *dst,

        __global int *src,

            int width, int height, int stride)

    {

        int x = get_global_id(0);

        int data;

        data = 0;

        int8 value;

        int i;

        for(i=0; i<width; i+=8)

        {

            value = vload8(0, src+x*stride+i);

            value.s0 += data;

            value.s1 += value.s0;

            value.s2 += value.s1;

            value.s3 += value.s2;

            value.s4 += value.s3;

            value.s5 += value.s4;

            value.s6 += value.s5;

            value.s7 += value.s6;

            data = value.s7;

            vstore8(value, 0, dst+x*stride+i);

        }

    }

Children
No data