This discussion has been locked.
You can no longer post new replies to this discussion. If you have a question you can start a new discussion

Kernel CL_OUT_OF_RESOURCES problem

Hi,

We are developing a program based on RK3288 (Mali T764) with Opencl. We met a very strange problem.

Our kernel has 4 workgroups with {2,64} workitems (128 x 4 in total). The input includes two 512x768 global float arrays, one 384 global float array, and the output is one 512x768 array. We initialize one 768 local float4 array, one 384 float array. At the beginning, two line of the 512x768 global float array is loaded into the 768 local float4 array, and the whole 384 global float is loaded into the local 384 array. We do some calculations, and then save the output. Until these process, everything fine.

We need to synchronize the local memory for further coding, so we add a barrier(CLK_LOCAL_MEM_FENCE), however, the trouble comes. clEnqueueNDRangeKernel() returns an error number -5, which seems meaning CL_OUT_OF_RESOURCES. We did further tests to check the error.

1.  Add the barrier but delete the code of calculation. The kernel works fine.

2. Add the barrier, keep the calculation but delete the code of loading two line from the global 512x768 arrays (the data in the local memory is 0). The kernel also works fine.

We are totally confused what is wrong with our code, or it is the problem of the driver or Opencl itself. Could anyone help us figure it out? Many thanks!

The code is here:

original code with barrier (if deleting the barrier in the middle, it works fine)

const char* f1_ap2xy[] = {

" _kernel void f1_ap2xy (_global float * in_amp,__global *in_phase, __global float * in_lut, __global float * out)\n"
" {\n"

"local float4 temp[768];\n"
"local float lut[384];\n"
"uint4 gid;\n"
"float4 r0_x,r1_x,r0_y,r1_y,t;\n"
"gid.x=(get_global_id(0)>>1)*98304+((get_global_id(0)&0x0001)*768);\n""gid.y=(get_global_id(1)<<2)+gid.x;\n"
"gid.z=get_local_id(1);\n"
"gid.w=get_local_id(0)*192+gid.z;\n"
"__global float4 trans=(global float4)(in_amp+gid.y);\n"

//read data from global
"r0_x=trans[0];\n"
"r1_x=trans[64];\n"
"t=trans[128];\n"
"trans=(global float4*)(in_phase+gid.y);\n"
"r0_y=trans[0];\n"
"r1_y=trans[64];\n"
"temp[gid.w+64]=trans[128];\n"
"trans=(global float4*)(out+gid.y);\n"

"lut[(get_local_id(0)<<6)+gid.z]=in_lut[(get_local_id(0)<<6)+gid.z];\n"
"lut[(get_local_id(0)<<6)+gid.z+128]=in_lut[(get_local_id(0)<<6)+gid.z+128];\n"
"lut[(get_local_id(0)<<6)+gid.z+256]=in_lut[(get_local_id(0)<<6)+gid.z+256];\n"

"barrier(CLK_LOCAL_MEM_FENCE);\n"

//calculate
"temp[gid.w]=r0_x+r1_x+t;\n"
"temp[gid.w+384]=r0_y+r1_y+temp[gid.w+64];\n" "temp[gid.w+128]=r0_x-(0.5*r1_x-(0.8660254*r1_y))-((temp[gid.w+64]*0.8660254)+(t*0.5));\n"
"temp[gid.w+512]=r0_y-(0.5*r1_y+(0.8660254*r1_x))+((0.8660254*t)-(0.5*temp[gid.w+64]));\n" "temp[gid.w+448]=r0_y-(0.5*r1_y-(0.8660254*r1_x))-((0.8660254*t)+(0.5*temp[gid.w+64]));\n"
"temp[gid.w+64]=r0_x-(0.5*r1_x+(0.8660254*r1_y))+((0.8660254*temp[gid.w+64])-(0.5*t));\n"

//output
"trans[0]=temp[gid.w+384];\n"
"trans[64]=temp[gid.w+512];\n"
"trans[128]=temp[gid.w+448];\n"
"}\n"
};


1. Add the barrier but delete the code of calculation

const char* f1_ap2xy[] = {

" _kernel void f1_ap2xy (_global float * in_amp,__global *in_phase, __global float * in_lut, __global float * out)\n"
" {\n"

"local float4 temp[768];\n"
"local float lut[384];\n"
"uint4 gid;\n"
"float4 r0_x,r1_x,r0_y,r1_y,t;\n"
"gid.x=(get_global_id(0)>>1)*98304+((get_global_id(0)&0x0001)*768);\n""gid.y=(get_global_id(1)<<2)+gid.x;\n"
"gid.z=get_local_id(1);\n"
"gid.w=get_local_id(0)*192+gid.z;\n"
"__global float4 trans=(global float4)(in_amp+gid.y);\n"

//read data from global
"r0_x=trans[0];\n"
"r1_x=trans[64];\n"
"t=trans[128];\n"
"trans=(global float4*)(in_phase+gid.y);\n"
"r0_y=trans[0];\n"
"r1_y=trans[64];\n"
"temp[gid.w+64]=trans[128];\n"
"trans=(global float4*)(out+gid.y);\n"

"lut[(get_local_id(0)<<6)+gid.z]=in_lut[(get_local_id(0)<<6)+gid.z];\n"
"lut[(get_local_id(0)<<6)+gid.z+128]=in_lut[(get_local_id(0)<<6)+gid.z+128];\n"
"lut[(get_local_id(0)<<6)+gid.z+256]=in_lut[(get_local_id(0)<<6)+gid.z+256];\n"

"barrier(CLK_LOCAL_MEM_FENCE);\n"

//output
"trans[0]=temp[gid.w+384];\n"
"trans[64]=temp[gid.w+512];\n"
"trans[128]=temp[gid.w+448];\n"
"}\n"
};


2. Add the barrier, keep the calculation but delete the code of loading two line from the global 512x768 arrays.

const char* f1_ap2xy[] = {

" _kernel void f1_ap2xy (_global float * in_amp,__global *in_phase, __global float * in_lut, __global float * out)\n"
" {\n"

"local float4 temp[768];\n"
"local float lut[384];\n"
"uint4 gid;\n"
"float4 r0_x,r1_x,r0_y,r1_y,t;\n"
"gid.x=(get_global_id(0)>>1)*98304+((get_global_id(0)&0x0001)*768);\n""gid.y=(get_global_id(1)<<2)+gid.x;\n"
"gid.z=get_local_id(1);\n"
"gid.w=get_local_id(0)*192+gid.z;\n"
"__global float4 trans=(global float4)(in_amp+gid.y);\n"


"trans=(global float4*)(out+gid.y);\n"

"lut[(get_local_id(0)<<6)+gid.z]=in_lut[(get_local_id(0)<<6)+gid.z];\n"
"lut[(get_local_id(0)<<6)+gid.z+128]=in_lut[(get_local_id(0)<<6)+gid.z+128];\n"
"lut[(get_local_id(0)<<6)+gid.z+256]=in_lut[(get_local_id(0)<<6)+gid.z+256];\n"

"barrier(CLK_LOCAL_MEM_FENCE);\n"

//calculate
"temp[gid.w]=r0_x+r1_x+t;\n"
"temp[gid.w+384]=r0_y+r1_y+temp[gid.w+64];\n" "temp[gid.w+128]=r0_x-(0.5*r1_x-(0.8660254*r1_y))-((temp[gid.w+64]*0.8660254)+(t*0.5));\n"
"temp[gid.w+512]=r0_y-(0.5*r1_y+(0.8660254*r1_x))+((0.8660254*t)-(0.5*temp[gid.w+64]));\n" "temp[gid.w+448]=r0_y-(0.5*r1_y-(0.8660254*r1_x))-((0.8660254*t)+(0.5*temp[gid.w+64]));\n"
"temp[gid.w+64]=r0_x-(0.5*r1_x+(0.8660254*r1_y))+((0.8660254*temp[gid.w+64])-(0.5*t));\n"

//output
"trans[0]=temp[gid.w+384];\n"
"trans[64]=temp[gid.w+512];\n"
"trans[128]=temp[gid.w+448];\n"
"}\n"
};


Parents
  • Not really, but you can try to start more threads even if they don’t do anything. Howeverremember there is no guarantee about the order in which they will get spawned.

    Also if you want the best performance, make sure you disable DVFS:

    echo "off" > /sys/class/misc/mali0/device/dvfs

    And set the GPU at the maximum frequency:

    echo "533" > /sys/class/misc/mali0/device/clock

Reply
  • Not really, but you can try to start more threads even if they don’t do anything. Howeverremember there is no guarantee about the order in which they will get spawned.

    Also if you want the best performance, make sure you disable DVFS:

    echo "off" > /sys/class/misc/mali0/device/dvfs

    And set the GPU at the maximum frequency:

    echo "533" > /sys/class/misc/mali0/device/clock

Children
No data