Please note: We are aware of an issue affecting replies on the Arm Community forums, which may not be loading as expected.

We apologize for any inconvenience and appreciate your patience while we investigate and work to resolve the issue.

Thank you for your understanding.


This discussion has been locked.
You can no longer post new replies to this discussion. If you have a question you can start a new discussion

Why is "compute queue active" not 100% "gpu active"?

Hi guys:
I have a compute shader for landscape patch culling running before basepass in UE4, but in streamline i found compute shader running 40kilo-cycles but "gpu active" is 181kilo-cycles. Where is the remain cycle(181 - 40  - 28 = 113kilo-cycles) running?

Below is my compute shader, I use offline compiler it say shader is bound by LD/ST, But "Mali Load/Store Usage" is 24.3kilo-cycles.

// Compiled by HLSLCC 0.73
// @Inputs: u3;-1:gl_GlobalInvocationID,u3;-1:gl_LocalInvocationID
// @PackedGlobals: HZBSize(h:0,4),MaxPatchCount(u:0,1)
// @Samplers: FinalNodesSRV(0:1)
// @UAVs: FinalPatchByteAddressBufferUAV(0:1),IndirectDrawBufferUAV(1:1)
// @SRVs: TreeNodeBufferSRV(2:1),TreeMaterialBuffer(3:1),TreeBuffer(4:1)
#version 310 es

// Packed UB
#define BINDING_6	0
#define BINDING_7	1
// Uniform Texel Buffer
#define BINDING_3	2
// Storage TexelBuffer
#define BINDING_0	3
#define BINDING_1	4
// Storage Buffer
#define BINDING_2	5
#define BINDING_4	6
#define BINDING_5	7

#extension GL_EXT_gpu_shader5 : enable
#extension GL_EXT_texture_buffer : enable
#extension GL_EXT_texture_cube_map_array : enable
#extension GL_EXT_shader_io_blocks : enable
#extension GL_OES_shader_image_atomic : enable
layout( local_size_x = 32, local_size_y = 2, local_size_z = 2 ) in;
struct FQuadTreeNodePack
{
	highp uint XY;
	highp uint MinMaxHeight;
	highp uint LodFlag;
	highp uint TreeIndex;
};

struct FQuadTreeGPU
{
	highp uint Offset;
	highp uint XY;
	highp float Padding0;
	highp float Padding1;
	highp vec2 MinPos;
	highp vec2 MaxPos;
	highp vec4 LayerIndexes;
};

layout(set=0, binding=BINDING_6, std140) uniform HLSLCC_CBh
{
	highp vec4 cu_h[1];
};

layout(set=0, binding=BINDING_7, std140) uniform HLSLCC_CBu
{
	highp uvec4 cu_u[1];
};

precision mediump float;
precision highp int;
precision mediump sampler;
precision mediump sampler2D;
precision mediump samplerCube;
uniform writeonly layout(set=0,r32ui,binding=BINDING_0) highp uimageBuffer FinalPatchByteAddressBufferUAV;
uniform layout(set=0,r32ui,binding=BINDING_1) highp uimageBuffer IndirectDrawBufferUAV;
layout(set=0,binding=BINDING_2) buffer  TreeNodeBufferSRV_BUFFER { FQuadTreeNodePack TreeNodeBufferSRV[]; };
layout(set=0, binding=BINDING_3) uniform highp usamplerBuffer FinalNodesSRV;
layout(set=0,binding=BINDING_4) buffer  TreeMaterialBuffer_BUFFER { vec4 TreeMaterialBuffer[]; };
layout(set=0,binding=BINDING_5) buffer  TreeBuffer_BUFFER { FQuadTreeGPU TreeBuffer[]; };
void main_00000000_00000000()
{
	highp uint u0 = 0u;
	u0 = cu_u[0].x;
	highp uint u1 = 0u;
	u1 = uint(cu_h[0].w);
	if ((gl_GlobalInvocationID.x<u1))
	{
		 FQuadTreeNodePack t2;
		t2 = TreeNodeBufferSRV[int(uint(texelFetch(FinalNodesSRV,int(gl_GlobalInvocationID.x)).x))];
		highp uint u3 = 0u;
		highp int i4 = 0;
		highp uint u5 = 0u;
		u5 = t2.XY;
		highp uint u6 = 0u;
		u6 = (u5&1431655765u);
		highp uint u7 = 0u;
		u7 = ((u6^(u6>>1u))&858993459u);
		highp uint u8 = 0u;
		u8 = ((u7^(u7>>2u))&252645135u);
		highp uint u9 = 0u;
		u9 = ((u8^(u8>>4u))&16711935u);
		highp uint u10 = 0u;
		u10 = (u5>>1u);
		highp uint u11 = 0u;
		u11 = (u10&1431655765u);
		highp uint u12 = 0u;
		u12 = ((u11^(u11>>1u))&858993459u);
		highp uint u13 = 0u;
		u13 = ((u12^(u12>>2u))&252645135u);
		highp uint u14 = 0u;
		u14 = ((u13^(u13>>4u))&16711935u);
		highp uint u15 = 0u;
		u15 = ((u14^(u14>>8u))&65535u);
		u10 = u15;
		highp uvec2 v16 = uvec2(0u,0u);
		v16.x = ((u9^(u9>>8u))&65535u);
		v16.y = u15;
		i4 = int(t2.TreeIndex);
		u3 = (t2.LodFlag>>24u);
		highp uint u17 = 0u;
		u17 = uint(i4);
		highp uint u18 = 0u;
		u18 = TreeBuffer[int(u17)].XY;
		highp uvec2 v19 = uvec2(0u,0u);
		v19.x = ((u18&4294901760u)>>16u);
		v19.y = (u18&65535u);
		highp uint u20 = 0u;
		u20 = uint(TreeMaterialBuffer[int(((u17*3u)+1u))].z);
		highp uint u21 = 0u;
		u21 = uint(i4);
		highp vec4 v22 = vec4(0.000000,0.000000,0.000000,0.000000);
		v22.xyzw = TreeMaterialBuffer[int((u21*3u))];
		 vec4 v23 = vec4(0.000000,0.000000,0.000000,0.000000);
		 float h24 = 0.000000;
		h24 = v22.x;
		v23.x = h24;
		 float h25 = 0.000000;
		h25 = v22.z;
		v23.y = h25;
		 float h26 = 0.000000;
		h26 = v22.w;
		v23.z = h26;
		 float h27 = 0.000000;
		h27 = TreeMaterialBuffer[int(((u21*3u)+1u))].x;
		v23.w = h27;
		highp uvec2 v28 = uvec2(0u,0u);
		v28.xy = (((v16*uvec2(2u,2u))+(gl_LocalInvocationID.yz*uvec2(uint((1<<int(u3))))))*uvec2(4u,4u));
		highp uvec4 v29 = uvec4(0u,0u,0u,0u);
		v29.xyzw = uvec4(TreeBuffer[i4].LayerIndexes);
		highp uint u30 = 0u;
		u30 = 1u;
		highp uint u31 = 0u;
		u31 = imageAtomicAdd(IndirectDrawBufferUAV, int(int(((u20*5u)+1u))), u30);
		highp int i32 = 0;
		i32 = ((int((u20*u0))+int(u31))*6);
		imageStore( FinalPatchByteAddressBufferUAV, int((i32+2)), uvec4((((((((u3&15u)|0u)|0u)|((v19.x<<12u)&1044480u))|((v19.y<<20u)&267386880u))|((gl_LocalInvocationID.y<<28u)&805306368u))|((gl_LocalInvocationID.z<<30u)&3221225472u))));
		imageStore( FinalPatchByteAddressBufferUAV, int((i32+3)), uvec4(((v28.y<<16u)|v28.x)));
		highp float f33 = 0.000000;
		f33 = v23.x;
		highp float f34 = 0.000000;
		f34 = v23.y;
		imageStore( FinalPatchByteAddressBufferUAV, int((i32+4)), uvec4(((((((v29.x&31u)|((v29.y<<5u)&992u))|((v29.z<<10u)&31744u))|((v29.w<<15u)&1015808u))|((uint(f33)<<20u)&2146435072u))|((uint(f34)<<31u)&2147483648u))));
	}
}

Any help would be great,

Cheers

Parents
  • Where is the remain cycle(181 - 40  - 28 = 113kilo-cycles) running?

    Any difference will be time spent in the GPU front-end handling scheduling. Scheduling overhead tends to be caused by:

    * High density of small workloads (e.g. many small compute kernels).

    * High densities of pipeline barriers (e.g. many separate pipeline barriers between workloads).

    * High numbers of workloads that trigger L2 cache flushes.

    Below is my compute shader, I use offline compiler it say shader is bound by LD/ST, But "Mali Load/Store Usage" is 24.3kilo-cycles.

    Offline compiler can only analyze the shader code, which your last graph shows does indeed show that load/store has the highest usage. The offline compiler obviously knows nothing about the surrounding use of the API that can cause other inefficiencies.

Reply
  • Where is the remain cycle(181 - 40  - 28 = 113kilo-cycles) running?

    Any difference will be time spent in the GPU front-end handling scheduling. Scheduling overhead tends to be caused by:

    * High density of small workloads (e.g. many small compute kernels).

    * High densities of pipeline barriers (e.g. many separate pipeline barriers between workloads).

    * High numbers of workloads that trigger L2 cache flushes.

    Below is my compute shader, I use offline compiler it say shader is bound by LD/ST, But "Mali Load/Store Usage" is 24.3kilo-cycles.

    Offline compiler can only analyze the shader code, which your last graph shows does indeed show that load/store has the highest usage. The offline compiler obviously knows nothing about the surrounding use of the API that can cause other inefficiencies.

Children
No data