This discussion has been locked.

You can no longer post new replies to this discussion. If you have a question you can start a new discussion

Why is "compute queue active" not 100% "gpu active"?

chashao 6 months ago

Hi guys:
I have a compute shader for landscape patch culling running before basepass in UE4, but in streamline i found compute shader running 40kilo-cycles but "gpu active" is 181kilo-cycles. Where is the remain cycle(181 - 40 - 28 = 113kilo-cycles) running?

Below is my compute shader, I use offline compiler it say shader is bound by LD/ST， But "Mali Load/Store Usage" is 24.3kilo-cycles.

// Compiled by HLSLCC 0.73
// @Inputs: u3;-1:gl_GlobalInvocationID,u3;-1:gl_LocalInvocationID
// @PackedGlobals: HZBSize(h:0,4),MaxPatchCount(u:0,1)
// @Samplers: FinalNodesSRV(0:1)
// @UAVs: FinalPatchByteAddressBufferUAV(0:1),IndirectDrawBufferUAV(1:1)
// @SRVs: TreeNodeBufferSRV(2:1),TreeMaterialBuffer(3:1),TreeBuffer(4:1)
#version 310 es

// Packed UB
#define BINDING_6	0
#define BINDING_7	1
// Uniform Texel Buffer
#define BINDING_3	2
// Storage TexelBuffer
#define BINDING_0	3
#define BINDING_1	4
// Storage Buffer
#define BINDING_2	5
#define BINDING_4	6
#define BINDING_5	7

#extension GL_EXT_gpu_shader5 : enable
#extension GL_EXT_texture_buffer : enable
#extension GL_EXT_texture_cube_map_array : enable
#extension GL_EXT_shader_io_blocks : enable
#extension GL_OES_shader_image_atomic : enable
layout( local_size_x = 32, local_size_y = 2, local_size_z = 2 ) in;
struct FQuadTreeNodePack
{
	highp uint XY;
	highp uint MinMaxHeight;
	highp uint LodFlag;
	highp uint TreeIndex;
};

struct FQuadTreeGPU
{
	highp uint Offset;
	highp uint XY;
	highp float Padding0;
	highp float Padding1;
	highp vec2 MinPos;
	highp vec2 MaxPos;
	highp vec4 LayerIndexes;
};

layout(set=0, binding=BINDING_6, std140) uniform HLSLCC_CBh
{
	highp vec4 cu_h[1];
};

layout(set=0, binding=BINDING_7, std140) uniform HLSLCC_CBu
{
	highp uvec4 cu_u[1];
};

precision mediump float;
precision highp int;
precision mediump sampler;
precision mediump sampler2D;
precision mediump samplerCube;
uniform writeonly layout(set=0,r32ui,binding=BINDING_0) highp uimageBuffer FinalPatchByteAddressBufferUAV;
uniform layout(set=0,r32ui,binding=BINDING_1) highp uimageBuffer IndirectDrawBufferUAV;
layout(set=0,binding=BINDING_2) buffer  TreeNodeBufferSRV_BUFFER { FQuadTreeNodePack TreeNodeBufferSRV[]; };
layout(set=0, binding=BINDING_3) uniform highp usamplerBuffer FinalNodesSRV;
layout(set=0,binding=BINDING_4) buffer  TreeMaterialBuffer_BUFFER { vec4 TreeMaterialBuffer[]; };
layout(set=0,binding=BINDING_5) buffer  TreeBuffer_BUFFER { FQuadTreeGPU TreeBuffer[]; };
void main_00000000_00000000()
{
	highp uint u0 = 0u;
	u0 = cu_u[0].x;
	highp uint u1 = 0u;
	u1 = uint(cu_h[0].w);
	if ((gl_GlobalInvocationID.x<u1))
	{
		 FQuadTreeNodePack t2;
		t2 = TreeNodeBufferSRV[int(uint(texelFetch(FinalNodesSRV,int(gl_GlobalInvocationID.x)).x))];
		highp uint u3 = 0u;
		highp int i4 = 0;
		highp uint u5 = 0u;
		u5 = t2.XY;
		highp uint u6 = 0u;
		u6 = (u5&1431655765u);
		highp uint u7 = 0u;
		u7 = ((u6^(u6>>1u))&858993459u);
		highp uint u8 = 0u;
		u8 = ((u7^(u7>>2u))&252645135u);
		highp uint u9 = 0u;
		u9 = ((u8^(u8>>4u))&16711935u);
		highp uint u10 = 0u;
		u10 = (u5>>1u);
		highp uint u11 = 0u;
		u11 = (u10&1431655765u);
		highp uint u12 = 0u;
		u12 = ((u11^(u11>>1u))&858993459u);
		highp uint u13 = 0u;
		u13 = ((u12^(u12>>2u))&252645135u);
		highp uint u14 = 0u;
		u14 = ((u13^(u13>>4u))&16711935u);
		highp uint u15 = 0u;
		u15 = ((u14^(u14>>8u))&65535u);
		u10 = u15;
		highp uvec2 v16 = uvec2(0u,0u);
		v16.x = ((u9^(u9>>8u))&65535u);
		v16.y = u15;
		i4 = int(t2.TreeIndex);
		u3 = (t2.LodFlag>>24u);
		highp uint u17 = 0u;
		u17 = uint(i4);
		highp uint u18 = 0u;
		u18 = TreeBuffer[int(u17)].XY;
		highp uvec2 v19 = uvec2(0u,0u);
		v19.x = ((u18&4294901760u)>>16u);
		v19.y = (u18&65535u);
		highp uint u20 = 0u;
		u20 = uint(TreeMaterialBuffer[int(((u17*3u)+1u))].z);
		highp uint u21 = 0u;
		u21 = uint(i4);
		highp vec4 v22 = vec4(0.000000,0.000000,0.000000,0.000000);
		v22.xyzw = TreeMaterialBuffer[int((u21*3u))];
		 vec4 v23 = vec4(0.000000,0.000000,0.000000,0.000000);
		 float h24 = 0.000000;
		h24 = v22.x;
		v23.x = h24;
		 float h25 = 0.000000;
		h25 = v22.z;
		v23.y = h25;
		 float h26 = 0.000000;
		h26 = v22.w;
		v23.z = h26;
		 float h27 = 0.000000;
		h27 = TreeMaterialBuffer[int(((u21*3u)+1u))].x;
		v23.w = h27;
		highp uvec2 v28 = uvec2(0u,0u);
		v28.xy = (((v16*uvec2(2u,2u))+(gl_LocalInvocationID.yz*uvec2(uint((1<<int(u3))))))*uvec2(4u,4u));
		highp uvec4 v29 = uvec4(0u,0u,0u,0u);
		v29.xyzw = uvec4(TreeBuffer[i4].LayerIndexes);
		highp uint u30 = 0u;
		u30 = 1u;
		highp uint u31 = 0u;
		u31 = imageAtomicAdd(IndirectDrawBufferUAV, int(int(((u20*5u)+1u))), u30);
		highp int i32 = 0;
		i32 = ((int((u20*u0))+int(u31))*6);
		imageStore( FinalPatchByteAddressBufferUAV, int((i32+2)), uvec4((((((((u3&15u)|0u)|0u)|((v19.x<<12u)&1044480u))|((v19.y<<20u)&267386880u))|((gl_LocalInvocationID.y<<28u)&805306368u))|((gl_LocalInvocationID.z<<30u)&3221225472u))));
		imageStore( FinalPatchByteAddressBufferUAV, int((i32+3)), uvec4(((v28.y<<16u)|v28.x)));
		highp float f33 = 0.000000;
		f33 = v23.x;
		highp float f34 = 0.000000;
		f34 = v23.y;
		imageStore( FinalPatchByteAddressBufferUAV, int((i32+4)), uvec4(((((((v29.x&31u)|((v29.y<<5u)&992u))|((v29.z<<10u)&31744u))|((v29.w<<15u)&1015808u))|((uint(f33)<<20u)&2146435072u))|((uint(f34)<<31u)&2147483648u))));
	}
}

Any help would be great,

Cheers

Top replies

Peter Harris 6 months ago +2 verified

chashao said: Where is the remain cycle(181 - 40 - 28 = 113kilo-cycles) running? Any difference will be time spent in the GPU front-end handling scheduling. Scheduling overhead tends to be caused by...

Parents

+1 Peter Harris 6 months ago

chashao said:
Where is the remain cycle(181 - 40 - 28 = 113kilo-cycles) running?

Any difference will be time spent in the GPU front-end handling scheduling. Scheduling overhead tends to be caused by:

* High density of small workloads (e.g. many small compute kernels).

* High densities of pipeline barriers (e.g. many separate pipeline barriers between workloads).

* High numbers of workloads that trigger L2 cache flushes.

chashao said:
Below is my compute shader, I use offline compiler it say shader is bound by LD/ST， But "Mali Load/Store Usage" is 24.3kilo-cycles.

Offline compiler can only analyze the shader code, which your last graph shows does indeed show that load/store has the highest usage. The offline compiler obviously knows nothing about the surrounding use of the API that can cause other inefficiencies.
Cancel
Vote up +2 Vote down

Cancel

Reply

+1 Peter Harris 6 months ago

chashao said:
Where is the remain cycle(181 - 40 - 28 = 113kilo-cycles) running?

Any difference will be time spent in the GPU front-end handling scheduling. Scheduling overhead tends to be caused by:

* High density of small workloads (e.g. many small compute kernels).

* High densities of pipeline barriers (e.g. many separate pipeline barriers between workloads).

* High numbers of workloads that trigger L2 cache flushes.

chashao said:
Below is my compute shader, I use offline compiler it say shader is bound by LD/ST， But "Mali Load/Store Usage" is 24.3kilo-cycles.

Offline compiler can only analyze the shader code, which your last graph shows does indeed show that load/store has the highest usage. The offline compiler obviously knows nothing about the surrounding use of the API that can cause other inefficiencies.
Cancel
Vote up +2 Vote down

Cancel

Children

No data