Hi guys:I have a compute shader for landscape patch culling running before basepass in UE4, but in streamline i found compute shader running 40kilo-cycles but "gpu active" is 181kilo-cycles. Where is the remain cycle(181 - 40 - 28 = 113kilo-cycles) running?
Below is my compute shader, I use offline compiler it say shader is bound by LD/ST, But "Mali Load/Store Usage" is 24.3kilo-cycles.
// Compiled by HLSLCC 0.73 // @Inputs: u3;-1:gl_GlobalInvocationID,u3;-1:gl_LocalInvocationID // @PackedGlobals: HZBSize(h:0,4),MaxPatchCount(u:0,1) // @Samplers: FinalNodesSRV(0:1) // @UAVs: FinalPatchByteAddressBufferUAV(0:1),IndirectDrawBufferUAV(1:1) // @SRVs: TreeNodeBufferSRV(2:1),TreeMaterialBuffer(3:1),TreeBuffer(4:1) #version 310 es // Packed UB #define BINDING_6 0 #define BINDING_7 1 // Uniform Texel Buffer #define BINDING_3 2 // Storage TexelBuffer #define BINDING_0 3 #define BINDING_1 4 // Storage Buffer #define BINDING_2 5 #define BINDING_4 6 #define BINDING_5 7 #extension GL_EXT_gpu_shader5 : enable #extension GL_EXT_texture_buffer : enable #extension GL_EXT_texture_cube_map_array : enable #extension GL_EXT_shader_io_blocks : enable #extension GL_OES_shader_image_atomic : enable layout( local_size_x = 32, local_size_y = 2, local_size_z = 2 ) in; struct FQuadTreeNodePack { highp uint XY; highp uint MinMaxHeight; highp uint LodFlag; highp uint TreeIndex; }; struct FQuadTreeGPU { highp uint Offset; highp uint XY; highp float Padding0; highp float Padding1; highp vec2 MinPos; highp vec2 MaxPos; highp vec4 LayerIndexes; }; layout(set=0, binding=BINDING_6, std140) uniform HLSLCC_CBh { highp vec4 cu_h[1]; }; layout(set=0, binding=BINDING_7, std140) uniform HLSLCC_CBu { highp uvec4 cu_u[1]; }; precision mediump float; precision highp int; precision mediump sampler; precision mediump sampler2D; precision mediump samplerCube; uniform writeonly layout(set=0,r32ui,binding=BINDING_0) highp uimageBuffer FinalPatchByteAddressBufferUAV; uniform layout(set=0,r32ui,binding=BINDING_1) highp uimageBuffer IndirectDrawBufferUAV; layout(set=0,binding=BINDING_2) buffer TreeNodeBufferSRV_BUFFER { FQuadTreeNodePack TreeNodeBufferSRV[]; }; layout(set=0, binding=BINDING_3) uniform highp usamplerBuffer FinalNodesSRV; layout(set=0,binding=BINDING_4) buffer TreeMaterialBuffer_BUFFER { vec4 TreeMaterialBuffer[]; }; layout(set=0,binding=BINDING_5) buffer TreeBuffer_BUFFER { FQuadTreeGPU TreeBuffer[]; }; void main_00000000_00000000() { highp uint u0 = 0u; u0 = cu_u[0].x; highp uint u1 = 0u; u1 = uint(cu_h[0].w); if ((gl_GlobalInvocationID.x<u1)) { FQuadTreeNodePack t2; t2 = TreeNodeBufferSRV[int(uint(texelFetch(FinalNodesSRV,int(gl_GlobalInvocationID.x)).x))]; highp uint u3 = 0u; highp int i4 = 0; highp uint u5 = 0u; u5 = t2.XY; highp uint u6 = 0u; u6 = (u5&1431655765u); highp uint u7 = 0u; u7 = ((u6^(u6>>1u))&858993459u); highp uint u8 = 0u; u8 = ((u7^(u7>>2u))&252645135u); highp uint u9 = 0u; u9 = ((u8^(u8>>4u))&16711935u); highp uint u10 = 0u; u10 = (u5>>1u); highp uint u11 = 0u; u11 = (u10&1431655765u); highp uint u12 = 0u; u12 = ((u11^(u11>>1u))&858993459u); highp uint u13 = 0u; u13 = ((u12^(u12>>2u))&252645135u); highp uint u14 = 0u; u14 = ((u13^(u13>>4u))&16711935u); highp uint u15 = 0u; u15 = ((u14^(u14>>8u))&65535u); u10 = u15; highp uvec2 v16 = uvec2(0u,0u); v16.x = ((u9^(u9>>8u))&65535u); v16.y = u15; i4 = int(t2.TreeIndex); u3 = (t2.LodFlag>>24u); highp uint u17 = 0u; u17 = uint(i4); highp uint u18 = 0u; u18 = TreeBuffer[int(u17)].XY; highp uvec2 v19 = uvec2(0u,0u); v19.x = ((u18&4294901760u)>>16u); v19.y = (u18&65535u); highp uint u20 = 0u; u20 = uint(TreeMaterialBuffer[int(((u17*3u)+1u))].z); highp uint u21 = 0u; u21 = uint(i4); highp vec4 v22 = vec4(0.000000,0.000000,0.000000,0.000000); v22.xyzw = TreeMaterialBuffer[int((u21*3u))]; vec4 v23 = vec4(0.000000,0.000000,0.000000,0.000000); float h24 = 0.000000; h24 = v22.x; v23.x = h24; float h25 = 0.000000; h25 = v22.z; v23.y = h25; float h26 = 0.000000; h26 = v22.w; v23.z = h26; float h27 = 0.000000; h27 = TreeMaterialBuffer[int(((u21*3u)+1u))].x; v23.w = h27; highp uvec2 v28 = uvec2(0u,0u); v28.xy = (((v16*uvec2(2u,2u))+(gl_LocalInvocationID.yz*uvec2(uint((1<<int(u3))))))*uvec2(4u,4u)); highp uvec4 v29 = uvec4(0u,0u,0u,0u); v29.xyzw = uvec4(TreeBuffer[i4].LayerIndexes); highp uint u30 = 0u; u30 = 1u; highp uint u31 = 0u; u31 = imageAtomicAdd(IndirectDrawBufferUAV, int(int(((u20*5u)+1u))), u30); highp int i32 = 0; i32 = ((int((u20*u0))+int(u31))*6); imageStore( FinalPatchByteAddressBufferUAV, int((i32+2)), uvec4((((((((u3&15u)|0u)|0u)|((v19.x<<12u)&1044480u))|((v19.y<<20u)&267386880u))|((gl_LocalInvocationID.y<<28u)&805306368u))|((gl_LocalInvocationID.z<<30u)&3221225472u)))); imageStore( FinalPatchByteAddressBufferUAV, int((i32+3)), uvec4(((v28.y<<16u)|v28.x))); highp float f33 = 0.000000; f33 = v23.x; highp float f34 = 0.000000; f34 = v23.y; imageStore( FinalPatchByteAddressBufferUAV, int((i32+4)), uvec4(((((((v29.x&31u)|((v29.y<<5u)&992u))|((v29.z<<10u)&31744u))|((v29.w<<15u)&1015808u))|((uint(f33)<<20u)&2146435072u))|((uint(f34)<<31u)&2147483648u)))); } }
Any help would be great,
Cheers