Using the following (example) shader, I encountered a behaviour I cannot make sense of:
#version 310 es precision mediump float; precision lowp int; precision mediump sampler2D; uniform sampler2D input0; uniform sampler2D input1; uniform mediump sampler2D weights0; uniform mediump sampler2D weights1; in highp vec2 texCoord; flat in lowp ivec2 wgtCoord; layout(location=0) out vec4 fragmentColor0; const lowp vec4 ones=vec4(1,1,1,1); const lowp float screenScale = 1.0; const lowp vec2 center = vec2(0.5,0.5); void fetchWeights(in sampler2D weights,in ivec2 coord,inout vec4 coeffs1[3],inout vec4 coeffs2[3]) { coeffs1[0] = texelFetch(weights,coord,0); coeffs1[1] = texelFetch(weights,coord+ivec2(1,0),0); coeffs1[2] = texelFetch(weights,coord+ivec2(2,0),0); coeffs2[0] = texelFetch(weights,coord+ivec2(3,0),0); coeffs2[1] = texelFetch(weights,coord+ivec2(4,0),0); coeffs2[2] = texelFetch(weights,coord+ivec2(5,0),0); } void fetch(in sampler2D source,in int y,inout vec4 result[3]) { ivec2 tc = ivec2(screenScale*(gl_FragCoord.xy-center)+texCoord); result[0] = max(vec4(0),texelFetch(source,tc-ivec2(1,y),0)); result[1] = max(vec4(0),texelFetch(source,tc,0)); result[2] = max(vec4(0),texelFetch(source,tc+ivec2(1,y),0)); } void procLine1(in sampler2D weights,inout mediump ivec2 wc,in vec4 line[3]) { vec4 coeffs1[3],coeffs2[3]; fetchWeights(weights,wc,coeffs1,coeffs2); wc.y+=1; fragmentColor0.r += dot(line[0]*coeffs1[0]+line[1]*coeffs1[1]+line[2]*coeffs1[2],ones); fragmentColor0.g += dot(line[0],coeffs2[0])+dot(line[1],coeffs2[1])+dot(line[2],coeffs2[2]); } void procLine2(in sampler2D,inout mediump ivec2 wc,in vec4 line[3]) { vec4 coeffs1[3],coeffs2[3]; fetchWeights(weights0,wc,coeffs1,coeffs2); wc.y+=1; fragmentColor0.b += dot(line[0]*coeffs1[0]+line[1]*coeffs1[1]+line[2]*coeffs1[2],ones); fragmentColor0.a += dot(line[0],coeffs2[0])+dot(line[1],coeffs2[1])+dot(line[2],coeffs2[2]); } void main(void) { vec4 line[3]; mediump ivec2 wc = wgtCoord; fragmentColor0 = vec4(0); fetch(input0,-1,line); procLine1(weights0,wc,line); procLine2(weights0,wc,line); fetch(input0,0,line); procLine1(weights0,wc,line); procLine2(weights0,wc,line); fetch(input0,1,line); procLine1(weights0,wc,line); procLine2(weights0,wc,line); /* fetch(input1,-1,line); procLine1(weights1,wc,line); procLine2(weights1,wc,line); fetch(input1,0,line); procLine1(weights1,wc,line); procLine2(weights1,wc,line); fetch(input1,1,line); procLine1(weights1,wc,line); procLine2(weights1,wc,line); */ }
When using the offline compiler, I get the following statistics (using -c Mali-T880):
8 work registers used, 1 uniform registers used, spilling used. A L/S T Bound Instructions Emitted: 68 5 43 A Shortest Path Cycles: 23 5 43 T Longest Path Cycles: 23 5 43 T
Now, when I re-enable the comment-block in the code above, I would expect the number of texture fetches to double and L/S to remain as it is.
Instead I get this:
8 work registers used, 1 uniform registers used, spilling used. A L/S T Bound Instructions Emitted: 130 79 86 A Shortest Path Cycles: 44 79 86 T Longest Path Cycles: 44 79 86 T
Is this a case of bad register allocation by the compiler ? The activated code-block has no data-dependency (at least not in a RAW sense) with the preceding block of code.
I don't know what this shader is used for so it's kind of difficult to test it.
Still, if you modify the code like this :
vec2 procLine(in sampler2D weights,inout mediump ivec2 wc,in vec4 line[3]) { vec4 coeffs1[3],coeffs2[3]; fetchWeights(weights,wc,coeffs1,coeffs2); wc.y+=1; return vec2( dot(line[0]*coeffs1[0]+line[1]*coeffs1[1]+line[2]*coeffs1[2],ones), dot(line[0],coeffs2[0])+dot(line[1],coeffs2[1])+dot(line[2],coeffs2[2]) ); } void procLines(in sampler2D weights,inout mediump ivec2 wc,in vec4 line[3], inout vec4 fgtcolor) { vec2 proced = procLine(weights, wc, line); fgtcolor += vec4(proced, proced); } void main(void) { vec4 line[3]; mediump ivec2 wc = wgtCoord; fragmentColor0 = vec4(0); fetch(input0,-1,line); procLines(weights0,wc,line, fragmentColor0); fetch(input0,0,line); procLines(weights0,wc,line, fragmentColor0); fetch(input0,1,line); procLines(weights0,wc,line, fragmentColor0); fetch(input1,-1,line); procLines(weights1,wc,line, fragmentColor0); fetch(input1,0,line); procLines(weights1,wc,line, fragmentColor0); fetch(input1,1,line); procLines(weights1,wc,line, fragmentColor0); }
Malisc returns this :
ARM Mali Offline Compiler v5.3.0(C) Copyright 2007-2016 ARM Limited.All rights reserved.No driver specified, using "Mali-T600_r10p0-00rel0" as default.No core specified, using "Mali-T880" as default.No core revision specified, using "r2p0" as default.8 work registers used, 1 uniform registers used, spilling used. A L/S T BoundInstructions Emitted: 73 18 50 AShortest Path Cycles: 24 18 50 TLongest Path Cycles: 24 18 50 TA = Arithmetic, L/S = Load/Store, T = TextureNote: The cycles counts do not include possible stalls due to cache misses.Note: Shaders with loops may return "N/A" for cycle counts if the number of cycles cannot be statically determined.Compilation succeeded.
That said, I'm not sure if that gives the exact same result.