This discussion has been locked.
You can no longer post new replies to this discussion. If you have a question you can start a new discussion

Strange compiler results

Using the following (example) shader, I encountered a behaviour I cannot make sense of:

 

#version 310 es

precision mediump float;
precision lowp int;
precision mediump sampler2D;

uniform sampler2D input0;
uniform sampler2D input1;

uniform mediump sampler2D weights0;
uniform mediump sampler2D weights1;

in highp vec2 texCoord;
flat in lowp ivec2 wgtCoord;

layout(location=0) out vec4 fragmentColor0;

const lowp vec4 ones=vec4(1,1,1,1);

const lowp float screenScale = 1.0;
const lowp vec2 center = vec2(0.5,0.5);

void fetchWeights(in sampler2D weights,in ivec2 coord,inout vec4 coeffs1[3],inout vec4 coeffs2[3]) {
  coeffs1[0] = texelFetch(weights,coord,0);
  coeffs1[1] = texelFetch(weights,coord+ivec2(1,0),0);
  coeffs1[2] = texelFetch(weights,coord+ivec2(2,0),0);
  coeffs2[0] = texelFetch(weights,coord+ivec2(3,0),0);
  coeffs2[1] = texelFetch(weights,coord+ivec2(4,0),0);
  coeffs2[2] = texelFetch(weights,coord+ivec2(5,0),0);
}


void fetch(in sampler2D source,in int y,inout vec4 result[3]) {
  ivec2 tc = ivec2(screenScale*(gl_FragCoord.xy-center)+texCoord);
  result[0] = max(vec4(0),texelFetch(source,tc-ivec2(1,y),0));
  result[1] = max(vec4(0),texelFetch(source,tc,0));
  result[2] = max(vec4(0),texelFetch(source,tc+ivec2(1,y),0));
}

void procLine1(in sampler2D weights,inout mediump ivec2 wc,in vec4 line[3]) {
  vec4 coeffs1[3],coeffs2[3];
  fetchWeights(weights,wc,coeffs1,coeffs2);
  wc.y+=1;
  fragmentColor0.r += dot(line[0]*coeffs1[0]+line[1]*coeffs1[1]+line[2]*coeffs1[2],ones);
  fragmentColor0.g += dot(line[0],coeffs2[0])+dot(line[1],coeffs2[1])+dot(line[2],coeffs2[2]);
}

void procLine2(in sampler2D,inout mediump ivec2 wc,in vec4 line[3]) {
  vec4 coeffs1[3],coeffs2[3];
  fetchWeights(weights0,wc,coeffs1,coeffs2);
  wc.y+=1;
  fragmentColor0.b += dot(line[0]*coeffs1[0]+line[1]*coeffs1[1]+line[2]*coeffs1[2],ones);
  fragmentColor0.a += dot(line[0],coeffs2[0])+dot(line[1],coeffs2[1])+dot(line[2],coeffs2[2]);
}


void main(void) {
  vec4 line[3];
  mediump ivec2 wc = wgtCoord;
  fragmentColor0 = vec4(0);
  fetch(input0,-1,line);
  procLine1(weights0,wc,line);
  procLine2(weights0,wc,line);
  fetch(input0,0,line);
  procLine1(weights0,wc,line);
  procLine2(weights0,wc,line);
  fetch(input0,1,line);
  procLine1(weights0,wc,line);
  procLine2(weights0,wc,line);
  /*  
  fetch(input1,-1,line);
  procLine1(weights1,wc,line);
  procLine2(weights1,wc,line);
  fetch(input1,0,line);
  procLine1(weights1,wc,line);
  procLine2(weights1,wc,line);
  fetch(input1,1,line);
  procLine1(weights1,wc,line);
  procLine2(weights1,wc,line);
  */ 
}

When using the offline compiler, I get the following statistics (using -c Mali-T880):

8 work registers used, 1 uniform registers used, spilling used.

			A	L/S	T	Bound
Instructions Emitted:	68	5	43	A
Shortest Path Cycles:	23	5	43	T
Longest Path Cycles:	23	5	43	T

Now, when I re-enable the comment-block in the code above, I would expect the number of texture fetches to double and L/S to remain as it is.

Instead I get this:

8 work registers used, 1 uniform registers used, spilling used.

			A	L/S	T	Bound
Instructions Emitted:	130	79	86	A
Shortest Path Cycles:	44	79	86	T
Longest Path Cycles:	44	79	86	T

Is this a case of bad register allocation by the compiler ? The activated code-block has no data-dependency (at least not in a RAW sense) with the preceding block of code.

 

 

  • It seems that these numbers stay the same if you replace weights1 by weights0 in the commented block, and uncomment the block.

    So it might be due to the driver inlining everything at once and abusing some sort of "stack" ?

  • The second case is generating a lot more spills and loads from the stack. No specifically good reason - just "bad luck" in the compiler heuristics.
  • I don't know what this shader is used for so it's kind of difficult to test it.

    Still, if you modify the code like this :

    vec2 procLine(in sampler2D weights,inout mediump ivec2 wc,in vec4 line[3]) {
      vec4 coeffs1[3],coeffs2[3];
      fetchWeights(weights,wc,coeffs1,coeffs2);
      wc.y+=1;
      return vec2(
        dot(line[0]*coeffs1[0]+line[1]*coeffs1[1]+line[2]*coeffs1[2],ones),
        dot(line[0],coeffs2[0])+dot(line[1],coeffs2[1])+dot(line[2],coeffs2[2])
      );
    }
    
    
    void procLines(in sampler2D weights,inout mediump ivec2 wc,in vec4 line[3], inout vec4 fgtcolor) {
      vec2 proced = procLine(weights, wc, line);
      fgtcolor += vec4(proced, proced);
    }
    
    
    void main(void) {
      vec4 line[3];
      mediump ivec2 wc = wgtCoord;
      fragmentColor0 = vec4(0);
      fetch(input0,-1,line);
      procLines(weights0,wc,line, fragmentColor0);
      fetch(input0,0,line);
      procLines(weights0,wc,line, fragmentColor0);
      fetch(input0,1,line);
      procLines(weights0,wc,line, fragmentColor0);
    
      fetch(input1,-1,line);
      procLines(weights1,wc,line, fragmentColor0);
      fetch(input1,0,line);
      procLines(weights1,wc,line, fragmentColor0);
      fetch(input1,1,line);
      procLines(weights1,wc,line, fragmentColor0);
    
    }

    Malisc returns this :
    ARM Mali Offline Compiler v5.3.0
    (C) Copyright 2007-2016 ARM Limited.
    All rights reserved.

    No driver specified, using "Mali-T600_r10p0-00rel0" as default.

    No core specified, using "Mali-T880" as default.

    No core revision specified, using "r2p0" as default.


    8 work registers used, 1 uniform registers used, spilling used.

                A    L/S    T    Bound
    Instructions Emitted:    73    18    50    A
    Shortest Path Cycles:    24    18    50    T
    Longest Path Cycles:    24    18    50    T

    A = Arithmetic, L/S = Load/Store, T = Texture
    Note: The cycles counts do not include possible stalls due to cache misses.
    Note: Shaders with loops may return "N/A" for cycle counts if the number of cycles cannot be statically determined.

    Compilation succeeded.



    That said, I'm not sure if that gives the exact same result.