This discussion has been locked.
You can no longer post new replies to this discussion. If you have a question you can start a new discussion

Strange compiler results

Using the following (example) shader, I encountered a behaviour I cannot make sense of:

 

#version 310 es

precision mediump float;
precision lowp int;
precision mediump sampler2D;

uniform sampler2D input0;
uniform sampler2D input1;

uniform mediump sampler2D weights0;
uniform mediump sampler2D weights1;

in highp vec2 texCoord;
flat in lowp ivec2 wgtCoord;

layout(location=0) out vec4 fragmentColor0;

const lowp vec4 ones=vec4(1,1,1,1);

const lowp float screenScale = 1.0;
const lowp vec2 center = vec2(0.5,0.5);

void fetchWeights(in sampler2D weights,in ivec2 coord,inout vec4 coeffs1[3],inout vec4 coeffs2[3]) {
  coeffs1[0] = texelFetch(weights,coord,0);
  coeffs1[1] = texelFetch(weights,coord+ivec2(1,0),0);
  coeffs1[2] = texelFetch(weights,coord+ivec2(2,0),0);
  coeffs2[0] = texelFetch(weights,coord+ivec2(3,0),0);
  coeffs2[1] = texelFetch(weights,coord+ivec2(4,0),0);
  coeffs2[2] = texelFetch(weights,coord+ivec2(5,0),0);
}


void fetch(in sampler2D source,in int y,inout vec4 result[3]) {
  ivec2 tc = ivec2(screenScale*(gl_FragCoord.xy-center)+texCoord);
  result[0] = max(vec4(0),texelFetch(source,tc-ivec2(1,y),0));
  result[1] = max(vec4(0),texelFetch(source,tc,0));
  result[2] = max(vec4(0),texelFetch(source,tc+ivec2(1,y),0));
}

void procLine1(in sampler2D weights,inout mediump ivec2 wc,in vec4 line[3]) {
  vec4 coeffs1[3],coeffs2[3];
  fetchWeights(weights,wc,coeffs1,coeffs2);
  wc.y+=1;
  fragmentColor0.r += dot(line[0]*coeffs1[0]+line[1]*coeffs1[1]+line[2]*coeffs1[2],ones);
  fragmentColor0.g += dot(line[0],coeffs2[0])+dot(line[1],coeffs2[1])+dot(line[2],coeffs2[2]);
}

void procLine2(in sampler2D,inout mediump ivec2 wc,in vec4 line[3]) {
  vec4 coeffs1[3],coeffs2[3];
  fetchWeights(weights0,wc,coeffs1,coeffs2);
  wc.y+=1;
  fragmentColor0.b += dot(line[0]*coeffs1[0]+line[1]*coeffs1[1]+line[2]*coeffs1[2],ones);
  fragmentColor0.a += dot(line[0],coeffs2[0])+dot(line[1],coeffs2[1])+dot(line[2],coeffs2[2]);
}


void main(void) {
  vec4 line[3];
  mediump ivec2 wc = wgtCoord;
  fragmentColor0 = vec4(0);
  fetch(input0,-1,line);
  procLine1(weights0,wc,line);
  procLine2(weights0,wc,line);
  fetch(input0,0,line);
  procLine1(weights0,wc,line);
  procLine2(weights0,wc,line);
  fetch(input0,1,line);
  procLine1(weights0,wc,line);
  procLine2(weights0,wc,line);
  /*  
  fetch(input1,-1,line);
  procLine1(weights1,wc,line);
  procLine2(weights1,wc,line);
  fetch(input1,0,line);
  procLine1(weights1,wc,line);
  procLine2(weights1,wc,line);
  fetch(input1,1,line);
  procLine1(weights1,wc,line);
  procLine2(weights1,wc,line);
  */ 
}

When using the offline compiler, I get the following statistics (using -c Mali-T880):

8 work registers used, 1 uniform registers used, spilling used.

			A	L/S	T	Bound
Instructions Emitted:	68	5	43	A
Shortest Path Cycles:	23	5	43	T
Longest Path Cycles:	23	5	43	T

Now, when I re-enable the comment-block in the code above, I would expect the number of texture fetches to double and L/S to remain as it is.

Instead I get this:

8 work registers used, 1 uniform registers used, spilling used.

			A	L/S	T	Bound
Instructions Emitted:	130	79	86	A
Shortest Path Cycles:	44	79	86	T
Longest Path Cycles:	44	79	86	T

Is this a case of bad register allocation by the compiler ? The activated code-block has no data-dependency (at least not in a RAW sense) with the preceding block of code.

 

 

Parents
  • I don't know what this shader is used for so it's kind of difficult to test it.

    Still, if you modify the code like this :

    vec2 procLine(in sampler2D weights,inout mediump ivec2 wc,in vec4 line[3]) {
      vec4 coeffs1[3],coeffs2[3];
      fetchWeights(weights,wc,coeffs1,coeffs2);
      wc.y+=1;
      return vec2(
        dot(line[0]*coeffs1[0]+line[1]*coeffs1[1]+line[2]*coeffs1[2],ones),
        dot(line[0],coeffs2[0])+dot(line[1],coeffs2[1])+dot(line[2],coeffs2[2])
      );
    }
    
    
    void procLines(in sampler2D weights,inout mediump ivec2 wc,in vec4 line[3], inout vec4 fgtcolor) {
      vec2 proced = procLine(weights, wc, line);
      fgtcolor += vec4(proced, proced);
    }
    
    
    void main(void) {
      vec4 line[3];
      mediump ivec2 wc = wgtCoord;
      fragmentColor0 = vec4(0);
      fetch(input0,-1,line);
      procLines(weights0,wc,line, fragmentColor0);
      fetch(input0,0,line);
      procLines(weights0,wc,line, fragmentColor0);
      fetch(input0,1,line);
      procLines(weights0,wc,line, fragmentColor0);
    
      fetch(input1,-1,line);
      procLines(weights1,wc,line, fragmentColor0);
      fetch(input1,0,line);
      procLines(weights1,wc,line, fragmentColor0);
      fetch(input1,1,line);
      procLines(weights1,wc,line, fragmentColor0);
    
    }

    Malisc returns this :
    ARM Mali Offline Compiler v5.3.0
    (C) Copyright 2007-2016 ARM Limited.
    All rights reserved.

    No driver specified, using "Mali-T600_r10p0-00rel0" as default.

    No core specified, using "Mali-T880" as default.

    No core revision specified, using "r2p0" as default.


    8 work registers used, 1 uniform registers used, spilling used.

                A    L/S    T    Bound
    Instructions Emitted:    73    18    50    A
    Shortest Path Cycles:    24    18    50    T
    Longest Path Cycles:    24    18    50    T

    A = Arithmetic, L/S = Load/Store, T = Texture
    Note: The cycles counts do not include possible stalls due to cache misses.
    Note: Shaders with loops may return "N/A" for cycle counts if the number of cycles cannot be statically determined.

    Compilation succeeded.



    That said, I'm not sure if that gives the exact same result.

Reply
  • I don't know what this shader is used for so it's kind of difficult to test it.

    Still, if you modify the code like this :

    vec2 procLine(in sampler2D weights,inout mediump ivec2 wc,in vec4 line[3]) {
      vec4 coeffs1[3],coeffs2[3];
      fetchWeights(weights,wc,coeffs1,coeffs2);
      wc.y+=1;
      return vec2(
        dot(line[0]*coeffs1[0]+line[1]*coeffs1[1]+line[2]*coeffs1[2],ones),
        dot(line[0],coeffs2[0])+dot(line[1],coeffs2[1])+dot(line[2],coeffs2[2])
      );
    }
    
    
    void procLines(in sampler2D weights,inout mediump ivec2 wc,in vec4 line[3], inout vec4 fgtcolor) {
      vec2 proced = procLine(weights, wc, line);
      fgtcolor += vec4(proced, proced);
    }
    
    
    void main(void) {
      vec4 line[3];
      mediump ivec2 wc = wgtCoord;
      fragmentColor0 = vec4(0);
      fetch(input0,-1,line);
      procLines(weights0,wc,line, fragmentColor0);
      fetch(input0,0,line);
      procLines(weights0,wc,line, fragmentColor0);
      fetch(input0,1,line);
      procLines(weights0,wc,line, fragmentColor0);
    
      fetch(input1,-1,line);
      procLines(weights1,wc,line, fragmentColor0);
      fetch(input1,0,line);
      procLines(weights1,wc,line, fragmentColor0);
      fetch(input1,1,line);
      procLines(weights1,wc,line, fragmentColor0);
    
    }

    Malisc returns this :
    ARM Mali Offline Compiler v5.3.0
    (C) Copyright 2007-2016 ARM Limited.
    All rights reserved.

    No driver specified, using "Mali-T600_r10p0-00rel0" as default.

    No core specified, using "Mali-T880" as default.

    No core revision specified, using "r2p0" as default.


    8 work registers used, 1 uniform registers used, spilling used.

                A    L/S    T    Bound
    Instructions Emitted:    73    18    50    A
    Shortest Path Cycles:    24    18    50    T
    Longest Path Cycles:    24    18    50    T

    A = Arithmetic, L/S = Load/Store, T = Texture
    Note: The cycles counts do not include possible stalls due to cache misses.
    Note: Shaders with loops may return "N/A" for cycle counts if the number of cycles cannot be statically determined.

    Compilation succeeded.



    That said, I'm not sure if that gives the exact same result.

Children
No data