Serious problems with handling of mat4x3


My shader codes:

Vertex Shader:

layout(std140)uniform _Global
layout(row_major)mat4 ProjMatrix;
vec3 CamAngVel;
layout(row_major)mat4x3 CamMatrix;
vec4 ClipPlane;
mediump float AllowBackFlip;
float TesselationDensity;
vec2 GrassRangeMulAdd;
mediump vec4 BendFactor;
layout(std140)uniform _ObjMatrix
layout(row_major)mat4x3 ViewMatrix[256];
layout(std140)uniform _ObjVel
vec3 ObjVel[256];
layout(std140)uniform _Mesh
float VtxHeightmap;
float VtxSkinning;
mediump vec4 Highlight;
mediump vec3 ObjAngVel;
layout(location=0)in vec4 ATTR0;
layout(location=1)in mediump vec3 ATTR1;
layout(location=2)in mediump vec3 ATTR2;
layout(location=3)in mediump vec4 ATTR3;
layout(location=4)in vec2 ATTR4;
layout(location=5)in vec2 ATTR5;
layout(location=6)in vec2 ATTR6;
layout(location=7)in mediump float ATTR7;
layout(location=8)in vec4 ATTR8;
layout(location=9)in mediump vec4 ATTR9;
layout(location=10)in mediump vec4 ATTR10;
layout(location=11)in mediump vec4 ATTR11;
out vec2 IO0;
out vec3 IO1;
out vec3 IO2;
out mediump vec3 IO3;
vec2 _82;
void main()
vec3 _103=(ViewMatrix[uint(gl_InstanceID)][0]*ATTR0.x)+((ViewMatrix[uint(gl_InstanceID)][1]*ATTR0.y)+((ViewMatrix[uint(gl_InstanceID)][2]*ATTR0.z)+ViewMatrix[uint(gl_InstanceID)][3]));
mediump vec3 _36=cross(,ObjAngVel);
float _120=_103.x;
float _124=_103.z;

Pixel Shader:

precision mediump float;
precision highp int;
struct MaterialClass
vec4 _color;
vec4 _ambient_specular;
vec4 _sss_glow_rough_bump;
highp vec4 _texscale_detscale_detpower_reflect;
struct DeferredSolidOutput
vec4 out0;
vec4 out1;
vec4 out2;
layout(std140)uniform _Global
layout(row_major)highp mat4 ProjMatrix;
highp vec3 CamAngVel;
layout(row_major)highp mat4x3 CamMatrix;
highp vec4 ClipPlane;
float AllowBackFlip;
highp float TesselationDensity;
highp vec2 GrassRangeMulAdd;
vec4 BendFactor;
layout(std140)uniform _Mesh
highp float VtxHeightmap;
highp float VtxSkinning;
vec4 Highlight;
vec3 ObjAngVel;
layout(std140)uniform _Material
MaterialClass Material;
uniform highp sampler2D Nrm;
uniform highp sampler2D Col;
in highp vec2 IO0;
in highp vec3 IO1;
in highp vec3 IO2;
in vec3 IO3;
layout(location=0)out vec4 RT0;
layout(location=1)out vec4 RT1;
layout(location=2)out vec4 RT2;
void main()
vec4 _31=texture(Nrm,IO0);
vec3 _36=normalize(IO3);
vec4 _38=texture(Col,IO0);
vec3 _43=(*;
vec3 _60;
DeferredSolidOutput param_var_output;
vec3 _53=(_60*0.5)+vec3(0.5);
vec3 _59=((IO2/vec3(IO1.z))*0.5)+vec3(0.5);

I'm facing serious problems with handling of mat4x3 on ARM Mali GPU-s.

The above code works fine on Desktop OpenGL GeForce, Apple iOS GL ES (iPad mini 2), but when running on 2 Android Devices:
Samsung Galaxy Note 4 (ARM Mali-T760)
Huawei Mate 20 X (ARM Mali-G76)
It doesn't work.

layout(std140)uniform _ObjMatrix
layout(row_major)mat4x3 ViewMatrix[256];
for this UBO using code:
GLint size=0; glGetActiveUniformBlockiv(prog, i, GL_UNIFORM_BLOCK_DATA_SIZE, &size);
On Desktop and Apple I'm getting the expected size of 3*Vec4*256 elements (total size 12288 bytes) = OK
On ARM Mali I'm getting 4*Vec4*256 elements (total size 16384 bytes) = Not OK

Which suggests that Mali is using mat4x4 instead of mat4x3

Then I check
GLint offset =-1; glGetActiveUniformsiv(prog, 1, &uni, GL_UNIFORM_OFFSET , &offset );
GLint array_stride=-1; glGetActiveUniformsiv(prog, 1, &uni, GL_UNIFORM_ARRAY_STRIDE , & array_stride);
GLint matrix_stride=-1; glGetActiveUniformsiv(prog, 1, &uni, GL_UNIFORM_MATRIX_STRIDE, &matrix_stride);

on all platforms I'm getting correct GL_UNIFORM_ARRAY_STRIDE of 48 and GL_UNIFORM_MATRIX_STRIDE of 16.

However this doesn't work on Mali!

When I set the UBO data:

glBindBuffer (GL_UNIFORM_BUFFER, buffer.buffer);
glBufferSubData(GL_UNIFORM_BUFFER, 0, buffer.size, data);

just for 2 Matrices:
first matrix byte offset =0 , size = 3 * Vec4
second matrix byte offset =3 * Vec4 (48), size = 3 * Vec4
Then rendering doesn't work correctly on Arm Mali

if I treat them as mat4x4
first matrix byte offset =0 , size = 4 * Vec4
second matrix byte offset =4 * Vec4 (64), size = 4 * Vec4
then rendering starts to work ok

So despite I'm requesting mat4x3 (3xVec4), I'm actually getting bigger UBO that uses mat4x4 (4xVec4) = fail
and reports array_stride 48 for mat4x3 instead of what it actually allocated 64 for mat4x4 = fail

Is there an easy workaround for this problem without having to wait for a driver update?

More questions in this forum