simd neon matrix multiplication works worse than simple

Hi, here is my case: I make the game and go to optimize the mathematical calculations, I have been long and hard (with interruptions) writing math on neon (.s extension files). And in the end I got it, but then I decided to check how many times it increases productivity and it turned out that it does not increase, but on the contrary - reduces. That is, I received a situation where the multiplication in the usual way of 60 matrices is recalculated for:  Seconds 0.000039  and Neon : Seconds 0.001612  =)) 

  Do not tell me how this is possible at all and where could I make a mistake?

My simple matrix multiple: 

 //where  myMatrix this  = float[4][4] 

myMatrix operator*(myMatrix mat1, myMatrix mat2)
{
myMatrix backMatrix;

// first row
backMatrix.elements[0][0] = mat1.elements[0][0] * mat2.elements[0][0] + mat1.elements[1][0] * mat2.elements[0][1]+ mat1.elements[2][0] * mat2.elements[0][2] + mat1.elements[3][0] * mat2.elements[0][3];

backMatrix.elements[1][0] = mat1.elements[0][0] * mat2.elements[1][0] + mat1.elements[1][0] * mat2.elements[1][1]+ mat1.elements[2][0] * mat2.elements[1][2] + mat1.elements[3][0] * mat2.elements[1][3];

backMatrix.elements[2][0] = mat1.elements[0][0] * mat2.elements[2][0] + mat1.elements[1][0] * mat2.elements[2][1]+ mat1.elements[2][0] * mat2.elements[2][2] + mat1.elements[3][0] * mat2.elements[2][3];

backMatrix.elements[3][0] = mat1.elements[0][0] * mat2.elements[3][0] + mat1.elements[1][0] * mat2.elements[3][1]+ mat1.elements[2][0] * mat2.elements[3][2] + mat1.elements[3][0] * mat2.elements[3][3];

// second's row
backMatrix.elements[0][1] = mat1.elements[0][1] * mat2.elements[0][0] + mat1.elements[1][1] * mat2.elements[0][1]+ mat1.elements[2][1] * mat2.elements[0][2] + mat1.elements[3][1] * mat2.elements[0][3];

backMatrix.elements[1][1] = mat1.elements[0][1] * mat2.elements[1][0] + mat1.elements[1][1] * mat2.elements[1][1]+ mat1.elements[2][1] * mat2.elements[1][2] + mat1.elements[3][1] * mat2.elements[1][3];

backMatrix.elements[2][1] = mat1.elements[0][1] * mat2.elements[2][0] + mat1.elements[1][1] * mat2.elements[2][1]+ mat1.elements[2][1] * mat2.elements[2][2] + mat1.elements[3][1] * mat2.elements[2][3];

backMatrix.elements[3][1] = mat1.elements[0][1] * mat2.elements[3][0] + mat1.elements[1][1] * mat2.elements[3][1]+ mat1.elements[2][1] * mat2.elements[3][2] + mat1.elements[3][1] * mat2.elements[3][3];


// third's row
backMatrix.elements[0][2] = mat1.elements[0][2] * mat2.elements[0][0] + mat1.elements[1][2] * mat2.elements[0][1]+ mat1.elements[2][2] * mat2.elements[0][2] + mat1.elements[3][2] * mat2.elements[0][3];

backMatrix.elements[1][2] = mat1.elements[0][2] * mat2.elements[1][0] + mat1.elements[1][2] * mat2.elements[1][1]+ mat1.elements[2][2] * mat2.elements[1][2] + mat1.elements[3][2] * mat2.elements[1][3];

backMatrix.elements[2][2] = mat1.elements[0][2] * mat2.elements[2][0] + mat1.elements[1][2] * mat2.elements[2][1]+ mat1.elements[2][2] * mat2.elements[2][2] + mat1.elements[3][2] * mat2.elements[2][3];

backMatrix.elements[3][2] = mat1.elements[0][2] * mat2.elements[3][0] + mat1.elements[1][2] * mat2.elements[3][1]+ mat1.elements[2][2] * mat2.elements[3][2] + mat1.elements[3][2] * mat2.elements[3][3];



// four's row
backMatrix.elements[0][3] = mat1.elements[0][3] * mat2.elements[0][0] + mat1.elements[1][3] * mat2.elements[0][1]+ mat1.elements[2][3] * mat2.elements[0][2] + mat1.elements[3][3] * mat2.elements[0][3];

backMatrix.elements[1][3] = mat1.elements[0][3] * mat2.elements[1][0] + mat1.elements[1][3] * mat2.elements[1][1]+ mat1.elements[2][3] * mat2.elements[1][2] + mat1.elements[3][3] * mat2.elements[1][3];

backMatrix.elements[2][3] = mat1.elements[0][3] * mat2.elements[2][0] + mat1.elements[1][3] * mat2.elements[2][1]+ mat1.elements[2][3] * mat2.elements[2][2] + mat1.elements[3][3] * mat2.elements[2][3];

backMatrix.elements[3][3] = mat1.elements[0][3] * mat2.elements[3][0] + mat1.elements[1][3] * mat2.elements[3][1]+ mat1.elements[2][3] * mat2.elements[3][2] + mat1.elements[3][3] * mat2.elements[3][3];


return backMatrix;
}


and my Neon Matrix
// neon matrix float[16]
 myMatrixFunctionCode (float* losFloat, const float er[16], const float sef[16]) asm("myMatrixFunction")

//*** .s file

.text
.syntax unified

.balign 4
.global myMatrixFunction
.thumb
.thumb_func

myMatrixFunction:

vld1.32 {d16-d19}, [r1]!
vld1.32 {d20-d23}, [r1]!
vld1.32 {d0-d3}, [r2]!
vld1.32 {d4-d7}, [r2]!

.macro mul_los_matrix store_q, column0_d, column1_d
vmul.f32 \store_q, q8, \column0_d[0] @multiple col element 0 by matrix col 0
vmla.f32 \store_q, q9, \column0_d[1] @multiple-acc col element 1 by matrix col 1
vmla.f32 \store_q, q10, \column1_d[0]
vmla.f32 \store_q, q11, \column1_d[1]
.endm

mul_los_matrix q12, d0, d1 @ matrix 0 * matrix 1 col 0
mul_los_matrix q13, d2, d3 @ matrix 0 * matrix 1 col 1
mul_los_matrix q14, d4, d5 @ matrix 0 * matrix 1 col 2
mul_los_matrix q15, d6, d7 @ matrix 0 * matrix 1 col 3

vst1.32 {d24-d27}, [r0]!
vst1.32 {d28-d31}, [r0]!
     b end

end:
bx lr

.end


Everything works for me - it counts everything correctly and so on.But it does it very slowly.Here is my problem.I will be glad and grateful for any help