=======================================
for matrix 4 by 4 multiplication, neon programming is slower than natural code with
auto-vectorization option. (Xilinx Zynq 702 EVM board - cortex a9 with gcc complier option
-mfloat-abi=softfp -mfpu=neon-fp16 -ftree-vectorize)
=========================================
could you tell me what's wrong ?
here are 3 versions.
1 - natural code
2 - neon code (intrinsic)
3 - neon inline asm code
operation results of three ways are all same (perfect matched).
I have a problem in speed results
=> natural is faster than neon intrinsic code.
=> intrinsic is faster than inline asm code.
natural code needs 119 lines in asm file.
neon code needs 46 lines in asm file.
inline asm code needs 23 lines in asm file.
for two 1280x720 source arrays, it's results(time tick) are as follow (in zynq).
natural - 7160144 tick (100.0%)
intrinsic - 7847183 tick (9.8% ↑)
inline asm - 8720648 tick (21.8%↑)
here are c codes for each version.
SRC_T and RES_T mean float.
FP32X4 and FP32X2 mean float32x4_t and float32x2_t each.
========================================================
static int nat_matrix_mul( const SRC_T * RESTRICT src0_4x4, const SRC_T * RESTRICT src1_4x4,
RES_T * RESTRICT res4x4)
{
res4x4[ 0] = (src0_4x4[ 0] * src1_4x4[ 0]) + (src0_4x4[ 1] * src1_4x4[ 4]) + (src0_4x4[ 2] *
src1_4x4[ 8]) + (src0_4x4[ 3] * src1_4x4[12]);
res4x4[ 1] = (src0_4x4[ 0] * src1_4x4[ 1]) + (src0_4x4[ 1] * src1_4x4[ 5]) + (src0_4x4[ 2] *
src1_4x4[ 9]) + (src0_4x4[ 3] * src1_4x4[13]);
res4x4[ 2] = (src0_4x4[ 0] * src1_4x4[ 2]) + (src0_4x4[ 1] * src1_4x4[ 6]) + (src0_4x4[ 2] *
src1_4x4[10]) + (src0_4x4[ 3] * src1_4x4[14]);
res4x4[ 3] = (src0_4x4[ 0] * src1_4x4[ 3]) + (src0_4x4[ 1] * src1_4x4[ 7]) + (src0_4x4[ 2] *
src1_4x4[11]) + (src0_4x4[ 3] * src1_4x4[15]);
res4x4[ 4] = (src0_4x4[ 4] * src1_4x4[ 0]) + (src0_4x4[ 5] * src1_4x4[ 4]) + (src0_4x4[ 6] *
src1_4x4[ 8]) + (src0_4x4[ 7] * src1_4x4[12]);
res4x4[ 5] = (src0_4x4[ 4] * src1_4x4[ 1]) + (src0_4x4[ 5] * src1_4x4[ 5]) + (src0_4x4[ 6] *
src1_4x4[ 9]) + (src0_4x4[ 7] * src1_4x4[13]);
res4x4[ 6] = (src0_4x4[ 4] * src1_4x4[ 2]) + (src0_4x4[ 5] * src1_4x4[ 6]) + (src0_4x4[ 6] *
src1_4x4[10]) + (src0_4x4[ 7] * src1_4x4[14]);
res4x4[ 7] = (src0_4x4[ 4] * src1_4x4[ 3]) + (src0_4x4[ 5] * src1_4x4[ 7]) + (src0_4x4[ 6] *
src1_4x4[11]) + (src0_4x4[ 7] * src1_4x4[15]);
res4x4[ 8] = (src0_4x4[ 8] * src1_4x4[ 0]) + (src0_4x4[ 9] * src1_4x4[ 4]) + (src0_4x4[10] *
src1_4x4[ 8]) + (src0_4x4[11] * src1_4x4[12]);
res4x4[ 9] = (src0_4x4[ 8] * src1_4x4[ 1]) + (src0_4x4[ 9] * src1_4x4[ 5]) + (src0_4x4[10] *
src1_4x4[ 9]) + (src0_4x4[11] * src1_4x4[13]);
res4x4[10] = (src0_4x4[ 8] * src1_4x4[ 2]) + (src0_4x4[ 9] * src1_4x4[ 6]) + (src0_4x4[10] *
src1_4x4[10]) + (src0_4x4[11] * src1_4x4[14]);
res4x4[11] = (src0_4x4[ 8] * src1_4x4[ 3]) + (src0_4x4[ 9] * src1_4x4[ 7]) + (src0_4x4[10] *
src1_4x4[11]) + (src0_4x4[11] * src1_4x4[15]);
res4x4[12] = (src0_4x4[12] * src1_4x4[ 0]) + (src0_4x4[13] * src1_4x4[ 4]) + (src0_4x4[14] *
src1_4x4[ 8]) + (src0_4x4[15] * src1_4x4[12]);
res4x4[13] = (src0_4x4[12] * src1_4x4[ 1]) + (src0_4x4[13] * src1_4x4[ 5]) + (src0_4x4[14] *
src1_4x4[ 9]) + (src0_4x4[15] * src1_4x4[13]);
res4x4[14] = (src0_4x4[12] * src1_4x4[ 2]) + (src0_4x4[13] * src1_4x4[ 6]) + (src0_4x4[14] *
src1_4x4[10]) + (src0_4x4[15] * src1_4x4[14]);
res4x4[15] = (src0_4x4[12] * src1_4x4[ 3]) + (src0_4x4[13] * src1_4x4[ 7]) + (src0_4x4[14] *
src1_4x4[11]) + (src0_4x4[15] * src1_4x4[15]);
return 0;
}
int opt_matrix_mul( const SRC_T * RESTRICT src0_4x4, const SRC_T * RESTRICT src1_4x4,
FP32X4 F32X4_IN_0_3210, F32X4_IN_0_7654, F32X4_IN_0_BA98, F32X4_IN_0_FEDC;
FP32X2 F32X2_IN_0_10, F32X2_IN_0_32, F32X2_IN_0_54, F32X2_IN_0_76, F32X2_IN_0_98,
F32X2_IN_0_BA, F32X2_IN_0_DC, F32X2_IN_0_FE;
FP32X4 F32X4_IN_1_3210, F32X4_IN_1_7654, F32X4_IN_1_BA98, F32X4_IN_1_FEDC;
FP32X4 F32X4_OUT_0_3210, F32X4_OUT_0_7654, F32X4_OUT_0_BA98, F32X4_OUT_0_FEDC;
F32X4_IN_1_3210 = vld1q_f32(&src1_4x4[ 0]);
F32X4_IN_1_7654 = vld1q_f32(&src1_4x4[ 4]);
F32X4_IN_1_BA98 = vld1q_f32(&src1_4x4[ 8]);
F32X4_IN_1_FEDC = vld1q_f32(&src1_4x4[12]);
F32X4_IN_0_3210 = vld1q_f32(&src0_4x4[ 0]);
F32X4_IN_0_7654 = vld1q_f32(&src0_4x4[ 4]);
F32X4_IN_0_BA98 = vld1q_f32(&src0_4x4[ 8]);
F32X4_IN_0_FEDC = vld1q_f32(&src0_4x4[12]);
F32X2_IN_0_10 = vget_low_f32 (F32X4_IN_0_3210);
F32X2_IN_0_32 = vget_high_f32(F32X4_IN_0_3210);
F32X2_IN_0_54 = vget_low_f32 (F32X4_IN_0_7654);
F32X2_IN_0_76 = vget_high_f32(F32X4_IN_0_7654);
F32X2_IN_0_98 = vget_low_f32 (F32X4_IN_0_BA98);
F32X2_IN_0_BA = vget_high_f32(F32X4_IN_0_BA98);
F32X2_IN_0_DC = vget_low_f32 (F32X4_IN_0_FEDC);
F32X2_IN_0_FE = vget_high_f32(F32X4_IN_0_FEDC);
F32X4_OUT_0_3210 = vmulq_lane_f32( F32X4_IN_1_3210, F32X2_IN_0_10, 0);
F32X4_OUT_0_3210 = vmlaq_lane_f32(F32X4_OUT_0_3210, F32X4_IN_1_7654, F32X2_IN_0_10, 1);
F32X4_OUT_0_3210 = vmlaq_lane_f32(F32X4_OUT_0_3210, F32X4_IN_1_BA98, F32X2_IN_0_32, 0);
F32X4_OUT_0_3210 = vmlaq_lane_f32(F32X4_OUT_0_3210, F32X4_IN_1_FEDC, F32X2_IN_0_32, 1);
F32X4_OUT_0_7654 = vmulq_lane_f32( F32X4_IN_1_3210, F32X2_IN_0_54, 0);
F32X4_OUT_0_7654 = vmlaq_lane_f32(F32X4_OUT_0_7654, F32X4_IN_1_7654, F32X2_IN_0_54, 1);
F32X4_OUT_0_7654 = vmlaq_lane_f32(F32X4_OUT_0_7654, F32X4_IN_1_BA98, F32X2_IN_0_76, 0);
F32X4_OUT_0_7654 = vmlaq_lane_f32(F32X4_OUT_0_7654, F32X4_IN_1_FEDC, F32X2_IN_0_76, 1);
F32X4_OUT_0_BA98 = vmulq_lane_f32( F32X4_IN_1_3210, F32X2_IN_0_98, 0);
F32X4_OUT_0_BA98 = vmlaq_lane_f32(F32X4_OUT_0_BA98, F32X4_IN_1_7654, F32X2_IN_0_98, 1);
F32X4_OUT_0_BA98 = vmlaq_lane_f32(F32X4_OUT_0_BA98, F32X4_IN_1_BA98, F32X2_IN_0_BA, 0);
F32X4_OUT_0_BA98 = vmlaq_lane_f32(F32X4_OUT_0_BA98, F32X4_IN_1_FEDC, F32X2_IN_0_BA, 1);
F32X4_OUT_0_FEDC = vmulq_lane_f32( F32X4_IN_1_3210, F32X2_IN_0_DC, 0);
F32X4_OUT_0_FEDC = vmlaq_lane_f32(F32X4_OUT_0_FEDC, F32X4_IN_1_7654, F32X2_IN_0_DC, 1);
F32X4_OUT_0_FEDC = vmlaq_lane_f32(F32X4_OUT_0_FEDC, F32X4_IN_1_BA98, F32X2_IN_0_FE, 0);
F32X4_OUT_0_FEDC = vmlaq_lane_f32(F32X4_OUT_0_FEDC, F32X4_IN_1_FEDC, F32X2_IN_0_FE, 1);
vst1q_f32(&res4x4[ 0], F32X4_OUT_0_3210);
vst1q_f32(&res4x4[ 4], F32X4_OUT_0_7654);
vst1q_f32(&res4x4[ 8], F32X4_OUT_0_BA98);
vst1q_f32(&res4x4[12], F32X4_OUT_0_FEDC);
/* load src0_4x4 */
asm ("vld1.32 { q8-q9 }, [r1]! @,");
asm ("vld1.32 { q10-q11 }, [r1]! @,");
/* load src1_4x4 */
asm ("vld1.32 { q0-q1 }, [r0]! @,");
asm ("vld1.32 { q2-q3 }, [r0]! @,");
/* mul res0-3 */
asm ("vmul.f32 q12, q8, d0[0] @,");
asm ("vmul.f32 q13, q8, d2[0] @,");
asm ("vmul.f32 q14, q8, d4[0] @,");
asm ("vmul.f32 q15, q8, d6[0] @,");
asm ("vmla.f32 q12, q9, d0[1] @,");
asm ("vmla.f32 q13, q9, d2[1] @,");
asm ("vmla.f32 q14, q9, d4[1] @,");
asm ("vmla.f32 q15, q9, d6[1] @,");
asm ("vmla.f32 q12, q10, d1[0] @,");
asm ("vmla.f32 q13, q10, d3[0] @,");
asm ("vmla.f32 q14, q10, d5[0] @,");
asm ("vmla.f32 q15, q10, d7[0] @,");
asm ("vmla.f32 q12, q11, d1[1] @,");
asm ("vmla.f32 q13, q11, d3[1] @,");
asm ("vmla.f32 q14, q11, d5[1] @,");
asm ("vmla.f32 q15, q11, d7[1] @,");
/* save res4x4 */
asm ("vst1.32 { q12-q13 }, [r2]! @,");
asm ("vst1.32 { q14-q15 }, [r2]! @,");
/* return 0 */
asm ("mov r0, #0 @,");
here are asm codes for each version
add r3, r1, ip @ D.14387, p_src_c1, ivtmp.11
add r2, r0, ip @ D.14387, p_src_c0, ivtmp.11
flds s28, [r3, #16] @ D.14385, MEM[base: _575, offset: 16B]
flds s10, [r2, #4] @ D.14385, MEM[base: _576, offset: 4B]
fmuls s7, s10, s28 @ D.14385, D.14385, D.14385
flds s23, [r3] @ D.14385, MEM[base: _575, offset: 0B]
flds s31, [r2] @ D.14385, MEM[base: _576, offset: 0B]
flds s27, [r3, #20] @ D.14385, MEM[base: _575, offset: 20B]
flds s25, [r3, #24] @ D.14385, MEM[base: _575, offset: 24B]
flds s24, [r3, #28] @ D.14385, MEM[base: _575, offset: 28B]
flds s14, [r2, #20] @ D.14385, MEM[base: _576, offset: 20B]
flds s26, [r2, #36] @ D.14385, MEM[base: _576, offset: 36B]
flds s22, [r3, #4] @ D.14385, MEM[base: _575, offset: 4B]
flds s20, [r3, #8] @ D.14385, MEM[base: _575, offset: 8B]
flds s19, [r3, #12] @ D.14385, MEM[base: _575, offset: 12B]
flds s29, [r2, #8] @ D.14385, MEM[base: _576, offset: 8B]
flds s18, [r3, #32] @ D.14385, MEM[base: _575, offset: 32B]
flds s30, [r2, #16] @ D.14385, MEM[base: _576, offset: 16B]
flds s21, [r2, #32] @ D.14385, MEM[base: _576, offset: 32B]
flds s17, [r3, #36] @ D.14385, MEM[base: _575, offset: 36B]
flds s0, [r3, #40] @ D.14385, MEM[base: _575, offset: 40B]
flds s1, [r3, #44] @ D.14385, MEM[base: _575, offset: 44B]
flds s2, [r3, #48] @ D.14385, MEM[base: _575, offset: 48B]
flds s16, [r2, #40] @ D.14385, MEM[base: _576, offset: 40B]
flds s3, [r3, #52] @ D.14385, MEM[base: _575, offset: 52B]
flds s5, [r3, #56] @ D.14385, MEM[base: _575, offset: 56B]
flds s6, [r3, #60] @ D.14385, MEM[base: _575, offset: 60B]
flds s4, [r2, #44] @ D.14385, MEM[base: _576, offset: 44B]
fmacs s7, s31, s23 @ D.14385, D.14385, D.14385
add r3, r5, ip @ D.14386, p_res, ivtmp.11
add r4, r4, #16 @ idx, idx,
add ip, ip, #64 @ ivtmp.11, ivtmp.11,
cmp r4, r6 @ idx, D.14384
fmuls s8, s10, s27 @ D.14385, D.14385, D.14385
fmuls s9, s10, s25 @ D.14385, D.14385, D.14385
fmuls s11, s28, s14 @ D.14385, D.14385, D.14385
fmuls s12, s27, s14 @ D.14385, D.14385, D.14385
fmuls s13, s25, s14 @ D.14385, D.14385, D.14385
fmuls s15, s28, s26 @ D.14385, D.14385, D.14385
fmuls s10, s10, s24 @ D.14385, D.14385, D.14385
fmuls s14, s24, s14 @ D.14385, D.14385, D.14385
fmacs s7, s29, s18 @ D.14385, D.14385, D.14385
fmacs s8, s31, s22 @ D.14385, D.14385, D.14385
fmacs s9, s31, s20 @ D.14385, D.14385, D.14385
fmacs s10, s31, s19 @ D.14385, D.14385, D.14385
fmacs s11, s23, s30 @ D.14385, D.14385, D.14385
fmacs s12, s22, s30 @ D.14385, D.14385, D.14385
fmacs s13, s20, s30 @ D.14385, D.14385, D.14385
fmacs s14, s19, s30 @ D.14385, D.14385, D.14385
fmacs s15, s23, s21 @ D.14385, D.14385, D.14385
fmacs s8, s29, s17 @ D.14385, D.14385, D.14385
fmacs s9, s29, s0 @ D.14385, D.14385, D.14385
fmacs s10, s29, s1 @ D.14385, D.14385, D.14385
flds s29, [r2, #24] @, MEM[base: _576, offset: 24B]
fmacs s15, s18, s16 @ D.14385, D.14385, D.14385
fmacs s11, s18, s29 @ D.14385, D.14385,
fmacs s12, s17, s29 @ D.14385, D.14385,
fmacs s13, s0, s29 @ D.14385, D.14385,
fmacs s14, s1, s29 @ D.14385, D.14385,
flds s29, [r2, #12] @, MEM[base: _576, offset: 12B]
fmacs s7, s29, s2 @ D.14385,, D.14385
fmacs s10, s29, s6 @ D.14385,, D.14385
fsts s7, [r3] @ D.14385, MEM[base: _568, offset: 0B]
flds s7, [r2, #28] @, MEM[base: _576, offset: 28B]
fmacs s15, s2, s4 @ D.14385, D.14385, D.14385
fsts s10, [r3, #12] @ D.14385, MEM[base: _568, offset: 12B]
fmacs s11, s2, s7 @ D.14385, D.14385,
fmacs s12, s3, s7 @ D.14385, D.14385,
fsts s11, [r3, #16] @ D.14385, MEM[base: _568, offset: 16B]
fmacs s14, s6, s7 @ D.14385, D.14385,
fmacs s8, s29, s3 @ D.14385,, D.14385
fmacs s9, s29, s5 @ D.14385,, D.14385
fsts s8, [r3, #4] @ D.14385, MEM[base: _568, offset: 4B]
fsts s9, [r3, #8] @ D.14385, MEM[base: _568, offset: 8B]
fsts s12, [r3, #20] @ D.14385, MEM[base: _568, offset: 20B]
flds s10, [r2, #52] @ D.14385, MEM[base: _576, offset: 52B]
fsts s14, [r3, #28] @ D.14385, MEM[base: _568, offset: 28B]
fsts s15, [r3, #32] @ D.14385, MEM[base: _568, offset: 32B]
fmuls s14, s27, s26 @ D.14385, D.14385, D.14385
flds s11, [r2, #48] @ D.14385, MEM[base: _576, offset: 48B]
flds s12, [r2, #56] @ D.14385, MEM[base: _576, offset: 56B]
fmuls s15, s25, s26 @ D.14385, D.14385, D.14385
fmuls s28, s28, s10 @ D.14385, D.14385, D.14385
fmuls s26, s24, s26 @ D.14385, D.14385, D.14385
fmuls s27, s27, s10 @ D.14385, D.14385, D.14385
fmuls s25, s25, s10 @ D.14385, D.14385, D.14385
fmuls s24, s24, s10 @ D.14385, D.14385, D.14385
fmacs s28, s23, s11 @ D.14385, D.14385, D.14385
fmacs s14, s22, s21 @ D.14385, D.14385, D.14385
fmacs s27, s22, s11 @ D.14385, D.14385, D.14385
fmacs s15, s20, s21 @ D.14385, D.14385, D.14385
fmacs s26, s19, s21 @ D.14385, D.14385, D.14385
fmacs s25, s20, s11 @ D.14385, D.14385, D.14385
fmacs s24, s19, s11 @ D.14385, D.14385, D.14385
fmacs s13, s5, s7 @ D.14385, D.14385,
fmacs s28, s18, s12 @ D.14385, D.14385, D.14385
fsts s13, [r3, #24] @ D.14385, MEM[base: _568, offset: 24B]
flds s13, [r2, #60] @ D.14385, MEM[base: _576, offset: 60B]
fmacs s14, s17, s16 @ D.14385, D.14385, D.14385
fmacs s27, s17, s12 @ D.14385, D.14385, D.14385
fmacs s15, s0, s16 @ D.14385, D.14385, D.14385
fmacs s26, s1, s16 @ D.14385, D.14385, D.14385
fmacs s25, s0, s12 @ D.14385, D.14385, D.14385
fmacs s24, s1, s12 @ D.14385, D.14385, D.14385
fmacs s28, s2, s13 @ D.14385, D.14385, D.14385
fmacs s14, s3, s4 @ D.14385, D.14385, D.14385
fsts s28, [r3, #48] @ D.14385, MEM[base: _568, offset: 48B]
fmacs s27, s3, s13 @ D.14385, D.14385, D.14385
fsts s14, [r3, #36] @ D.14385, MEM[base: _568, offset: 36B]
fmacs s15, s5, s4 @ D.14385, D.14385, D.14385
fsts s27, [r3, #52] @ D.14385, MEM[base: _568, offset: 52B]
fmacs s26, s6, s4 @ D.14385, D.14385, D.14385
fsts s15, [r3, #40] @ D.14385, MEM[base: _568, offset: 40B]
fmacs s25, s5, s13 @ D.14385, D.14385, D.14385
fsts s26, [r3, #44] @ D.14385, MEM[base: _568, offset: 44B]
fmacs s24, s6, s13 @ D.14385, D.14385, D.14385
fsts s25, [r3, #56] @ D.14385, MEM[base: _568, offset: 56B]
fsts s24, [r3, #60] @ D.14385, MEM[base: _568, offset: 60B]
add r4, r0, #32 @ tmp159, src0_4x4,
vld1.32 {d16-d17}, [r3]! @ D.14261, MEM[(const __builtin_neon_sf[4] *)src0_4x4_10(D)]
add r0, r0, #48 @ tmp160, src0_4x4,
vmov d1, d16 @ v2sf @ D.14262, D.14261
add r5, r2, #32 @ tmp162, res4x4,
vld1.32 {d18-d19}, [r3] @ D.14261, MEM[(const __builtin_neon_sf[4] *)_12]
vmov d5, d17 @ v2sf @ D.14262, D.14261
vmov d0, d18 @ v2sf @ D.14262, D.14261
add r3, r1, #32 @ tmp156, src1_4x4,
vld1.32 {d20-d21}, [r4] @ D.14261, MEM[(const __builtin_neon_sf[4] *)_14]
vmov d4, d19 @ v2sf @ D.14262, D.14261
vmov d2, d20 @ v2sf @ D.14262, D.14261
add r1, r1, #48 @ tmp157, src1_4x4,
vld1.32 {d18-d19}, [r0] @ D.14261, MEM[(const __builtin_neon_sf[4] *)_16]
vmov d6, d21 @ v2sf @ D.14262, D.14261
vmov d3, d18 @ v2sf @ D.14262, D.14261
add r4, r2, #48 @ tmp163, res4x4,
vld1.32 {d16-d17}, [ip]! @ D.14261, MEM[(const __builtin_neon_sf[4] *)src1_4x4_2(D)]
vmov d7, d19 @ v2sf @ D.14262, D.14261
mov r0, #0 @,
vmul.f32 q10, q8, d0[0] @ D.14261, D.14261, D.14262,
vld1.32 {d28-d29}, [ip] @ D.14261, MEM[(const __builtin_neon_sf[4] *)_4]
vmul.f32 q11, q8, d1[0] @ D.14261, D.14261, D.14262,
vld1.32 {d26-d27}, [r3] @ D.14261, MEM[(const __builtin_neon_sf[4] *)_6]
vmul.f32 q9, q8, d2[0] @ D.14261, D.14261, D.14262,
vld1.32 {d24-d25}, [r1] @ D.14261, MEM[(const __builtin_neon_sf[4] *)_8]
vmul.f32 q8, q8, d3[0] @ D.14261, D.14261, D.14262,
vmla.f32 q10, q14, d0[1] @ D.14261, D.14261, D.14262,
vmla.f32 q11, q14, d1[1] @ D.14261, D.14261, D.14262,
vmla.f32 q9, q14, d2[1] @ D.14261, D.14261, D.14262,
vmla.f32 q8, q14, d3[1] @ D.14261, D.14261, D.14262,
vmla.f32 q10, q13, d4[0] @ D.14261, D.14261, D.14262,
vmla.f32 q11, q13, d5[0] @ D.14261, D.14261, D.14262,
vmla.f32 q9, q13, d6[0] @ D.14261, D.14261, D.14262,
vmla.f32 q8, q13, d7[0] @ D.14261, D.14261, D.14262,
vmla.f32 q10, q12, d4[1] @ D.14261, D.14261, D.14262,
vmla.f32 q11, q12, d5[1] @ D.14261, D.14261, D.14262,
vmla.f32 q9, q12, d6[1] @ D.14261, D.14261, D.14262,
vmla.f32 q8, q12, d7[1] @ D.14261, D.14261, D.14262,
vst1.32 {d22-d23}, [r2]! @ D.14261, MEM[(__builtin_neon_sf[4] *)res4x4_42(D)]
vst1.32 {d20-d21}, [r2] @ D.14261, MEM[(__builtin_neon_sf[4] *)_44]
vst1.32 {d18-d19}, [r5] @ D.14261, MEM[(__builtin_neon_sf[4] *)_46]
vst1.32 {d16-d17}, [r4] @ D.14261, MEM[(__builtin_neon_sf[4] *)_48]
@ 68 "../src/mat.c" 1
vld1.32 { q8-q9 }, [r1]! @,
@ 0 "" 2
@ 69 "../src/mat.c" 1
vld1.32 { q10-q11 }, [r1]! @,
@ 72 "../src/mat.c" 1
vld1.32 { q0-q1 }, [r0]! @,
@ 73 "../src/mat.c" 1
vld1.32 { q2-q3 }, [r0]! @,
@ 76 "../src/mat.c" 1
vmul.f32 q12, q8, d0[0] @,
@ 77 "../src/mat.c" 1
vmul.f32 q13, q8, d2[0] @,
@ 78 "../src/mat.c" 1
vmul.f32 q14, q8, d4[0] @,
@ 79 "../src/mat.c" 1
vmul.f32 q15, q8, d6[0] @,
@ 81 "../src/mat.c" 1
vmla.f32 q12, q9, d0[1] @,
@ 82 "../src/mat.c" 1
vmla.f32 q13, q9, d2[1] @,
@ 83 "../src/mat.c" 1
vmla.f32 q14, q9, d4[1] @,
@ 84 "../src/mat.c" 1
vmla.f32 q15, q9, d6[1] @,
@ 86 "../src/mat.c" 1
vmla.f32 q12, q10, d1[0] @,
@ 87 "../src/mat.c" 1
vmla.f32 q13, q10, d3[0] @,
@ 88 "../src/mat.c" 1
vmla.f32 q14, q10, d5[0] @,
@ 89 "../src/mat.c" 1
vmla.f32 q15, q10, d7[0] @,
@ 91 "../src/mat.c" 1
vmla.f32 q12, q11, d1[1] @,
@ 92 "../src/mat.c" 1
vmla.f32 q13, q11, d3[1] @,
@ 93 "../src/mat.c" 1
vmla.f32 q14, q11, d5[1] @,
@ 94 "../src/mat.c" 1
vmla.f32 q15, q11, d7[1] @,
@ 97 "../src/mat.c" 1
vst1.32 { q12-q13 }, [r2]! @,
@ 98 "../src/mat.c" 1
vst1.32 { q14-q15 }, [r2]! @,
@ 100 "../src/mat.c" 1