Please note: We are aware of an issue affecting replies on the Arm Community forums, which may not be loading as expected.

We apologize for any inconvenience and appreciate your patience while we investigate and work to resolve the issue.

Thank you for your understanding.


This discussion has been locked.
You can no longer post new replies to this discussion. If you have a question you can start a new discussion

question about arm cortex-a9 neon optimization(4x4 matrix mul)

=======================================

for matrix 4 by 4 multiplication, neon programming is slower than natural code with

auto-vectorization option. (Xilinx Zynq 702 EVM board - cortex a9 with gcc complier option

-mfloat-abi=softfp -mfpu=neon-fp16 -ftree-vectorize)

=========================================

could you tell me what's wrong ?

=========================================

here are 3 versions.

1 - natural code

2 - neon code (intrinsic)

3 - neon inline asm code

=========================================

operation results of three ways are all same (perfect matched).

=========================================

I have a problem in speed results

  => natural is faster than neon intrinsic code.

  => intrinsic is faster than inline asm code.

=========================================

natural code needs 119 lines in asm file.

neon code needs 46 lines in asm file.

inline asm code needs 23 lines in asm file.

=========================================

for two 1280x720 source arrays, it's results(time tick) are as follow (in zynq).

natural - 7160144 tick (100.0%)

intrinsic - 7847183 tick (9.8% ↑)

inline asm - 8720648 tick (21.8%↑)

=========================================

here are c codes for each version.

SRC_T and RES_T mean float.

FP32X4 and FP32X2 mean float32x4_t and float32x2_t each.

========================================================

1 - natural code

static int nat_matrix_mul( const SRC_T * RESTRICT src0_4x4, const SRC_T * RESTRICT src1_4x4,

RES_T * RESTRICT res4x4)

{

res4x4[ 0] = (src0_4x4[ 0] * src1_4x4[ 0]) + (src0_4x4[ 1] * src1_4x4[ 4]) + (src0_4x4[ 2] *

src1_4x4[ 8]) + (src0_4x4[ 3] * src1_4x4[12]);

res4x4[ 1] = (src0_4x4[ 0] * src1_4x4[ 1]) + (src0_4x4[ 1] * src1_4x4[ 5]) + (src0_4x4[ 2] *

src1_4x4[ 9]) + (src0_4x4[ 3] * src1_4x4[13]);

res4x4[ 2] = (src0_4x4[ 0] * src1_4x4[ 2]) + (src0_4x4[ 1] * src1_4x4[ 6]) + (src0_4x4[ 2] *

src1_4x4[10]) + (src0_4x4[ 3] * src1_4x4[14]);

res4x4[ 3] = (src0_4x4[ 0] * src1_4x4[ 3]) + (src0_4x4[ 1] * src1_4x4[ 7]) + (src0_4x4[ 2] *

src1_4x4[11]) + (src0_4x4[ 3] * src1_4x4[15]);

res4x4[ 4] = (src0_4x4[ 4] * src1_4x4[ 0]) + (src0_4x4[ 5] * src1_4x4[ 4]) + (src0_4x4[ 6] *

src1_4x4[ 8]) + (src0_4x4[ 7] * src1_4x4[12]);

res4x4[ 5] = (src0_4x4[ 4] * src1_4x4[ 1]) + (src0_4x4[ 5] * src1_4x4[ 5]) + (src0_4x4[ 6] *

src1_4x4[ 9]) + (src0_4x4[ 7] * src1_4x4[13]);

res4x4[ 6] = (src0_4x4[ 4] * src1_4x4[ 2]) + (src0_4x4[ 5] * src1_4x4[ 6]) + (src0_4x4[ 6] *

src1_4x4[10]) + (src0_4x4[ 7] * src1_4x4[14]);

res4x4[ 7] = (src0_4x4[ 4] * src1_4x4[ 3]) + (src0_4x4[ 5] * src1_4x4[ 7]) + (src0_4x4[ 6] *

src1_4x4[11]) + (src0_4x4[ 7] * src1_4x4[15]);

res4x4[ 8] = (src0_4x4[ 8] * src1_4x4[ 0]) + (src0_4x4[ 9] * src1_4x4[ 4]) + (src0_4x4[10] *

src1_4x4[ 8]) + (src0_4x4[11] * src1_4x4[12]);

res4x4[ 9] = (src0_4x4[ 8] * src1_4x4[ 1]) + (src0_4x4[ 9] * src1_4x4[ 5]) + (src0_4x4[10] *

src1_4x4[ 9]) + (src0_4x4[11] * src1_4x4[13]);

res4x4[10] = (src0_4x4[ 8] * src1_4x4[ 2]) + (src0_4x4[ 9] * src1_4x4[ 6]) + (src0_4x4[10] *

src1_4x4[10]) + (src0_4x4[11] * src1_4x4[14]);

res4x4[11] = (src0_4x4[ 8] * src1_4x4[ 3]) + (src0_4x4[ 9] * src1_4x4[ 7]) + (src0_4x4[10] *

src1_4x4[11]) + (src0_4x4[11] * src1_4x4[15]);

res4x4[12] = (src0_4x4[12] * src1_4x4[ 0]) + (src0_4x4[13] * src1_4x4[ 4]) + (src0_4x4[14] *

src1_4x4[ 8]) + (src0_4x4[15] * src1_4x4[12]);

res4x4[13] = (src0_4x4[12] * src1_4x4[ 1]) + (src0_4x4[13] * src1_4x4[ 5]) + (src0_4x4[14] *

src1_4x4[ 9]) + (src0_4x4[15] * src1_4x4[13]);

res4x4[14] = (src0_4x4[12] * src1_4x4[ 2]) + (src0_4x4[13] * src1_4x4[ 6]) + (src0_4x4[14] *

src1_4x4[10]) + (src0_4x4[15] * src1_4x4[14]);

res4x4[15] = (src0_4x4[12] * src1_4x4[ 3]) + (src0_4x4[13] * src1_4x4[ 7]) + (src0_4x4[14] *

src1_4x4[11]) + (src0_4x4[15] * src1_4x4[15]);

return 0;

}

========================================================

2 - neon code (intrinsic)

int opt_matrix_mul( const SRC_T * RESTRICT src0_4x4, const SRC_T * RESTRICT src1_4x4,

RES_T * RESTRICT res4x4)

{

FP32X4 F32X4_IN_0_3210, F32X4_IN_0_7654, F32X4_IN_0_BA98, F32X4_IN_0_FEDC;

FP32X2 F32X2_IN_0_10, F32X2_IN_0_32, F32X2_IN_0_54, F32X2_IN_0_76, F32X2_IN_0_98,

F32X2_IN_0_BA, F32X2_IN_0_DC, F32X2_IN_0_FE;

FP32X4 F32X4_IN_1_3210, F32X4_IN_1_7654, F32X4_IN_1_BA98, F32X4_IN_1_FEDC;

FP32X4 F32X4_OUT_0_3210, F32X4_OUT_0_7654, F32X4_OUT_0_BA98, F32X4_OUT_0_FEDC;

F32X4_IN_1_3210 = vld1q_f32(&src1_4x4[ 0]);

F32X4_IN_1_7654 = vld1q_f32(&src1_4x4[ 4]);

F32X4_IN_1_BA98 = vld1q_f32(&src1_4x4[ 8]);

F32X4_IN_1_FEDC = vld1q_f32(&src1_4x4[12]);

F32X4_IN_0_3210 = vld1q_f32(&src0_4x4[ 0]);

F32X4_IN_0_7654 = vld1q_f32(&src0_4x4[ 4]);

F32X4_IN_0_BA98 = vld1q_f32(&src0_4x4[ 8]);

F32X4_IN_0_FEDC = vld1q_f32(&src0_4x4[12]);

F32X2_IN_0_10 = vget_low_f32 (F32X4_IN_0_3210);

F32X2_IN_0_32 = vget_high_f32(F32X4_IN_0_3210);

F32X2_IN_0_54 = vget_low_f32 (F32X4_IN_0_7654);

F32X2_IN_0_76 = vget_high_f32(F32X4_IN_0_7654);

F32X2_IN_0_98 = vget_low_f32 (F32X4_IN_0_BA98);

F32X2_IN_0_BA = vget_high_f32(F32X4_IN_0_BA98);

F32X2_IN_0_DC = vget_low_f32 (F32X4_IN_0_FEDC);

F32X2_IN_0_FE = vget_high_f32(F32X4_IN_0_FEDC);

F32X4_OUT_0_3210 = vmulq_lane_f32( F32X4_IN_1_3210, F32X2_IN_0_10, 0);

F32X4_OUT_0_3210 = vmlaq_lane_f32(F32X4_OUT_0_3210, F32X4_IN_1_7654, F32X2_IN_0_10, 1);

F32X4_OUT_0_3210 = vmlaq_lane_f32(F32X4_OUT_0_3210, F32X4_IN_1_BA98, F32X2_IN_0_32, 0);

F32X4_OUT_0_3210 = vmlaq_lane_f32(F32X4_OUT_0_3210, F32X4_IN_1_FEDC, F32X2_IN_0_32, 1);

F32X4_OUT_0_7654 = vmulq_lane_f32( F32X4_IN_1_3210, F32X2_IN_0_54, 0);

F32X4_OUT_0_7654 = vmlaq_lane_f32(F32X4_OUT_0_7654, F32X4_IN_1_7654, F32X2_IN_0_54, 1);

F32X4_OUT_0_7654 = vmlaq_lane_f32(F32X4_OUT_0_7654, F32X4_IN_1_BA98, F32X2_IN_0_76, 0);

F32X4_OUT_0_7654 = vmlaq_lane_f32(F32X4_OUT_0_7654, F32X4_IN_1_FEDC, F32X2_IN_0_76, 1);

F32X4_OUT_0_BA98 = vmulq_lane_f32( F32X4_IN_1_3210, F32X2_IN_0_98, 0);

F32X4_OUT_0_BA98 = vmlaq_lane_f32(F32X4_OUT_0_BA98, F32X4_IN_1_7654, F32X2_IN_0_98, 1);

F32X4_OUT_0_BA98 = vmlaq_lane_f32(F32X4_OUT_0_BA98, F32X4_IN_1_BA98, F32X2_IN_0_BA, 0);

F32X4_OUT_0_BA98 = vmlaq_lane_f32(F32X4_OUT_0_BA98, F32X4_IN_1_FEDC, F32X2_IN_0_BA, 1);

F32X4_OUT_0_FEDC = vmulq_lane_f32( F32X4_IN_1_3210, F32X2_IN_0_DC, 0);

F32X4_OUT_0_FEDC = vmlaq_lane_f32(F32X4_OUT_0_FEDC, F32X4_IN_1_7654, F32X2_IN_0_DC, 1);

F32X4_OUT_0_FEDC = vmlaq_lane_f32(F32X4_OUT_0_FEDC, F32X4_IN_1_BA98, F32X2_IN_0_FE, 0);

F32X4_OUT_0_FEDC = vmlaq_lane_f32(F32X4_OUT_0_FEDC, F32X4_IN_1_FEDC, F32X2_IN_0_FE, 1);

vst1q_f32(&res4x4[ 0], F32X4_OUT_0_3210);

vst1q_f32(&res4x4[ 4], F32X4_OUT_0_7654);

vst1q_f32(&res4x4[ 8], F32X4_OUT_0_BA98);

vst1q_f32(&res4x4[12], F32X4_OUT_0_FEDC);

return 0;

}

========================================================

3 - neon inline asm code

int opt_matrix_mul( const SRC_T * RESTRICT src0_4x4, const SRC_T * RESTRICT src1_4x4,

RES_T * RESTRICT res4x4)

{

/* load src0_4x4 */

asm ("vld1.32 { q8-q9 }, [r1]! @,");

asm ("vld1.32 { q10-q11 }, [r1]! @,");

/* load src1_4x4 */

asm ("vld1.32 { q0-q1 }, [r0]! @,");

asm ("vld1.32 { q2-q3 }, [r0]! @,");

/* mul res0-3 */

asm ("vmul.f32 q12, q8, d0[0] @,");

asm ("vmul.f32 q13, q8, d2[0] @,");

asm ("vmul.f32 q14, q8, d4[0] @,");

asm ("vmul.f32 q15, q8, d6[0] @,");

asm ("vmla.f32 q12, q9, d0[1] @,");

asm ("vmla.f32 q13, q9, d2[1] @,");

asm ("vmla.f32 q14, q9, d4[1] @,");

asm ("vmla.f32 q15, q9, d6[1] @,");

asm ("vmla.f32 q12, q10, d1[0] @,");

asm ("vmla.f32 q13, q10, d3[0] @,");

asm ("vmla.f32 q14, q10, d5[0] @,");

asm ("vmla.f32 q15, q10, d7[0] @,");

asm ("vmla.f32 q12, q11, d1[1] @,");

asm ("vmla.f32 q13, q11, d3[1] @,");

asm ("vmla.f32 q14, q11, d5[1] @,");

asm ("vmla.f32 q15, q11, d7[1] @,");

/* save res4x4 */

asm ("vst1.32 { q12-q13 }, [r2]! @,");

asm ("vst1.32 { q14-q15 }, [r2]! @,");

/* return 0 */

asm ("mov r0, #0 @,");

}

=========================================

here are asm codes for each version

========================================================

1 - natural code

add r3, r1, ip @ D.14387, p_src_c1, ivtmp.11

add r2, r0, ip @ D.14387, p_src_c0, ivtmp.11

flds s28, [r3, #16] @ D.14385, MEM[base: _575, offset: 16B]

flds s10, [r2, #4] @ D.14385, MEM[base: _576, offset: 4B]

fmuls s7, s10, s28 @ D.14385, D.14385, D.14385

flds s23, [r3] @ D.14385, MEM[base: _575, offset: 0B]

flds s31, [r2] @ D.14385, MEM[base: _576, offset: 0B]

flds s27, [r3, #20] @ D.14385, MEM[base: _575, offset: 20B]

flds s25, [r3, #24] @ D.14385, MEM[base: _575, offset: 24B]

flds s24, [r3, #28] @ D.14385, MEM[base: _575, offset: 28B]

flds s14, [r2, #20] @ D.14385, MEM[base: _576, offset: 20B]

flds s26, [r2, #36] @ D.14385, MEM[base: _576, offset: 36B]

flds s22, [r3, #4] @ D.14385, MEM[base: _575, offset: 4B]

flds s20, [r3, #8] @ D.14385, MEM[base: _575, offset: 8B]

flds s19, [r3, #12] @ D.14385, MEM[base: _575, offset: 12B]

flds s29, [r2, #8] @ D.14385, MEM[base: _576, offset: 8B]

flds s18, [r3, #32] @ D.14385, MEM[base: _575, offset: 32B]

flds s30, [r2, #16] @ D.14385, MEM[base: _576, offset: 16B]

flds s21, [r2, #32] @ D.14385, MEM[base: _576, offset: 32B]

flds s17, [r3, #36] @ D.14385, MEM[base: _575, offset: 36B]

flds s0, [r3, #40] @ D.14385, MEM[base: _575, offset: 40B]

flds s1, [r3, #44] @ D.14385, MEM[base: _575, offset: 44B]

flds s2, [r3, #48] @ D.14385, MEM[base: _575, offset: 48B]

flds s16, [r2, #40] @ D.14385, MEM[base: _576, offset: 40B]

flds s3, [r3, #52] @ D.14385, MEM[base: _575, offset: 52B]

flds s5, [r3, #56] @ D.14385, MEM[base: _575, offset: 56B]

flds s6, [r3, #60] @ D.14385, MEM[base: _575, offset: 60B]

flds s4, [r2, #44] @ D.14385, MEM[base: _576, offset: 44B]

fmacs s7, s31, s23 @ D.14385, D.14385, D.14385

add r3, r5, ip @ D.14386, p_res, ivtmp.11

add r4, r4, #16 @ idx, idx,

add ip, ip, #64 @ ivtmp.11, ivtmp.11,

cmp r4, r6 @ idx, D.14384

fmuls s8, s10, s27 @ D.14385, D.14385, D.14385

fmuls s9, s10, s25 @ D.14385, D.14385, D.14385

fmuls s11, s28, s14 @ D.14385, D.14385, D.14385

fmuls s12, s27, s14 @ D.14385, D.14385, D.14385

fmuls s13, s25, s14 @ D.14385, D.14385, D.14385

fmuls s15, s28, s26 @ D.14385, D.14385, D.14385

fmuls s10, s10, s24 @ D.14385, D.14385, D.14385

fmuls s14, s24, s14 @ D.14385, D.14385, D.14385

fmacs s7, s29, s18 @ D.14385, D.14385, D.14385

fmacs s8, s31, s22 @ D.14385, D.14385, D.14385

fmacs s9, s31, s20 @ D.14385, D.14385, D.14385

fmacs s10, s31, s19 @ D.14385, D.14385, D.14385

fmacs s11, s23, s30 @ D.14385, D.14385, D.14385

fmacs s12, s22, s30 @ D.14385, D.14385, D.14385

fmacs s13, s20, s30 @ D.14385, D.14385, D.14385

fmacs s14, s19, s30 @ D.14385, D.14385, D.14385

fmacs s15, s23, s21 @ D.14385, D.14385, D.14385

fmacs s8, s29, s17 @ D.14385, D.14385, D.14385

fmacs s9, s29, s0 @ D.14385, D.14385, D.14385

fmacs s10, s29, s1 @ D.14385, D.14385, D.14385

flds s29, [r2, #24] @, MEM[base: _576, offset: 24B]

fmacs s15, s18, s16 @ D.14385, D.14385, D.14385

fmacs s11, s18, s29 @ D.14385, D.14385,

fmacs s12, s17, s29 @ D.14385, D.14385,

fmacs s13, s0, s29 @ D.14385, D.14385,

fmacs s14, s1, s29 @ D.14385, D.14385,

flds s29, [r2, #12] @, MEM[base: _576, offset: 12B]

fmacs s7, s29, s2 @ D.14385,, D.14385

fmacs s10, s29, s6 @ D.14385,, D.14385

fsts s7, [r3] @ D.14385, MEM[base: _568, offset: 0B]

flds s7, [r2, #28] @, MEM[base: _576, offset: 28B]

fmacs s15, s2, s4 @ D.14385, D.14385, D.14385

fsts s10, [r3, #12] @ D.14385, MEM[base: _568, offset: 12B]

fmacs s11, s2, s7 @ D.14385, D.14385,

fmacs s12, s3, s7 @ D.14385, D.14385,

fsts s11, [r3, #16] @ D.14385, MEM[base: _568, offset: 16B]

fmacs s14, s6, s7 @ D.14385, D.14385,

fmacs s8, s29, s3 @ D.14385,, D.14385

fmacs s9, s29, s5 @ D.14385,, D.14385

fsts s8, [r3, #4] @ D.14385, MEM[base: _568, offset: 4B]

fsts s9, [r3, #8] @ D.14385, MEM[base: _568, offset: 8B]

fsts s12, [r3, #20] @ D.14385, MEM[base: _568, offset: 20B]

flds s10, [r2, #52] @ D.14385, MEM[base: _576, offset: 52B]

fsts s14, [r3, #28] @ D.14385, MEM[base: _568, offset: 28B]

fsts s15, [r3, #32] @ D.14385, MEM[base: _568, offset: 32B]

fmuls s14, s27, s26 @ D.14385, D.14385, D.14385

flds s11, [r2, #48] @ D.14385, MEM[base: _576, offset: 48B]

flds s12, [r2, #56] @ D.14385, MEM[base: _576, offset: 56B]

fmuls s15, s25, s26 @ D.14385, D.14385, D.14385

fmuls s28, s28, s10 @ D.14385, D.14385, D.14385

fmuls s26, s24, s26 @ D.14385, D.14385, D.14385

fmuls s27, s27, s10 @ D.14385, D.14385, D.14385

fmuls s25, s25, s10 @ D.14385, D.14385, D.14385

fmuls s24, s24, s10 @ D.14385, D.14385, D.14385

fmacs s28, s23, s11 @ D.14385, D.14385, D.14385

fmacs s14, s22, s21 @ D.14385, D.14385, D.14385

fmacs s27, s22, s11 @ D.14385, D.14385, D.14385

fmacs s15, s20, s21 @ D.14385, D.14385, D.14385

fmacs s26, s19, s21 @ D.14385, D.14385, D.14385

fmacs s25, s20, s11 @ D.14385, D.14385, D.14385

fmacs s24, s19, s11 @ D.14385, D.14385, D.14385

fmacs s13, s5, s7 @ D.14385, D.14385,

fmacs s28, s18, s12 @ D.14385, D.14385, D.14385

fsts s13, [r3, #24] @ D.14385, MEM[base: _568, offset: 24B]

flds s13, [r2, #60] @ D.14385, MEM[base: _576, offset: 60B]

fmacs s14, s17, s16 @ D.14385, D.14385, D.14385

fmacs s27, s17, s12 @ D.14385, D.14385, D.14385

fmacs s15, s0, s16 @ D.14385, D.14385, D.14385

fmacs s26, s1, s16 @ D.14385, D.14385, D.14385

fmacs s25, s0, s12 @ D.14385, D.14385, D.14385

fmacs s24, s1, s12 @ D.14385, D.14385, D.14385

fmacs s28, s2, s13 @ D.14385, D.14385, D.14385

fmacs s14, s3, s4 @ D.14385, D.14385, D.14385

fsts s28, [r3, #48] @ D.14385, MEM[base: _568, offset: 48B]

fmacs s27, s3, s13 @ D.14385, D.14385, D.14385

fsts s14, [r3, #36] @ D.14385, MEM[base: _568, offset: 36B]

fmacs s15, s5, s4 @ D.14385, D.14385, D.14385

fsts s27, [r3, #52] @ D.14385, MEM[base: _568, offset: 52B]

fmacs s26, s6, s4 @ D.14385, D.14385, D.14385

fsts s15, [r3, #40] @ D.14385, MEM[base: _568, offset: 40B]

fmacs s25, s5, s13 @ D.14385, D.14385, D.14385

fsts s26, [r3, #44] @ D.14385, MEM[base: _568, offset: 44B]

fmacs s24, s6, s13 @ D.14385, D.14385, D.14385

fsts s25, [r3, #56] @ D.14385, MEM[base: _568, offset: 56B]

fsts s24, [r3, #60] @ D.14385, MEM[base: _568, offset: 60B]

========================================================

2 - neon code (intrinsic)

add r4, r0, #32 @ tmp159, src0_4x4,

vld1.32 {d16-d17}, [r3]! @ D.14261, MEM[(const __builtin_neon_sf[4] *)src0_4x4_10(D)]

add r0, r0, #48 @ tmp160, src0_4x4,

vmov d1, d16 @ v2sf @ D.14262, D.14261

add r5, r2, #32 @ tmp162, res4x4,

vld1.32 {d18-d19}, [r3] @ D.14261, MEM[(const __builtin_neon_sf[4] *)_12]

vmov d5, d17 @ v2sf @ D.14262, D.14261

vmov d0, d18 @ v2sf @ D.14262, D.14261

add r3, r1, #32 @ tmp156, src1_4x4,

vld1.32 {d20-d21}, [r4] @ D.14261, MEM[(const __builtin_neon_sf[4] *)_14]

vmov d4, d19 @ v2sf @ D.14262, D.14261

vmov d2, d20 @ v2sf @ D.14262, D.14261

add r1, r1, #48 @ tmp157, src1_4x4,

vld1.32 {d18-d19}, [r0] @ D.14261, MEM[(const __builtin_neon_sf[4] *)_16]

vmov d6, d21 @ v2sf @ D.14262, D.14261

vmov d3, d18 @ v2sf @ D.14262, D.14261

add r4, r2, #48 @ tmp163, res4x4,

vld1.32 {d16-d17}, [ip]! @ D.14261, MEM[(const __builtin_neon_sf[4] *)src1_4x4_2(D)]

vmov d7, d19 @ v2sf @ D.14262, D.14261

mov r0, #0 @,

vmul.f32 q10, q8, d0[0] @ D.14261, D.14261, D.14262,

vld1.32 {d28-d29}, [ip] @ D.14261, MEM[(const __builtin_neon_sf[4] *)_4]

vmul.f32 q11, q8, d1[0] @ D.14261, D.14261, D.14262,

vld1.32 {d26-d27}, [r3] @ D.14261, MEM[(const __builtin_neon_sf[4] *)_6]

vmul.f32 q9, q8, d2[0] @ D.14261, D.14261, D.14262,

vld1.32 {d24-d25}, [r1] @ D.14261, MEM[(const __builtin_neon_sf[4] *)_8]

vmul.f32 q8, q8, d3[0] @ D.14261, D.14261, D.14262,

vmla.f32 q10, q14, d0[1] @ D.14261, D.14261, D.14262,

vmla.f32 q11, q14, d1[1] @ D.14261, D.14261, D.14262,

vmla.f32 q9, q14, d2[1] @ D.14261, D.14261, D.14262,

vmla.f32 q8, q14, d3[1] @ D.14261, D.14261, D.14262,

vmla.f32 q10, q13, d4[0] @ D.14261, D.14261, D.14262,

vmla.f32 q11, q13, d5[0] @ D.14261, D.14261, D.14262,

vmla.f32 q9, q13, d6[0] @ D.14261, D.14261, D.14262,

vmla.f32 q8, q13, d7[0] @ D.14261, D.14261, D.14262,

vmla.f32 q10, q12, d4[1] @ D.14261, D.14261, D.14262,

vmla.f32 q11, q12, d5[1] @ D.14261, D.14261, D.14262,

vmla.f32 q9, q12, d6[1] @ D.14261, D.14261, D.14262,

vmla.f32 q8, q12, d7[1] @ D.14261, D.14261, D.14262,

vst1.32 {d22-d23}, [r2]! @ D.14261, MEM[(__builtin_neon_sf[4] *)res4x4_42(D)]

vst1.32 {d20-d21}, [r2] @ D.14261, MEM[(__builtin_neon_sf[4] *)_44]

vst1.32 {d18-d19}, [r5] @ D.14261, MEM[(__builtin_neon_sf[4] *)_46]

vst1.32 {d16-d17}, [r4] @ D.14261, MEM[(__builtin_neon_sf[4] *)_48]

========================================================

3 - neon inline asm code

@ 68 "../src/mat.c" 1

vld1.32 { q8-q9 }, [r1]! @,

@ 0 "" 2

@ 69 "../src/mat.c" 1

vld1.32 { q10-q11 }, [r1]! @,

@ 0 "" 2

@ 72 "../src/mat.c" 1

vld1.32 { q0-q1 }, [r0]! @,

@ 0 "" 2

@ 73 "../src/mat.c" 1

vld1.32 { q2-q3 }, [r0]! @,

@ 0 "" 2

@ 76 "../src/mat.c" 1

vmul.f32 q12, q8, d0[0] @,

@ 0 "" 2

@ 77 "../src/mat.c" 1

vmul.f32 q13, q8, d2[0] @,

@ 0 "" 2

@ 78 "../src/mat.c" 1

vmul.f32 q14, q8, d4[0] @,

@ 0 "" 2

@ 79 "../src/mat.c" 1

vmul.f32 q15, q8, d6[0] @,

@ 0 "" 2

@ 81 "../src/mat.c" 1

vmla.f32 q12, q9, d0[1] @,

@ 0 "" 2

@ 82 "../src/mat.c" 1

vmla.f32 q13, q9, d2[1] @,

@ 0 "" 2

@ 83 "../src/mat.c" 1

vmla.f32 q14, q9, d4[1] @,

@ 0 "" 2

@ 84 "../src/mat.c" 1

vmla.f32 q15, q9, d6[1] @,

@ 0 "" 2

@ 86 "../src/mat.c" 1

vmla.f32 q12, q10, d1[0] @,

@ 0 "" 2

@ 87 "../src/mat.c" 1

vmla.f32 q13, q10, d3[0] @,

@ 0 "" 2

@ 88 "../src/mat.c" 1

vmla.f32 q14, q10, d5[0] @,

@ 0 "" 2

@ 89 "../src/mat.c" 1

vmla.f32 q15, q10, d7[0] @,

@ 0 "" 2

@ 91 "../src/mat.c" 1

vmla.f32 q12, q11, d1[1] @,

@ 0 "" 2

@ 92 "../src/mat.c" 1

vmla.f32 q13, q11, d3[1] @,

@ 0 "" 2

@ 93 "../src/mat.c" 1

vmla.f32 q14, q11, d5[1] @,

@ 0 "" 2

@ 94 "../src/mat.c" 1

vmla.f32 q15, q11, d7[1] @,

@ 0 "" 2

@ 97 "../src/mat.c" 1

vst1.32 { q12-q13 }, [r2]! @,

@ 0 "" 2

@ 98 "../src/mat.c" 1

vst1.32 { q14-q15 }, [r2]! @,

@ 0 "" 2

@ 100 "../src/mat.c" 1

mov r0, #0 @,

Parents
  • Hello,

    I also measured the execution time for ethe ach assembler code on my Cortex-A9 (Renesas RZ/A1L) board.

    The below are the results.

    1 - natural code                  60 ticks

    2 - neon code (intrinsic)     34 ticks

    3 - neon inline asm code    38 ticks

    Regarding "2- neon code", because r3 is unknown, the CPU occurred an exception.

    I guess the compiled code of "2 - neon code (intrinsic)" would be wrong.

    So, I changed it as the following but I am not sure it is correct or not.

    add r4, r0, #32 
    add r0, r0, #48 
    add r3, r1, #32 
    add r1, r1, #48 
    add r5, r2, #32 
    add ip, r2, #48
    
    vld1.32 {d16-d17}, [r3]! 
    vld1.32 {d18-d19}, [r3] 
    vmov d1, d16 
    vmov d5, d17 
    vmov d0, d18 
    vmov d4, d19
    
    vld1.32 {d20-d21}, [r4]
    vld1.32 {d18-d19}, [r0]
    vmov d2, d20
    vmov d6, d21
    vmov d3, d18
    vmov d7, d19
    
    vmul.f32 q10, q8, d0[0]
    vmul.f32 q11, q8, d1[0]
    vmul.f32 q9, q8, d2[0]
    vmul.f32 q8, q8, d3[0]
    vmla.f32 q10, q14, d0[1]
    vmla.f32 q11, q14, d1[1]
    vmla.f32 q9, q14, d2[1]
    vmla.f32 q8, q14, d3[1]
    vmla.f32 q10, q13, d4[0],
    vmla.f32 q11, q13, d5[0],
    vmla.f32 q9, q13, d6[0]
    vmla.f32 q8, q13, d7[0]
    vmla.f32 q10, q12, d4[1]
    vmla.f32 q11, q12, d5[1]
    vmla.f32 q9, q12, d6[1]
    vmla.f32 q8, q12, d7[1]
    
    vst1.32 {d22-d23}, [r2]!
    vst1.32 {d20-d21}, [r2]
    vst1.32 {d18-d19}, [r5] 
    vst1.32 {d16-d17}, [ip]
    
    mov r0, #0
    bx lr
    
    
    

    Anyway, I think the results would are compromising.

    Best regards,

    Yasuhiko Koumoto.

Reply
  • Hello,

    I also measured the execution time for ethe ach assembler code on my Cortex-A9 (Renesas RZ/A1L) board.

    The below are the results.

    1 - natural code                  60 ticks

    2 - neon code (intrinsic)     34 ticks

    3 - neon inline asm code    38 ticks

    Regarding "2- neon code", because r3 is unknown, the CPU occurred an exception.

    I guess the compiled code of "2 - neon code (intrinsic)" would be wrong.

    So, I changed it as the following but I am not sure it is correct or not.

    add r4, r0, #32 
    add r0, r0, #48 
    add r3, r1, #32 
    add r1, r1, #48 
    add r5, r2, #32 
    add ip, r2, #48
    
    vld1.32 {d16-d17}, [r3]! 
    vld1.32 {d18-d19}, [r3] 
    vmov d1, d16 
    vmov d5, d17 
    vmov d0, d18 
    vmov d4, d19
    
    vld1.32 {d20-d21}, [r4]
    vld1.32 {d18-d19}, [r0]
    vmov d2, d20
    vmov d6, d21
    vmov d3, d18
    vmov d7, d19
    
    vmul.f32 q10, q8, d0[0]
    vmul.f32 q11, q8, d1[0]
    vmul.f32 q9, q8, d2[0]
    vmul.f32 q8, q8, d3[0]
    vmla.f32 q10, q14, d0[1]
    vmla.f32 q11, q14, d1[1]
    vmla.f32 q9, q14, d2[1]
    vmla.f32 q8, q14, d3[1]
    vmla.f32 q10, q13, d4[0],
    vmla.f32 q11, q13, d5[0],
    vmla.f32 q9, q13, d6[0]
    vmla.f32 q8, q13, d7[0]
    vmla.f32 q10, q12, d4[1]
    vmla.f32 q11, q12, d5[1]
    vmla.f32 q9, q12, d6[1]
    vmla.f32 q8, q12, d7[1]
    
    vst1.32 {d22-d23}, [r2]!
    vst1.32 {d20-d21}, [r2]
    vst1.32 {d18-d19}, [r5] 
    vst1.32 {d16-d17}, [ip]
    
    mov r0, #0
    bx lr
    
    
    

    Anyway, I think the results would are compromising.

    Best regards,

    Yasuhiko Koumoto.

Children
No data