Hi,
I am using IDE Xilinx SDK 2019.1 for my application and running it on ARM cortex a53 processor with Neon and floating point engine support available. I am working on a bare metal application.
The problem I am facing is that, I am unable to understand the disassembly of neon intrinsics functions in my code at highest level optimization i.e O3.
The following code is just for an example. My original code is using the same intrinsics functions but I am not achieving any performance boost as compared to my original C code. In this code, I am giving as an input two floating point arrays of each 16 elements and then multiplying each 4 elements chunk of array A with array B and storing its result in array C. All of the used variables are local.
The Neon intrinsics version of my code is: // initialized arrays float A[16]= {1.0,2.0,3.0,4.0, 1.0,2.0,3.0,4.0, 1.0,2.0,3.0,4.0, 1.0,2.0,3.0,4.0 }; float B[16] = {1.0,2.0,3.0,4.0, 1.0,2.0,3.0,4.0, 1.0,2.0,3.0,4.0, 1.0,2.0,3.0,4.0 }; float C[16]; //function definition #include <arm_neon.h> void multiply_4x4_neon(float *A, float *B,float *C) { // these are the columns A float32x4_t A0; float32x4_t A1; float32x4_t A2; float32x4_t A3; float32x4_t B0; float32x4_t B1; float32x4_t B2; float32x4_t B3; float32x4_t C0; float32x4_t C1; float32x4_t C2; float32x4_t C3; C0 = vmovq_n_f32(0); C1 = vmovq_n_f32(0); C2 = vmovq_n_f32(0); C3 = vmovq_n_f32(0); A0 = vld1q_f32(A); B0 = vld1q_f32(B); C0 = vmlaq_f32(C0,A0, B0); vst1q_f32(C, C0); A1 = vld1q_f32(A+4); B1 = vld1q_f32(B+4); C1 = vmlaq_f32(C1,A1, B1); vst1q_f32(C+4, C1); A2 = vld1q_f32(A+8); B2 = vld1q_f32(B+8); C2 = vmlaq_f32(C2,A2, B2); vst1q_f32(C+8, C2); A3 = vld1q_f32(A+12); B3 = vld1q_f32(B+12); C3 = vmlaq_f32(C3,A3, B3); vst1q_f32(C+12, C3); }
The assembly of above code at O3 optimization level is the following:
00000000000251e0 <multiply_4x4_neon>: return a + b * c; 251e0: 4f000400 movi v0.4s, #0x0 return __builtin_aarch64_ld1v4sf ((const __builtin_aarch64_simd_sf *) a); 251e4: 3dc00001 ldr q1, [x0] 251e8: 3dc00022 ldr q2, [x1] return a + b * c; 251ec: 4ea01c03 mov v3.16b, v0.16b 251f0: 4e21cc43 fmla v3.4s, v2.4s, v1.4s __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_f32 (float32_t *a, float32x4_t b) { __builtin_aarch64_st1v4sf ((__builtin_aarch64_simd_sf *) a, b); 251f4: 3d800043 str q3, [x2] return a + b * c; 251f8: 4ea01c03 mov v3.16b, v0.16b return __builtin_aarch64_ld1v4sf ((const __builtin_aarch64_simd_sf *) a); 251fc: 3dc00401 ldr q1, [x0, #16] 25200: 3dc00422 ldr q2, [x1, #16] return a + b * c; 25204: 4e21cc43 fmla v3.4s, v2.4s, v1.4s __builtin_aarch64_st1v4sf ((__builtin_aarch64_simd_sf *) a, b); 25208: 3d800443 str q3, [x2, #16] return a + b * c; 2520c: 4ea01c03 mov v3.16b, v0.16b return __builtin_aarch64_ld1v4sf ((const __builtin_aarch64_simd_sf *) a); 25210: 3dc00802 ldr q2, [x0, #32] 25214: 3dc00821 ldr q1, [x1, #32] return a + b * c; 25218: 4e21cc43 fmla v3.4s, v2.4s, v1.4s __builtin_aarch64_st1v4sf ((__builtin_aarch64_simd_sf *) a, b); 2521c: 3d800843 str q3, [x2, #32] return __builtin_aarch64_ld1v4sf ((const __builtin_aarch64_simd_sf *) a); 25220: 3dc00c02 ldr q2, [x0, #48] 25224: 3dc00c21 ldr q1, [x1, #48] return a + b * c; 25228: 4e21cc40 fmla v0.4s, v2.4s, v1.4s __builtin_aarch64_st1v4sf ((__builtin_aarch64_simd_sf *) a, b); 2522c: 3d800c40 str q0, [x2, #48] A3 = vld1q_f32(A+12); B3 = vld1q_f32(B+12); C3 = vmlaq_f32(C3,A3, B3); vst1q_f32(C+12, C3); }
The setting of compiler on IDE is:
I am not using any compiler option for optimization. I am unable to specify -mfpu=neon compiler option(because compiler is not recognizing it) but from the disassembly of code, it seems to me that it is running on Neon engine because I can see vector instructions in disassembly. So, please also confirm that either code is running on Neon engine?
I am not telling the compiler to use hardware linkages . For example if I use -mfloat-abi=hard in optimization setting of compiler, the compiler is not recognizing it. So, how can I tell the compiler to use hardware linkages?
I could not understand why there is a function body of intrinsic function vst1q_f32 (float32_t *a, float32x4_t b) starting in the middle of assembly code.
I know that at highest optimization level, the compiler is somehow jumping around the instructions.
Could someone please help me on these confusions so that I can understand disassembly of code and I can further optimize it?