Problem in understanding behaviour of GCC compiler (aarch64-none-elf-gcc) on Neon intrinsics for ARM cortex a53

Hi,

I am using IDE Xilinx SDK 2019.1 for my application and running it on ARM cortex a53  processor with Neon and floating point engine support available. I am working on a bare metal application.

The problem I am facing is that, I am unable to understand the disassembly of neon intrinsics functions in my code at highest level optimization i.e O3.

The following code is just for an example. My original code is using the same intrinsics functions but I am not achieving any performance boost as compared to my original C code. In this code, I am giving as an input two floating point arrays of each 16 elements and then multiplying each 4 elements chunk of array A with array B and storing its result in array C. All of the used variables are local.

The Neon intrinsics version of my code is:

// initialized arrays

float A[16]= {1.0,2.0,3.0,4.0,
1.0,2.0,3.0,4.0,
1.0,2.0,3.0,4.0,
1.0,2.0,3.0,4.0
};

float B[16] = {1.0,2.0,3.0,4.0,
1.0,2.0,3.0,4.0,
1.0,2.0,3.0,4.0,
1.0,2.0,3.0,4.0
};
float C[16];

//function definition

#include <arm_neon.h>

void multiply_4x4_neon(float *A, float *B,float *C) {
// these are the columns A
float32x4_t A0;
float32x4_t A1;
float32x4_t A2;
float32x4_t A3;

float32x4_t B0;
float32x4_t B1;
float32x4_t B2;
float32x4_t B3;

float32x4_t C0;
float32x4_t C1;
float32x4_t C2;
float32x4_t C3;

C0 = vmovq_n_f32(0);
C1 = vmovq_n_f32(0);
C2 = vmovq_n_f32(0);
C3 = vmovq_n_f32(0);

A0 = vld1q_f32(A);
B0 = vld1q_f32(B);
C0 = vmlaq_f32(C0,A0, B0);
vst1q_f32(C, C0);

A1 = vld1q_f32(A+4);
B1 = vld1q_f32(B+4);
C1 = vmlaq_f32(C1,A1, B1);
vst1q_f32(C+4, C1);

A2 = vld1q_f32(A+8);
B2 = vld1q_f32(B+8);
C2 = vmlaq_f32(C2,A2, B2);
vst1q_f32(C+8, C2);

A3 = vld1q_f32(A+12);
B3 = vld1q_f32(B+12);
C3 = vmlaq_f32(C3,A3, B3);
vst1q_f32(C+12, C3);

}

The assembly of above code at O3 optimization level is the following:

00000000000251e0 <multiply_4x4_neon>:
  return a + b * c;
   251e0:	4f000400 	movi	v0.4s, #0x0
  return __builtin_aarch64_ld1v4sf ((const __builtin_aarch64_simd_sf *) a);
   251e4:	3dc00001 	ldr	q1, [x0]
   251e8:	3dc00022 	ldr	q2, [x1]
  return a + b * c;
   251ec:	4ea01c03 	mov	v3.16b, v0.16b
   251f0:	4e21cc43 	fmla	v3.4s, v2.4s, v1.4s

__extension__ extern __inline void
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_f32 (float32_t *a, float32x4_t b)
{
  __builtin_aarch64_st1v4sf ((__builtin_aarch64_simd_sf *) a, b);
   251f4:	3d800043 	str	q3, [x2]
  return a + b * c;
   251f8:	4ea01c03 	mov	v3.16b, v0.16b
  return __builtin_aarch64_ld1v4sf ((const __builtin_aarch64_simd_sf *) a);
   251fc:	3dc00401 	ldr	q1, [x0, #16]
   25200:	3dc00422 	ldr	q2, [x1, #16]
  return a + b * c;
   25204:	4e21cc43 	fmla	v3.4s, v2.4s, v1.4s
  __builtin_aarch64_st1v4sf ((__builtin_aarch64_simd_sf *) a, b);
   25208:	3d800443 	str	q3, [x2, #16]
  return a + b * c;
   2520c:	4ea01c03 	mov	v3.16b, v0.16b
  return __builtin_aarch64_ld1v4sf ((const __builtin_aarch64_simd_sf *) a);
   25210:	3dc00802 	ldr	q2, [x0, #32]
   25214:	3dc00821 	ldr	q1, [x1, #32]
  return a + b * c;
   25218:	4e21cc43 	fmla	v3.4s, v2.4s, v1.4s
  __builtin_aarch64_st1v4sf ((__builtin_aarch64_simd_sf *) a, b);
   2521c:	3d800843 	str	q3, [x2, #32]
  return __builtin_aarch64_ld1v4sf ((const __builtin_aarch64_simd_sf *) a);
   25220:	3dc00c02 	ldr	q2, [x0, #48]
   25224:	3dc00c21 	ldr	q1, [x1, #48]
  return a + b * c;
   25228:	4e21cc40 	fmla	v0.4s, v2.4s, v1.4s
  __builtin_aarch64_st1v4sf ((__builtin_aarch64_simd_sf *) a, b);
   2522c:	3d800c40 	str	q0, [x2, #48]
	A3 = vld1q_f32(A+12);
	B3 = vld1q_f32(B+12);
	C3 = vmlaq_f32(C3,A3, B3);
	vst1q_f32(C+12, C3);

}

The setting of compiler on IDE is:

I am not using any compiler option for optimization. I am unable to specify -mfpu=neon compiler option(because compiler is not recognizing it) but from the disassembly of code, it seems to me that it is running on Neon engine because I can see vector instructions in disassembly. So, please also confirm that either code is running on Neon engine?

I am not telling the compiler to use hardware linkages . For example if I use -mfloat-abi=hard in optimization setting of compiler, the compiler is not recognizing it. So, how can I tell the compiler to use hardware linkages?

I could not understand why there is a function body of intrinsic function vst1q_f32 (float32_t *a, float32x4_t b)  starting in the middle of assembly code.

I know that at highest optimization level, the compiler is somehow jumping around the instructions.

Could someone please help me on these confusions so that I can understand disassembly of code and  I can further optimize it?

More questions in this forum