This discussion has been locked.
You can no longer post new replies to this discussion. If you have a question you can start a new discussion

Neon代码O2反汇编的一些疑惑

最近在学习neon_programmer_guide,其中有一个demo如下

void add_int (int* restrict pa, int* restrict pb, unsigned int n, int x)
{
unsigned int i;
for(i = 0; i < (n&~3); i++)
pa[i] = pb[i] + x;
}

我在ubuntu10的PC上也编译了一版,使用-O2 -ptree-vertorize 但是对反汇编出的结果很疑惑,code如下:

void main()
{
 6f0:	a9b67bfd 	stp	x29, x30, [sp,#-160]!
	int *pa =(int*)malloc(36*4);
 6f4:	d2801200 	mov	x0, #0x90                  	// #144

}


void main()
{
 6f8:	910003fd 	mov	x29, sp
	int *pa =(int*)malloc(36*4);
 6fc:	97ffffed 	bl	6b0 <malloc@plt>
 700:	aa0003e2 	mov	x2, x0

void add_init (int* __restrict pa,int * pb,unsigned int n,int x){

	unsigned int i;
	for(i=0;i<(n&~3);i++)
		pa[i]=pb[i]+x;
 704:	910043a1 	add	x1, x29, #0x10
 708:	4f030480 	movi	v0.4s, #0x64
 70c:	aa0103e3 	mov	x3, x1
 710:	9100c3a0 	add	x0, x29, #0x30
 714:	910103a5 	add	x5, x29, #0x40
 718:	910143a4 	add	x4, x29, #0x50
 71c:	3cc10471 	ldr	q17, [x3],#16
 720:	910183a6 	add	x6, x29, #0x60
 724:	3dc00007 	ldr	q7, [x0]
 728:	aa0203e8 	mov	x8, x2
 72c:	3dc000a6 	ldr	q6, [x5]
 730:	9100804b 	add	x11, x2, #0x20
 734:	3dc00085 	ldr	q5, [x4]
 738:	9100c04c 	add	x12, x2, #0x30
 73c:	3dc000c1 	ldr	q1, [x6]
 740:	9101004d 	add	x13, x2, #0x40
 744:	3dc00070 	ldr	q16, [x3]
 748:	4ea08632 	add	v18.4s, v17.4s, v0.4s
 74c:	4ea084f3 	add	v19.4s, v7.4s, v0.4s
 750:	9101404e 	add	x14, x2, #0x50
 754:	4ea084d4 	add	v20.4s, v6.4s, v0.4s
 758:	9101c3a7 	add	x7, x29, #0x70
 75c:	4ea084b6 	add	v22.4s, v5.4s, v0.4s
 760:	910203a9 	add	x9, x29, #0x80
 764:	3c810512 	str	q18, [x8],#16
 768:	910243aa 	add	x10, x29, #0x90
 76c:	4ea08615 	add	v21.4s, v16.4s, v0.4s
 770:	3dc000e4 	ldr	q4, [x7]
 774:	4ea08437 	add	v23.4s, v1.4s, v0.4s
 778:	910283a3 	add	x3, x29, #0xa0
 77c:	3dc00123 	ldr	q3, [x9]
 780:	9101804f 	add	x15, x2, #0x60
 784:	3dc00142 	ldr	q2, [x10]
 788:	9101c050 	add	x16, x2, #0x70
 78c:	3d800115 	str	q21, [x8]
 790:	91020051 	add	x17, x2, #0x80
 794:	3d800173 	str	q19, [x11]
 798:	91024052 	add	x18, x2, #0x90
 79c:	3d800194 	str	q20, [x12]
void main()
{
	int *pa =(int*)malloc(36*4);
	int pb[36];
	add_init(pa,pb,40,100);
	printf("%d",pa[0]);
 7a0:	90000001 	adrp	x1, 0 <abitag-0x250>

void add_init (int* __restrict pa,int * pb,unsigned int n,int x){

	unsigned int i;
	for(i=0;i<(n&~3);i++)
		pa[i]=pb[i]+x;
 7a4:	3d8001b6 	str	q22, [x13]
void main()
{
	int *pa =(int*)malloc(36*4);
	int pb[36];
	add_init(pa,pb,40,100);
	printf("%d",pa[0]);
 7a8:	91234020 	add	x0, x1, #0x8d0

void add_init (int* __restrict pa,int * pb,unsigned int n,int x){

	unsigned int i;
	for(i=0;i<(n&~3);i++)
		pa[i]=pb[i]+x;
 7ac:	3d8001d7 	str	q23, [x14]
 7b0:	4ea08498 	add	v24.4s, v4.4s, v0.4s
 7b4:	4ea08479 	add	v25.4s, v3.4s, v0.4s
 7b8:	4ea0845a 	add	v26.4s, v2.4s, v0.4s
 7bc:	3dc0007b 	ldr	q27, [x3]
 7c0:	3d8001f8 	str	q24, [x15]
 7c4:	3d800219 	str	q25, [x16]
 7c8:	3d80023a 	str	q26, [x17]
 7cc:	4ea0877c 	add	v28.4s, v27.4s, v0.4s
 7d0:	3d80025c 	str	q28, [x18]
void main()
{
	int *pa =(int*)malloc(36*4);
	int pb[36];
	add_init(pa,pb,40,100);
	printf("%d",pa[0]);
 7d4:	b9400041 	ldr	w1, [x2]
	return;
}
 7d8:	a8ca7bfd 	ldp	x29, x30, [sp],#160
void main()
{
	int *pa =(int*)malloc(36*4);
	int pb[36];
	add_init(pa,pb,40,100);
	printf("%d",pa[0]);
 7dc:	17ffffbd 	b	6d0 <printf@plt>

其中诸如:

10: 9100c3a0 add x0, x29, #0x30
714: 910103a5 add x5, x29, #0x40
718: 910143a4 add x4, x29, #0x50

之类的代码完全不知所以,理解应该是在操作栈,可是事先没在里面“放东西”,却要“取东西”出来。

另外,比如此demo,如何可以看出优化的如何?比如有无多余的内存读取等?

望大牛解答心中疑惑!感谢

  • 我在本地重新编译了你的代码,但是结果完全不一样。我的环境是ubuntu16.04, gcc 5.4, aarch64平台。你的平台有些老了,出来的结果和较新的编译器差距很大。

    另外你在调用add函数时,会出现数组越界。编译器处理不了这种问题,会在某些运行条件时爆出来这个问题。