最近在学习neon_programmer_guide,其中有一个demo如下
void add_int (int* restrict pa, int* restrict pb, unsigned int n, int x) { unsigned int i; for(i = 0; i < (n&~3); i++) pa[i] = pb[i] + x; }
我在ubuntu10的PC上也编译了一版,使用-O2 -ptree-vertorize 但是对反汇编出的结果很疑惑,code如下:
void main() { 6f0: a9b67bfd stp x29, x30, [sp,#-160]! int *pa =(int*)malloc(36*4); 6f4: d2801200 mov x0, #0x90 // #144 } void main() { 6f8: 910003fd mov x29, sp int *pa =(int*)malloc(36*4); 6fc: 97ffffed bl 6b0 <malloc@plt> 700: aa0003e2 mov x2, x0 void add_init (int* __restrict pa,int * pb,unsigned int n,int x){ unsigned int i; for(i=0;i<(n&~3);i++) pa[i]=pb[i]+x; 704: 910043a1 add x1, x29, #0x10 708: 4f030480 movi v0.4s, #0x64 70c: aa0103e3 mov x3, x1 710: 9100c3a0 add x0, x29, #0x30 714: 910103a5 add x5, x29, #0x40 718: 910143a4 add x4, x29, #0x50 71c: 3cc10471 ldr q17, [x3],#16 720: 910183a6 add x6, x29, #0x60 724: 3dc00007 ldr q7, [x0] 728: aa0203e8 mov x8, x2 72c: 3dc000a6 ldr q6, [x5] 730: 9100804b add x11, x2, #0x20 734: 3dc00085 ldr q5, [x4] 738: 9100c04c add x12, x2, #0x30 73c: 3dc000c1 ldr q1, [x6] 740: 9101004d add x13, x2, #0x40 744: 3dc00070 ldr q16, [x3] 748: 4ea08632 add v18.4s, v17.4s, v0.4s 74c: 4ea084f3 add v19.4s, v7.4s, v0.4s 750: 9101404e add x14, x2, #0x50 754: 4ea084d4 add v20.4s, v6.4s, v0.4s 758: 9101c3a7 add x7, x29, #0x70 75c: 4ea084b6 add v22.4s, v5.4s, v0.4s 760: 910203a9 add x9, x29, #0x80 764: 3c810512 str q18, [x8],#16 768: 910243aa add x10, x29, #0x90 76c: 4ea08615 add v21.4s, v16.4s, v0.4s 770: 3dc000e4 ldr q4, [x7] 774: 4ea08437 add v23.4s, v1.4s, v0.4s 778: 910283a3 add x3, x29, #0xa0 77c: 3dc00123 ldr q3, [x9] 780: 9101804f add x15, x2, #0x60 784: 3dc00142 ldr q2, [x10] 788: 9101c050 add x16, x2, #0x70 78c: 3d800115 str q21, [x8] 790: 91020051 add x17, x2, #0x80 794: 3d800173 str q19, [x11] 798: 91024052 add x18, x2, #0x90 79c: 3d800194 str q20, [x12] void main() { int *pa =(int*)malloc(36*4); int pb[36]; add_init(pa,pb,40,100); printf("%d",pa[0]); 7a0: 90000001 adrp x1, 0 <abitag-0x250> void add_init (int* __restrict pa,int * pb,unsigned int n,int x){ unsigned int i; for(i=0;i<(n&~3);i++) pa[i]=pb[i]+x; 7a4: 3d8001b6 str q22, [x13] void main() { int *pa =(int*)malloc(36*4); int pb[36]; add_init(pa,pb,40,100); printf("%d",pa[0]); 7a8: 91234020 add x0, x1, #0x8d0 void add_init (int* __restrict pa,int * pb,unsigned int n,int x){ unsigned int i; for(i=0;i<(n&~3);i++) pa[i]=pb[i]+x; 7ac: 3d8001d7 str q23, [x14] 7b0: 4ea08498 add v24.4s, v4.4s, v0.4s 7b4: 4ea08479 add v25.4s, v3.4s, v0.4s 7b8: 4ea0845a add v26.4s, v2.4s, v0.4s 7bc: 3dc0007b ldr q27, [x3] 7c0: 3d8001f8 str q24, [x15] 7c4: 3d800219 str q25, [x16] 7c8: 3d80023a str q26, [x17] 7cc: 4ea0877c add v28.4s, v27.4s, v0.4s 7d0: 3d80025c str q28, [x18] void main() { int *pa =(int*)malloc(36*4); int pb[36]; add_init(pa,pb,40,100); printf("%d",pa[0]); 7d4: b9400041 ldr w1, [x2] return; } 7d8: a8ca7bfd ldp x29, x30, [sp],#160 void main() { int *pa =(int*)malloc(36*4); int pb[36]; add_init(pa,pb,40,100); printf("%d",pa[0]); 7dc: 17ffffbd b 6d0 <printf@plt>
其中诸如:
10: 9100c3a0 add x0, x29, #0x30714: 910103a5 add x5, x29, #0x40718: 910143a4 add x4, x29, #0x50
之类的代码完全不知所以,理解应该是在操作栈,可是事先没在里面“放东西”,却要“取东西”出来。
另外,比如此demo,如何可以看出优化的如何?比如有无多余的内存读取等?
望大牛解答心中疑惑!感谢
我在本地重新编译了你的代码,但是结果完全不一样。我的环境是ubuntu16.04, gcc 5.4, aarch64平台。你的平台有些老了,出来的结果和较新的编译器差距很大。
另外你在调用add函数时,会出现数组越界。编译器处理不了这种问题,会在某些运行条件时爆出来这个问题。