This discussion has been locked.

You can no longer post new replies to this discussion. If you have a question you can start a new discussion

Optimising with NEON

Note: This was originally posted on 11th February 2010 at http://forums.arm.com

Hello guys, I am a bit new with ARM and NEON and was wondering if anyone could help me with advice on how to optimise a small piece of code by NEON.
The problem is that whatever I do I can't get the NEON code to execute faster than the standard C code.
I am running the code on Beagle board with the latest Angstrom distribution and building with the Code Sourcery Lite gcc.
My gcc command line options are -O3 -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp -ffast-math
the piece of code in question is the following:

int32_t weightsH[4];
int16_t weightsV[4];
int16_t *lut1, *lut2;

// start of loop

quantLevel1 = ...
quantLevel2 = ...
frac = ...
frac_1 = ...

pxlOut = 0;

lut1 = luts[quantLevel1];
lut2 = luts[quantLevel2];

w1 = lut1[0];
w2 = lut2[0];
weight = ((weightsH[0]>>16)*weightsV[0])>>13;
pxlOut += (weight*((w1*frac + w2*frac_1)>>14))>>14;

w1 = lut1[1];
w2 = lut2[1];
weight = ((weightsH[1]>>16)*weightsV[0])>>13;
pxlOut += (weight*((w1*frac + w2*frac_1)>>14))>>14;

w1 = lut1[2];
w2 = lut2[2];
weight = ((weightsH[0]>>16)*weightsV[1])>>13;
pxlOut += (weight*((w1*frac + w2*frac_1)>>14))>>14;

w1 = lut1[3];
w2 = lut2[3];
weight = ((weightsH[1]>>16)*weightsV[1])>>13;
pxlOut += (weight*((w1*frac + w2*frac_1)>>14))>>14;

weightsH[0] -= slope_x;
weightsH[1] += slope_x;

// end of loop

the NEON code with the best results that I managed to obtain is the following, yet it's still slower than the above, it's just killing me:

int16x4_t wVs, lt1, lt2, fr1, w;
int32x4_t wHs,s;
int32x4_t buf;
int32x2_t buf2;

slopes[0] = -slope_x;
slopes[1] = slope_x;
slopes[2] = -slope_x;
slopes[3] = slope_x;

wHs = vld1q_s32(weightsH);
s = vld1q_s32(slopes);
wVs = vld1_s16(weightsV);

// start of loop

quantLevel1 = ...
quantLevel2 = ...
frac = ...
frac_1 = ...

lut1 = luts[quantLevel1];
lut2 = luts[quantLevel2];

buf = vdupq_n_s32(0);
lt1 = vld1_s16(lut1);
lt2 = vld1_s16(lut2);
buf = vmlal_n_s16(buf, lt1, (int16_t)frac);
buf = vmlal_n_s16(buf, lt2, (int16_t)frac_1);
fr1 = vshrn_n_s32(buf, 14);
w = vshrn_n_s32(wHs, 16);
buf = vmull_s16(w, wVs);
w = vshrn_n_s32(buf, 13);
buf = vmull_s16(w, fr1);
fr1 = vshrn_n_s32(buf, 14);
buf2 = vpaddl_s16(fr1);
pxlOut = vget_lane_s32(buf2, 0) + vget_lane_s32(buf2, 1);

// end of loop

Thank you in advance,
Roumen

Parents

Simon Craske over 12 years ago

Note: This was originally posted on 11th February 2010 at http://forums.arm.com

Roumen,

Have you looked at what code the compiler is generating for the pure C version,
with the command line options you're using, it's likely that GCC is already using Neon,
e.g.

typedef int t; t foo(t *p) { int i; t max; max = p[0]; for(i=0;i<32;i++) if(p[i]>max) max=p[i]; return max; }

Compiled with GCC 4.4.1 using "-O3 -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp -ffast-math" automatically produces the Neon code:

mov r1, r0 ldr r2, [r0, #0] vld1.32 {d2}, [r1]! sub sp, sp, #8 vld1.32 {d20}, [r1]! vld1.32 {d0}, [r1]! str r2, [sp, #4] vld1.32 {d22}, [r1]! str r2, [sp, #0] fldd d16, [sp, #0] vmax.s32 d1, d2, d16 vmax.s32 d17, d1, d20 vld1.32 {d19}, [r1]! vmax.s32 d31, d17, d0 vld1.32 {d21}, [r1]! vmax.s32 d30, d31, d22 vld1.32 {d27}, [r1]! vmax.s32 d29, d30, d19 vld1.32 {d20}, [r1]! vmax.s32 d28, d29, d21 vld1.32 {d18}, [r1]! vmax.s32 d26, d28, d27 vld1.32 {d19}, [r1]! vmax.s32 d25, d26, d20 vld1.32 {d7}, [r1]! vmax.s32 d24, d25, d18 vld1.32 {d18}, [r1]! vmax.s32 d23, d24, d19 vld1.32 {d4}, [r1]! vmax.s32 d6, d23, d7 vld1.32 {d2}, [r1]! vmax.s32 d5, d6, d18 vld1.32 {d1}, [r1] vmax.s32 d3, d5, d4 add r3, r1, #8 vmax.s32 d17, d3, d2 vmax.s32 d0, d17, d1 vld1.32 {d17}, [r3] vmax.s32 d16, d17, d0 vpmax.s32 d16, d16, d16 vmov.32 r0, d16[0] add sp, sp, #8 bx lr

hth
s.
Cancel
Vote up 0 Vote down

Cancel

Reply

Simon Craske over 12 years ago

Note: This was originally posted on 11th February 2010 at http://forums.arm.com

Roumen,

Have you looked at what code the compiler is generating for the pure C version,
with the command line options you're using, it's likely that GCC is already using Neon,
e.g.

typedef int t; t foo(t *p) { int i; t max; max = p[0]; for(i=0;i<32;i++) if(p[i]>max) max=p[i]; return max; }

Compiled with GCC 4.4.1 using "-O3 -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp -ffast-math" automatically produces the Neon code:

mov r1, r0 ldr r2, [r0, #0] vld1.32 {d2}, [r1]! sub sp, sp, #8 vld1.32 {d20}, [r1]! vld1.32 {d0}, [r1]! str r2, [sp, #4] vld1.32 {d22}, [r1]! str r2, [sp, #0] fldd d16, [sp, #0] vmax.s32 d1, d2, d16 vmax.s32 d17, d1, d20 vld1.32 {d19}, [r1]! vmax.s32 d31, d17, d0 vld1.32 {d21}, [r1]! vmax.s32 d30, d31, d22 vld1.32 {d27}, [r1]! vmax.s32 d29, d30, d19 vld1.32 {d20}, [r1]! vmax.s32 d28, d29, d21 vld1.32 {d18}, [r1]! vmax.s32 d26, d28, d27 vld1.32 {d19}, [r1]! vmax.s32 d25, d26, d20 vld1.32 {d7}, [r1]! vmax.s32 d24, d25, d18 vld1.32 {d18}, [r1]! vmax.s32 d23, d24, d19 vld1.32 {d4}, [r1]! vmax.s32 d6, d23, d7 vld1.32 {d2}, [r1]! vmax.s32 d5, d6, d18 vld1.32 {d1}, [r1] vmax.s32 d3, d5, d4 add r3, r1, #8 vmax.s32 d17, d3, d2 vmax.s32 d0, d17, d1 vld1.32 {d17}, [r3] vmax.s32 d16, d17, d0 vpmax.s32 d16, d16, d16 vmov.32 r0, d16[0] add sp, sp, #8 bx lr

hth
s.
Cancel
Vote up 0 Vote down

Cancel

Children

No data