This discussion has been locked.
You can no longer post new replies to this discussion. If you have a question you can start a new discussion

Optimising with NEON

Note: This was originally posted on 11th February 2010 at http://forums.arm.com

Hello guys, I am a bit new with ARM and NEON and was wondering if anyone could help me with advice on how to optimise a small piece of code by NEON.
The problem is that whatever I do I can't get the NEON code to execute faster than the standard C code.
I am running the code on Beagle board with the latest Angstrom distribution and building with the Code Sourcery Lite gcc.
My gcc command line options are  -O3 -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp -ffast-math
the piece of code in question is the following:

int32_t weightsH[4];
int16_t weightsV[4];
int16_t *lut1, *lut2;

// start of loop

quantLevel1 = ...
quantLevel2 = ...
frac = ...
frac_1 = ...

pxlOut      = 0;

lut1 = luts[quantLevel1];
lut2 = luts[quantLevel2];

w1 = lut1[0];
w2 = lut2[0];
weight = ((weightsH[0]>>16)*weightsV[0])>>13;
pxlOut += (weight*((w1*frac + w2*frac_1)>>14))>>14;

w1 = lut1[1];
w2 = lut2[1];
weight = ((weightsH[1]>>16)*weightsV[0])>>13;
pxlOut += (weight*((w1*frac + w2*frac_1)>>14))>>14;

w1 = lut1[2];
w2 = lut2[2];
weight = ((weightsH[0]>>16)*weightsV[1])>>13;
pxlOut += (weight*((w1*frac + w2*frac_1)>>14))>>14;

w1 = lut1[3];
w2 = lut2[3];
weight = ((weightsH[1]>>16)*weightsV[1])>>13;
pxlOut += (weight*((w1*frac + w2*frac_1)>>14))>>14;

weightsH[0] -= slope_x;
weightsH[1] += slope_x;

// end of loop

the NEON code with the best results that I managed to obtain is the following, yet it's still slower than the above, it's just killing me:

int16x4_t wVs, lt1, lt2, fr1, w;
int32x4_t wHs,s;
int32x4_t buf;
int32x2_t buf2;

slopes[0] = -slope_x;
slopes[1] = slope_x;
slopes[2] = -slope_x;
slopes[3] = slope_x;

wHs = vld1q_s32(weightsH);
s = vld1q_s32(slopes);
wVs = vld1_s16(weightsV);

// start of loop

quantLevel1 = ...
quantLevel2 = ...
frac = ...
frac_1 = ...

lut1 = luts[quantLevel1];
lut2 = luts[quantLevel2];

buf = vdupq_n_s32(0);
lt1 = vld1_s16(lut1);
lt2 = vld1_s16(lut2);
buf = vmlal_n_s16(buf, lt1, (int16_t)frac);
buf = vmlal_n_s16(buf, lt2, (int16_t)frac_1);
fr1 = vshrn_n_s32(buf, 14);
w = vshrn_n_s32(wHs, 16);
buf = vmull_s16(w, wVs);
w = vshrn_n_s32(buf, 13);
buf = vmull_s16(w, fr1);
fr1 = vshrn_n_s32(buf, 14);
buf2 = vpaddl_s16(fr1);
pxlOut = vget_lane_s32(buf2, 0) + vget_lane_s32(buf2, 1);

// end of loop

Thank you in advance,
Roumen
Parents
  • Note: This was originally posted on 11th February 2010 at http://forums.arm.com

    Roumen,

    Have you looked at what code the compiler is generating for the pure C version,
    with the command line options you're using, it's likely that GCC is already using Neon,
    e.g.

    typedef int t;

    t foo(t *p)
    {
    int i;
    t max;
    max = p[0];

    for(i=0;i<32;i++)
      if(p[i]>max) max=p[i];

    return max;
    }


    Compiled with GCC 4.4.1 using "-O3 -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp -ffast-math" automatically produces the Neon code:

    mov  r1, r0
    ldr  r2, [r0, #0]
    vld1.32 {d2}, [r1]!
    sub  sp, sp, #8
    vld1.32 {d20}, [r1]!
    vld1.32 {d0}, [r1]!
    str  r2, [sp, #4]
    vld1.32 {d22}, [r1]!
    str  r2, [sp, #0]
    fldd d16, [sp, #0]
    vmax.s32  d1, d2, d16
    vmax.s32  d17, d1, d20
    vld1.32 {d19}, [r1]!
    vmax.s32  d31, d17, d0
    vld1.32 {d21}, [r1]!
    vmax.s32  d30, d31, d22
    vld1.32 {d27}, [r1]!
    vmax.s32  d29, d30, d19
    vld1.32 {d20}, [r1]!
    vmax.s32  d28, d29, d21
    vld1.32 {d18}, [r1]!
    vmax.s32  d26, d28, d27
    vld1.32 {d19}, [r1]!
    vmax.s32  d25, d26, d20
    vld1.32 {d7}, [r1]!
    vmax.s32  d24, d25, d18
    vld1.32 {d18}, [r1]!
    vmax.s32  d23, d24, d19
    vld1.32 {d4}, [r1]!
    vmax.s32  d6, d23, d7
    vld1.32 {d2}, [r1]!
    vmax.s32  d5, d6, d18
    vld1.32 {d1}, [r1]
    vmax.s32  d3, d5, d4
    add  r3, r1, #8
    vmax.s32  d17, d3, d2
    vmax.s32  d0, d17, d1
    vld1.32 {d17}, [r3]
    vmax.s32  d16, d17, d0
    vpmax.s32    d16, d16, d16
    vmov.32 r0, d16[0]
    add  sp, sp, #8
    bx   lr


    hth
    s.
Reply
  • Note: This was originally posted on 11th February 2010 at http://forums.arm.com

    Roumen,

    Have you looked at what code the compiler is generating for the pure C version,
    with the command line options you're using, it's likely that GCC is already using Neon,
    e.g.

    typedef int t;

    t foo(t *p)
    {
    int i;
    t max;
    max = p[0];

    for(i=0;i<32;i++)
      if(p[i]>max) max=p[i];

    return max;
    }


    Compiled with GCC 4.4.1 using "-O3 -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp -ffast-math" automatically produces the Neon code:

    mov  r1, r0
    ldr  r2, [r0, #0]
    vld1.32 {d2}, [r1]!
    sub  sp, sp, #8
    vld1.32 {d20}, [r1]!
    vld1.32 {d0}, [r1]!
    str  r2, [sp, #4]
    vld1.32 {d22}, [r1]!
    str  r2, [sp, #0]
    fldd d16, [sp, #0]
    vmax.s32  d1, d2, d16
    vmax.s32  d17, d1, d20
    vld1.32 {d19}, [r1]!
    vmax.s32  d31, d17, d0
    vld1.32 {d21}, [r1]!
    vmax.s32  d30, d31, d22
    vld1.32 {d27}, [r1]!
    vmax.s32  d29, d30, d19
    vld1.32 {d20}, [r1]!
    vmax.s32  d28, d29, d21
    vld1.32 {d18}, [r1]!
    vmax.s32  d26, d28, d27
    vld1.32 {d19}, [r1]!
    vmax.s32  d25, d26, d20
    vld1.32 {d7}, [r1]!
    vmax.s32  d24, d25, d18
    vld1.32 {d18}, [r1]!
    vmax.s32  d23, d24, d19
    vld1.32 {d4}, [r1]!
    vmax.s32  d6, d23, d7
    vld1.32 {d2}, [r1]!
    vmax.s32  d5, d6, d18
    vld1.32 {d1}, [r1]
    vmax.s32  d3, d5, d4
    add  r3, r1, #8
    vmax.s32  d17, d3, d2
    vmax.s32  d0, d17, d1
    vld1.32 {d17}, [r3]
    vmax.s32  d16, d17, d0
    vpmax.s32    d16, d16, d16
    vmov.32 r0, d16[0]
    add  sp, sp, #8
    bx   lr


    hth
    s.
Children
No data