This discussion has been locked.
You can no longer post new replies to this discussion. If you have a question you can start a new discussion

Does GCC really support automatic vectorization for NEON technology?

There are two development articles metioned it that GCC can do it:

ntroducing NEON

NEON Support in Compilation Tools

But I tested code snap in these docs with GCC compling options but the generated assembly code

doesn't use any neon instruction.

Parents
  • It should be what you said, but the real compiling result is almost the same.

    1) Without the hint of 'len & ~3'

    int accumulate(int * __attribute__ ((aligned (16))) c, int len)

    {

        int i, retval;

        for(i=0, retval = 0; i < len; i++) {

            retval += c[i];

        }

        return retval;

    }

    Compling output:

    .align2
    .globalaccumulate
    .typeaccumulate, %function

    accumulate:

    @ args = 0, pretend = 0, frame = 0
    @ frame_needed = 0, uses_anonymous_args = 0
    @ link register save eliminated.
    cmpr1, #0
    stmfdsp!, {r4, r5, r6, r7}
    ble.L14
    andsr2, r0, #4
    mvnner2, #0
    andr2, r2, #3
    cmpr2, r1
    movcsr2, r1
    cmpr1, #6
    movlsr2, r1
    bhi.L28

    .L3:

    cmpr2, #1
    ldrr3, [r0]
    movlsip, #1
    bls.L5
    ldrip, [r0, #4]
    cmpr2, #2
    addr3, r3, ip
    movlsip, #2
    bls.L5
    ldrip, [r0, #8]
    cmpr2, #3
    addr3, r3, ip
    movlsip, #3
    bls.L5
    ldrip, [r0, #12]
    cmpr2, #4
    addr3, r3, ip
    movlsip, #4
    bls.L5
    cmpr2, #5
    ldrip, [r0, #16]
    ldrhir4, [r0, #20]
    addr3, r3, ip
    addhir3, r3, r4
    movlsip, #5
    movhiip, #6

    .L5:

    cmpr1, r2
    beq.L2
    rsbr6, r2, r1
    movr5, r6, lsr #2
    movsr7, r5, asl #2
    beq.L7

    .L29:

    addr2, r0, r2, asl #2
    movr4, #0
    vmov.i32q8, #0  @ v4si

    .L13:

    addr4, r4, #1
    vld1.64{d18-d19}, [r2:64]!
    cmpr5, r4
    vadd.i32q8, q8, q9
    bhi.L13
    vadd.i32d16, d16, d17
    vmov.i32q9, #0  @ v4si
    vpadd.i32d18, d16, d16
    vmov.32r2, d18[0]
    cmpr6, r7
    addip, ip, r7
    addr3, r3, r2
    beq.L2

    .L7:

    ldrr4, [r0, ip, asl #2]
    addr2, ip, #1
    cmpr1, r2
    addr3, r3, r4
    ble.L2
    ldrr2, [r0, r2, asl #2]
    addip, ip, #2
    cmpr1, ip
    addr3, r3, r2
    ldrgtr2, [r0, ip, asl #2]
    addgtr3, r3, r2

    .L2:

    movr0, r3
    ldmfdsp!, {r4, r5, r6, r7}
    bxlr

    .L28:

    cmpr2, #0
    moveqr3, r2
    moveqip, r2
    bne.L3
    rsbr6, r2, r1
    movr5, r6, lsr #2
    movsr7, r5, asl #2
    bne.L29
    b.L7

    .L14:

    movr3, #0
    b.L2
    .sizeaccumulate, .-accumulate

    2) With the hint of 'len & ~3'

    int accumulate(int * __attribute__ ((aligned (16))) c, int len)

    {

        int i, retval;

        for(i=0, retval = 0; i < (len & ~3) ; i++) {

            retval += c[i];

        }

        return retval;

    }

    Compling result:

    .align2
    .globalaccumulate
    .typeaccumulate, %function

    accumulate:

    @ args = 0, pretend = 0, frame = 0
    @ frame_needed = 0, uses_anonymous_args = 0
    @ link register save eliminated.
    bicr1, r1, #3
    cmpr1, #0
    stmfdsp!, {r4, r5, r6, r7}
    ble.L14
    andsr2, r0, #4
    mvnner2, #0
    andr2, r2, #3
    cmpr2, r1
    movcsr2, r1
    cmpr1, #6
    movlsr2, r1
    bhi.L28

    .L3:

    cmpr2, #1
    ldrr3, [r0]
    movlsip, #1
    bls.L5
    ldrip, [r0, #4]
    cmpr2, #2
    addr3, r3, ip
    movlsip, #2
    bls.L5
    ldrip, [r0, #8]
    cmpr2, #3
    addr3, r3, ip
    movlsip, #3
    bls.L5
    ldrip, [r0, #12]
    cmpr2, #4
    addr3, r3, ip
    movlsip, #4
    bls.L5
    cmpr2, #5
    ldrip, [r0, #16]
    ldrhir4, [r0, #20]
    addr3, r3, ip
    addhir3, r3, r4
    movlsip, #5
    movhiip, #6

    .L5:

    cmpr1, r2
    beq.L2
    rsbr6, r2, r1
    movr5, r6, lsr #2
    movsr7, r5, asl #2
    beq.L7

    .L29:

    addr2, r0, r2, asl #2
    movr4, #0
    vmov.i32q8, #0  @ v4si

    .L13:

    addr4, r4, #1
    vld1.64{d18-d19}, [r2:64]!
    cmpr5, r4
    vadd.i32q8, q8, q9
    bhi.L13
    vadd.i32d16, d16, d17
    vmov.i32q9, #0  @ v4si
    vpadd.i32d18, d16, d16
    vmov.32r2, d18[0]
    cmpr6, r7
    addip, ip, r7
    addr3, r3, r2
    beq.L2

    .L7:

    ldrr4, [r0, ip, asl #2]
    addr2, ip, #1
    cmpr2, r1
    addr3, r3, r4
    bge.L2
    ldrr2, [r0, r2, asl #2]
    addip, ip, #2
    cmpr1, ip
    addr3, r3, r2
    ldrgtr2, [r0, ip, asl #2]
    addgtr3, r3, r2

    .L2:

    movr0, r3
    ldmfdsp!, {r4, r5, r6, r7}
    bxlr

    .L28:

    cmpr2, #0
    moveqr3, r2
    moveqip, r2
    bne.L3
    rsbr6, r2, r1
    movr5, r6, lsr #2
    movsr7, r5, asl #2
    bne.L29
    b.L7

    .L14:

    movr3, #0
    b.L2
    .sizeaccumulate, .-accumulate

    The only different is that 'len & ~3' complied into a instruction 'bic  r1, r1, #3', nothing else.

Reply
  • It should be what you said, but the real compiling result is almost the same.

    1) Without the hint of 'len & ~3'

    int accumulate(int * __attribute__ ((aligned (16))) c, int len)

    {

        int i, retval;

        for(i=0, retval = 0; i < len; i++) {

            retval += c[i];

        }

        return retval;

    }

    Compling output:

    .align2
    .globalaccumulate
    .typeaccumulate, %function

    accumulate:

    @ args = 0, pretend = 0, frame = 0
    @ frame_needed = 0, uses_anonymous_args = 0
    @ link register save eliminated.
    cmpr1, #0
    stmfdsp!, {r4, r5, r6, r7}
    ble.L14
    andsr2, r0, #4
    mvnner2, #0
    andr2, r2, #3
    cmpr2, r1
    movcsr2, r1
    cmpr1, #6
    movlsr2, r1
    bhi.L28

    .L3:

    cmpr2, #1
    ldrr3, [r0]
    movlsip, #1
    bls.L5
    ldrip, [r0, #4]
    cmpr2, #2
    addr3, r3, ip
    movlsip, #2
    bls.L5
    ldrip, [r0, #8]
    cmpr2, #3
    addr3, r3, ip
    movlsip, #3
    bls.L5
    ldrip, [r0, #12]
    cmpr2, #4
    addr3, r3, ip
    movlsip, #4
    bls.L5
    cmpr2, #5
    ldrip, [r0, #16]
    ldrhir4, [r0, #20]
    addr3, r3, ip
    addhir3, r3, r4
    movlsip, #5
    movhiip, #6

    .L5:

    cmpr1, r2
    beq.L2
    rsbr6, r2, r1
    movr5, r6, lsr #2
    movsr7, r5, asl #2
    beq.L7

    .L29:

    addr2, r0, r2, asl #2
    movr4, #0
    vmov.i32q8, #0  @ v4si

    .L13:

    addr4, r4, #1
    vld1.64{d18-d19}, [r2:64]!
    cmpr5, r4
    vadd.i32q8, q8, q9
    bhi.L13
    vadd.i32d16, d16, d17
    vmov.i32q9, #0  @ v4si
    vpadd.i32d18, d16, d16
    vmov.32r2, d18[0]
    cmpr6, r7
    addip, ip, r7
    addr3, r3, r2
    beq.L2

    .L7:

    ldrr4, [r0, ip, asl #2]
    addr2, ip, #1
    cmpr1, r2
    addr3, r3, r4
    ble.L2
    ldrr2, [r0, r2, asl #2]
    addip, ip, #2
    cmpr1, ip
    addr3, r3, r2
    ldrgtr2, [r0, ip, asl #2]
    addgtr3, r3, r2

    .L2:

    movr0, r3
    ldmfdsp!, {r4, r5, r6, r7}
    bxlr

    .L28:

    cmpr2, #0
    moveqr3, r2
    moveqip, r2
    bne.L3
    rsbr6, r2, r1
    movr5, r6, lsr #2
    movsr7, r5, asl #2
    bne.L29
    b.L7

    .L14:

    movr3, #0
    b.L2
    .sizeaccumulate, .-accumulate

    2) With the hint of 'len & ~3'

    int accumulate(int * __attribute__ ((aligned (16))) c, int len)

    {

        int i, retval;

        for(i=0, retval = 0; i < (len & ~3) ; i++) {

            retval += c[i];

        }

        return retval;

    }

    Compling result:

    .align2
    .globalaccumulate
    .typeaccumulate, %function

    accumulate:

    @ args = 0, pretend = 0, frame = 0
    @ frame_needed = 0, uses_anonymous_args = 0
    @ link register save eliminated.
    bicr1, r1, #3
    cmpr1, #0
    stmfdsp!, {r4, r5, r6, r7}
    ble.L14
    andsr2, r0, #4
    mvnner2, #0
    andr2, r2, #3
    cmpr2, r1
    movcsr2, r1
    cmpr1, #6
    movlsr2, r1
    bhi.L28

    .L3:

    cmpr2, #1
    ldrr3, [r0]
    movlsip, #1
    bls.L5
    ldrip, [r0, #4]
    cmpr2, #2
    addr3, r3, ip
    movlsip, #2
    bls.L5
    ldrip, [r0, #8]
    cmpr2, #3
    addr3, r3, ip
    movlsip, #3
    bls.L5
    ldrip, [r0, #12]
    cmpr2, #4
    addr3, r3, ip
    movlsip, #4
    bls.L5
    cmpr2, #5
    ldrip, [r0, #16]
    ldrhir4, [r0, #20]
    addr3, r3, ip
    addhir3, r3, r4
    movlsip, #5
    movhiip, #6

    .L5:

    cmpr1, r2
    beq.L2
    rsbr6, r2, r1
    movr5, r6, lsr #2
    movsr7, r5, asl #2
    beq.L7

    .L29:

    addr2, r0, r2, asl #2
    movr4, #0
    vmov.i32q8, #0  @ v4si

    .L13:

    addr4, r4, #1
    vld1.64{d18-d19}, [r2:64]!
    cmpr5, r4
    vadd.i32q8, q8, q9
    bhi.L13
    vadd.i32d16, d16, d17
    vmov.i32q9, #0  @ v4si
    vpadd.i32d18, d16, d16
    vmov.32r2, d18[0]
    cmpr6, r7
    addip, ip, r7
    addr3, r3, r2
    beq.L2

    .L7:

    ldrr4, [r0, ip, asl #2]
    addr2, ip, #1
    cmpr2, r1
    addr3, r3, r4
    bge.L2
    ldrr2, [r0, r2, asl #2]
    addip, ip, #2
    cmpr1, ip
    addr3, r3, r2
    ldrgtr2, [r0, ip, asl #2]
    addgtr3, r3, r2

    .L2:

    movr0, r3
    ldmfdsp!, {r4, r5, r6, r7}
    bxlr

    .L28:

    cmpr2, #0
    moveqr3, r2
    moveqip, r2
    bne.L3
    rsbr6, r2, r1
    movr5, r6, lsr #2
    movsr7, r5, asl #2
    bne.L29
    b.L7

    .L14:

    movr3, #0
    b.L2
    .sizeaccumulate, .-accumulate

    The only different is that 'len & ~3' complied into a instruction 'bic  r1, r1, #3', nothing else.

Children
No data