This discussion has been locked.
You can no longer post new replies to this discussion. If you have a question you can start a new discussion

gcc with neon

Hi

I add a piece of code in multimedia.

It makes data in array reverse.

It works, but when I add -O2 or -O3 the result is error and vmov with -O2 ,-O3 create illegal instruction.

I don't understand...

Is this a gcc bug??

asm volatile

  (

  "pld [%0, #0xFFF];\n\t\

  vldm %0!,{d0-d3};\n\t\

  vswp.32 d0,d3;\n\t\

  vswp.32 d1,d2;\n\t\

  VREV64.32 q0, q0;\n\t\

  VREV64.32 q1, q1;\n\t\

  vstm %1!,{d0-d3};"

  :

  :"r"(data),"r"(coeff),"r"(tmp),"r"(sum),"r"(ptr)

  :"d0", "d1","d2","d3","d4","d5","d6","d7"

  );

asm volatile

  (

  "pld [%0, #0xFFF];\n\t\

  vldm %0!,{d0-d3};\n\t\

  vldm %1!,{d4-d7};\n\t\

  VMUL.I32 d0,d0,d4;\n\t\

  VMUL.I32 d1,d1,d5;\n\t\

  VMUL.I32 d2,d2,d6;\n\t\

  VMUL.I32 d3,d3,d7;\n\t\

  VADD.I32 d0,d0,d1;\n\t\

  VADD.I32 d2,d2,d3;\n\t\

  VADD.I32 d0,d0,d2;\n\t\

  VMOV %2,%3,d0;\n\t\

  add %3,%3,%2;\n\t\

  str %3,[%4]"

  :

  :"r"(data),"r"(coeff),"r"(tmp),"r"(sum),"r"(ptr)

  :"d0", "d1","d2","d3","d4","d5","d6","d7"

  );

Parents
  • Hi Ching Hsiung Yu,

    it seems to depend on the compiler version. GCC 4.3.1 could compile the code successfully. However, GCC 4.9.3 could not recognize the neon instructions. The correspondent assembler with GCC 4.9.3 could recognize those neon instructions. So you should better write the code not by the inline assembler but by the pure assembly code as jebsbauer says.

    Best regards,

    Yasuhiko Koumoto.

Reply
  • Hi Ching Hsiung Yu,

    it seems to depend on the compiler version. GCC 4.3.1 could compile the code successfully. However, GCC 4.9.3 could not recognize the neon instructions. The correspondent assembler with GCC 4.9.3 could recognize those neon instructions. So you should better write the code not by the inline assembler but by the pure assembly code as jebsbauer says.

    Best regards,

    Yasuhiko Koumoto.

Children
  • It seems strange gcc cannot use neon....

  • Of course, it is the story of the inline assembler. I used your source code for assembling.

    Here are the procedures.

    $ cat temp.c
    main()
    {
    float *data;
    float *coeff;
    float tmp,sum;
    float *ptr;
    asm volatile
      (
      "pld [%0, #0xFFF];\n\t\
      vldm %0!,{d0-d3};\n\t\
      vswp.32 d0,d3;\n\t\
      vswp.32 d1,d2;\n\t\
      VREV64.32 q0, q0;\n\t\
      VREV64.32 q1, q1;\n\t\
      vstm %1!,{d0-d3};"
      :
      :"r"(data),"r"(coeff),"r"(tmp),"r"(sum),"r"(ptr)
      :"d0", "d1","d2","d3","d4","d5","d6","d7"
      );
    asm volatile
      (
      "pld [%0, #0xFFF];\n\t\
      vldm %0!,{d0-d3};\n\t\
      vldm %1!,{d4-d7};\n\t\
      VMUL.I32 d0,d0,d4;\n\t\
      VMUL.I32 d1,d1,d5;\n\t\
      VMUL.I32 d2,d2,d6;\n\t\
      VMUL.I32 d3,d3,d7;\n\t\
      VADD.I32 d0,d0,d1;\n\t\
      VADD.I32 d2,d2,d3;\n\t\
      VADD.I32 d0,d0,d2;\n\t\
      VMOV %2,%3,d0;\n\t\
      add %3,%3,%2;\n\t\
      str %3,[%4]"
      :
      :"r"(data),"r"(coeff),"r"(tmp),"r"(sum),"r"(ptr)
      :"d0", "d1","d2","d3","d4","d5","d6","d7"
      );
    
    }
    $ arm-elf-gcc -o temp.out -O3 -mfpu=neon temp.c -nostdlib
    /cygdrive/d/arm-tools/bin/../lib/gcc/arm-elf/4.3.1/../../../../arm-elf/bin/ld: warning: cannot find entry symbol _start; defaulting to 00008000
    $ arm-elf-objdump.exe -D temp.out
    temp.out:    file format elf32-littlearm
    Disassembly of section .text:
    00008000 <main>:
        8000:      e52d4004        push    {r4}            ; (str r4, [sp, #-4]!)
        8004:      e3a0c000        mov    ip, #0  ; 0x0
        8008:      e3a04000        mov    r4, #0  ; 0x0
        800c:      f5d4ffff        pld    [r4, #4095]
        8010:      ecb40b08        vldmia  r4!, {d0-d3}
        8014:      f3b20003        vswp    d0, d3
        8018:      f3b21002        vswp    d1, d2
        801c:      f3b80040        vrev64.32      q0, q0
        8020:      f3b82042        vrev64.32      q1, q1
        8024:      eca40b08        vstmia  r4!, {d0-d3}
        8028:      f5d4ffff        pld    [r4, #4095]
        802c:      ecb40b08        vldmia  r4!, {d0-d3}
        8030:      ecb44b08        vldmia  r4!, {d4-d7}
        8034:      f2200914        vmul.i32        d0, d0, d4
        8038:      f2211915        vmul.i32        d1, d1, d5
        803c:      f2222916        vmul.i32        d2, d2, d6
        8040:      f2233917        vmul.i32        d3, d3, d7
        8044:      f2200801        vadd.i32        d0, d0, d1
        8048:      f2222803        vadd.i32        d2, d2, d3
        804c:      f2200802        vadd.i32        d0, d0, d2
        8050:      ec5ccb10        vmov    ip, ip, d0
        8054:      e08cc00c        add    ip, ip, ip
        8058:      e584c000        str    ip, [r4]
        805c:      e8bd0010        pop    {r4}
        8060:      e12fff1e        bx      lr
    Disassembly of section .comment:
    00000000 <.comment>:
      0:  43434700        movtmi  r4, #14080      ; 0x3700
      4:  4728203a        undefined
      8:  2029554e        eorcs  r5, r9, lr, asr #10
      c:  2e332e34        mrccs  14, 1, r2, cr3, cr4, {1}
      10:  Address 0x00000010 is out of bounds.
    Disassembly of section .ARM.attributes:
    00000000 <_stack-0x80000>:
      0:  00000f41        andeq  r0, r0, r1, asr #30
      4:  61656100        cmnvs  r5, r0, lsl #2
      8:  01006962        tsteq  r0, r2, ror #18
      c:  00000005        andeq  r0, r0, r5
    $ arm-elf-gcc -v
    Using built-in specs.
    Target: arm-elf
    Configured with: ../gcc-4.3.1/configure --target=arm-elf --with-gmp=/usr/local/gmp-4.2.2 --with-mpfr=/usr/local/mpfr-2.3.1 --prefix=/usr/local/arm-tools --enable-languages=c --disable-libssp
    Thread model: single
    gcc version 4.3.1 (GCC)
    

    Are there any strange parts?

    Best regards,

    Yasuhiko Koumoto.