This discussion has been locked.
You can no longer post new replies to this discussion. If you have a question you can start a new discussion

__SMLALD() produces suboptimal code

Note: This was originally posted on 19th August 2013 at http://forums.arm.com

__SMLALD() as implemented in core_cm4_simd.h (V3.20) produces suboptimal code (at leas with GCC).

Example:
sample = *samp2_p++; \
re = __SMLALD(ortho_p[COS2_O], sample, re); \
im = __SMLALD(ortho_p[SIN2_O], sample, im); \
ortho_p++;


Produced code [GCC 4.8.1 with -O2 optimization]:
  a94: 4682       mov sl, r0
  a96: f10c 0304  add.w r3, ip, #4
  a9a: f8d4 0100  ldr.w r0, [r4, #256] ; 0x100
  a9e: f8dc c000  ldr.w ip, [ip]
  aa2: fbc0 a8cc  smlald sl, r8, r0, ip
  aa6: 2000       movs r0, #0
  aa8: 9022       str r0, [sp, #136] ; 0x88
  aaa: f8cd 808c  str.w r8, [sp, #140] ; 0x8c
  aae: e9dd 8922  ldrd r8, r9, [sp, #136] ; 0x88
  ab2: f854 0b04  ldr.w r0, [r4], #4
  ab6: ea48 080a  orr.w r8, r8, sl
  aba: fbc0 21cc  smlald r2, r1, r0, ip
  abe: 9125       str r1, [sp, #148] ; 0x94
  ac0: 2100       movs r1, #0
  ac2: 9124       str r1, [sp, #144] ; 0x90
  ac4: e9dd 0124  ldrd r0, r1, [sp, #144] ; 0x90
  ac8: ea40 0002  orr.w r0, r0, r2
  acc: 468a       mov sl, r1
  ace: 4684       mov ip, r0
  ad0: 464a       mov r2, r9

I propose following implementation:

typedef union llreg_u{
  uint32_t w32[2];
  uint64_t w64;
} llreg_t;

__attribute__( ( always_inline ) ) __STATIC_INLINE uint64_t __SMLALD (uint32_t op1, uint32_t op2, uint64_t result)
{
  llreg_t llr;
  llr.w64 = result;

  __ASM volatile ("smlald %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );

  return(llr.w64);
}


that produces following code [GCC 4.8.1 with -O2 optimization]:
  a96: f109 0504  add.w r5, r9, #4
  a9a: f8d9 c000  ldr.w ip, [r9]
  a9e: f8d3 2100  ldr.w r2, [r3, #256] ; 0x100
  aa2: fbc2 40cc  smlald r4, r0, r2, ip
  aa6: 9a01    ldr r2, [sp, #4]
  aa8: f853 9b04  ldr.w r9, [r3], #4
  aac: fbc9 12cc  smlald r1, r2, r9, ip


Notes:
1. The proposed code assumes little endian but preparation for big endian is trivial.
2. Similar optimization can be applied to __SMLALD() siblings.

Ilija
Parents
  • Note: This was originally posted on 22nd August 2013 at http://forums.arm.com

    Noted will follow-up and try your proposal. Thanks for reporting.

    Joey

    __SMLALD() as implemented in core_cm4_simd.h (V3.20) produces suboptimal code (at leas with GCC).

    Example:
        sample = *samp2_p++; \
        re = __SMLALD(ortho_p[COS2_O], sample, re); \
        im = __SMLALD(ortho_p[SIN2_O], sample, im); \
        ortho_p++;


    Produced code [GCC 4.8.1 with -O2 optimization]:
         a94:    4682          mov    sl, r0
        a96:    f10c 0304     add.w    r3, ip, #4
        a9a:    f8d4 0100     ldr.w    r0, [r4, #256]    ; 0x100
        a9e:    f8dc c000     ldr.w    ip, [ip]
        aa2:    fbc0 a8cc     smlald    sl, r8, r0, ip
        aa6:    2000          movs    r0, #0
        aa8:    9022          str    r0, [sp, #136]    ; 0x88
        aaa:    f8cd 808c     str.w    r8, [sp, #140]    ; 0x8c
        aae:    e9dd 8922     ldrd    r8, r9, [sp, #136]    ; 0x88
        ab2:    f854 0b04     ldr.w    r0, [r4], #4
        ab6:    ea48 080a     orr.w    r8, r8, sl
        aba:    fbc0 21cc     smlald    r2, r1, r0, ip
        abe:    9125          str    r1, [sp, #148]    ; 0x94
        ac0:    2100          movs    r1, #0
        ac2:    9124          str    r1, [sp, #144]    ; 0x90
        ac4:    e9dd 0124     ldrd    r0, r1, [sp, #144]    ; 0x90
        ac8:    ea40 0002     orr.w    r0, r0, r2
        acc:    468a          mov    sl, r1
        ace:    4684          mov    ip, r0
        ad0:    464a          mov    r2, r9

    I propose following implementation:

    typedef union llreg_u{
      uint32_t w32[2];
      uint64_t w64;
    } llreg_t;

    __attribute__( ( always_inline ) ) __STATIC_INLINE uint64_t __SMLALD (uint32_t op1, uint32_t op2, uint64_t result)
    {
      llreg_t llr;
      llr.w64 = result;

      __ASM volatile ("smlald %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );
       
      return(llr.w64);
    }


    that produces following code [GCC 4.8.1 with -O2 optimization]:
         a96:    f109 0504     add.w    r5, r9, #4
        a9a:    f8d9 c000     ldr.w    ip, [r9]
        a9e:    f8d3 2100     ldr.w    r2, [r3, #256]    ; 0x100
        aa2:    fbc2 40cc     smlald    r4, r0, r2, ip
        aa6:    9a01          ldr    r2, [sp, #4]
        aa8:    f853 9b04     ldr.w    r9, [r3], #4
        aac:    fbc9 12cc     smlald    r1, r2, r9, ip


    Notes:
    1. The proposed code assumes little endian but preparation for big endian is trivial.
    2. Similar optimization can be applied to __SMLALD() siblings.

    Ilija
Reply
  • Note: This was originally posted on 22nd August 2013 at http://forums.arm.com

    Noted will follow-up and try your proposal. Thanks for reporting.

    Joey

    __SMLALD() as implemented in core_cm4_simd.h (V3.20) produces suboptimal code (at leas with GCC).

    Example:
        sample = *samp2_p++; \
        re = __SMLALD(ortho_p[COS2_O], sample, re); \
        im = __SMLALD(ortho_p[SIN2_O], sample, im); \
        ortho_p++;


    Produced code [GCC 4.8.1 with -O2 optimization]:
         a94:    4682          mov    sl, r0
        a96:    f10c 0304     add.w    r3, ip, #4
        a9a:    f8d4 0100     ldr.w    r0, [r4, #256]    ; 0x100
        a9e:    f8dc c000     ldr.w    ip, [ip]
        aa2:    fbc0 a8cc     smlald    sl, r8, r0, ip
        aa6:    2000          movs    r0, #0
        aa8:    9022          str    r0, [sp, #136]    ; 0x88
        aaa:    f8cd 808c     str.w    r8, [sp, #140]    ; 0x8c
        aae:    e9dd 8922     ldrd    r8, r9, [sp, #136]    ; 0x88
        ab2:    f854 0b04     ldr.w    r0, [r4], #4
        ab6:    ea48 080a     orr.w    r8, r8, sl
        aba:    fbc0 21cc     smlald    r2, r1, r0, ip
        abe:    9125          str    r1, [sp, #148]    ; 0x94
        ac0:    2100          movs    r1, #0
        ac2:    9124          str    r1, [sp, #144]    ; 0x90
        ac4:    e9dd 0124     ldrd    r0, r1, [sp, #144]    ; 0x90
        ac8:    ea40 0002     orr.w    r0, r0, r2
        acc:    468a          mov    sl, r1
        ace:    4684          mov    ip, r0
        ad0:    464a          mov    r2, r9

    I propose following implementation:

    typedef union llreg_u{
      uint32_t w32[2];
      uint64_t w64;
    } llreg_t;

    __attribute__( ( always_inline ) ) __STATIC_INLINE uint64_t __SMLALD (uint32_t op1, uint32_t op2, uint64_t result)
    {
      llreg_t llr;
      llr.w64 = result;

      __ASM volatile ("smlald %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );
       
      return(llr.w64);
    }


    that produces following code [GCC 4.8.1 with -O2 optimization]:
         a96:    f109 0504     add.w    r5, r9, #4
        a9a:    f8d9 c000     ldr.w    ip, [r9]
        a9e:    f8d3 2100     ldr.w    r2, [r3, #256]    ; 0x100
        aa2:    fbc2 40cc     smlald    r4, r0, r2, ip
        aa6:    9a01          ldr    r2, [sp, #4]
        aa8:    f853 9b04     ldr.w    r9, [r3], #4
        aac:    fbc9 12cc     smlald    r1, r2, r9, ip


    Notes:
    1. The proposed code assumes little endian but preparation for big endian is trivial.
    2. Similar optimization can be applied to __SMLALD() siblings.

    Ilija
Children
No data