This discussion has been locked.

You can no longer post new replies to this discussion. If you have a question you can start a new discussion

__SMLALD() produces suboptimal code

Note: This was originally posted on 19th August 2013 at http://forums.arm.com

__SMLALD() as implemented in core_cm4_simd.h (V3.20) produces suboptimal code (at leas with GCC).

Example:

 sample = *samp2_p++; \
 re = __SMLALD(ortho_p[COS2_O], sample, re); \
 im = __SMLALD(ortho_p[SIN2_O], sample, im); \
 ortho_p++;

Produced code [GCC 4.8.1 with -O2 optimization]:

  a94: 4682       mov sl, r0
  a96: f10c 0304  add.w r3, ip, #4
  a9a: f8d4 0100  ldr.w r0, [r4, #256] ; 0x100
  a9e: f8dc c000  ldr.w ip, [ip]
  aa2: fbc0 a8cc  smlald sl, r8, r0, ip
  aa6: 2000       movs r0, #0
  aa8: 9022       str r0, [sp, #136] ; 0x88
  aaa: f8cd 808c  str.w r8, [sp, #140] ; 0x8c
  aae: e9dd 8922  ldrd r8, r9, [sp, #136] ; 0x88
  ab2: f854 0b04  ldr.w r0, [r4], #4
  ab6: ea48 080a  orr.w r8, r8, sl
  aba: fbc0 21cc  smlald r2, r1, r0, ip
  abe: 9125       str r1, [sp, #148] ; 0x94
  ac0: 2100       movs r1, #0
  ac2: 9124       str r1, [sp, #144] ; 0x90
  ac4: e9dd 0124  ldrd r0, r1, [sp, #144] ; 0x90
  ac8: ea40 0002  orr.w r0, r0, r2
  acc: 468a       mov sl, r1
  ace: 4684       mov ip, r0
  ad0: 464a       mov r2, r9

I propose following implementation:

typedef union llreg_u{
  uint32_t w32[2];
  uint64_t w64;
} llreg_t;

__attribute__( ( always_inline ) ) __STATIC_INLINE uint64_t __SMLALD (uint32_t op1, uint32_t op2, uint64_t result)
{
  llreg_t llr;
  llr.w64 = result;

  __ASM volatile ("smlald %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );
 
  return(llr.w64);
}

that produces following code [GCC 4.8.1 with -O2 optimization]:

  a96: f109 0504  add.w r5, r9, #4
  a9a: f8d9 c000  ldr.w ip, [r9]
  a9e: f8d3 2100  ldr.w r2, [r3, #256] ; 0x100
  aa2: fbc2 40cc  smlald r4, r0, r2, ip
  aa6: 9a01    ldr r2, [sp, #4]
  aa8: f853 9b04  ldr.w r9, [r3], #4
  aac: fbc9 12cc  smlald r1, r2, r9, ip

Notes:
1. The proposed code assumes little endian but preparation for big endian is trivial.
2. Similar optimization can be applied to __SMLALD() siblings.

Ilija

Parents

Joey Ye over 12 years ago

Note: This was originally posted on 22nd August 2013 at http://forums.arm.com

Noted will follow-up and try your proposal. Thanks for reporting.

Joey

__SMLALD() as implemented in core_cm4_simd.h (V3.20) produces suboptimal code (at leas with GCC).

Example:
    sample = *samp2_p++; \     re = __SMLALD(ortho_p[COS2_O], sample, re); \     im = __SMLALD(ortho_p[SIN2_O], sample, im); \     ortho_p++;

Produced code [GCC 4.8.1 with -O2 optimization]:
     a94:    4682          mov    sl, r0     a96:    f10c 0304     add.w    r3, ip, #4     a9a:    f8d4 0100     ldr.w    r0, [r4, #256]    ; 0x100     a9e:    f8dc c000     ldr.w    ip, [ip]     aa2:    fbc0 a8cc     smlald    sl, r8, r0, ip     aa6:    2000          movs    r0, #0     aa8:    9022          str    r0, [sp, #136]    ; 0x88     aaa:    f8cd 808c     str.w    r8, [sp, #140]    ; 0x8c     aae:    e9dd 8922     ldrd    r8, r9, [sp, #136]    ; 0x88     ab2:    f854 0b04     ldr.w    r0, [r4], #4     ab6:    ea48 080a     orr.w    r8, r8, sl     aba:    fbc0 21cc     smlald    r2, r1, r0, ip     abe:    9125          str    r1, [sp, #148]    ; 0x94     ac0:    2100          movs    r1, #0     ac2:    9124          str    r1, [sp, #144]    ; 0x90     ac4:    e9dd 0124     ldrd    r0, r1, [sp, #144]    ; 0x90     ac8:    ea40 0002     orr.w    r0, r0, r2     acc:    468a          mov    sl, r1     ace:    4684          mov    ip, r0     ad0:    464a          mov    r2, r9
I propose following implementation:

typedef union llreg_u{ uint32_t w32[2]; uint64_t w64; } llreg_t; __attribute__( ( always_inline ) ) __STATIC_INLINE uint64_t __SMLALD (uint32_t op1, uint32_t op2, uint64_t result) { llreg_t llr; llr.w64 = result; __ASM volatile ("smlald %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );     return(llr.w64); }

that produces following code [GCC 4.8.1 with -O2 optimization]:
     a96:    f109 0504     add.w    r5, r9, #4     a9a:    f8d9 c000     ldr.w    ip, [r9]     a9e:    f8d3 2100     ldr.w    r2, [r3, #256]    ; 0x100     aa2:    fbc2 40cc     smlald    r4, r0, r2, ip     aa6:    9a01          ldr    r2, [sp, #4]     aa8:    f853 9b04     ldr.w    r9, [r3], #4     aac:    fbc9 12cc     smlald    r1, r2, r9, ip

Notes:
1. The proposed code assumes little endian but preparation for big endian is trivial.
2. Similar optimization can be applied to __SMLALD() siblings.

Ilija
Cancel
Vote up 0 Vote down

Cancel

Reply

Joey Ye over 12 years ago

Note: This was originally posted on 22nd August 2013 at http://forums.arm.com

Noted will follow-up and try your proposal. Thanks for reporting.

Joey

__SMLALD() as implemented in core_cm4_simd.h (V3.20) produces suboptimal code (at leas with GCC).

Example:
    sample = *samp2_p++; \     re = __SMLALD(ortho_p[COS2_O], sample, re); \     im = __SMLALD(ortho_p[SIN2_O], sample, im); \     ortho_p++;

Produced code [GCC 4.8.1 with -O2 optimization]:
     a94:    4682          mov    sl, r0     a96:    f10c 0304     add.w    r3, ip, #4     a9a:    f8d4 0100     ldr.w    r0, [r4, #256]    ; 0x100     a9e:    f8dc c000     ldr.w    ip, [ip]     aa2:    fbc0 a8cc     smlald    sl, r8, r0, ip     aa6:    2000          movs    r0, #0     aa8:    9022          str    r0, [sp, #136]    ; 0x88     aaa:    f8cd 808c     str.w    r8, [sp, #140]    ; 0x8c     aae:    e9dd 8922     ldrd    r8, r9, [sp, #136]    ; 0x88     ab2:    f854 0b04     ldr.w    r0, [r4], #4     ab6:    ea48 080a     orr.w    r8, r8, sl     aba:    fbc0 21cc     smlald    r2, r1, r0, ip     abe:    9125          str    r1, [sp, #148]    ; 0x94     ac0:    2100          movs    r1, #0     ac2:    9124          str    r1, [sp, #144]    ; 0x90     ac4:    e9dd 0124     ldrd    r0, r1, [sp, #144]    ; 0x90     ac8:    ea40 0002     orr.w    r0, r0, r2     acc:    468a          mov    sl, r1     ace:    4684          mov    ip, r0     ad0:    464a          mov    r2, r9
I propose following implementation:

typedef union llreg_u{ uint32_t w32[2]; uint64_t w64; } llreg_t; __attribute__( ( always_inline ) ) __STATIC_INLINE uint64_t __SMLALD (uint32_t op1, uint32_t op2, uint64_t result) { llreg_t llr; llr.w64 = result; __ASM volatile ("smlald %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );     return(llr.w64); }

that produces following code [GCC 4.8.1 with -O2 optimization]:
     a96:    f109 0504     add.w    r5, r9, #4     a9a:    f8d9 c000     ldr.w    ip, [r9]     a9e:    f8d3 2100     ldr.w    r2, [r3, #256]    ; 0x100     aa2:    fbc2 40cc     smlald    r4, r0, r2, ip     aa6:    9a01          ldr    r2, [sp, #4]     aa8:    f853 9b04     ldr.w    r9, [r3], #4     aac:    fbc9 12cc     smlald    r1, r2, r9, ip

Notes:
1. The proposed code assumes little endian but preparation for big endian is trivial.
2. Similar optimization can be applied to __SMLALD() siblings.

Ilija
Cancel
Vote up 0 Vote down

Cancel

Children

No data