sample = *samp2_p++; \ re = __SMLALD(ortho_p[COS2_O], sample, re); \ im = __SMLALD(ortho_p[SIN2_O], sample, im); \ ortho_p++;
a94: 4682 mov sl, r0 a96: f10c 0304 add.w r3, ip, #4 a9a: f8d4 0100 ldr.w r0, [r4, #256] ; 0x100 a9e: f8dc c000 ldr.w ip, [ip] aa2: fbc0 a8cc smlald sl, r8, r0, ip aa6: 2000 movs r0, #0 aa8: 9022 str r0, [sp, #136] ; 0x88 aaa: f8cd 808c str.w r8, [sp, #140] ; 0x8c aae: e9dd 8922 ldrd r8, r9, [sp, #136] ; 0x88 ab2: f854 0b04 ldr.w r0, [r4], #4 ab6: ea48 080a orr.w r8, r8, sl aba: fbc0 21cc smlald r2, r1, r0, ip abe: 9125 str r1, [sp, #148] ; 0x94 ac0: 2100 movs r1, #0 ac2: 9124 str r1, [sp, #144] ; 0x90 ac4: e9dd 0124 ldrd r0, r1, [sp, #144] ; 0x90 ac8: ea40 0002 orr.w r0, r0, r2 acc: 468a mov sl, r1 ace: 4684 mov ip, r0 ad0: 464a mov r2, r9
typedef union llreg_u{ uint32_t w32[2]; uint64_t w64;} llreg_t;__attribute__( ( always_inline ) ) __STATIC_INLINE uint64_t __SMLALD (uint32_t op1, uint32_t op2, uint64_t result){ llreg_t llr; llr.w64 = result; __ASM volatile ("smlald %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) ); return(llr.w64);}
a96: f109 0504 add.w r5, r9, #4 a9a: f8d9 c000 ldr.w ip, [r9] a9e: f8d3 2100 ldr.w r2, [r3, #256] ; 0x100 aa2: fbc2 40cc smlald r4, r0, r2, ip aa6: 9a01 ldr r2, [sp, #4] aa8: f853 9b04 ldr.w r9, [r3], #4 aac: fbc9 12cc smlald r1, r2, r9, ip
__SMLALD() as implemented in core_cm4_simd.h (V3.20) produces suboptimal code (at leas with GCC).Example: sample = *samp2_p++; \ re = __SMLALD(ortho_p[COS2_O], sample, re); \ im = __SMLALD(ortho_p[SIN2_O], sample, im); \ ortho_p++;Produced code [GCC 4.8.1 with -O2 optimization]: a94: 4682 mov sl, r0 a96: f10c 0304 add.w r3, ip, #4 a9a: f8d4 0100 ldr.w r0, [r4, #256] ; 0x100 a9e: f8dc c000 ldr.w ip, [ip] aa2: fbc0 a8cc smlald sl, r8, r0, ip aa6: 2000 movs r0, #0 aa8: 9022 str r0, [sp, #136] ; 0x88 aaa: f8cd 808c str.w r8, [sp, #140] ; 0x8c aae: e9dd 8922 ldrd r8, r9, [sp, #136] ; 0x88 ab2: f854 0b04 ldr.w r0, [r4], #4 ab6: ea48 080a orr.w r8, r8, sl aba: fbc0 21cc smlald r2, r1, r0, ip abe: 9125 str r1, [sp, #148] ; 0x94 ac0: 2100 movs r1, #0 ac2: 9124 str r1, [sp, #144] ; 0x90 ac4: e9dd 0124 ldrd r0, r1, [sp, #144] ; 0x90 ac8: ea40 0002 orr.w r0, r0, r2 acc: 468a mov sl, r1 ace: 4684 mov ip, r0 ad0: 464a mov r2, r9I propose following implementation:typedef union llreg_u{ uint32_t w32[2]; uint64_t w64;} llreg_t;__attribute__( ( always_inline ) ) __STATIC_INLINE uint64_t __SMLALD (uint32_t op1, uint32_t op2, uint64_t result){ llreg_t llr; llr.w64 = result; __ASM volatile ("smlald %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) ); return(llr.w64);}that produces following code [GCC 4.8.1 with -O2 optimization]: a96: f109 0504 add.w r5, r9, #4 a9a: f8d9 c000 ldr.w ip, [r9] a9e: f8d3 2100 ldr.w r2, [r3, #256] ; 0x100 aa2: fbc2 40cc smlald r4, r0, r2, ip aa6: 9a01 ldr r2, [sp, #4] aa8: f853 9b04 ldr.w r9, [r3], #4 aac: fbc9 12cc smlald r1, r2, r9, ipNotes:1. The proposed code assumes little endian but preparation for big endian is trivial.2. Similar optimization can be applied to __SMLALD() siblings.Ilija