I have a C function that copies 8 x 32-bit words from src to dest specified by pointers:
static inline void PktProcWrite8( uint32_t* p_src, // Source address of data uint32_t* p_dest ) // Destination address { #ifndef __cplusplus register #endif uint32_t r0, r1, r2, r3, r4, r5, r6, r7; // Use 'register' hint to encourage C compiler to use STM instruction { r0 = p_src[0]; r1 = p_src[1]; r2 = p_src[2]; r3 = p_src[3]; r4 = p_src[4]; r5 = p_src[5]; r6 = p_src[6]; r7 = p_src[7]; p_dest[0] = r0; p_dest[1] = r1; p_dest[2] = r2; p_dest[3] = r3; p_dest[4] = r4; p_dest[5] = r5; p_dest[6] = r6; p_dest[7] = r7; } }
The generated assembler is:
PktProcWrite8_asm: .fnstart .cfi_sections .debug_frame .cfi_startproc @ %bb.0: .save {r4, r5, r6, lr} push {r4, r5, r6, lr} .cfi_def_cfa_offset 16 .cfi_offset lr, -4 .cfi_offset r6, -8 .cfi_offset r5, -12 .cfi_offset r4, -16 ldm.w r0, {r2, r3, r12, lr} add.w r6, r0, #16 ldm r6, {r4, r5, r6} ldr r0, [r0, #28] stm.w r1, {r2, r3, r12, lr} add.w r2, r1, #16 stm r2!, {r4, r5, r6} str r0, [r1, #28] pop {r4, r5, r6, pc} .Lfunc_end0:
It is important for us to maximize the use of burst writes. The above assembler does a burst write of 4 words, followed by a burst of 3 words, followed by a single word.
Is there any reason why we could not modify the assembler to use a single burst of 8 words or, less efficiently, two bursts of 4 words?
The target is Cortex-M4 and we are using armclang.
I don't know.
Also interesting that it's not using the auto-increment capability of ldm/stm (except once.)
Cortex-M4 has burst writes? (or is that dependent on the bus(es) implemented outside of the core?)
Your function is "inline", but the code you show is a complete callable function. Isn't the code generation in actual use, when it's actually inlined, liable to change based on nearby code?
It sort-of looks to me like whatever you may gain by using ldm/stm, you may lose by needing to save the registers you use.
gcc does "worse" (?), generating 4 ldrd and strd instructions. uint32_t r0, r1, r2, r3, r4, r5, r6, r7; // Use 'register' hint to encourage C compiler to use STM instruction {
uint32_t r0, r1, r2, r3, r4, r5, r6, r7; // Use 'register' hint to encourage C compiler to use STM instruction
{
0: b4f0 push {r4, r5, r6, r7} 2: e9d0 c701 ldrd ip, r7, [r0, #4] 6: e9d0 6503 ldrd r6, r5, [r0, #12] a: e9d0 4205 ldrd r4, r2, [r0, #20] r0 = p_src[0]; e: 69c3 ldr r3, [r0, #28] r4 = p_src[4]; r5 = p_src[5]; r6 = p_src[6]; r7 = p_src[7]; p_dest[0] = r0; 10: 6800 ldr r0, [r0, #0] 12: e9c1 7602 strd r7, r6, [r1, #8] 16: e9c1 5404 strd r5, r4, [r1, #16] 1a: e9c1 0c00 strd r0, ip, [r1] p_dest[4] = r4; p_dest[5] = r5; p_dest[6] = r6; p_dest[7] = r7; 1e: bcf0 pop {r4, r5, r6, r7} p_dest[0] = r0; 20: e9c1 2306 strd r2, r3, [r1, #24] 24: 4770 bx lr }
0: b4f0 push {r4, r5, r6, r7}
2: e9d0 c701 ldrd ip, r7, [r0, #4]
6: e9d0 6503 ldrd r6, r5, [r0, #12]
a: e9d0 4205 ldrd r4, r2, [r0, #20]
r0 = p_src[0];
e: 69c3 ldr r3, [r0, #28]
r4 = p_src[4];
r5 = p_src[5];
r6 = p_src[6];
r7 = p_src[7];
p_dest[0] = r0;
10: 6800 ldr r0, [r0, #0]
12: e9c1 7602 strd r7, r6, [r1, #8]
16: e9c1 5404 strd r5, r4, [r1, #16]
1a: e9c1 0c00 strd r0, ip, [r1]
p_dest[4] = r4;
p_dest[5] = r5;
p_dest[6] = r6;
p_dest[7] = r7;
1e: bcf0 pop {r4, r5, r6, r7}
20: e9c1 2306 strd r2, r3, [r1, #24]
24: 4770 bx lr
}
Hi David,
I redacted my original reply - my code had a silly error in it.
However the root cause is the same, that the AAPCS specifies that registers r0-r3, r12, (and lr) are the only ones corruptible at the function boundary, and so with r0 and r1 used for your pointers, you are limited to r2, r3, r12, (and lr, from the stack).
The following is smaller, but I am not sure that it is any faster, due to the extra memory accesses in push/pop.
0x00000000: b5f0 .. PUSH {r4-r7,lr} 0x00000002: e89050fc ...P LDM r0,{r2-r7,r12,lr} 0x00000006: e88150fc ...P STM r1,{r2-r7,r12,lr} 0x0000000a: e8bd40f0 ...@ POP {r4-r7,lr}
Ronan
Hi Bill and Ronan
Thanks for your replies.
Ronan, your suggestion is really helpful and I am benchmarking it now.
WestfW said:Your function is "inline", but the code you show is a complete callable function.
Yes. the code is not appearing inline, which is bad. Could either of you please help me convert it to an inline function?I guess it would be something like:
// Copy the command to the block (all 8 words 0..7 are used) // PktProcWrite8_asm( p_src, p_dest ); __asm("PktProcWrite8_asm:\n" " push {r4-r7,lr}\n" " ldm r0,{r2-r7,r12,lr}\n" " stm r1,{r2-r7,r12,lr}\n" " pop {r4-r7,lr}" );
I forgot about the inline component... might be best to stay away from using the lr. In the below I've used r8 instead.
#define uint32_t unsigned int static inline void PktProcWrite8( uint32_t* p_src, uint32_t* p_dest ) { __asm( " push {r4-r8}\n" " ldm r0,{r2-r8,r12}\n" " stm r1,{r2-r8,r12}\n" " pop {r4-r8}" ); } void foo(uint32_t* p_src, uint32_t* p_dest) { PktProcWrite8(p_src, p_dest); return; }
I'm not doing a great job answering this...
I wrote
> I forgot about the inline component... might be best to stay away from using the lr. In the below I've used r8 instead.
However... while it works in the above trivial case, it does not work in general. Whatever values were in r2 and r3 before the copy function is called will be lost, which likely means that the overall foo() function will break.
Use the above with extreme caution.
But the disassembly already shows the function as pushing r4-r7...
In gcc (which has a complex __asm__ syntax), you might be able to use:
static inline void cpy2(volatile uint32_t* p_src, // Source address of data volatile uint32_t* p_dest) { __asm__ __volatile__ ( " ldm %[inp]!, {r3, r4, r5, r6} \n\t" " stm %[outp]!, {r3, r4, r5, r6} \n\t" " ldm %[inp]!, {r3, r4, r5, r6} \n\t" " stm %[outp]!, {r3, r4, r5, r6} \n\t" : /* outputs */ [inp] "+r" (p_src), [outp] "+r" (p_dest) : /* inputs */ : "r3", "r4", "r5", "r6" /* clobbers */ );}
static inline void cpy2(volatile uint32_t* p_src, // Source address of data
volatile uint32_t* p_dest) {
__asm__ __volatile__ ( " ldm %[inp]!, {r3, r4, r5, r6} \n\t"
" stm %[outp]!, {r3, r4, r5, r6} \n\t"
" ldm %[inp]!, {r3, r4, r5, r6} \n\t"
: /* outputs */
[inp] "+r" (p_src),
[outp] "+r" (p_dest)
: /* inputs */
: "r3", "r4", "r5", "r6" /* clobbers */
);
Putting the 4 registers in the "clobber" list tells the compiler that those registers are going to be used, and it will save/restore them as necessary (and hopefully NOT when not necessary.) For example, with source code:
cpy2(s, d); cpy2(s+1000, d+1000); cpy2(s+2000, d+2000);
cpy2(s, d);
cpy2(s+1000, d+1000);
cpy2(s+2000, d+2000);
It produces object code:
void bar() { 0: b470 push {r4, r5, r6} cpy2(s, d); 2: 2264 movs r2, #100 ; 0x64 4: 21c8 movs r1, #200 ; 0xc8 6: ca78 ldmia r2!, {r3, r4, r5, r6} 8: c178 stmia r1!, {r3, r4, r5, r6} a: ca78 ldmia r2!, {r3, r4, r5, r6} c: c178 stmia r1!, {r3, r4, r5, r6} cpy2(s+1000, d+1000); e: f241 0204 movw r2, #4100 ; 0x1004 12: f241 0168 movw r1, #4200 ; 0x1068 16: ca78 ldmia r2!, {r3, r4, r5, r6} 18: c178 stmia r1!, {r3, r4, r5, r6} 1a: ca78 ldmia r2!, {r3, r4, r5, r6} 1c: c178 stmia r1!, {r3, r4, r5, r6} cpy2(s+2000, d+2000); 1e: f641 72a4 movw r2, #8100 ; 0x1fa4 22: f242 0108 movw r1, #8200 ; 0x2008 26: ca78 ldmia r2!, {r3, r4, r5, r6} 28: c178 stmia r1!, {r3, r4, r5, r6} 2a: ca78 ldmia r2!, {r3, r4, r5, r6} 2c: c178 stmia r1!, {r3, r4, r5, r6} 2e: bc70 pop {r4, r5, r6} 30: 4770 bx lr
void bar() {
0: b470 push {r4, r5, r6}
2: 2264 movs r2, #100 ; 0x64
4: 21c8 movs r1, #200 ; 0xc8
6: ca78 ldmia r2!, {r3, r4, r5, r6}
8: c178 stmia r1!, {r3, r4, r5, r6}
a: ca78 ldmia r2!, {r3, r4, r5, r6}
c: c178 stmia r1!, {r3, r4, r5, r6}
e: f241 0204 movw r2, #4100 ; 0x1004
12: f241 0168 movw r1, #4200 ; 0x1068
16: ca78 ldmia r2!, {r3, r4, r5, r6}
18: c178 stmia r1!, {r3, r4, r5, r6}
1a: ca78 ldmia r2!, {r3, r4, r5, r6}
1c: c178 stmia r1!, {r3, r4, r5, r6}
1e: f641 72a4 movw r2, #8100 ; 0x1fa4
22: f242 0108 movw r1, #8200 ; 0x2008
26: ca78 ldmia r2!, {r3, r4, r5, r6}
28: c178 stmia r1!, {r3, r4, r5, r6}
2a: ca78 ldmia r2!, {r3, r4, r5, r6}
2c: c178 stmia r1!, {r3, r4, r5, r6}
2e: bc70 pop {r4, r5, r6}
30: 4770 bx lr
Thanks very much for your further replies. The code is working well for me and offers the speed improvement we needed.
Best regards
David
Assembler does no optimization. It takes your code as is and converts to machine code. Compilers on the other hand can optimize your code; the resulting assembly is already optimized.