I have a C function that copies 8 x 32-bit words from src to dest specified by pointers:
static inline void PktProcWrite8( uint32_t* p_src, // Source address of data uint32_t* p_dest ) // Destination address { #ifndef __cplusplus register #endif uint32_t r0, r1, r2, r3, r4, r5, r6, r7; // Use 'register' hint to encourage C compiler to use STM instruction { r0 = p_src[0]; r1 = p_src[1]; r2 = p_src[2]; r3 = p_src[3]; r4 = p_src[4]; r5 = p_src[5]; r6 = p_src[6]; r7 = p_src[7]; p_dest[0] = r0; p_dest[1] = r1; p_dest[2] = r2; p_dest[3] = r3; p_dest[4] = r4; p_dest[5] = r5; p_dest[6] = r6; p_dest[7] = r7; } }
The generated assembler is:
PktProcWrite8_asm: .fnstart .cfi_sections .debug_frame .cfi_startproc @ %bb.0: .save {r4, r5, r6, lr} push {r4, r5, r6, lr} .cfi_def_cfa_offset 16 .cfi_offset lr, -4 .cfi_offset r6, -8 .cfi_offset r5, -12 .cfi_offset r4, -16 ldm.w r0, {r2, r3, r12, lr} add.w r6, r0, #16 ldm r6, {r4, r5, r6} ldr r0, [r0, #28] stm.w r1, {r2, r3, r12, lr} add.w r2, r1, #16 stm r2!, {r4, r5, r6} str r0, [r1, #28] pop {r4, r5, r6, pc} .Lfunc_end0:
It is important for us to maximize the use of burst writes. The above assembler does a burst write of 4 words, followed by a burst of 3 words, followed by a single word.
Is there any reason why we could not modify the assembler to use a single burst of 8 words or, less efficiently, two bursts of 4 words?
The target is Cortex-M4 and we are using armclang.
I don't know.
Also interesting that it's not using the auto-increment capability of ldm/stm (except once.)
Cortex-M4 has burst writes? (or is that dependent on the bus(es) implemented outside of the core?)
Your function is "inline", but the code you show is a complete callable function. Isn't the code generation in actual use, when it's actually inlined, liable to change based on nearby code?
It sort-of looks to me like whatever you may gain by using ldm/stm, you may lose by needing to save the registers you use.
gcc does "worse" (?), generating 4 ldrd and strd instructions. uint32_t r0, r1, r2, r3, r4, r5, r6, r7; // Use 'register' hint to encourage C compiler to use STM instruction {
uint32_t r0, r1, r2, r3, r4, r5, r6, r7; // Use 'register' hint to encourage C compiler to use STM instruction
{
0: b4f0 push {r4, r5, r6, r7} 2: e9d0 c701 ldrd ip, r7, [r0, #4] 6: e9d0 6503 ldrd r6, r5, [r0, #12] a: e9d0 4205 ldrd r4, r2, [r0, #20] r0 = p_src[0]; e: 69c3 ldr r3, [r0, #28] r4 = p_src[4]; r5 = p_src[5]; r6 = p_src[6]; r7 = p_src[7]; p_dest[0] = r0; 10: 6800 ldr r0, [r0, #0] 12: e9c1 7602 strd r7, r6, [r1, #8] 16: e9c1 5404 strd r5, r4, [r1, #16] 1a: e9c1 0c00 strd r0, ip, [r1] p_dest[4] = r4; p_dest[5] = r5; p_dest[6] = r6; p_dest[7] = r7; 1e: bcf0 pop {r4, r5, r6, r7} p_dest[0] = r0; 20: e9c1 2306 strd r2, r3, [r1, #24] 24: 4770 bx lr }
0: b4f0 push {r4, r5, r6, r7}
2: e9d0 c701 ldrd ip, r7, [r0, #4]
6: e9d0 6503 ldrd r6, r5, [r0, #12]
a: e9d0 4205 ldrd r4, r2, [r0, #20]
r0 = p_src[0];
e: 69c3 ldr r3, [r0, #28]
r4 = p_src[4];
r5 = p_src[5];
r6 = p_src[6];
r7 = p_src[7];
p_dest[0] = r0;
10: 6800 ldr r0, [r0, #0]
12: e9c1 7602 strd r7, r6, [r1, #8]
16: e9c1 5404 strd r5, r4, [r1, #16]
1a: e9c1 0c00 strd r0, ip, [r1]
p_dest[4] = r4;
p_dest[5] = r5;
p_dest[6] = r6;
p_dest[7] = r7;
1e: bcf0 pop {r4, r5, r6, r7}
20: e9c1 2306 strd r2, r3, [r1, #24]
24: 4770 bx lr
}