This discussion has been locked.
You can no longer post new replies to this discussion. If you have a question you can start a new discussion

How to optimize an assembler copy function?

I have a C function that copies 8 x 32-bit words from src to dest specified by pointers:

static inline void PktProcWrite8( uint32_t* p_src,     // Source address of data
                                  uint32_t* p_dest )   // Destination address 
{
#ifndef __cplusplus    
    register 
#endif    
    uint32_t r0, r1, r2, r3, r4, r5, r6, r7;   // Use 'register' hint to encourage C compiler to use STM instruction
    {
        r0 = p_src[0];
        r1 = p_src[1];
        r2 = p_src[2];
        r3 = p_src[3];
        r4 = p_src[4];
        r5 = p_src[5];
        r6 = p_src[6];
        r7 = p_src[7];

        p_dest[0] = r0;
        p_dest[1] = r1;
        p_dest[2] = r2;
        p_dest[3] = r3;
        p_dest[4] = r4;
        p_dest[5] = r5;
        p_dest[6] = r6;
        p_dest[7] = r7;
    }
}

The generated assembler is:

PktProcWrite8_asm:
	.fnstart
	.cfi_sections .debug_frame
	.cfi_startproc
@ %bb.0:
	.save	{r4, r5, r6, lr}
	push	{r4, r5, r6, lr}
	.cfi_def_cfa_offset 16
	.cfi_offset lr, -4
	.cfi_offset r6, -8
	.cfi_offset r5, -12
	.cfi_offset r4, -16
	ldm.w	r0, {r2, r3, r12, lr}
	add.w	r6, r0, #16
	ldm	r6, {r4, r5, r6}
	ldr	r0, [r0, #28]
	stm.w	r1, {r2, r3, r12, lr}
	add.w	r2, r1, #16
	stm	r2!, {r4, r5, r6}
	str	r0, [r1, #28]
	pop	{r4, r5, r6, pc}
.Lfunc_end0:

It is important for us to maximize the use of burst writes. The above assembler does a burst write of 4 words, followed by a burst of 3 words, followed by a single word.

Is there any reason why we could not modify the assembler to use a single burst of 8 words or, less efficiently, two bursts of 4 words?

The target is Cortex-M4 and we are using armclang.

Parents
  • I don't know.

    Also interesting that it's not using the auto-increment capability of ldm/stm (except once.)

    Cortex-M4 has burst writes?  (or is that dependent on the bus(es) implemented outside of the core?)

    Your function is "inline", but the code you show is a complete callable function.  Isn't the code generation in actual use, when it's actually inlined, liable to change based on nearby code?

    It sort-of looks to me like whatever you may gain by using ldm/stm, you may lose by needing to save the registers you use.

    gcc does "worse" (?), generating 4 ldrd and strd instructions.

        uint32_t r0, r1, r2, r3, r4, r5, r6, r7;   // Use 'register' hint to encourage C compiler to use STM instruction
        {

       0:    b4f0          push    {r4, r5, r6, r7}
       2:    e9d0 c701     ldrd    ip, r7, [r0, #4]
       6:    e9d0 6503     ldrd    r6, r5, [r0, #12]
       a:    e9d0 4205     ldrd    r4, r2, [r0, #20]
            r0 = p_src[0];
       e:    69c3          ldr    r3, [r0, #28]
            r4 = p_src[4];
            r5 = p_src[5];
            r6 = p_src[6];
            r7 = p_src[7];

            p_dest[0] = r0;
      10:    6800          ldr    r0, [r0, #0]
      12:    e9c1 7602     strd    r7, r6, [r1, #8]
      16:    e9c1 5404     strd    r5, r4, [r1, #16]
      1a:    e9c1 0c00     strd    r0, ip, [r1]
            p_dest[4] = r4;
            p_dest[5] = r5;
            p_dest[6] = r6;
            p_dest[7] = r7;
      1e:    bcf0          pop    {r4, r5, r6, r7}
            p_dest[0] = r0;
      20:    e9c1 2306     strd    r2, r3, [r1, #24]
      24:    4770          bx    lr
        }

Reply
  • I don't know.

    Also interesting that it's not using the auto-increment capability of ldm/stm (except once.)

    Cortex-M4 has burst writes?  (or is that dependent on the bus(es) implemented outside of the core?)

    Your function is "inline", but the code you show is a complete callable function.  Isn't the code generation in actual use, when it's actually inlined, liable to change based on nearby code?

    It sort-of looks to me like whatever you may gain by using ldm/stm, you may lose by needing to save the registers you use.

    gcc does "worse" (?), generating 4 ldrd and strd instructions.

        uint32_t r0, r1, r2, r3, r4, r5, r6, r7;   // Use 'register' hint to encourage C compiler to use STM instruction
        {

       0:    b4f0          push    {r4, r5, r6, r7}
       2:    e9d0 c701     ldrd    ip, r7, [r0, #4]
       6:    e9d0 6503     ldrd    r6, r5, [r0, #12]
       a:    e9d0 4205     ldrd    r4, r2, [r0, #20]
            r0 = p_src[0];
       e:    69c3          ldr    r3, [r0, #28]
            r4 = p_src[4];
            r5 = p_src[5];
            r6 = p_src[6];
            r7 = p_src[7];

            p_dest[0] = r0;
      10:    6800          ldr    r0, [r0, #0]
      12:    e9c1 7602     strd    r7, r6, [r1, #8]
      16:    e9c1 5404     strd    r5, r4, [r1, #16]
      1a:    e9c1 0c00     strd    r0, ip, [r1]
            p_dest[4] = r4;
            p_dest[5] = r5;
            p_dest[6] = r6;
            p_dest[7] = r7;
      1e:    bcf0          pop    {r4, r5, r6, r7}
            p_dest[0] = r0;
      20:    e9c1 2306     strd    r2, r3, [r1, #24]
      24:    4770          bx    lr
        }

Children
No data