This discussion has been locked.
You can no longer post new replies to this discussion. If you have a question you can start a new discussion

How to optimize an assembler copy function?

I have a C function that copies 8 x 32-bit words from src to dest specified by pointers:

static inline void PktProcWrite8( uint32_t* p_src,     // Source address of data
                                  uint32_t* p_dest )   // Destination address 
{
#ifndef __cplusplus    
    register 
#endif    
    uint32_t r0, r1, r2, r3, r4, r5, r6, r7;   // Use 'register' hint to encourage C compiler to use STM instruction
    {
        r0 = p_src[0];
        r1 = p_src[1];
        r2 = p_src[2];
        r3 = p_src[3];
        r4 = p_src[4];
        r5 = p_src[5];
        r6 = p_src[6];
        r7 = p_src[7];

        p_dest[0] = r0;
        p_dest[1] = r1;
        p_dest[2] = r2;
        p_dest[3] = r3;
        p_dest[4] = r4;
        p_dest[5] = r5;
        p_dest[6] = r6;
        p_dest[7] = r7;
    }
}

The generated assembler is:

PktProcWrite8_asm:
	.fnstart
	.cfi_sections .debug_frame
	.cfi_startproc
@ %bb.0:
	.save	{r4, r5, r6, lr}
	push	{r4, r5, r6, lr}
	.cfi_def_cfa_offset 16
	.cfi_offset lr, -4
	.cfi_offset r6, -8
	.cfi_offset r5, -12
	.cfi_offset r4, -16
	ldm.w	r0, {r2, r3, r12, lr}
	add.w	r6, r0, #16
	ldm	r6, {r4, r5, r6}
	ldr	r0, [r0, #28]
	stm.w	r1, {r2, r3, r12, lr}
	add.w	r2, r1, #16
	stm	r2!, {r4, r5, r6}
	str	r0, [r1, #28]
	pop	{r4, r5, r6, pc}
.Lfunc_end0:

It is important for us to maximize the use of burst writes. The above assembler does a burst write of 4 words, followed by a burst of 3 words, followed by a single word.

Is there any reason why we could not modify the assembler to use a single burst of 8 words or, less efficiently, two bursts of 4 words?

The target is Cortex-M4 and we are using armclang.

Parents
  • In gcc (which has a complex __asm__ syntax), you might be able to use:

    static inline void cpy2(volatile uint32_t* p_src,     // Source address of data
                        volatile uint32_t* p_dest) {
      __asm__ __volatile__  ( " ldm %[inp]!, {r3, r4, r5, r6} \n\t"
                              " stm %[outp]!, {r3, r4, r5, r6} \n\t"
                              " ldm %[inp]!, {r3, r4, r5, r6} \n\t"
                              " stm %[outp]!, {r3, r4, r5, r6} \n\t"
                              : /* outputs */
                                [inp] "+r" (p_src),
                                [outp] "+r" (p_dest)
                              : /* inputs */
                              : "r3", "r4", "r5", "r6" /* clobbers */
        );
    }

    Putting the 4 registers in the "clobber" list tells the compiler that those registers are going to be used, and it will save/restore them as necessary (and hopefully NOT when not necessary.)  For example, with source code:

      cpy2(s, d);
      cpy2(s+1000, d+1000);
      cpy2(s+2000, d+2000);

    It produces object code:

    void bar() {
       0:    b470          push    {r4, r5, r6}
      cpy2(s, d);
       2:    2264          movs    r2, #100    ; 0x64
       4:    21c8          movs    r1, #200    ; 0xc8
       6:    ca78          ldmia    r2!, {r3, r4, r5, r6}
       8:    c178          stmia    r1!, {r3, r4, r5, r6}
       a:    ca78          ldmia    r2!, {r3, r4, r5, r6}
       c:    c178          stmia    r1!, {r3, r4, r5, r6}
      cpy2(s+1000, d+1000);
       e:    f241 0204     movw    r2, #4100    ; 0x1004
      12:    f241 0168     movw    r1, #4200    ; 0x1068
      16:    ca78          ldmia    r2!, {r3, r4, r5, r6}
      18:    c178          stmia    r1!, {r3, r4, r5, r6}
      1a:    ca78          ldmia    r2!, {r3, r4, r5, r6}
      1c:    c178          stmia    r1!, {r3, r4, r5, r6}
      cpy2(s+2000, d+2000);
      1e:    f641 72a4     movw    r2, #8100    ; 0x1fa4
      22:    f242 0108     movw    r1, #8200    ; 0x2008
      26:    ca78          ldmia    r2!, {r3, r4, r5, r6}
      28:    c178          stmia    r1!, {r3, r4, r5, r6}
      2a:    ca78          ldmia    r2!, {r3, r4, r5, r6}
      2c:    c178          stmia    r1!, {r3, r4, r5, r6}
      2e:    bc70          pop    {r4, r5, r6}
      30:    4770          bx    lr

Reply
  • In gcc (which has a complex __asm__ syntax), you might be able to use:

    static inline void cpy2(volatile uint32_t* p_src,     // Source address of data
                        volatile uint32_t* p_dest) {
      __asm__ __volatile__  ( " ldm %[inp]!, {r3, r4, r5, r6} \n\t"
                              " stm %[outp]!, {r3, r4, r5, r6} \n\t"
                              " ldm %[inp]!, {r3, r4, r5, r6} \n\t"
                              " stm %[outp]!, {r3, r4, r5, r6} \n\t"
                              : /* outputs */
                                [inp] "+r" (p_src),
                                [outp] "+r" (p_dest)
                              : /* inputs */
                              : "r3", "r4", "r5", "r6" /* clobbers */
        );
    }

    Putting the 4 registers in the "clobber" list tells the compiler that those registers are going to be used, and it will save/restore them as necessary (and hopefully NOT when not necessary.)  For example, with source code:

      cpy2(s, d);
      cpy2(s+1000, d+1000);
      cpy2(s+2000, d+2000);

    It produces object code:

    void bar() {
       0:    b470          push    {r4, r5, r6}
      cpy2(s, d);
       2:    2264          movs    r2, #100    ; 0x64
       4:    21c8          movs    r1, #200    ; 0xc8
       6:    ca78          ldmia    r2!, {r3, r4, r5, r6}
       8:    c178          stmia    r1!, {r3, r4, r5, r6}
       a:    ca78          ldmia    r2!, {r3, r4, r5, r6}
       c:    c178          stmia    r1!, {r3, r4, r5, r6}
      cpy2(s+1000, d+1000);
       e:    f241 0204     movw    r2, #4100    ; 0x1004
      12:    f241 0168     movw    r1, #4200    ; 0x1068
      16:    ca78          ldmia    r2!, {r3, r4, r5, r6}
      18:    c178          stmia    r1!, {r3, r4, r5, r6}
      1a:    ca78          ldmia    r2!, {r3, r4, r5, r6}
      1c:    c178          stmia    r1!, {r3, r4, r5, r6}
      cpy2(s+2000, d+2000);
      1e:    f641 72a4     movw    r2, #8100    ; 0x1fa4
      22:    f242 0108     movw    r1, #8200    ; 0x2008
      26:    ca78          ldmia    r2!, {r3, r4, r5, r6}
      28:    c178          stmia    r1!, {r3, r4, r5, r6}
      2a:    ca78          ldmia    r2!, {r3, r4, r5, r6}
      2c:    c178          stmia    r1!, {r3, r4, r5, r6}
      2e:    bc70          pop    {r4, r5, r6}
      30:    4770          bx    lr

Children