Hi,
In the recent days i've been playing with assembly code in my raspberry 3B. I have a little code that shows how fast is my assembly code vs memcpy:
#include <stdio.h> #include <time.h> #include <stdint.h> #include <stdlib.h> #include <string.h> extern void move32b_LDM(uint32_t *ori, uint32_t size, uint32_t *dest); extern void move32b_LDR(uint32_t *ori, uint32_t size, uint32_t *dest); int main(int argc, char* argv[]) { uint32_t *origen = malloc(1 << 27); uint32_t *destino = malloc(1<<27); if (origen == 0 || destino == 0) return -1; for (unsigned int i=0;i< (1<<25);i++) *(origen+i) = i; clock_t start = clock(); memcpy(destino,origen, 1 << 27); clock_t finish = clock(); printf("Time of memcpy: %f\n", ((double)(finish - start))/CLOCKS_PER_SEC); start = clock(); move32b_LDM(origen,1<<27,destino); finish = clock(); printf("Time of LDM: %f\n", ((double)(finish - start))/CLOCKS_PER_SEC); start = clock(); move32b_LDR(origen,1<<27,destino); finish = clock(); printf("Time of LDR: %f\n", ((double)(finish - start))/CLOCKS_PER_SEC); return 0; }
And i have my assembly code:
.global move32b_LDM .global move32b_LDR .p2align 2 .type move32b_LDM,%function .section .text move32b_LDM: .func push {R4,R5,R6,R7,R8} ADD R8,R2,R1 loop: LDMIA R0!,{R4,R5,R6,R7} STMIA R2!, {R4,R5,R6,R7} CMP R8,R2 BNE loop pop {R4,R5,R6,R7,R8} MOV PC,LR .endfunc .type move32b_LDR,%function move32b_LDR: .func push {R4,R5,R6,R8} MOV R5,#0 MOv R6,R5 ADD R8,R2,R6 loop2: LDR R4,[R0],#4 STR R4,[R2],#4 CMP R8,R2 BNE loop2 pop {R4,R5,R6,R8} MOV PC,LR .endfunc
Everything looks easy, but i get segmentation faults and invalid instruction exceptions everytime i call move32b_LDR function. Looking at assembly code something looks wrong:
6ba: f7ff ef7e blx 5b8 <move32b_LDR>
You can see that the relative offset is not a even number as it should be and is causing a unexpected behaviour of the branch instruction. Gcc and ld should know that this branching is forbiden by the compiler and IMO is a bug of the compiler, isn't it? Can it be solve? Thanks you.
"the relative offset is not a even number"
What definition of even are you using? The branch is not your problem.
In move32b_LDR
R5 = 0;
R6 = R5;
R8 = R2 + R6 ; (R8 = Destination address + 0)
You end the loop when R2 == R8, which will not happen
In move32b_LDM you set R8 = (R1 + R2), that seems to be a better choice for move32b_LDR