I'm currently testing the code on an embedded board that is equipped with an ARM Cortex-A72 CPU based on the Armv8 architecture. The following code is used to measure the performance.
void kernel_func(unsigned char* input_data, unsigned char* output_data) { int stride_size=1; //assert(TEST_SIZE%byte_size == 0); for(int i=0; i<100000000; i+=stride_size) { output_data[i] = input_data[i]; } return; }
To test the above code, I divided it into three versions as shown below and measured their performance.
void kernel_func_0(unsigned char* __restrict__ input_data, unsigned char* __restrict__ output_data) { int stride_size=1; for(int i=0; i<100000000; i+=stride_size) { output_data[i] = input_data[i]; } return; } void kernel_func_1(unsigned char* __restrict__ input_data, unsigned char* __restrict__ output_data) { int stride_size=16; for(int i=0; i<100000000; i+=stride_size) { output_data[i+0 ] = input_data[i+0 ]; output_data[i+1 ] = input_data[i+1 ]; output_data[i+2 ] = input_data[i+2 ]; output_data[i+3 ] = input_data[i+3 ]; output_data[i+4 ] = input_data[i+4 ]; output_data[i+5 ] = input_data[i+5 ]; output_data[i+6 ] = input_data[i+6 ]; output_data[i+7 ] = input_data[i+7 ]; output_data[i+8 ] = input_data[i+8 ]; output_data[i+9 ] = input_data[i+9 ]; output_data[i+10] = input_data[i+10]; output_data[i+11] = input_data[i+11]; output_data[i+12] = input_data[i+12]; output_data[i+13] = input_data[i+13]; output_data[i+14] = input_data[i+14]; output_data[i+15] = input_data[i+15]; } return; } void kernel_func_2(unsigned char* __restrict__ input_data, unsigned char* __restrict__ output_data) { int stride_size=32; for(int i=0; i<100000000; i+=stride_size) { output_data[i+0 ] = input_data[i+0 ]; output_data[i+1 ] = input_data[i+1 ]; output_data[i+2 ] = input_data[i+2 ]; output_data[i+3 ] = input_data[i+3 ]; output_data[i+4 ] = input_data[i+4 ]; output_data[i+5 ] = input_data[i+5 ]; output_data[i+6 ] = input_data[i+6 ]; output_data[i+7 ] = input_data[i+7 ]; output_data[i+8 ] = input_data[i+8 ]; output_data[i+9 ] = input_data[i+9 ]; output_data[i+10] = input_data[i+10]; output_data[i+11] = input_data[i+11]; output_data[i+12] = input_data[i+12]; output_data[i+13] = input_data[i+13]; output_data[i+14] = input_data[i+14]; output_data[i+15] = input_data[i+15]; output_data[i+16] = input_data[i+16]; output_data[i+17] = input_data[i+17]; output_data[i+18] = input_data[i+18]; output_data[i+19] = input_data[i+19]; output_data[i+20] = input_data[i+20]; output_data[i+21] = input_data[i+21]; output_data[i+22] = input_data[i+22]; output_data[i+23] = input_data[i+23]; output_data[i+24] = input_data[i+24]; output_data[i+25] = input_data[i+25]; output_data[i+26] = input_data[i+26]; output_data[i+27] = input_data[i+27]; output_data[i+28] = input_data[i+28]; output_data[i+29] = input_data[i+29]; output_data[i+30] = input_data[i+30]; output_data[i+31] = input_data[i+31]; } return; }
The performance of the first code(kernel_func_0) is measured at around 8ms, the second code(kernel_func_1) at 8ms, and the third code(kernel_func_2) at around 11ms.
To identify the reason for the performance difference between the second and third code, I converted both codes into assembly code.
/* The Assembly code of KERNEL_FUNC_1 */ .arch armv8.2-a+crc .file "kernel.cpp" .text .align 2 .p2align 4,,11 .global _Z11kernel_funcPhS_ .type _Z11kernel_funcPhS_, %function _Z11kernel_funcPhS_: .LFB4340: .cfi_startproc mov x3, 57600 mov x2, 0 movk x3, 0x5f5, lsl 16 .p2align 3,,7 .L2: ldr q0, [x0, x2] str q0, [x1, x2] add x2, x2, 16 cmp x2, x3 bne .L2 ret .cfi_endproc .LFE4340: .size _Z11kernel_funcPhS_, .-_Z11kernel_funcPhS_ .ident "GCC: (Ubuntu 11.1.0-1ubuntu1~18.04.1) 11.1.0" .section .note.GNU-stack,"",@progbits
/* The Assembly code of KERNEL_FUNC_2 */ .arch armv8.2-a+crc .file "kernel.cpp" .text .align 2 .p2align 4,,11 .global _Z11kernel_funcPhS_ .type _Z11kernel_funcPhS_, %function _Z11kernel_funcPhS_: .LFB4340: .cfi_startproc mov x3, 57600 add x5, x0, 16 add x4, x1, 16 mov x2, 0 movk x3, 0x5f5, lsl 16 .p2align 3,,7 .L2: ldr q1, [x0, x2] ldr q0, [x5, x2] str q1, [x1, x2] str q0, [x4, x2] add x2, x2, 32 cmp x2, x3 bne .L2 ret .cfi_endproc .LFE4340: .size _Z11kernel_funcPhS_, .-_Z11kernel_funcPhS_ .ident "GCC: (Ubuntu 11.1.0-1ubuntu1~18.04.1) 11.1.0" .section .note.GNU-stack,"",@progbits
The majority of the performance for both assembly codes is determined by operations on ".L2:" labels. However, I believe that there should be no difference in performance between the two codes because if i modify the second assembly code(KERNEL_FUNC_1_MOD) as follows, it looks like execute same operation as the third assembly code(KERNEL_FUNC_2 CODE).
/* The Assembly code of KERNEL_FUNC_1_MOD */ .arch armv8.2-a+crc .file "kernel.cpp" .text .align 2 .p2align 4,,11 .global _Z11kernel_funcPhS_ .type _Z11kernel_funcPhS_, %function _Z11kernel_funcPhS_: .LFB4340: .cfi_startproc mov x3, 57600 mov x2, 0 movk x3, 0x5f5, lsl 16 .p2align 3,,7 .L2: ldr q0, [x0, x2] add x2, x2, 16 ldr q1, [x0, x2] sub x2, x2, 16 str q0, [x1, x2] add x2, x2, 16 str q1, [x1, x2] add x2, x2, 16 cmp x2, x3 bne .L2 ret .cfi_endproc .LFE4340: .size _Z11kernel_funcPhS_, .-_Z11kernel_funcPhS_ .ident "GCC: (Ubuntu 11.1.0-1ubuntu1~18.04.1) 11.1.0" .section .note.GNU-stack,"",@progbits
When running the two codes, "KERNEL_FUNC_1_Mod" takes 8ms and "KERNEL_FUNC_2" takes 11ms. It is difficult to understand why such results are produced. It's hard to comprehend that the performance differs by about 3ms just because of the difference of whether the address to load memory is written in the same register or not.