This discussion has been locked.
You can no longer post new replies to this discussion. If you have a question you can start a new discussion

Question of Arm performance related to register allocation

I'm currently testing the code on an embedded board that is equipped with an ARM Cortex-A72 CPU based on the Armv8 architecture. The following code is used to measure the performance.

void kernel_func(unsigned char* input_data, unsigned char* output_data)
{
    int stride_size=1;
    //assert(TEST_SIZE%byte_size == 0);
    for(int i=0; i<100000000; i+=stride_size)
    {
        output_data[i] = input_data[i];
    }
    return;
}

To test the above code, I divided it into three versions as shown below and measured their performance.

void kernel_func_0(unsigned char* __restrict__ input_data, unsigned char* __restrict__ output_data)
{
    int stride_size=1;
    for(int i=0; i<100000000; i+=stride_size)
    {
        output_data[i] = input_data[i];
    }
    return;
}

void kernel_func_1(unsigned char* __restrict__ input_data, unsigned char* __restrict__ output_data)
{
    int stride_size=16;
    for(int i=0; i<100000000; i+=stride_size)
    {
        output_data[i+0 ] = input_data[i+0 ];
        output_data[i+1 ] = input_data[i+1 ];
        output_data[i+2 ] = input_data[i+2 ];
        output_data[i+3 ] = input_data[i+3 ];
        output_data[i+4 ] = input_data[i+4 ];
        output_data[i+5 ] = input_data[i+5 ];
        output_data[i+6 ] = input_data[i+6 ];
        output_data[i+7 ] = input_data[i+7 ];
        output_data[i+8 ] = input_data[i+8 ];
        output_data[i+9 ] = input_data[i+9 ];
        output_data[i+10] = input_data[i+10];
        output_data[i+11] = input_data[i+11];
        output_data[i+12] = input_data[i+12];
        output_data[i+13] = input_data[i+13];
        output_data[i+14] = input_data[i+14];
        output_data[i+15] = input_data[i+15];
    }
    return;
}

void kernel_func_2(unsigned char* __restrict__ input_data, unsigned char* __restrict__ output_data)
{
    int stride_size=32;
    for(int i=0; i<100000000; i+=stride_size)
    {
        output_data[i+0 ] = input_data[i+0 ];
        output_data[i+1 ] = input_data[i+1 ];
        output_data[i+2 ] = input_data[i+2 ];
        output_data[i+3 ] = input_data[i+3 ];
        output_data[i+4 ] = input_data[i+4 ];
        output_data[i+5 ] = input_data[i+5 ];
        output_data[i+6 ] = input_data[i+6 ];
        output_data[i+7 ] = input_data[i+7 ];
        output_data[i+8 ] = input_data[i+8 ];
        output_data[i+9 ] = input_data[i+9 ];
        output_data[i+10] = input_data[i+10];
        output_data[i+11] = input_data[i+11];
        output_data[i+12] = input_data[i+12];
        output_data[i+13] = input_data[i+13];
        output_data[i+14] = input_data[i+14];
        output_data[i+15] = input_data[i+15];
        output_data[i+16] = input_data[i+16];
        output_data[i+17] = input_data[i+17];
        output_data[i+18] = input_data[i+18];
        output_data[i+19] = input_data[i+19];
        output_data[i+20] = input_data[i+20];
        output_data[i+21] = input_data[i+21];
        output_data[i+22] = input_data[i+22];
        output_data[i+23] = input_data[i+23];
        output_data[i+24] = input_data[i+24];
        output_data[i+25] = input_data[i+25];
        output_data[i+26] = input_data[i+26];
        output_data[i+27] = input_data[i+27];
        output_data[i+28] = input_data[i+28];
        output_data[i+29] = input_data[i+29];
        output_data[i+30] = input_data[i+30];
        output_data[i+31] = input_data[i+31];
    }
    return;
}

The performance of the first code(kernel_func_0) is measured at around 8ms, the second code(kernel_func_1) at 8ms, and the third code(kernel_func_2) at around 11ms.

To identify the reason for the performance difference between the second and third code, I converted both codes into assembly code.

/*
	The Assembly code of KERNEL_FUNC_1
 */
	.arch armv8.2-a+crc
	.file	"kernel.cpp"
	.text
	.align	2
	.p2align 4,,11
	.global	_Z11kernel_funcPhS_
	.type	_Z11kernel_funcPhS_, %function
_Z11kernel_funcPhS_:
.LFB4340:
	.cfi_startproc
	mov	x3, 57600
	mov	x2, 0
	movk	x3, 0x5f5, lsl 16
	.p2align 3,,7
.L2:
	ldr	q0, [x0, x2]
	str	q0, [x1, x2]
	add	x2, x2, 16
	cmp	x2, x3
	bne	.L2
	ret
	.cfi_endproc
.LFE4340:
	.size	_Z11kernel_funcPhS_, .-_Z11kernel_funcPhS_
	.ident	"GCC: (Ubuntu 11.1.0-1ubuntu1~18.04.1) 11.1.0"
	.section	.note.GNU-stack,"",@progbits

/*
	The Assembly code of KERNEL_FUNC_2
 */

	.arch armv8.2-a+crc
	.file	"kernel.cpp"
	.text
	.align	2
	.p2align 4,,11
	.global	_Z11kernel_funcPhS_
	.type	_Z11kernel_funcPhS_, %function
_Z11kernel_funcPhS_:
.LFB4340:
	.cfi_startproc
	mov	x3, 57600
	add	x5, x0, 16
	add	x4, x1, 16
	mov	x2, 0
	movk	x3, 0x5f5, lsl 16
	.p2align 3,,7
.L2:
	ldr	q1, [x0, x2]
	ldr	q0, [x5, x2]
	str	q1, [x1, x2]
	str	q0, [x4, x2]
	add	x2, x2, 32
	cmp	x2, x3
	bne	.L2
	ret
	.cfi_endproc
.LFE4340:
	.size	_Z11kernel_funcPhS_, .-_Z11kernel_funcPhS_
	.ident	"GCC: (Ubuntu 11.1.0-1ubuntu1~18.04.1) 11.1.0"
	.section	.note.GNU-stack,"",@progbits

The majority of the performance for both assembly codes is determined by operations on ".L2:" labels. However, I believe that there should be no difference in performance between the two codes because if i modify the second assembly code(KERNEL_FUNC_1_MOD) as follows, it looks like execute same operation as the third assembly code(KERNEL_FUNC_2 CODE).

/*
    The Assembly code of KERNEL_FUNC_1_MOD
*/
	.arch armv8.2-a+crc
	.file	"kernel.cpp"
	.text
	.align	2
	.p2align 4,,11
	.global	_Z11kernel_funcPhS_
	.type	_Z11kernel_funcPhS_, %function
_Z11kernel_funcPhS_:
.LFB4340:
	.cfi_startproc
	mov	x3, 57600
	mov	x2, 0
	movk	x3, 0x5f5, lsl 16
	.p2align 3,,7
.L2:
	ldr	q0, [x0, x2]
	add	x2, x2, 16
	ldr q1, [x0, x2]
	sub	x2, x2, 16

	str	q0, [x1, x2]
	add	x2, x2, 16
	str q1, [x1, x2]
	add x2, x2, 16
	
	cmp	x2, x3
	bne	.L2
	ret
	.cfi_endproc
.LFE4340:
	.size	_Z11kernel_funcPhS_, .-_Z11kernel_funcPhS_
	.ident	"GCC: (Ubuntu 11.1.0-1ubuntu1~18.04.1) 11.1.0"
	.section	.note.GNU-stack,"",@progbits

When running the two codes, "KERNEL_FUNC_1_Mod" takes 8ms and "KERNEL_FUNC_2" takes 11ms. It is difficult to understand why such results are produced. It's hard to comprehend that the performance differs by about 3ms just because of the difference of whether the address to load memory is written in the same register or not.