I find the cas instruction is more slower than ldxr/stlxr, i have test it with__atomic_compare_exchange
time_cnt_old = get_cnt();
for(k = 0; k < LOOP_CNT; k++)
{
countold=count;
count++;
__atomic_compare_exchange(&a,&countold, &count,0,__ATOMIC_SEQ_CST,__ATOMIC_SEQ_CST);
}
time_cnt_new = get_cnt() - time_cnt_old
you can use
gcc -lpthread -march=armv8.1-a -o test main.c as "casal " atomic instruction
or
gcc -lpthread -march=armv-a -o test main.c as "lsxr/stlxr " atomic instruction
the casal time is more slower than "lsxr/stlxr " :
(1) run glibc atomic armv8.1-a
0: glibc atomic cmpcxg:585019493
1: glibc atomic cmpcxg:408777308
2: glibc atomic cmpcxg:769870843
3: glibc atomic cmpcxg:149371093
4: glibc atomic cmpcxg:151365619
5: glibc atomic cmpcxg:150890346
6: glibc atomic cmpcxg:151328121
7: glibc atomic cmpcxg:156505415
8: glibc atomic cmpcxg:412924425
9: glibc atomic cmpcxg:278711677
10: glibc atomic cmpcxg:151651510
11: glibc atomic cmpcxg:279346515
12: glibc atomic cmpcxg:151173807
13: glibc atomic cmpcxg:278545998
14: glibc atomic cmpcxg:278200664
15: glibc atomic cmpcxg:277724961
16: glibc atomic cmpcxg:370065101
17: glibc atomic cmpcxg:278351668
18: glibc atomic cmpcxg:151488937
19: glibc atomic cmpcxg:151469273
(2) run custom ldxr atomic armv8-a
0: custom ldxr atomic cmpcxg:94791218
1: custom ldxr atomic cmpcxg:94722346
2: custom ldxr atomic cmpcxg:94858015
3: custom ldxr atomic cmpcxg:94658057
4: custom ldxr atomic cmpcxg:94695239
5: custom ldxr atomic cmpcxg:94687119
6: custom ldxr atomic cmpcxg:94657355
7: custom ldxr atomic cmpcxg:94666011
8: custom ldxr atomic cmpcxg:94631812
9: custom ldxr atomic cmpcxg:94835661
10: custom ldxr atomic cmpcxg:94686230
11: custom ldxr atomic cmpcxg:94797306
12: custom ldxr atomic cmpcxg:94691870
13: custom ldxr atomic cmpcxg:94685030
14: custom ldxr atomic cmpcxg:94680305
15: custom ldxr atomic cmpcxg:94759021
16: custom ldxr atomic cmpcxg:94700858
17: custom ldxr atomic cmpcxg:94715765
18: custom ldxr atomic cmpcxg:94687178
19: custom ldxr atomic cmpcxg:94662201
the casal time is also not stable, Could you help to explain this? the casal should be faster then ldxr/stlxr for atomic compare and exchange
#include <stdio.h> #include <stdlib.h> #include <pthread.h> #include <unistd.h> #define LOOP_CNT (100000000) int a=0; int b=0; static unsigned long get_cnt(void) { unsigned long timer_val; asm ( " mrs %0, cntvct_el0" : "=r" (timer_val) : : "memory" ); return timer_val; } unsigned long glibc_atomic_performance(void) { int k; int count_list[3][128*2]={0}; int count=0; int countold=0; unsigned long long time_cnt_old = 0, time_cnt_new = 0; time_cnt_old = get_cnt(); for(k = 0; k < LOOP_CNT; k++) { countold=count; count++; __atomic_compare_exchange(&a,&countold, &count,0,__ATOMIC_SEQ_CST,__ATOMIC_SEQ_CST); } time_cnt_new = get_cnt() - time_cnt_old; return time_cnt_new; } int main(int argc, char *argv[]) { char ch; unsigned long time; time = glibc_atomic_performance(); printf("glibc atomic cmpcxg:%ld\n", time); return 0; }
thanks for you reply, i have put these code , as follow:
$gcc -lpthread -march=armv8.1-a -o test cas_v3.c && taskset -c 2 ./test
glibc atomic cmpcxg:243257012
$gcc -lpthread -march=armv8-a -o test cas_v3.c && taskset -c 2 ./test
glibc atomic cmpcxg:81125905