Dear dears,
There is a particular order of instructions that leads to performance degradation.
I tested it on 12-core Arm® Cortex®-A78AE v8.2 64-bit CPU 3MB L2 + 6MB L3 (Jetson AGX Orin Developer Kit), but I faced similar problem on other processors (kunpengs). The issue is valid for x86-64 too.
The following order of instructions:
fadd d0, d0, d0fadd d1, d0, d0fadd d2, d0, d0fadd d0, d0, d0
should be executed in 4 ticks (2 tick per fadd), but actually it takes 5 ticks. (4.5 ticks for some x86-64 I had randomly tested)
The reason is probably a static scheduling on FPUs (is it correct?)
This fix the problem:
fadd d0, d0, d0fmov d4, d0fadd d3, d3, d3fadd d1, d0, d0fadd d0, d0, d0fadd d2, d4, d4
This order takes 4 ticks. It worth mention that "fadd d3, d3, d3" dramatically improves stability in my test case (without it 4.3 ticks could happen).
This is the code to reproduce (g++ -O3):
#include <cstdint>#include <cstdio>
#include <chrono>
inline double run_fadd_chain(volatile const float iters) { auto st = std::chrono::high_resolution_clock::now();
asm volatile ( "mov x2, %[cnt]\n"
"1:\n"
"fadd d0, d0, d0\n" "fmov d4, d0\n" "fadd d3, d3, d3\n" "fadd d1, d0, d0\n" "fadd d0, d0, d0\n" "fadd d2, d4, d4\n"
"subs x2, x2, #1\n" "bne 1b\n" : : [cnt]"r"((uint64_t)iters) : "x2", "d0", "d1", "d2", "d3", "d4" );
auto en = std::chrono::high_resolution_clock::now();
return std::chrono::duration_cast<std::chrono::nanoseconds>(en - st).count() / iters * 2.2; // 2.2 GHz}
int main() { for (volatile int i = 0; i < 50; ++i){ constexpr int iters = 1'000'000; run_fadd_chain(iters); }
constexpr int iters = 1'000'000; volatile double cycles_per_fadd = run_fadd_chain(iters); printf("Cycles per iteration: %f\n", cycles_per_fadd); return 0;}