Bug: Floating point rounding errors in implicitly linked amath.so

The NVIDIA libcudacxx project does verification of both CPU and GPU results to ensure that computations are hopefully repeatable from either processor. We've determined that there is some rounding error in the implicitly linked amath.so. Specifically when testing cbrtf. I did not determine other exponent cmath functions to have the same issue.

I am unable to attach C++ files, so the code is pasted below. Sorry if there are any formatting issues.

/**********************************************************************************************
FAILING CASE
Compiled with:
$ /home/coder/armclang/24.10/arm-linux-compiler-24.10.1_Ubuntu-22.04/bin/armclang++ test.cpp \
  -std=c++20 -O3 -nostdlib -L../armclang/24.10/arm-linux-compiler-24.10.1_Ubuntu-22.04/lib -lc -lamath -lgcc

$ ldd a.out
  linux-vdso.so.1 (0x00007daa87514000)
  libc.so.6 => /lib/aarch64-linux-gnu/libc.so.6 (0x00007daa86ae0000)
  /lib/ld-linux-aarch64.so.1 (0x00007daa874c0000)
  libamath.so => not found

$ LD_LIBRARY_PATH=/home/coder/armclang/24.10/arm-linux-compiler-24.10.1_Ubuntu-22.04/lib ./a.out
0X40000000 (expected)
0X40000001 (result)

********************************************************************************************
PASSING CASE
Compiled with:
$ /home/coder/armclang/24.10/arm-linux-compiler-24.10.1_Ubuntu-22.04/bin/armclang++ test.cpp \
  -std=c++20 -O3 -nostdlib -L../armclang/24.10/arm-linux-compiler-24.10.1_Ubuntu-22.04/lib -lc -lm -lgcc

$ ldd a.out
    linux-vdso.so.1 (0x0000717b8130b000)
    libc.so.6 => /lib/aarch64-linux-gnu/libc.so.6 (0x0000717b808d0000)
    libm.so.6 => /lib/aarch64-linux-gnu/libm.so.6 (0x0000717b80830000)
    /lib/ld-linux-aarch64.so.1 (0x0000717b812b0000)

$ ./a.out
0X40000000 (expected)
0X40000000 (result)
**********************************************************************************************/

#include <cmath>
#include <stdio.h>

int main();

extern "C" void _start() {
  main();
  exit(0);
}

int main() {
  using T = float;
  volatile float val = 64.0f / 8.0f;
  auto result = (float)cbrtf(val);
  auto expected = (float)T(2);

  auto print4 = [](const char* v) {
    printf("%#.2hhX%.2hhX%.2hhX%.2hhX\n", v[3], v[2], v[1], v[0]);
  };
  print4((const char*)&expected);
  print4((const char*)&result);

  return 0;
}
0