Here is my codes under arm sve.```extern float32_t dot_f16_auto_vectorization(const float16_t* __restrict a, const float16_t* __restrict b, size_t n) {#pragma float_control(precise, off) float16_t sum = 0.0; for (size_t i = 0; i < n; i += 1) { sum += a[i] * b[i]; } return (float32_t)sum;}```It don't auto vectorization and show that```<source>:10:2: remark: loop not vectorized: value that could not be identified as reduction is used outside the loop [-Rpass-analysis=loop-vectorize] 10 | for (size_t i = 0; i < n; i += 1) { | ^```see godbolt.org/.../rjG8Thzn6 for more infomation.I want it will like intrinsics below, but it didn't.```extern float32_t dot_f16_sve(const float16_t* __restrict a, const float16_t* __restrict b, size_t n) { svfloat16_t svsum = svdup_f16(0.0f); svbool_t pg; svfloat16_t sva, svb; for (size_t i = 0; i < n; i += svcnth()) { pg = svwhilelt_b16(i, n); sva = svld1_f16(pg, a + i); svb = svld1_f16(pg, b + i); svsum = svmla_f16_x(pg, svsum, sva, svb); } return svaddv_f16(svptrue_b16(), svsum);}```