We are running a survey to help us improve the experience for all of our members. If you see the survey appear, please take the time to tell us about your experience if you can.
Hi all,I am working with a simple sqrt kernel. Code given below at end of post. It calculates the sqrt on a given array and stores it into a new array.
However, when compiling with a gcc compiler as - gcc -mcpu=cortex-a53 -mfpu=neon neon_sqrt_kernel.c
I get the following error
neon_sqrt_kernel.c: In function ‘main’:neon_sqrt_kernel.c:70:12: warning: implicit declaration of function ‘vsqrtq_f32’ [-Wimplicit-function-declaration] data_c = vsqrtq_f32(data_a); ^~~~~~~~~~neon_sqrt_kernel.c:70:10: error: incompatible types when assigning to type ‘float32x4_t’ from type ‘int’ data_c = vsqrtq_f32(data_a);
However, the NEON intrinsics manual clearly indicates that such a function does not exist (https://developer.arm.com/technologies/neon/intrinsics). Could anyone clarify the above dilemma for me? Why is the above function not declared?
Code
#include <stdio.h> #include <stdlib.h> #include <time.h> #include "arm_neon.h" /* * Author : Aketh TM * This kernel was developed as a part of an effort to test the effectiveness of aligned instructions * using NEON SIMD Intrinsics */ // Preprocessor to check if the arrays are aligned #define is_aligned(POINTER,BYTE_COUNT) (((uintptr_t)(const void *)(POINTER)) % (BYTE_COUNT) == 0 ) int main (int argc,char** argv) { unsigned int n = atoi(argv[1]); /* Create custom arbitrary data. */ float32_t* float32_data_a; float32_t* float32_data_b; float32_t* float32_data_c; struct timespec start,end; size_t size = sizeof(float32_t) * n; // Allocate memory aligned at 16 byte boundaries float32_data_a = (float32_t *) aligned_alloc(16,size); float32_data_c = (float32_t *) aligned_alloc(16,size); if( __ARM_NEON__) printf("Neon unit detected \n"); // Check for alignment if( is_aligned(float32_data_a,16) ) printf("Address of float32_data_a is aligned with address %d \n",(uintptr_t)(const void *) float32_data_a); else printf("Array a is unaligned \n"); if( is_aligned(float32_data_b,16) ) printf("Address of float32_data_b is aligned with address %d \n",(uintptr_t)(const void *) float32_data_b); else printf("Array b is unaligned \n"); if( is_aligned(float32_data_c,16) ) printf("Address of float32_data_c is aligned with address %d \n",(uintptr_t)(const void *) float32_data_c); else printf("Array c is unaligned \n"); for(uint32_t i = 0; i < n ; i++) { float32_data_a[i] = i; float32_data_c[i] = i; } /* Create the vector with our data. */ float32x4_t data_a; float32x4_t data_b; float32x4_t data_c; clock_gettime(CLOCK_MONOTONIC,&start); for(int count = 0; count < 1; count++) { for(unsigned int i = 0; i < n ; i+=4) { /* Load our custom data into the vector register. */ data_a = vld1q_f32 (float32_data_a + i); data_c = vsqrtq_f32(data_a); vst1q_f32(float32_data_c + i,data_c); } } clock_gettime(CLOCK_MONOTONIC,&end); double time_usec=(((double)end.tv_sec * 1000000 + (double)end.tv_nsec/1000) - ((double)start.tv_sec *1000000 + (double)start.tv_nsec/1000)); printf("Time taken for square root is : %fus to process data of size %d \n", time_usec,n ); for(int i = 0; i < 5 ; i++) printf("%f ",float32_data_c[i]); printf("\n"); free(float32_data_a); free(float32_data_b); free(float32_data_c); return 0; }