Hi all,I am working with a simple sqrt kernel. Code given below at end of post. It calculates the sqrt on a given array and stores it into a new array.
However, when compiling with a gcc compiler as - gcc -mcpu=cortex-a53 -mfpu=neon neon_sqrt_kernel.c
I get the following error
neon_sqrt_kernel.c: In function ‘main’:neon_sqrt_kernel.c:70:12: warning: implicit declaration of function ‘vsqrtq_f32’ [-Wimplicit-function-declaration] data_c = vsqrtq_f32(data_a); ^~~~~~~~~~neon_sqrt_kernel.c:70:10: error: incompatible types when assigning to type ‘float32x4_t’ from type ‘int’ data_c = vsqrtq_f32(data_a);
However, the NEON intrinsics manual clearly indicates that such a function does not exist (https://developer.arm.com/technologies/neon/intrinsics). Could anyone clarify the above dilemma for me? Why is the above function not declared?
Code
#include <stdio.h> #include <stdlib.h> #include <time.h> #include "arm_neon.h" /* * Author : Aketh TM * This kernel was developed as a part of an effort to test the effectiveness of aligned instructions * using NEON SIMD Intrinsics */ // Preprocessor to check if the arrays are aligned #define is_aligned(POINTER,BYTE_COUNT) (((uintptr_t)(const void *)(POINTER)) % (BYTE_COUNT) == 0 ) int main (int argc,char** argv) { unsigned int n = atoi(argv[1]); /* Create custom arbitrary data. */ float32_t* float32_data_a; float32_t* float32_data_b; float32_t* float32_data_c; struct timespec start,end; size_t size = sizeof(float32_t) * n; // Allocate memory aligned at 16 byte boundaries float32_data_a = (float32_t *) aligned_alloc(16,size); float32_data_c = (float32_t *) aligned_alloc(16,size); if( __ARM_NEON__) printf("Neon unit detected \n"); // Check for alignment if( is_aligned(float32_data_a,16) ) printf("Address of float32_data_a is aligned with address %d \n",(uintptr_t)(const void *) float32_data_a); else printf("Array a is unaligned \n"); if( is_aligned(float32_data_b,16) ) printf("Address of float32_data_b is aligned with address %d \n",(uintptr_t)(const void *) float32_data_b); else printf("Array b is unaligned \n"); if( is_aligned(float32_data_c,16) ) printf("Address of float32_data_c is aligned with address %d \n",(uintptr_t)(const void *) float32_data_c); else printf("Array c is unaligned \n"); for(uint32_t i = 0; i < n ; i++) { float32_data_a[i] = i; float32_data_c[i] = i; } /* Create the vector with our data. */ float32x4_t data_a; float32x4_t data_b; float32x4_t data_c; clock_gettime(CLOCK_MONOTONIC,&start); for(int count = 0; count < 1; count++) { for(unsigned int i = 0; i < n ; i+=4) { /* Load our custom data into the vector register. */ data_a = vld1q_f32 (float32_data_a + i); data_c = vsqrtq_f32(data_a); vst1q_f32(float32_data_c + i,data_c); } } clock_gettime(CLOCK_MONOTONIC,&end); double time_usec=(((double)end.tv_sec * 1000000 + (double)end.tv_nsec/1000) - ((double)start.tv_sec *1000000 + (double)start.tv_nsec/1000)); printf("Time taken for square root is : %fus to process data of size %d \n", time_usec,n ); for(int i = 0; i < 5 ; i++) printf("%f ",float32_data_c[i]); printf("\n"); free(float32_data_a); free(float32_data_b); free(float32_data_c); return 0; }
I did attach the code. However, it seems like the attachment wasn't processed and posted as part of the thread. Sorry for the inconvience.
The code for the program I am referring to is the the following :-
#include <stdio.h>#include <stdlib.h>#include <time.h>#include "arm_neon.h"int main (int argc,char** argv) { unsigned int n = 512; /* Create custom arbitrary data. */ float32_t* float32_data_a; float32_t* float32_data_c; struct timespec start,end; size_t size = sizeof(float32_t) * n; // Allocate memory aligned at 16 byte boundaries float32_data_a = (float32_t *) aligned_alloc(16,size); float32_data_c = (float32_t *) aligned_alloc(16,size); if( __ARM_NEON__) printf("Neon unit detected \n"); for(uint32_t i = 0; i < n ; i++) { float32_data_a[i] = i; float32_data_c[i] = i; } /* Create the vector with our data. */ float32x4_t data_a; float32x4_t data_c; clock_gettime(CLOCK_MONOTONIC,&start); for(int count = 0; count < 1; count++) { for(unsigned int i = 0; i < n ; i+=4) { /* Load our custom data into the vector register. */ data_a = vld1q_f32 (float32_data_a + i); data_c = vsqrtq_f32(data_a); vst1q_f32(float32_data_c + i,data_c); } } clock_gettime(CLOCK_MONOTONIC,&end); double time_usec=(((double)end.tv_sec * 1000000 + (double)end.tv_nsec/1000) - ((double)start.tv_sec *1000000 + (double)start.tv_nsec/1000)); printf("Time taken for square root is : %fus to process data of size %d \n", time_usec,n ); for(int i = 0; i < 5 ; i++) printf("%f ",float32_data_c[i]); printf("\n"); free(float32_data_a); free(float32_data_b); free(float32_data_c); return 0;}