This discussion has been locked.
You can no longer post new replies to this discussion. If you have a question you can start a new discussion

NEON intrinsics vector division and reciprocal functions not found

Hi all,

I am working with a simple sqrt kernel. Code given below at end of post. It calculates the sqrt on a given array and stores it into a new array.

However, when compiling with a  gcc compiler as  - gcc -mcpu=cortex-a53 -mfpu=neon neon_sqrt_kernel.c

I get the following error

neon_sqrt_kernel.c: In function ‘main’:
neon_sqrt_kernel.c:70:12: warning: implicit declaration of function ‘vsqrtq_f32’ [-Wimplicit-function-declaration]
   data_c = vsqrtq_f32(data_a);
            ^~~~~~~~~~
neon_sqrt_kernel.c:70:10: error: incompatible types when assigning to type ‘float32x4_t’ from type ‘int’
   data_c = vsqrtq_f32(data_a);

However, the NEON intrinsics manual clearly indicates that such a function does not exist (https://developer.arm.com/technologies/neon/intrinsics).

Could anyone clarify the above dilemma for me? Why is the above function not declared?

Code

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "arm_neon.h"

/*
 * Author : Aketh TM
 * This kernel was developed as a part of an effort to test the effectiveness of aligned instructions 
 * using NEON SIMD Intrinsics
 */

// Preprocessor to check if the arrays are aligned
#define is_aligned(POINTER,BYTE_COUNT) (((uintptr_t)(const void *)(POINTER)) % (BYTE_COUNT) == 0   )

int main (int argc,char** argv) 
{
    unsigned int n = atoi(argv[1]);

    /* Create custom arbitrary data. */
    float32_t* float32_data_a;
    float32_t* float32_data_b; 
    float32_t* float32_data_c;
    struct timespec start,end;
	
    size_t size = sizeof(float32_t) * n;

    // Allocate memory aligned at 16 byte boundaries
    float32_data_a = (float32_t *) aligned_alloc(16,size);     
    float32_data_c = (float32_t *) aligned_alloc(16,size);
  
    if( __ARM_NEON__) printf("Neon unit detected \n");

    // Check for alignment
    if( is_aligned(float32_data_a,16) )
       	printf("Address of float32_data_a is aligned with address %d \n",(uintptr_t)(const void *) float32_data_a);
    else
	printf("Array a is unaligned \n");
 
    if( is_aligned(float32_data_b,16) )
       	printf("Address of float32_data_b is aligned with address %d \n",(uintptr_t)(const void *) float32_data_b);
    else
	printf("Array b is unaligned \n");
 
    if( is_aligned(float32_data_c,16) )
       	printf("Address of float32_data_c is aligned with address %d \n",(uintptr_t)(const void *) float32_data_c);
    else
	printf("Array c is unaligned \n");


    for(uint32_t i = 0; i < n ; i++) 
    {
	    float32_data_a[i] = i;
	    float32_data_c[i] = i;
    }

    /* Create the vector with our data. */
    float32x4_t data_a;
    float32x4_t data_b;
    float32x4_t data_c;

   
    clock_gettime(CLOCK_MONOTONIC,&start);

    for(int count = 0; count < 1; count++)
    {
    	for(unsigned int i = 0; i < n ; i+=4)
    	{	 
    		/* Load our custom data into the vector register. */
    		data_a  = vld1q_f32 (float32_data_a + i); 
		data_c = vsqrtq_f32(data_a);
    		vst1q_f32(float32_data_c + i,data_c); 
	}	
    }

    clock_gettime(CLOCK_MONOTONIC,&end);

    double time_usec=(((double)end.tv_sec * 1000000 + (double)end.tv_nsec/1000) - ((double)start.tv_sec *1000000 + (double)start.tv_nsec/1000));
    printf("Time taken for square root is : %fus to process data of size %d \n", time_usec,n );

    for(int i = 0; i < 5 ; i++) printf("%f ",float32_data_c[i]); 
    printf("\n");
    
    free(float32_data_a);
    free(float32_data_b);
    free(float32_data_c);

    return 0;
}

Parents
  • I did attach the code. However, it seems like the attachment wasn't processed and posted as part of the thread. Sorry for the inconvience.

    The code for the program I am referring to is the the following :-

    #include <stdio.h>
    #include <stdlib.h>
    #include <time.h>
    #include "arm_neon.h"

    int main (int argc,char** argv)
    {
        unsigned int n = 512;

        /* Create custom arbitrary data. */
        float32_t* float32_data_a;
        float32_t* float32_data_c;
        struct timespec start,end;
        
        size_t size = sizeof(float32_t) * n;

        // Allocate memory aligned at 16 byte boundaries
        float32_data_a = (float32_t *) aligned_alloc(16,size);     
        float32_data_c = (float32_t *) aligned_alloc(16,size);
     
        if( __ARM_NEON__) printf("Neon unit detected \n");


        for(uint32_t i = 0; i < n ; i++)
        {
            float32_data_a[i] = i;
            float32_data_c[i] = i;
        }

        /* Create the vector with our data. */
        float32x4_t data_a;
        float32x4_t data_c;

       
        clock_gettime(CLOCK_MONOTONIC,&start);

        for(int count = 0; count < 1; count++)
        {
            for(unsigned int i = 0; i < n ; i+=4)
            {    
                /* Load our custom data into the vector register. */
                data_a  = vld1q_f32 (float32_data_a + i);
                data_c = vsqrtq_f32(data_a);
                vst1q_f32(float32_data_c + i,data_c);
        }    
        }

        clock_gettime(CLOCK_MONOTONIC,&end);

        double time_usec=(((double)end.tv_sec * 1000000 + (double)end.tv_nsec/1000) - ((double)start.tv_sec *1000000 + (double)start.tv_nsec/1000));
        printf("Time taken for square root is : %fus to process data of size %d \n", time_usec,n );

        for(int i = 0; i < 5 ; i++) printf("%f ",float32_data_c[i]);
        printf("\n");
        
        free(float32_data_a);
        free(float32_data_b);
        free(float32_data_c);

        return 0;
    }

Reply
  • I did attach the code. However, it seems like the attachment wasn't processed and posted as part of the thread. Sorry for the inconvience.

    The code for the program I am referring to is the the following :-

    #include <stdio.h>
    #include <stdlib.h>
    #include <time.h>
    #include "arm_neon.h"

    int main (int argc,char** argv)
    {
        unsigned int n = 512;

        /* Create custom arbitrary data. */
        float32_t* float32_data_a;
        float32_t* float32_data_c;
        struct timespec start,end;
        
        size_t size = sizeof(float32_t) * n;

        // Allocate memory aligned at 16 byte boundaries
        float32_data_a = (float32_t *) aligned_alloc(16,size);     
        float32_data_c = (float32_t *) aligned_alloc(16,size);
     
        if( __ARM_NEON__) printf("Neon unit detected \n");


        for(uint32_t i = 0; i < n ; i++)
        {
            float32_data_a[i] = i;
            float32_data_c[i] = i;
        }

        /* Create the vector with our data. */
        float32x4_t data_a;
        float32x4_t data_c;

       
        clock_gettime(CLOCK_MONOTONIC,&start);

        for(int count = 0; count < 1; count++)
        {
            for(unsigned int i = 0; i < n ; i+=4)
            {    
                /* Load our custom data into the vector register. */
                data_a  = vld1q_f32 (float32_data_a + i);
                data_c = vsqrtq_f32(data_a);
                vst1q_f32(float32_data_c + i,data_c);
        }    
        }

        clock_gettime(CLOCK_MONOTONIC,&end);

        double time_usec=(((double)end.tv_sec * 1000000 + (double)end.tv_nsec/1000) - ((double)start.tv_sec *1000000 + (double)start.tv_nsec/1000));
        printf("Time taken for square root is : %fus to process data of size %d \n", time_usec,n );

        for(int i = 0; i < 5 ; i++) printf("%f ",float32_data_c[i]);
        printf("\n");
        
        free(float32_data_a);
        free(float32_data_b);
        free(float32_data_c);

        return 0;
    }

Children
No data