We are running a survey to help us improve the experience for all of our members. If you see the survey appear, please take the time to tell us about your experience if you can.
Hi @ all,
I have an uint32x4_t on which I want to perform a count of leading zeros and a table lookup with intrinsics. The table lookup should be performed like this:
0 1 2 3 4 5 6 7 8 9 A B C D E F Data ||0x0,0x0,0x1,0x2|0x0,0x3,0x0,0x4||0x5,0x6, 0x7, 0x8| 0x0, 0x0, 0x0, 0x9|| SMask ||0x2,0x3,0x5,0x6|0x7,0x8,0x9,0xA||0xB,0xF,0x10,0x10|0x10,0x10,0x10,0x10|| Result ||0x1,0x2,0x3,0x0|0x4,0x5,0x6,0x7||0x8,0x9, 0x0, 0x0| 0x0, 0x0, 0x0, 0x0||
Thus table lookup is only supported through uint8x8xN and the count of leading zeros is only possible in the way I want it with uint32x4_t I ran into the following problem:
This is my code so far:
#include <iostream> #include <arm_neon.h> inline uint8x16_t Shuffle(const uint8x16_t & src, const uint8x16_t & shuffle) { return vcombine_u8( vtbl2_u8( (const uint8x8x2_t &)src, vget_low_u8(shuffle) ), vtbl2_u8( (const uint8x8x2_t &)src, vget_high_u8(shuffle) ) ); } int main() { //FIX PART ONLY RUN ONCE //lookupTableIdx = 64*a + 16*b + 4*c + d //shiftData contains the needed leftshifts to get the correct idx int32_t* shiftData = new int32_t[4]; shiftData[0] = 6; shiftData[1] = 4; shiftData[2] = 2; shiftData[3] = 0; //load shiftData into int32x4_t vector register int32x4_t shiftVec = vld1q_s32(shiftData); uint32_t* clzData = new uint32_t[4]; // END OF FIXED PART uint32_t* data32 = new uint32_t[4]; data32[0] = 258; // [0x00 0x00 0x01 0x02] data32[1] = 196612; // [0x00 0x03 0x00 0x04] data32[2] = 84281096; // [0x05 0x06 0x07 0x08] data32[3] = 9; // [0x00 0x00 0x00 0x09] //load structure uint32x4_t data32Vec = vld1q_u32(data32); //count leading zeros --> divide by 8 through >>3, shift with shiftdata //maybe optim: add in register vadd(uint32x2_t,uint32x3_t) uint32x4_t clzReg = vshlq_u32(vshrq_n_u32(vclzq_u32(data32Vec),3),shiftVec); //store clzReg into an uint32_t[4] vst1q_u32(clzData, clzReg); //lookupIdx calculation through addition uint8_t idx = (uint8_t)(clzData[0]+clzData[1]+clzData[2]+clzData[3]); std::cout <<"permutationMaskIdx = "<< (unsigned) idx << std::endl; uint8_t* sMask = new uint8_t[16]; sMask[0] = 2; sMask[1] = 3; sMask[2] = 5; sMask[3] = 6; sMask[4] = 7; sMask[5] = 8; sMask[6] = 9; sMask[7] = 10; sMask[8] = 11; sMask[9] = 15; sMask[10] = 16; sMask[11] = 16; sMask[12] = 16; sMask[13] = 16; sMask[14] = 16; sMask[15] = 16; /*load permutationmask into vector register*/ uint8x16_t shuffleMask = vld1q_u8(sMask); uint8_t* comprData = new uint8_t[16]; vst1q_u8(comprData, Shuffle(vreinterpretq_u8_u32(data32Vec),shuffleMask)); for(int i = 0; i < 16; ++i) { std::cout << (unsigned)comprData[i] << " " ; } std::cout << std::endl; delete[] comprData; delete[] sMask; delete[] data32; return 0; }
If I compile that using:
g++ -march=native -mfpu=neon -std=c++14 main.cpp
and run it, the output is this:
0 0 0 3 0 8 7 6 5 0 0 0 0 0 0 0
But it should be this:
1 2 3 0 4 5 6 7 8 9 0 0 0 0 0 0
Does anyone know, what I am doing wrong?
Sincerely