Hi @ all,
I have an uint32x4_t on which I want to perform a count of leading zeros and a table lookup with intrinsics. The table lookup should be performed like this:
0 1 2 3 4 5 6 7 8 9 A B C D E F Data ||0x0,0x0,0x1,0x2|0x0,0x3,0x0,0x4||0x5,0x6, 0x7, 0x8| 0x0, 0x0, 0x0, 0x9|| SMask ||0x2,0x3,0x5,0x6|0x7,0x8,0x9,0xA||0xB,0xF,0x10,0x10|0x10,0x10,0x10,0x10|| Result ||0x1,0x2,0x3,0x0|0x4,0x5,0x6,0x7||0x8,0x9, 0x0, 0x0| 0x0, 0x0, 0x0, 0x0||
Thus table lookup is only supported through uint8x8xN and the count of leading zeros is only possible in the way I want it with uint32x4_t I ran into the following problem:
This is my code so far:
#include <iostream> #include <arm_neon.h> inline uint8x16_t Shuffle(const uint8x16_t & src, const uint8x16_t & shuffle) { return vcombine_u8( vtbl2_u8( (const uint8x8x2_t &)src, vget_low_u8(shuffle) ), vtbl2_u8( (const uint8x8x2_t &)src, vget_high_u8(shuffle) ) ); } int main() { //FIX PART ONLY RUN ONCE //lookupTableIdx = 64*a + 16*b + 4*c + d //shiftData contains the needed leftshifts to get the correct idx int32_t* shiftData = new int32_t[4]; shiftData[0] = 6; shiftData[1] = 4; shiftData[2] = 2; shiftData[3] = 0; //load shiftData into int32x4_t vector register int32x4_t shiftVec = vld1q_s32(shiftData); uint32_t* clzData = new uint32_t[4]; // END OF FIXED PART uint32_t* data32 = new uint32_t[4]; data32[0] = 258; // [0x00 0x00 0x01 0x02] data32[1] = 196612; // [0x00 0x03 0x00 0x04] data32[2] = 84281096; // [0x05 0x06 0x07 0x08] data32[3] = 9; // [0x00 0x00 0x00 0x09] //load structure uint32x4_t data32Vec = vld1q_u32(data32); //count leading zeros --> divide by 8 through >>3, shift with shiftdata //maybe optim: add in register vadd(uint32x2_t,uint32x3_t) uint32x4_t clzReg = vshlq_u32(vshrq_n_u32(vclzq_u32(data32Vec),3),shiftVec); //store clzReg into an uint32_t[4] vst1q_u32(clzData, clzReg); //lookupIdx calculation through addition uint8_t idx = (uint8_t)(clzData[0]+clzData[1]+clzData[2]+clzData[3]); std::cout <<"permutationMaskIdx = "<< (unsigned) idx << std::endl; uint8_t* sMask = new uint8_t[16]; sMask[0] = 2; sMask[1] = 3; sMask[2] = 5; sMask[3] = 6; sMask[4] = 7; sMask[5] = 8; sMask[6] = 9; sMask[7] = 10; sMask[8] = 11; sMask[9] = 15; sMask[10] = 16; sMask[11] = 16; sMask[12] = 16; sMask[13] = 16; sMask[14] = 16; sMask[15] = 16; /*load permutationmask into vector register*/ uint8x16_t shuffleMask = vld1q_u8(sMask); uint8_t* comprData = new uint8_t[16]; vst1q_u8(comprData, Shuffle(vreinterpretq_u8_u32(data32Vec),shuffleMask)); for(int i = 0; i < 16; ++i) { std::cout << (unsigned)comprData[i] << " " ; } std::cout << std::endl; delete[] comprData; delete[] sMask; delete[] data32; return 0; }
If I compile that using:
g++ -march=native -mfpu=neon -std=c++14 main.cpp
and run it, the output is this:
0 0 0 3 0 8 7 6 5 0 0 0 0 0 0 0
But it should be this:
1 2 3 0 4 5 6 7 8 9 0 0 0 0 0 0
Does anyone know, what I am doing wrong?
Sincerely
It's not clear to me why you want to go via uint32x4_t, VCLZ is able to work directly on uint8s.
As an example, the following non-Neon code (that converts sixteen 8-bit values to the alpha-numerical character equivalent to each count-lead-zero) :
void func(const uint8_t *src, uint8_t *dst) { static const uint8_t table[] = { '0','1','2','3','4','5','6','7','8' }; int i; for(i=0;i<16;i++) dst[i] = table[__builtin_clz(src[i])]; }
Could be expressed in Neon intrinsics using VCLZ and VTBL as something along the lines of:
void func(const uint8_t *src, uint8_t *dst) { uint8x8x2_t tbl; uint8x16_t in, idx, tab; uint8x8_t res0, res1; uint8x16_t out; static const uint8_t table[] = { '0','1','2','3','4','5','6','7', '8',0,0,0,0,0,0,0 }; // pad to 16 bytes. tab = vld1q_u8(table); tbl.val[0] = vget_low_u8(tab); tbl.val[1] = vget_high_u8(tab); in = vld1q_u8(src); idx = vclzq_u8(in); res0 = vtbl2_u8(tbl, vget_low_u8(idx)); res1 = vtbl2_u8(tbl, vget_high_u8(idx)); out = vcombine_u8(res0,res1); vst1q_u8(dst, out); }
hth
Simon.
Isn't it a huge difference between counting leading zeros of 8bit and 32 bit? E.g.: uint32_t a = 0x00 0A 00 0A uint8_t* b = { 0x00, 0x0A, 0x00, 0x0A } clz(a) = 12 //it is 00000000 00001010... clz(b) = {8, 4, 8, 4} One could argue, that we could perform some kind of addition, but it seems a bit rough to me.
DorJo, your problem appears to reside in how you initialize "data32".
uint32_t data[] = { 258, 196612, 84281096, 9 }; /* { 0x102, 0x304, 0x5678, 0x9 } */
has equivalent memory layout to:
uint8_t data[] = { 0x02, 0x01, 0x00, 0x00, 0x04, 0x00, 0x03, 0x00, ... };
and not:
uint8_t data[] = { 0x00, 0x00, 0x01, 0x02, 0x00, 0x03, 0x00, 0x04, ... };
as you appear to require.