This discussion has been locked.
You can no longer post new replies to this discussion. If you have a question you can start a new discussion

Strange behaviour of uint8x8x2_t

Hi @ all,

I have an uint32x4_t on which I want to perform a count of leading zeros and a table lookup with intrinsics. The table lookup should be performed like this:

          0   1   2   3   4   5   6   7    8   9    A    B    C    D    E    F
  Data ||0x0,0x0,0x1,0x2|0x0,0x3,0x0,0x4||0x5,0x6, 0x7, 0x8| 0x0, 0x0, 0x0, 0x9||
 SMask ||0x2,0x3,0x5,0x6|0x7,0x8,0x9,0xA||0xB,0xF,0x10,0x10|0x10,0x10,0x10,0x10||
Result ||0x1,0x2,0x3,0x0|0x4,0x5,0x6,0x7||0x8,0x9, 0x0, 0x0| 0x0, 0x0, 0x0, 0x0||

Thus table lookup is only supported through uint8x8xN and the count of leading zeros is only possible in the way I want it with uint32x4_t I ran into the following problem:

This is my code so far:


#include <iostream>
#include <arm_neon.h>

inline uint8x16_t Shuffle(const uint8x16_t & src, const uint8x16_t & shuffle) {
  return vcombine_u8(
            (const uint8x8x2_t &)src, 
            (const uint8x8x2_t &)src, 
int main() {
   //lookupTableIdx = 64*a + 16*b + 4*c + d
   //shiftData contains the needed leftshifts to get the correct idx
   int32_t* shiftData = new int32_t[4];
   shiftData[0] = 6;
   shiftData[1] = 4;
   shiftData[2] = 2;
   shiftData[3] = 0;
   //load shiftData into int32x4_t vector register
   int32x4_t shiftVec = vld1q_s32(shiftData);
   uint32_t* clzData = new uint32_t[4];

   uint32_t* data32 = new uint32_t[4];
   data32[0] = 258;             // [0x00 0x00 0x01 0x02]
   data32[1] = 196612;          // [0x00 0x03 0x00 0x04]
   data32[2] = 84281096;        // [0x05 0x06 0x07 0x08]
   data32[3] = 9;               // [0x00 0x00 0x00 0x09]

   //load structure
   uint32x4_t data32Vec = vld1q_u32(data32);
   //count leading zeros --> divide by 8 through >>3, shift with shiftdata
   //maybe optim: add in register vadd(uint32x2_t,uint32x3_t)
   uint32x4_t clzReg = vshlq_u32(vshrq_n_u32(vclzq_u32(data32Vec),3),shiftVec);
   //store clzReg into an uint32_t[4]
   vst1q_u32(clzData, clzReg);
   //lookupIdx calculation through addition
   uint8_t idx = (uint8_t)(clzData[0]+clzData[1]+clzData[2]+clzData[3]);
   std::cout <<"permutationMaskIdx = "<< (unsigned) idx << std::endl;
   uint8_t* sMask = new uint8_t[16];

   sMask[0] = 2;
   sMask[1] = 3;
   sMask[2] = 5;
   sMask[3] = 6;
   sMask[4] = 7;
   sMask[5] = 8;
   sMask[6] = 9;
   sMask[7] = 10;
   sMask[8] = 11;
   sMask[9] = 15;
   sMask[10] = 16;
   sMask[11] = 16;
   sMask[12] = 16;
   sMask[13] = 16;
   sMask[14] = 16;
   sMask[15] = 16;

   /*load permutationmask into vector register*/
   uint8x16_t shuffleMask = vld1q_u8(sMask);
   uint8_t* comprData = new uint8_t[16];
   vst1q_u8(comprData, Shuffle(vreinterpretq_u8_u32(data32Vec),shuffleMask));
   for(int i = 0; i < 16; ++i) {
      std::cout << (unsigned)comprData[i] << "   " ;
   std::cout << std::endl;
   delete[] comprData;
   delete[] sMask;
   delete[] data32;
   return 0;


If I compile that using:

g++ -march=native -mfpu=neon -std=c++14 main.cpp

and run it, the output is this:

0   0   0   3   0   8   7   6   5   0   0   0   0   0   0   0

But it should be this:

1   2   3   0   4   5   6   7   8   9   0   0   0   0   0   0

Does anyone know, what I am doing wrong?



  • It's not clear to me why you want to go via uint32x4_t, VCLZ is able to work directly on uint8s.

    As an example, the following non-Neon code (that converts sixteen 8-bit values to the alpha-numerical character equivalent to each count-lead-zero) :

    void func(const uint8_t *src, uint8_t *dst)
      static const uint8_t table[] =
        { '0','1','2','3','4','5','6','7','8' };
      int i;
        dst[i] = table[__builtin_clz(src[i])];

    Could be expressed in Neon intrinsics using VCLZ and VTBL as something along the lines of:

    void func(const uint8_t *src, uint8_t *dst)
      uint8x8x2_t tbl;
      uint8x16_t in, idx, tab;
      uint8x8_t res0, res1;
      uint8x16_t out;
      static const uint8_t table[] =
        { '0','1','2','3','4','5','6','7',
          '8',0,0,0,0,0,0,0 }; // pad to 16 bytes.
      tab = vld1q_u8(table);
      tbl.val[0] = vget_low_u8(tab);
      tbl.val[1] = vget_high_u8(tab);
      in   = vld1q_u8(src);
      idx  = vclzq_u8(in);
      res0 = vtbl2_u8(tbl, vget_low_u8(idx));
      res1 = vtbl2_u8(tbl, vget_high_u8(idx));
      out  = vcombine_u8(res0,res1);
      vst1q_u8(dst, out);



  • It's not clear to me why you want to go via uint32x4_t, VCLZ is able to work directly on uint8s.

    As an example, the following non-Neon code (that converts sixteen 8-bit values to the alpha-numerical character equivalent to each count-lead-zero) :

    void func(const uint8_t *src, uint8_t *dst)
      static const uint8_t table[] =
        { '0','1','2','3','4','5','6','7','8' };
      int i;
        dst[i] = table[__builtin_clz(src[i])];

    Could be expressed in Neon intrinsics using VCLZ and VTBL as something along the lines of:

    void func(const uint8_t *src, uint8_t *dst)
      uint8x8x2_t tbl;
      uint8x16_t in, idx, tab;
      uint8x8_t res0, res1;
      uint8x16_t out;
      static const uint8_t table[] =
        { '0','1','2','3','4','5','6','7',
          '8',0,0,0,0,0,0,0 }; // pad to 16 bytes.
      tab = vld1q_u8(table);
      tbl.val[0] = vget_low_u8(tab);
      tbl.val[1] = vget_high_u8(tab);
      in   = vld1q_u8(src);
      idx  = vclzq_u8(in);
      res0 = vtbl2_u8(tbl, vget_low_u8(idx));
      res1 = vtbl2_u8(tbl, vget_high_u8(idx));
      out  = vcombine_u8(res0,res1);
      vst1q_u8(dst, out);


