int32x4_t pairwiseAddedOnce = vpaddlq_s16(vec);int64x2_t pairwiseAddedTwice = vpaddlq_s32(pairwiseAddedOnce);int16_t sum = (int16_t)(vgetq_lane_s64(pairwiseAddedTwice, 0) + vgetq_lane_s64(pairwiseAddedTwice, 1));
int16x4_t addedDRegisters = vadd_s16(vget_low_s16(vec), vget_high_s16(vec));int32x2_t pairwiseAddedOnce = vpaddl_s16(addedDRegisters);int64x1_t pairwiseAddedTwice = vpaddl_s32(pairwiseAddedOnce);int16_t sum = (int16_t)vget_lane_s64(pairwiseAddedTwice, 0);
int32x4_t pairwiseAddedOnce = vpaddlq_s16(vec);int64x2_t pairwiseAddedTwice = vpaddlq_s32(pairwiseAddedOnce);int32x2_t narrowed = vmovn_s64(pairwiseAddedTwice);int64x1_t pairwiseAddedThrice = vpaddl_s32(narrowed);int16_t sum = (int16_t)vget_lane_s64(pairwiseAddedThrice, 0);
Does your code included into a loop or do you need to apply the algorithm only one time ?Can you explain what you are wanting to do ?
int16_t left_sum_1 = vgetq_lane_s16(multipliedVector_1, 0);int32x4_t added32x4_1 = vpaddlq_s16(multipliedVector_1);int64x2_t added64x2_1 = vpaddlq_s32(added32x4_1);int64_t right_sum_1 = vgetq_lane_s64(added64x2_1, 0) + vgetq_lane_s64(added64x2_1, 1) - left_sum_1;