This discussion has been locked.
You can no longer post new replies to this discussion. If you have a question you can start a new discussion

Improve Performance of specific NEON functions using SVE/SVE2

Hello,

I have the following 3 functions that utilize NEON instruction set:

function pixel_avg2_w8_neon, export=1
1:
    subs        w5,  w5,  #2
    ld1         {v0.8b}, [x2], x3
    ld1         {v2.8b}, [x4], x3
    urhadd      v0.8b,  v0.8b,  v2.8b
    ld1         {v1.8b}, [x2], x3
    ld1         {v3.8b}, [x4], x3
    urhadd      v1.8b,  v1.8b,  v3.8b
    st1         {v0.8b}, [x0], x1
    st1         {v1.8b}, [x0], x1
    b.gt        1b
    ret
endfunc

function pixel_avg2_w16_neon, export=1
1:
    subs        w5,  w5,  #2
    ld1         {v0.16b}, [x2], x3
    ld1         {v2.16b}, [x4], x3
    urhadd      v0.16b, v0.16b, v2.16b
    ld1         {v1.16b}, [x2], x3
    ld1         {v3.16b}, [x4], x3
    urhadd      v1.16b, v1.16b, v3.16b
    st1         {v0.16b}, [x0], x1
    st1         {v1.16b}, [x0], x1
    b.gt        1b
    ret
endfunc

function pixel_sad_\h\()_neon, export=1
    ld1         {v1.16b}, [x2], x3
    ld1         {v0.16b}, [x0], x1
    ld1         {v3.16b}, [x2], x3
    ld1         {v2.16b}, [x0], x1
    uabdl       v16.8h,  v0.8b,  v1.8b
    uabdl2      v17.8h,  v0.16b, v1.16b
    uabal       v16.8h,  v2.8b,  v3.8b
    uabal2      v17.8h,  v2.16b, v3.16b

.rept \h / 2 - 1
    ld1         {v1.16b}, [x2], x3
    ld1         {v0.16b}, [x0], x1
    ld1         {v3.16b}, [x2], x3
    ld1         {v2.16b}, [x0], x1
    uabal       v16.8h,  v0.8b,  v1.8b
    uabal2      v17.8h,  v0.16b, v1.16b
    uabal       v16.8h,  v2.8b,  v3.8b
    uabal2      v17.8h,  v2.16b, v3.16b
.endr
    add         v16.8h,  v16.8h,  v17.8h
    uaddlv      s0,  v16.8h
    fmov        w0,  s0
    ret
endfunc

I want to use SVE/SVE2 instructions set to improve the performance of these functions. My testbed is Alibaba Yitian 710 (vector size=128 bits).

For the first 2, I couldn't find a way to improve the performance. For the latter, I wrote the following function:

function pixel_sad_\h\()_sve, export=1
    ptrue       p0.h, vl8
    ld1b        {z1.h}, p0/z, [x2]
    ld1b        {z4.h}, p0/z, [x2, #1, mul vl]
    add         x2, x2, x3
    ld1b        {z3.h}, p0/z, [x2]
    ld1b        {z6.h}, p0/z, [x2, #1, mul vl]
    add         x2, x2, x3
    ld1b        {z0.h}, p0/z, [x0]
    ld1b        {z5.h}, p0/z, [x0, #1, mul vl]
    add         x0, x0, x1
    ld1b        {z2.h}, p0/z, [x0]
    ld1b        {z7.h}, p0/z, [x0, #1, mul vl]
    add         x0, x0, x1
    uabd        v16.8h,  v0.8h,  v1.8h
    uabd        v17.8h,  v4.8h,  v5.8h
    uaba        v16.8h,  v2.8h,  v3.8h
    uaba        v17.8h,  v7.8h,  v6.8h

.rept \h / 2 - 1
    ld1b        {z1.h}, p0/z, [x2]
    ld1b        {z4.h}, p0/z, [x2, #1, mul vl]
    add         x2, x2, x3
    ld1b        {z3.h}, p0/z, [x2]
    ld1b        {z6.h}, p0/z, [x2, #1, mul vl]
    add         x2, x2, x3
    ld1b        {z0.h}, p0/z, [x0]
    ld1b        {z5.h}, p0/z, [x0, #1, mul vl]
    add         x0, x0, x1
    ld1b        {z2.h}, p0/z, [x0]
    ld1b        {z7.h}, p0/z, [x0, #1, mul vl]
    add         x0, x0, x1
    uaba        v16.8h,  v0.8h,  v1.8h
    uaba        v17.8h,  v4.8h,  v5.8h
    uaba        v16.8h,  v2.8h,  v3.8h
    uaba        v17.8h,  v7.8h,  v6.8h
.endr
    
    add         v16.8h,  v16.8h,  v17.8h
    uaddlv      s0,  v16.8h
    fmov        w0,  s0
    ret
endfunc

However, this degrades the performance instead of improving it.

Can someone help me?

Thank you in advance,

Akis

Parents
  • Hi George,

    For sub8x8_dct8_neon, I applied your suggestion and everything worked fine. Thanks!

    For the copy functions, as you said, there is no much left to do for improving the performance.

    For mbtree_propagate_list_internal_neon, I applied your suggestion. Thanks!

    For pixel_var2_8x\h\()_neon, I used the udot instruction, but it doesn't work. It seems that some vectors (for example v0.8h, v1.8h, v6.8h and v7.8h) are still needed after widening instructions. I developed the following function:

    function pixel_var2_8x\h\()_sve, export=1
        movi            v30.4s, #0
        movi            v31.4s, #0
        mov             x3,  #16
        ld1             {v16.8b}, [x0], #8
        ld1             {v18.8b}, [x1], x3
        ld1             {v17.8b}, [x0], #8
        ld1             {v19.8b}, [x1], x3
        mov             x5,  \h - 2
        uabd            v28.8b, v16.8b, v18.8b
        usubl           v0.8h,  v16.8b, v18.8b
        uabd            v29.8b, v17.8b, v19.8b
        usubl           v1.8h,  v17.8b, v19.8b
        ld1             {v16.8b}, [x0], #8
        ld1             {v18.8b}, [x1], x3
    
        udot            v30.2s, v28.8b, v28.8b
        udot            v31.2s, v29.8b, v29.8b
    
        uabd            v28.8b, v16.8b, v18.8b
        usubl           v6.8h,  v16.8b, v18.8b
    
    1:  subs            x5,  x5,  #1
        ld1             {v17.8b}, [x0], #8
        ld1             {v19.8b}, [x1], x3
        udot            v30.2s, v28.8b, v28.8b
        uabd            v29.8b, v17.8b, v19.8b
        usubl           v7.8h,  v17.8b, v19.8b
        add             v0.8h,  v0.8h,  v6.8h
        ld1             {v16.8b}, [x0], #8
        ld1             {v18.8b}, [x1], x3
        udot            v31.2s, v29.8b, v29.8b
        uabd            v28.8b, v16.8b, v18.8b
        usubl           v6.8h,  v16.8b, v18.8b
        add             v1.8h,  v1.8h,  v7.8h
        b.gt            1b
    
        ld1             {v17.8b}, [x0], #8
        ld1             {v19.8b}, [x1], x3
        udot            v30.2s, v6.8b, v6.8b
        uabd            v29.8b, v17.8b, v19.8b
        usubl           v7.8h,  v17.8b, v19.8b
        add             v0.8h,  v0.8h,  v6.8h
        udot            v31.2s, v29.8b, v29.8b
        add             v1.8h,  v1.8h,  v7.8h
    
        saddlv          s0,  v0.8h
        saddlv          s1,  v1.8h
        mov             w0,  v0.s[0]
        mov             w1,  v1.s[0]
        addv            s2,  v30.4s
        addv            s4,  v31.4s
        mul             w0,  w0,  w0
        mul             w1,  w1,  w1
        mov             w3,  v30.s[0]
        mov             w4,  v31.s[0]
        sub             w0,  w3,  w0,  lsr # 6 + (\h >> 4)
        sub             w1,  w4,  w1,  lsr # 6 + (\h >> 4)
        str             w3,  [x2]
        add             w0,  w0,  w1
        str             w4,  [x2, #4]
    
        ret
    endfunc
    

    Unit tests fail. Can you please tell me what I am doing wrong? Also, the usage of the three load merging commands instead of the initial four, degrades the performance. I do not know why.

    For pixel_sad_x_h\()_neon_10, I also agree that we cannot improve it.

    BR,

    Akis

Reply
  • Hi George,

    For sub8x8_dct8_neon, I applied your suggestion and everything worked fine. Thanks!

    For the copy functions, as you said, there is no much left to do for improving the performance.

    For mbtree_propagate_list_internal_neon, I applied your suggestion. Thanks!

    For pixel_var2_8x\h\()_neon, I used the udot instruction, but it doesn't work. It seems that some vectors (for example v0.8h, v1.8h, v6.8h and v7.8h) are still needed after widening instructions. I developed the following function:

    function pixel_var2_8x\h\()_sve, export=1
        movi            v30.4s, #0
        movi            v31.4s, #0
        mov             x3,  #16
        ld1             {v16.8b}, [x0], #8
        ld1             {v18.8b}, [x1], x3
        ld1             {v17.8b}, [x0], #8
        ld1             {v19.8b}, [x1], x3
        mov             x5,  \h - 2
        uabd            v28.8b, v16.8b, v18.8b
        usubl           v0.8h,  v16.8b, v18.8b
        uabd            v29.8b, v17.8b, v19.8b
        usubl           v1.8h,  v17.8b, v19.8b
        ld1             {v16.8b}, [x0], #8
        ld1             {v18.8b}, [x1], x3
    
        udot            v30.2s, v28.8b, v28.8b
        udot            v31.2s, v29.8b, v29.8b
    
        uabd            v28.8b, v16.8b, v18.8b
        usubl           v6.8h,  v16.8b, v18.8b
    
    1:  subs            x5,  x5,  #1
        ld1             {v17.8b}, [x0], #8
        ld1             {v19.8b}, [x1], x3
        udot            v30.2s, v28.8b, v28.8b
        uabd            v29.8b, v17.8b, v19.8b
        usubl           v7.8h,  v17.8b, v19.8b
        add             v0.8h,  v0.8h,  v6.8h
        ld1             {v16.8b}, [x0], #8
        ld1             {v18.8b}, [x1], x3
        udot            v31.2s, v29.8b, v29.8b
        uabd            v28.8b, v16.8b, v18.8b
        usubl           v6.8h,  v16.8b, v18.8b
        add             v1.8h,  v1.8h,  v7.8h
        b.gt            1b
    
        ld1             {v17.8b}, [x0], #8
        ld1             {v19.8b}, [x1], x3
        udot            v30.2s, v6.8b, v6.8b
        uabd            v29.8b, v17.8b, v19.8b
        usubl           v7.8h,  v17.8b, v19.8b
        add             v0.8h,  v0.8h,  v6.8h
        udot            v31.2s, v29.8b, v29.8b
        add             v1.8h,  v1.8h,  v7.8h
    
        saddlv          s0,  v0.8h
        saddlv          s1,  v1.8h
        mov             w0,  v0.s[0]
        mov             w1,  v1.s[0]
        addv            s2,  v30.4s
        addv            s4,  v31.4s
        mul             w0,  w0,  w0
        mul             w1,  w1,  w1
        mov             w3,  v30.s[0]
        mov             w4,  v31.s[0]
        sub             w0,  w3,  w0,  lsr # 6 + (\h >> 4)
        sub             w1,  w4,  w1,  lsr # 6 + (\h >> 4)
        str             w3,  [x2]
        add             w0,  w0,  w1
        str             w4,  [x2, #4]
    
        ret
    endfunc
    

    Unit tests fail. Can you please tell me what I am doing wrong? Also, the usage of the three load merging commands instead of the initial four, degrades the performance. I do not know why.

    For pixel_sad_x_h\()_neon_10, I also agree that we cannot improve it.

    BR,

    Akis

Children
No data