Bug 117769 - RISC-V: Worse codegen in x264_pixel_satd_8x4
Summary: RISC-V: Worse codegen in x264_pixel_satd_8x4
Status: RESOLVED FIXED
Alias: None
Product: gcc
Classification: Unclassified
Component: target (show other bugs)
Version: 15.0
: P3 normal
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization
Depends on:
Blocks: spec
  Show dependency treegraph
 
Reported: 2024-11-25 09:05 UTC by JuzheZhong
Modified: 2024-11-25 09:51 UTC (History)
4 users (show)

See Also:
Host:
Target: riscv
Build:
Known to work:
Known to fail:
Last reconfirmed:


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description JuzheZhong 2024-11-25 09:05:06 UTC
Notice we generate worse codegen than ARM SVE in hottest function of 625.x264:

#include <stdint.h>
#include <math.h>
#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) {\
    int t0 = s0 + s1;\
    int t1 = s0 - s1;\
    int t2 = s2 + s3;\
    int t3 = s2 - s3;\
    d0 = t0 + t2;\
    d2 = t0 - t2;\
    d1 = t1 + t3;\
    d3 = t1 - t3;\
}
uint32_t abs2( uint32_t a )
{
    uint32_t s = ((a>>15)&0x10001)*0xffff;
    return (a+s)^s;
}
int x264_pixel_satd_8x4( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
{
    uint32_t tmp[4][4];
    uint32_t a0, a1, a2, a3;
    int sum = 0;
    for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 )
    {
        a0 = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16);
        a1 = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16);
        a2 = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16);
        a3 = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16);
        HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0,a1,a2,a3 );
    }
    for( int i = 0; i < 4; i++ )
    {
        HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] );
        sum += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
    }
    return (((uint16_t)sum) + ((uint32_t)sum>>16)) >> 1;
}

https://godbolt.org/z/4KsfMaTjo

I notice in this commit: 
https://github.com/gcc-mirror/gcc/commit/cf261dd52272bdca767560131f3c2b4e1edae9ab?open_in_browser=true

We can have similiar codegen as ARM SVE. And we have tested in QEMU 20% performance drop in case of dynamic instruction count in the trunk GCC compare with the commit described above. We need to bisect to see which commit hurt the performance of RISC-V.
Comment 1 Robin Dapp 2024-11-25 09:34:34 UTC
The SLP vec_perm patch went upstream since which seems pretty related as specifically targets SATD's permutes.  Surprised to see a higher icount, though.
Comment 2 JuzheZhong 2024-11-25 09:36:13 UTC
Ok. I see it is not an issue now.

When we enable -mno-vect-strict-align:

https://godbolt.org/z/MzqzPTcc6

We have same codegen as ARM SVE now:

x264_pixel_satd_8x4(unsigned char*, int, unsigned char*, int):
        add     a6,a0,a1
        add     a4,a2,a3
        add     a7,a4,a3
        add     t1,a6,a1
        vsetivli        zero,8,e8,mf8,ta,ma
        add     a1,t1,a1
        add     a3,a7,a3
        vle8.v  v6,0(a1)
        vle8.v  v5,0(a3)
        lui     a5,%hi(.LC0)
        vle8.v  v3,0(a4)
        vle8.v  v4,0(a6)
        addi    a5,a5,%lo(.LC0)
        vlm.v   v0,0(a5)
        vle8.v  v10,0(t1)
        vle8.v  v9,0(a7)
        vle8.v  v2,0(a0)
        vle8.v  v8,0(a2)
        lui     a5,%hi(.LC1)
        vmv1r.v v15,v6
        vmv1r.v v11,v5
        addi    a5,a5,%lo(.LC1)
        vlm.v   v1,0(a5)
        vmv1r.v v14,v4
        vmv1r.v v13,v3
        vcompress.vm    v11,v9,v0
        vcompress.vm    v15,v10,v0
        vcompress.vm    v14,v2,v0
        vcompress.vm    v13,v8,v0
        vwsubu.vv       v12,v15,v11
        vmv1r.v v0,v1
        vslideup.vi     v7,v6,4
        vslideup.vi     v6,v5,4
        vslideup.vi     v5,v4,4
        vwsubu.vv       v11,v14,v13
        vslideup.vi     v4,v3,4
        vsetvli zero,zero,e32,mf2,ta,ma
        vsext.vf2       v3,v12
        vsetvli zero,zero,e8,mf8,ta,ma
        vcompress.vm    v6,v9,v0
        vcompress.vm    v5,v2,v0
        vcompress.vm    v7,v10,v0
        vsetvli zero,zero,e32,mf2,ta,ma
        vsext.vf2       v2,v11
        lui     a5,%hi(.LANCHOR0)
        vsetvli zero,zero,e8,mf8,ta,ma
        vcompress.vm    v4,v8,v0
        addi    a5,a5,%lo(.LANCHOR0)
        vsetvli zero,zero,e32,mf2,ta,ma
        vsll.vi v3,v3,16
        addi    a4,a5,32
        vsetvli zero,zero,e8,mf8,ta,ma
        vle32.v v8,0(a5)
        vle32.v v9,0(a4)
        vwsubu.vv       v10,v7,v6
        vsetvli zero,zero,e32,mf2,ta,ma
        vsll.vi v2,v2,16
        vsetvli zero,zero,e8,mf8,ta,ma
        vwsubu.vv       v7,v5,v4
        vsetvli zero,zero,e16,mf4,ta,ma
        vwadd.wv        v1,v3,v10
        vwadd.wv        v6,v2,v7
        addi    a4,a5,64
        vsetvli zero,zero,e32,mf2,ta,mu
        vrgather.vv     v2,v1,v8
        vrgather.vv     v4,v1,v9
        vle32.v v1,0(a4)
        addi    a5,a5,96
        vle32.v v7,0(a5)
        vrgather.vv     v5,v6,v8
        vadd.vv v8,v4,v2
        vrgather.vv     v3,v6,v9
        vmsgeu.vi       v11,v1,8
        vsub.vv v4,v4,v2
        vrgather.vv     v6,v8,v1
        vadd.vi v10,v1,-8
        vmv1r.v v0,v11
        vadd.vv v12,v5,v3
        vrgather.vv     v2,v8,v7
        vrgather.vv     v6,v4,v10,v0.t
        vadd.vi v8,v7,-8
        vmsgeu.vi       v0,v7,8
        vsub.vv v3,v3,v5
        vrgather.vv     v5,v12,v1
        vrgather.vv     v2,v4,v8,v0.t
        vmv1r.v v0,v11
        vrgather.vv     v1,v12,v7
        lui     a5,%hi(.LC6)
        vrgather.vv     v5,v3,v10,v0.t
        vmsgeu.vi       v0,v7,8
        addi    a5,a5,%lo(.LC6)
        vadd.vv v4,v2,v6
        vrgather.vv     v1,v3,v8,v0.t
        vlm.v   v0,0(a5)
        vsub.vv v2,v2,v6
        addi    sp,sp,-64
        vadd.vv v3,v5,v1
        addi    a4,sp,32
        vsub.vv v1,v1,v5
        vmerge.vvm      v2,v2,v4,v0
        addi    a5,sp,48
        vmerge.vvm      v1,v1,v3,v0
        vse32.v v2,0(a4)
        vmv.s.x v9,zero
        vse32.v v1,0(sp)
        vsetivli        zero,4,e32,mf2,ta,ma
        vle32.v v2,0(a5)
        vle32.v v1,0(a4)
        addi    a5,sp,16
        vsub.vv v3,v1,v2
        vle32.v v1,0(sp)
        vle32.v v2,0(a5)
        addi    a5,sp,48
        vsub.vv v5,v1,v2
        vle32.v v1,0(a5)
        vle32.v v2,0(a4)
        addi    a5,sp,16
        vadd.vv v4,v1,v2
        vle32.v v1,0(a5)
        vle32.v v2,0(sp)
        addi    sp,sp,64
        vadd.vv v2,v1,v2
        vsub.vv v1,v5,v3
        vadd.vv v3,v3,v5
        vadd.vv v5,v4,v2
        vsetivli        zero,8,e16,mf4,ta,ma
        vsra.vi v8,v1,15
        vsra.vi v7,v3,15
        vsetivli        zero,4,e32,mf2,ta,ma
        vsub.vv v2,v2,v4
        vadd.vv v1,v1,v8
        vadd.vv v3,v3,v7
        vsetivli        zero,8,e16,mf4,ta,ma
        vsra.vi v6,v5,15
        vsetivli        zero,4,e32,mf2,ta,ma
        vxor.vv v1,v1,v8
        vxor.vv v3,v3,v7
        vadd.vv v5,v5,v6
        vsetivli        zero,8,e16,mf4,ta,ma
        vsra.vi v4,v2,15
        vsetivli        zero,4,e32,mf2,ta,ma
        vadd.vv v1,v1,v3
        vxor.vv v5,v5,v6
        vadd.vv v2,v2,v4
        vadd.vv v1,v1,v5
        vxor.vv v2,v2,v4
        vadd.vv v1,v1,v2
        vredsum.vs      v1,v1,v9
        vmv.x.s a0,v1
        slli    a5,a0,48
        srli    a5,a5,48
        srliw   a0,a0,16
        addw    a0,a0,a5
        srli    a0,a0,1
        jr      ra
Comment 3 Robin Dapp 2024-11-25 09:43:00 UTC
Ok, I see.  Those x264 functions are sensitive to alignment.  Right now the only tune model to enable it by default is generic ooo.

But the commit you mentioned cannot have been OK then either?
Comment 4 JuzheZhong 2024-11-25 09:51:30 UTC
(In reply to Robin Dapp from comment #3)
> Ok, I see.  Those x264 functions are sensitive to alignment.  Right now the
> only tune model to enable it by default is generic ooo.
> 
> But the commit you mentioned cannot have been OK then either?

The commit I mentioned is just by default doesn't check whether RISC-V strict align or not. But later commit checks the alignment, then we need to use -mno-vector-strict-align to get better code-gen.

Will close this issue soon.