[gcc r14-6783] RISC-V: Add dynamic LMUL test for x264
Pan Li
panli@gcc.gnu.org
Thu Dec 21 10:43:51 GMT 2023
https://gcc.gnu.org/g:41a5f67db3cd625d2f3cfba6a87d3530a3291f33
commit r14-6783-g41a5f67db3cd625d2f3cfba6a87d3530a3291f33
Author: Juzhe-Zhong <juzhe.zhong@rivai.ai>
Date: Thu Dec 21 16:57:50 2023 +0800
RISC-V: Add dynamic LMUL test for x264
When working on evaluating x264 performance, I notice the best LMUL for such case with -march=rv64gcv is LMUL = 2
LMUL = 1:
x264_pixel_8x8:
add a4,a1,a2
addi a6,a0,16
vsetivli zero,4,e8,mf4,ta,ma
add a5,a4,a2
vle8.v v12,0(a6)
vle8.v v2,0(a4)
addi a6,a0,4
addi a4,a4,4
vle8.v v11,0(a6)
vle8.v v9,0(a4)
addi a6,a1,4
addi a4,a0,32
vle8.v v13,0(a0)
vle8.v v1,0(a1)
vle8.v v4,0(a6)
vle8.v v8,0(a4)
vle8.v v7,0(a5)
vwsubu.vv v3,v13,v1
add a3,a5,a2
addi a6,a0,20
addi a4,a0,36
vle8.v v10,0(a6)
vle8.v v6,0(a4)
addi a5,a5,4
vle8.v v5,0(a5)
vsetvli zero,zero,e16,mf2,ta,mu
vmslt.vi v0,v3,0
vneg.v v3,v3,v0.t
vsetvli zero,zero,e8,mf4,ta,ma
vwsubu.vv v1,v12,v2
vsetvli zero,zero,e16,mf2,ta,mu
vmslt.vi v0,v1,0
vneg.v v1,v1,v0.t
vmv1r.v v2,v1
vwadd.vv v1,v3,v2
vsetvli zero,zero,e8,mf4,ta,ma
vwsubu.vv v2,v11,v4
vsetvli zero,zero,e16,mf2,ta,mu
vmslt.vi v0,v2,0
vneg.v v2,v2,v0.t
vsetvli zero,zero,e8,mf4,ta,ma
vwsubu.vv v3,v10,v9
vsetvli zero,zero,e16,mf2,ta,mu
vmv1r.v v4,v2
vmslt.vi v0,v3,0
vneg.v v3,v3,v0.t
vwadd.vv v2,v4,v3
vsetvli zero,zero,e8,mf4,ta,ma
vwsubu.vv v3,v8,v7
vsetvli zero,zero,e16,mf2,ta,mu
add a4,a3,a2
vmslt.vi v0,v3,0
vneg.v v3,v3,v0.t
vwadd.wv v1,v1,v3
vsetvli zero,zero,e8,mf4,ta,ma
add a5,a4,a2
vwsubu.vv v3,v6,v5
addi a6,a0,48
vsetvli zero,zero,e16,mf2,ta,mu
vle8.v v16,0(a3)
vle8.v v12,0(a4)
addi a3,a3,4
addi a4,a4,4
vle8.v v17,0(a6)
vle8.v v14,0(a3)
vle8.v v10,0(a4)
vle8.v v8,0(a5)
add a6,a5,a2
addi a3,a0,64
addi a4,a0,80
addi a5,a5,4
vle8.v v13,0(a3)
vle8.v v4,0(a5)
vle8.v v9,0(a4)
vle8.v v6,0(a6)
vmslt.vi v0,v3,0
addi a7,a0,52
vneg.v v3,v3,v0.t
vle8.v v15,0(a7)
vwadd.wv v2,v2,v3
addi a3,a0,68
addi a4,a0,84
vle8.v v11,0(a3)
vle8.v v5,0(a4)
addi a5,a0,96
vle8.v v7,0(a5)
vsetvli zero,zero,e8,mf4,ta,ma
vwsubu.vv v3,v17,v16
vsetvli zero,zero,e16,mf2,ta,mu
vmslt.vi v0,v3,0
vneg.v v3,v3,v0.t
vwadd.wv v1,v1,v3
vsetvli zero,zero,e8,mf4,ta,ma
vwsubu.vv v3,v15,v14
vsetvli zero,zero,e16,mf2,ta,mu
vmslt.vi v0,v3,0
vneg.v v3,v3,v0.t
vwadd.wv v2,v2,v3
vsetvli zero,zero,e8,mf4,ta,ma
vwsubu.vv v3,v13,v12
vsetvli zero,zero,e16,mf2,ta,mu
slli a4,a2,3
vmslt.vi v0,v3,0
vneg.v v3,v3,v0.t
vwadd.wv v1,v1,v3
vsetvli zero,zero,e8,mf4,ta,ma
sub a4,a4,a2
vwsubu.vv v3,v11,v10
vsetvli zero,zero,e16,mf2,ta,mu
add a1,a1,a4
vmslt.vi v0,v3,0
vneg.v v3,v3,v0.t
vwadd.wv v2,v2,v3
vsetvli zero,zero,e8,mf4,ta,ma
lbu a7,0(a1)
vwsubu.vv v3,v9,v8
lbu a5,112(a0)
vsetvli zero,zero,e16,mf2,ta,mu
subw a5,a5,a7
vmslt.vi v0,v3,0
lbu a3,113(a0)
vneg.v v3,v3,v0.t
lbu a4,1(a1)
vwadd.wv v1,v1,v3
addi a6,a6,4
vsetvli zero,zero,e8,mf4,ta,ma
subw a3,a3,a4
vwsubu.vv v3,v5,v4
addi a2,a0,100
vsetvli zero,zero,e16,mf2,ta,mu
vle8.v v4,0(a6)
sraiw a6,a5,31
vle8.v v5,0(a2)
sraiw a7,a3,31
vmslt.vi v0,v3,0
xor a2,a5,a6
vneg.v v3,v3,v0.t
vwadd.wv v2,v2,v3
vsetvli zero,zero,e8,mf4,ta,ma
lbu a4,114(a0)
vwsubu.vv v3,v7,v6
lbu t1,2(a1)
vsetvli zero,zero,e16,mf2,ta,mu
subw a2,a2,a6
xor a6,a3,a7
vmslt.vi v0,v3,0
subw a4,a4,t1
vneg.v v3,v3,v0.t
lbu t1,3(a1)
vwadd.wv v1,v1,v3
lbu a5,115(a0)
subw a6,a6,a7
vsetvli zero,zero,e8,mf4,ta,ma
li a7,0
vwsubu.vv v3,v5,v4
sraiw t3,a4,31
vsetvli zero,zero,e16,mf2,ta,mu
subw a5,a5,t1
vmslt.vi v0,v3,0
vneg.v v3,v3,v0.t
vwadd.wv v2,v2,v3
sraiw t1,a5,31
vsetvli zero,zero,e32,m1,ta,ma
xor a4,a4,t3
vadd.vv v1,v1,v2
vmv.s.x v2,a7
vredsum.vs v1,v1,v2
vmv.x.s a7,v1
addw a2,a7,a2
subw a4,a4,t3
addw a6,a6,a2
xor a2,a5,t1
lbu a3,116(a0)
lbu t4,4(a1)
addw a4,a4,a6
subw a2,a2,t1
lbu a5,5(a1)
subw a3,a3,t4
addw a2,a2,a4
lbu a4,117(a0)
lbu t1,6(a1)
sraiw a7,a3,31
subw a4,a4,a5
lbu a5,118(a0)
sraiw a6,a4,31
subw a5,a5,t1
xor a3,a3,a7
lbu t1,7(a1)
lbu a0,119(a0)
sraiw a1,a5,31
subw a0,a0,t1
subw a3,a3,a7
xor a4,a4,a6
addw a3,a3,a2
subw a4,a4,a6
sraiw a2,a0,31
xor a5,a5,a1
addw a4,a4,a3
subw a5,a5,a1
xor a0,a0,a2
addw a5,a5,a4
subw a0,a0,a2
addw a0,a0,a5
ret
LMUL = dynamic
x264_pixel_8x8:
add a7,a1,a2
vsetivli zero,8,e8,mf2,ta,ma
add a6,a7,a2
vle8.v v1,0(a1)
add a3,a6,a2
vle8.v v2,0(a7)
add a4,a3,a2
vle8.v v13,0(a0)
vle8.v v7,0(a4)
vwsubu.vv v4,v13,v1
vle8.v v11,0(a6)
vle8.v v9,0(a3)
add a5,a4,a2
addi t1,a0,16
vle8.v v5,0(a5)
vle8.v v3,0(t1)
addi a7,a0,32
addi a6,a0,48
vle8.v v12,0(a7)
vle8.v v10,0(a6)
addi a3,a0,64
addi a4,a0,80
vle8.v v8,0(a3)
vle8.v v6,0(a4)
vsetvli zero,zero,e16,m1,ta,mu
vmslt.vi v0,v4,0
vneg.v v4,v4,v0.t
vsetvli zero,zero,e8,mf2,ta,ma
vwsubu.vv v1,v3,v2
vsetvli zero,zero,e16,m1,ta,mu
vmslt.vi v0,v1,0
vneg.v v1,v1,v0.t
vwadd.vv v2,v4,v1
vsetvli zero,zero,e8,mf2,ta,ma
vwsubu.vv v1,v12,v11
vsetvli zero,zero,e16,m1,ta,mu
vmslt.vi v0,v1,0
vneg.v v1,v1,v0.t
vwadd.wv v2,v2,v1
vsetvli zero,zero,e8,mf2,ta,ma
vwsubu.vv v1,v10,v9
vsetvli zero,zero,e16,m1,ta,mu
vmslt.vi v0,v1,0
vneg.v v1,v1,v0.t
vwadd.wv v2,v2,v1
vsetvli zero,zero,e8,mf2,ta,ma
vwsubu.vv v1,v8,v7
vsetvli zero,zero,e16,m1,ta,mu
slli a4,a2,3
vmslt.vi v0,v1,0
vneg.v v1,v1,v0.t
vwadd.wv v2,v2,v1
vsetvli zero,zero,e8,mf2,ta,ma
sub a4,a4,a2
vwsubu.vv v1,v6,v5
vsetvli zero,zero,e16,m1,ta,mu
addi a3,a0,96
vmslt.vi v0,v1,0
vle8.v v7,0(a3)
vneg.v v1,v1,v0.t
add a5,a5,a2
vwadd.wv v2,v2,v1
vle8.v v6,0(a5)
addi a0,a0,112
add a1,a1,a4
vle8.v v5,0(a0)
vle8.v v4,0(a1)
vsetvli zero,zero,e8,mf2,ta,ma
vwsubu.vv v1,v7,v6
vsetvli zero,zero,e16,m1,ta,mu
vmslt.vi v0,v1,0
vneg.v v1,v1,v0.t
vwadd.wv v2,v2,v1
vsetvli zero,zero,e32,m2,ta,ma
li a5,0
vmv.s.x v1,a5
vredsum.vs v1,v2,v1
vmv.x.s a0,v1
vsetvli zero,zero,e8,mf2,ta,ma
vwsubu.vv v1,v5,v4
vsetvli zero,zero,e16,m1,ta,mu
vmslt.vi v0,v1,0
vneg.v v1,v1,v0.t
vsetivli zero,1,e32,m1,ta,ma
vmv.s.x v2,a5
vsetivli zero,8,e16,m1,ta,ma
vwredsumu.vs v1,v1,v2
vsetivli zero,0,e32,m1,ta,ma
vmv.x.s a5,v1
addw a0,a0,a5
ret
I notice we have much better codegen and performance improvement gain with --param=riscv-autovec-lmul=dynamic
which is able to pick the best LMUL (M2).
Add test avoid future somebody potential destroy performance on X264.
gcc/testsuite/ChangeLog:
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c: New test.
Diff:
---
.../vect/costmodel/riscv/rvv/dynamic-lmul2-7.c | 24 ++++++++++++++++++++++
1 file changed, 24 insertions(+)
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c
new file mode 100644
index 00000000000..87e963edc47
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize --param riscv-autovec-lmul=dynamic" } */
+
+int
+x264_pixel_8x8 (unsigned char *pix1, unsigned char *pix2, int i_stride_pix2)
+{
+ int i_sum = 0;
+ for (int y = 0; y < 8; y++)
+ {
+ i_sum += __builtin_abs (pix1[0] - pix2[0]);
+ i_sum += __builtin_abs (pix1[1] - pix2[1]);
+ i_sum += __builtin_abs (pix1[2] - pix2[2]);
+ i_sum += __builtin_abs (pix1[3] - pix2[3]);
+ i_sum += __builtin_abs (pix1[4] - pix2[4]);
+ i_sum += __builtin_abs (pix1[5] - pix2[5]);
+ i_sum += __builtin_abs (pix1[6] - pix2[6]);
+ i_sum += __builtin_abs (pix1[7] - pix2[7]);
+ pix1 += 16;
+ pix2 += i_stride_pix2;
+ }
+ return i_sum;
+}
+
+/* { dg-final { scan-assembler {e32,m2} } } */
More information about the Gcc-cvs
mailing list