[PATCH] RISC-V: Fix inferior codegen for vse intrinsics.
Kito Cheng
kito.cheng@gmail.com
Thu Jan 26 19:12:45 GMT 2023
committed, thanks.
On Thu, Dec 29, 2022 at 11:34 PM <juzhe.zhong@rivai.ai> wrote:
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
>
> Currently we use pred_mov to to do the codegen for vse intrinsics.
> However, it
> generates inferior codegen when I am testing AVL model of VSETVL PASS
> using vse intrinsics.
>
> Consider this following code:
> void f2 (int * restrict in, int * restrict out, void * restrict mask_in,
> int n)
> {
> vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((float *)(in + 10000), 19);
> __riscv_vse32_v_f32mf2 ((float *)(out + 10000), v, 19);
> vbool64_t mask = *(vbool64_t*)mask_in;
> for (int i = 0; i < n; i++)
> {
> vint16mf2_t v1 = __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1),
> 19);
> __riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19);
>
> vint32mf2_t v2 = __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2),
> 19);
> __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19);
>
> vint32mf2_t v3 = __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t
> *)(in + i + 200), 13);
> __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 200), v2, 13);
>
> vfloat64m1_t v4 = __riscv_vle64_v_f64m1_m (mask, (double *)(in + i +
> 300), 11);
> __riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11);
>
> vfloat64m1_t v5 = __riscv_vle64_v_f64m1_tum (mask, v4, (double *)(in
> + i + 500), 11);
> __riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11);
>
> vfloat64m1_t v6 = __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in
> + i + 600), 11);
> __riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11);
>
> vuint8mf4_t v7 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700),
> 11);
> __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11);
> }
> }
>
> Before this patch:
> csrr t2,vlenb
> srli t2,t2,1
> slli s0,t2,2
> vsetvli zero,19,e16,mf2,ta,ma
> sub s0,s0,t2
> csrr t2,vlenb
> vle16.v v24,0(a3)
> mv a4,a3
> vse16.v v24,0(a1)
> srli t2,t2,1
> add a2,a3,t6
> add s0,s0,sp
> vsetvli zero,19,e32,mf2,ta,ma
> addi a3,a3,4
> vle32.v v24,0(a3)
> vsetvli zero,t0,e32,mf2,ta,ma
> vse32.v v24,0(s0)
> slli s0,t2,2
> sub s0,s0,t2
> add s0,s0,sp
> vsetvli t0,zero,e32,mf2,ta,ma
> vle32.v v24,0(s0)
> mv s0,t2
> slli t2,t2,2
> mv a5,a1
> vsetvli zero,19,e32,mf2,ta,ma
> addi a1,a1,4
> sub t2,t2,s0
> vse32.v v24,0(a1)
> add t2,t2,sp
> vsetvli t0,zero,e32,mf2,ta,ma
> addi t1,a5,796
> vle32.v v24,0(t2)
> addi t5,a4,1196
> addi a7,a5,1196
> addi t4,a4,1996
> addi a6,a5,1996
> vsetvli zero,13,e32,mf2,ta,ma
> add a4,a4,t3
> vse32.v v24,0(t1)
> add a5,a5,t3
> vsetvli zero,11,e64,m1,tu,mu
> vle64.v v24,0(t5),v0.t
> vse64.v v24,0(a7)
> vle64.v v24,0(t4),v0.t
> vse64.v v24,0(a6)
> vle64.v v24,0(a4),v0.t
> vse64.v v24,0(a5),v0.t
> vsetvli zero,11,e8,mf4,ta,ma
> vle8.v v24,0(a2)
> vse8.v v24,0(a2)
> bne a0,a3,.L8
> csrr t0,vlenb
> slli t1,t0,1
> add sp,sp,t1
> lw s0,12(sp)
> addi sp,sp,16
> jr ra
>
> We are generating redundant spilling codes.
> Here we introduce a dedicated pred_store pattern for vse intrinsics like
> maskstore in ARM SVE.
>
> After this patch:
> vsetvli zero,19,e16,mf2,ta,ma
> mv a5,a4
> vle16.v v24,0(a0)
> mv a3,a0
> vse16.v 19,0(a4)
> addi t1,a4,796
> vsetvli zero,19,e32,mf2,ta,ma
> addi a0,a0,4
> addi a4,a4,4
> vle32.v v24,0(a0)
> addi t0,a3,1196
> vse32.v 19,0(a4)
> addi a7,a5,1196
> addi t6,a3,1996
> addi a6,a5,1996
> add t5,a3,t4
> vsetvli zero,13,e32,mf2,ta,ma
> add a2,a5,t4
> vse32.v 13,0(t1)
> add a3,a3,t3
> vsetvli zero,11,e64,m1,tu,mu
> add a5,a5,t3
> vle64.v v24,0(t0),v0.t
> vse64.v 11,0(a7)
> vle64.v v24,0(t6),v0.t
> vse64.v 11,0(a6)
> vle64.v v24,0(t5),v0.t
> vse64.v 11,0(a2),v0.t
> vsetvli zero,11,e8,mf4,ta,ma
> vle8.v v24,0(a3)
> vse8.v 11,0(a5)
> bne a1,a4,.L8
> .L6:
> ret
>
> gcc/ChangeLog:
>
> * config/riscv/riscv-vector-builtins-bases.cc (class loadstore):
> use pred_store for vse.
> * config/riscv/riscv-vector-builtins.cc
> (function_expander::add_mem_operand): Refine function.
> (function_expander::use_contiguous_load_insn): Adjust new
> implementation.
> (function_expander::use_contiguous_store_insn): Ditto.
> * config/riscv/riscv-vector-builtins.h: Refine function.
> * config/riscv/vector.md (@pred_store<mode>): New pattern.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/base/vse-constraint-1.c: New test.
>
> ---
> .../riscv/riscv-vector-builtins-bases.cc | 2 +-
> gcc/config/riscv/riscv-vector-builtins.cc | 22 +----
> gcc/config/riscv/riscv-vector-builtins.h | 8 +-
> gcc/config/riscv/vector.md | 23 ++++-
> .../riscv/rvv/base/vse-constraint-1.c | 97 +++++++++++++++++++
> 5 files changed, 128 insertions(+), 24 deletions(-)
> create mode 100644
> gcc/testsuite/gcc.target/riscv/rvv/base/vse-constraint-1.c
>
> diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc
> b/gcc/config/riscv/riscv-vector-builtins-bases.cc
> index 10373e5ccf2..af66b016b49 100644
> --- a/gcc/config/riscv/riscv-vector-builtins-bases.cc
> +++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc
> @@ -106,7 +106,7 @@ class loadstore : public function_base
> rtx expand (function_expander &e) const override
> {
> if (STORE_P)
> - return e.use_contiguous_store_insn (code_for_pred_mov
> (e.vector_mode ()));
> + return e.use_contiguous_store_insn (code_for_pred_store
> (e.vector_mode ()));
> else
> return e.use_contiguous_load_insn (code_for_pred_mov (e.vector_mode
> ()));
> }
> diff --git a/gcc/config/riscv/riscv-vector-builtins.cc
> b/gcc/config/riscv/riscv-vector-builtins.cc
> index e39bfea9636..47e01b647f8 100644
> --- a/gcc/config/riscv/riscv-vector-builtins.cc
> +++ b/gcc/config/riscv/riscv-vector-builtins.cc
> @@ -845,15 +845,15 @@ function_expander::add_vundef_operand (machine_mode
> mode)
> }
>
> /* Add a memory operand with mode MODE and address ADDR. */
> -rtx
> -function_expander::add_mem_operand (machine_mode mode, rtx addr)
> +void
> +function_expander::add_mem_operand (machine_mode mode, unsigned argno)
> {
> gcc_assert (VECTOR_MODE_P (mode));
> + rtx addr = expand_normal (CALL_EXPR_ARG (exp, argno));
> rtx mem = gen_rtx_MEM (mode, memory_address (mode, addr));
> /* The memory is only guaranteed to be element-aligned. */
> set_mem_align (mem, GET_MODE_ALIGNMENT (GET_MODE_INNER (mode)));
> add_fixed_operand (mem);
> - return mem;
> }
>
> /* Use contiguous load INSN. */
> @@ -878,9 +878,7 @@ function_expander::use_contiguous_load_insn (insn_code
> icode)
> else
> add_vundef_operand (mode);
>
> - tree addr_arg = CALL_EXPR_ARG (exp, arg_offset++);
> - rtx addr = expand_normal (addr_arg);
> - add_mem_operand (mode, addr);
> + add_mem_operand (mode, arg_offset++);
>
> for (int argno = arg_offset; argno < call_expr_nargs (exp); argno++)
> add_input_operand (argno);
> @@ -904,27 +902,17 @@ function_expander::use_contiguous_store_insn
> (insn_code icode)
> /* Record the offset to get the argument. */
> int arg_offset = 0;
>
> - int addr_loc = use_real_mask_p (pred) ? 1 : 0;
> - tree addr_arg = CALL_EXPR_ARG (exp, addr_loc);
> - rtx addr = expand_normal (addr_arg);
> - rtx mem = add_mem_operand (mode, addr);
> + add_mem_operand (mode, use_real_mask_p (pred) ? 1 : 0);
>
> if (use_real_mask_p (pred))
> add_input_operand (arg_offset++);
> else
> add_all_one_mask_operand (mask_mode);
>
> - /* To model "+m" constraint, we include memory operand into input. */
> - add_input_operand (mode, mem);
> -
> arg_offset++;
> for (int argno = arg_offset; argno < call_expr_nargs (exp); argno++)
> add_input_operand (argno);
>
> - add_input_operand (Pmode, get_tail_policy_for_pred (pred));
> - add_input_operand (Pmode, get_mask_policy_for_pred (pred));
> - add_input_operand (Pmode, get_avl_type_rtx (avl_type::NONVLMAX));
> -
> return generate_insn (icode);
> }
>
> diff --git a/gcc/config/riscv/riscv-vector-builtins.h
> b/gcc/config/riscv/riscv-vector-builtins.h
> index c13df99cb5b..58d8d78043c 100644
> --- a/gcc/config/riscv/riscv-vector-builtins.h
> +++ b/gcc/config/riscv/riscv-vector-builtins.h
> @@ -317,12 +317,12 @@ public:
> rtx expand ();
>
> void add_input_operand (machine_mode, rtx);
> - void add_input_operand (unsigned argno);
> + void add_input_operand (unsigned);
> void add_output_operand (machine_mode, rtx);
> - void add_all_one_mask_operand (machine_mode mode);
> - void add_vundef_operand (machine_mode mode);
> + void add_all_one_mask_operand (machine_mode);
> + void add_vundef_operand (machine_mode);
> void add_fixed_operand (rtx);
> - rtx add_mem_operand (machine_mode, rtx);
> + void add_mem_operand (machine_mode, unsigned);
>
> machine_mode vector_mode (void) const;
>
> diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
> index 89810b183fc..3d0174f98a2 100644
> --- a/gcc/config/riscv/vector.md
> +++ b/gcc/config/riscv/vector.md
> @@ -209,7 +209,7 @@
>
> ;; The index of operand[] to get the merge op.
> (define_attr "merge_op_idx" ""
> - (cond [(eq_attr "type"
> "vlde,vste,vimov,vfmov,vldm,vstm,vlds,vmalu")
> + (cond [(eq_attr "type" "vlde,vimov,vfmov,vldm,vstm,vlds,vmalu")
> (const_int 2)]
> (const_int INVALID_ATTRIBUTE)))
>
> @@ -647,7 +647,7 @@
> (reg:SI VL_REGNUM)
> (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
> (match_operand:V 3 "vector_move_operand" " m, m,
> vr, vr, viWc0")
> - (match_operand:V 2 "vector_merge_operand" " 0, vu,
> vu0, vu0, vu0")))]
> + (match_operand:V 2 "vector_merge_operand" " 0, vu,
> vu, vu0, vu0")))]
> "TARGET_VECTOR"
> "@
> vle<sew>.v\t%0,%3%p1
> @@ -663,6 +663,25 @@
> [(set_attr "type" "vlde,vlde,vste,vimov,vimov")
> (set_attr "mode" "<MODE>")])
>
> +;; Dedicated pattern for vse.v instruction since we can't reuse pred_mov
> pattern to include
> +;; memory operand as input which will produce inferior codegen.
> +(define_insn "@pred_store<mode>"
> + [(set (match_operand:V 0 "memory_operand" "+m")
> + (if_then_else:V
> + (unspec:<VM>
> + [(match_operand:<VM> 1 "vector_mask_operand" "vmWc1")
> + (match_operand 3 "vector_length_operand" " rK")
> + (reg:SI VL_REGNUM)
> + (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
> + (match_operand:V 2 "register_operand" " vr")
> + (match_dup 0)))]
> + "TARGET_VECTOR"
> + "vse<sew>.v\t%2,%0%p1"
> + [(set_attr "type" "vste")
> + (set_attr "mode" "<MODE>")
> + (set (attr "avl_type") (symbol_ref "riscv_vector::NONVLMAX"))
> + (set_attr "vl_op_idx" "3")])
> +
> ;; vlm.v/vsm.v/vmclr.m/vmset.m.
> ;; constraint alternative 0 match vlm.v.
> ;; constraint alternative 1 match vsm.v.
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/vse-constraint-1.c
> b/gcc/testsuite/gcc.target/riscv/rvv/base/vse-constraint-1.c
> new file mode 100644
> index 00000000000..5b8b9b41c7b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/vse-constraint-1.c
> @@ -0,0 +1,97 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv32gcv -mabi=ilp32d -O3" } */
> +
> +#include "riscv_vector.h"
> +
> +void f (int * restrict in, int * restrict out, void * restrict mask_in,
> int n)
> +{
> + vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((float *)(in + 10000), 19);
> + __riscv_vse32_v_f32mf2 ((float *)(out + 10000), v, 19);
> + vbool64_t mask = *(vbool64_t*)mask_in;
> + for (int i = 0; i < n; i++)
> + {
> + vint16mf2_t v1 = __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1),
> 19);
> + __riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19);
> +
> + vint32mf2_t v2 = __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2),
> 19);
> + __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19);
> +
> + vint32mf2_t v3 = __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t
> *)(in + i + 200), 13);
> + __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 200), v3, 13);
> +
> + vfloat64m1_t v4 = __riscv_vle64_v_f64m1_m (mask, (double *)(in + i
> + 300), 11);
> + __riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11);
> +
> + vfloat64m1_t v5 = __riscv_vle64_v_f64m1_tum (mask, v4, (double
> *)(in + i + 500), 11);
> + __riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11);
> +
> + vfloat64m1_t v6 = __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in
> + i + 600), 11);
> + __riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11);
> +
> + vuint8mf4_t v7 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700),
> 11);
> + __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11);
> + }
> +}
> +
> +void f2 (int * restrict in, int * restrict out, void * restrict mask_in,
> int n)
> +{
> + vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((float *)(in + 10000), 19);
> + __riscv_vse32_v_f32mf2 ((float *)(out + 10000), v, 19);
> + vbool64_t mask = *(vbool64_t*)mask_in;
> + for (int i = 0; i < n; i++)
> + {
> + vint16mf2_t v1 = __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1),
> 19);
> + __riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19);
> +
> + vint32mf2_t v2 = __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2),
> 19);
> + __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19);
> +
> + vint32mf2_t v3 = __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t
> *)(in + i + 200), 13);
> + __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 200), v2, 13);
> +
> + vfloat64m1_t v4 = __riscv_vle64_v_f64m1_m (mask, (double *)(in + i
> + 300), 11);
> + __riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11);
> +
> + vfloat64m1_t v5 = __riscv_vle64_v_f64m1_tum (mask, v4, (double
> *)(in + i + 500), 11);
> + __riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11);
> +
> + vfloat64m1_t v6 = __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in
> + i + 600), 11);
> + __riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11);
> +
> + vuint8mf4_t v7 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700),
> 11);
> + __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11);
> + }
> +}
> +
> +void f3 (int * restrict in, int * restrict out, void * restrict mask_in,
> int n)
> +{
> + vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((float *)(in + 10000), 19);
> + __riscv_vse32_v_f32mf2 ((float *)(out + 10000), v, 19);
> + vbool64_t mask = *(vbool64_t*)mask_in;
> + for (int i = 0; i < n; i++)
> + {
> + vint16mf2_t v1 = __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1),
> 19);
> + __riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19);
> +
> + vint32mf2_t v2 = __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2),
> 19);
> + __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19);
> +
> + vint32mf2_t v3 = __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t
> *)(in + i + 200), 13);
> + *(vint32mf2_t*)(out + i + 200) = v3;
> +
> + vfloat64m1_t v4 = __riscv_vle64_v_f64m1_m (mask, (double *)(in + i
> + 300), 11);
> + __riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11);
> +
> + vfloat64m1_t v5 = __riscv_vle64_v_f64m1_tum (mask, v4, (double
> *)(in + i + 500), 11);
> + __riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11);
> +
> + vfloat64m1_t v6 = __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in
> + i + 600), 11);
> + __riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11);
> +
> + vuint8mf4_t v7 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700),
> 11);
> + __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11);
> + }
> +}
> +
> +/* It should not have redundant vector register spills which produce csrr
> vlenb instructions allocate stack. */
> +/* { dg-final { scan-assembler-not {csrr\s+[a-x0-9]+,\s*vlenb} } } */
> --
> 2.36.1
>
>
More information about the Gcc-patches
mailing list