[PATCH] RISC-V: Fix inferior codegen for vse intrinsics.

Thu Jan 26 19:12:45 GMT 2023

committed, thanks.

On Thu, Dec 29, 2022 at 11:34 PM <juzhe.zhong@rivai.ai> wrote:

> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
>
> Currently we use pred_mov to to do the codegen for vse intrinsics.
> However, it
> generates inferior codegen when I am testing AVL model of VSETVL PASS
> using vse intrinsics.
>
> Consider this following code:
> void f2 (int * restrict in, int * restrict out, void * restrict mask_in,
> int n)
> {
>   vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((float *)(in + 10000), 19);
>   __riscv_vse32_v_f32mf2 ((float *)(out + 10000), v, 19);
>   vbool64_t mask = *(vbool64_t*)mask_in;
>   for (int i = 0; i < n; i++)
>     {
>       vint16mf2_t v1 = __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1),
> 19);
>       __riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19);
>
>       vint32mf2_t v2 = __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2),
> 19);
>       __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19);
>
>       vint32mf2_t v3 = __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t
> *)(in + i + 200), 13);
>       __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 200), v2, 13);
>
>       vfloat64m1_t v4 = __riscv_vle64_v_f64m1_m (mask, (double *)(in + i +
> 300), 11);
>       __riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11);
>
>       vfloat64m1_t v5 = __riscv_vle64_v_f64m1_tum (mask, v4, (double *)(in
> + i + 500), 11);
>       __riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11);
>
>       vfloat64m1_t v6 = __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in
> + i + 600), 11);
>       __riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11);
>
>       vuint8mf4_t v7 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700),
> 11);
>       __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11);
>     }
> }
>
> Before this patch:
>         csrr    t2,vlenb
>         srli    t2,t2,1
>         slli    s0,t2,2
>         vsetvli zero,19,e16,mf2,ta,ma
>         sub     s0,s0,t2
>         csrr    t2,vlenb
>         vle16.v v24,0(a3)
>         mv      a4,a3
>         vse16.v v24,0(a1)
>         srli    t2,t2,1
>         add     a2,a3,t6
>         add     s0,s0,sp
>         vsetvli zero,19,e32,mf2,ta,ma
>         addi    a3,a3,4
>         vle32.v v24,0(a3)
>         vsetvli zero,t0,e32,mf2,ta,ma
>         vse32.v v24,0(s0)
>         slli    s0,t2,2
>         sub     s0,s0,t2
>         add     s0,s0,sp
>         vsetvli t0,zero,e32,mf2,ta,ma
>         vle32.v v24,0(s0)
>         mv      s0,t2
>         slli    t2,t2,2
>         mv      a5,a1
>         vsetvli zero,19,e32,mf2,ta,ma
>         addi    a1,a1,4
>         sub     t2,t2,s0
>         vse32.v v24,0(a1)
>         add     t2,t2,sp
>         vsetvli t0,zero,e32,mf2,ta,ma
>         addi    t1,a5,796
>         vle32.v v24,0(t2)
>         addi    t5,a4,1196
>         addi    a7,a5,1196
>         addi    t4,a4,1996
>         addi    a6,a5,1996
>         vsetvli zero,13,e32,mf2,ta,ma
>         add     a4,a4,t3
>         vse32.v v24,0(t1)
>         add     a5,a5,t3
>         vsetvli zero,11,e64,m1,tu,mu
>         vle64.v v24,0(t5),v0.t
>         vse64.v v24,0(a7)
>         vle64.v v24,0(t4),v0.t
>         vse64.v v24,0(a6)
>         vle64.v v24,0(a4),v0.t
>         vse64.v v24,0(a5),v0.t
>         vsetvli zero,11,e8,mf4,ta,ma
>         vle8.v  v24,0(a2)
>         vse8.v  v24,0(a2)
>         bne     a0,a3,.L8
>         csrr    t0,vlenb
>         slli    t1,t0,1
>         add     sp,sp,t1
>         lw      s0,12(sp)
>         addi    sp,sp,16
>         jr      ra
>
> We are generating redundant spilling codes.
> Here we introduce a dedicated pred_store pattern for vse intrinsics like
> maskstore in ARM SVE.
>
> After this patch:
> vsetvli zero,19,e16,mf2,ta,ma
>         mv      a5,a4
>         vle16.v v24,0(a0)
>         mv      a3,a0
>         vse16.v 19,0(a4)
>         addi    t1,a4,796
>         vsetvli zero,19,e32,mf2,ta,ma
>         addi    a0,a0,4
>         addi    a4,a4,4
>         vle32.v v24,0(a0)
>         addi    t0,a3,1196
>         vse32.v 19,0(a4)
>         addi    a7,a5,1196
>         addi    t6,a3,1996
>         addi    a6,a5,1996
>         add     t5,a3,t4
>         vsetvli zero,13,e32,mf2,ta,ma
>         add     a2,a5,t4
>         vse32.v 13,0(t1)
>         add     a3,a3,t3
>         vsetvli zero,11,e64,m1,tu,mu
>         add     a5,a5,t3
>         vle64.v v24,0(t0),v0.t
>         vse64.v 11,0(a7)
>         vle64.v v24,0(t6),v0.t
>         vse64.v 11,0(a6)
>         vle64.v v24,0(t5),v0.t
>         vse64.v 11,0(a2),v0.t
>         vsetvli zero,11,e8,mf4,ta,ma
>         vle8.v  v24,0(a3)
>         vse8.v  11,0(a5)
>         bne     a1,a4,.L8
> .L6:
>         ret
>
> gcc/ChangeLog:
>
>         * config/riscv/riscv-vector-builtins-bases.cc (class loadstore):
> use pred_store for vse.
>         * config/riscv/riscv-vector-builtins.cc
> (function_expander::add_mem_operand): Refine function.
>         (function_expander::use_contiguous_load_insn): Adjust new
> implementation.
>         (function_expander::use_contiguous_store_insn): Ditto.
>         * config/riscv/riscv-vector-builtins.h: Refine function.
>         * config/riscv/vector.md (@pred_store<mode>): New pattern.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/riscv/rvv/base/vse-constraint-1.c: New test.
>
> ---
>  .../riscv/riscv-vector-builtins-bases.cc      |  2 +-
>  gcc/config/riscv/riscv-vector-builtins.cc     | 22 +----
>  gcc/config/riscv/riscv-vector-builtins.h      |  8 +-
>  gcc/config/riscv/vector.md                    | 23 ++++-
>  .../riscv/rvv/base/vse-constraint-1.c         | 97 +++++++++++++++++++
>  5 files changed, 128 insertions(+), 24 deletions(-)
>  create mode 100644
> gcc/testsuite/gcc.target/riscv/rvv/base/vse-constraint-1.c
>
> diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc
> b/gcc/config/riscv/riscv-vector-builtins-bases.cc
> index 10373e5ccf2..af66b016b49 100644
> --- a/gcc/config/riscv/riscv-vector-builtins-bases.cc
> +++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc
> @@ -106,7 +106,7 @@ class loadstore : public function_base
>    rtx expand (function_expander &e) const override
>    {
>      if (STORE_P)
> -      return e.use_contiguous_store_insn (code_for_pred_mov
> (e.vector_mode ()));
> +      return e.use_contiguous_store_insn (code_for_pred_store
> (e.vector_mode ()));
>      else
>        return e.use_contiguous_load_insn (code_for_pred_mov (e.vector_mode
> ()));
>    }
> diff --git a/gcc/config/riscv/riscv-vector-builtins.cc
> b/gcc/config/riscv/riscv-vector-builtins.cc
> index e39bfea9636..47e01b647f8 100644
> --- a/gcc/config/riscv/riscv-vector-builtins.cc
> +++ b/gcc/config/riscv/riscv-vector-builtins.cc
> @@ -845,15 +845,15 @@ function_expander::add_vundef_operand (machine_mode
> mode)
>  }
>
>  /* Add a memory operand with mode MODE and address ADDR.  */
> -rtx
> -function_expander::add_mem_operand (machine_mode mode, rtx addr)
> +void
> +function_expander::add_mem_operand (machine_mode mode, unsigned argno)
>  {
>    gcc_assert (VECTOR_MODE_P (mode));
> +  rtx addr = expand_normal (CALL_EXPR_ARG (exp, argno));
>    rtx mem = gen_rtx_MEM (mode, memory_address (mode, addr));
>    /* The memory is only guaranteed to be element-aligned.  */
>    set_mem_align (mem, GET_MODE_ALIGNMENT (GET_MODE_INNER (mode)));
>    add_fixed_operand (mem);
> -  return mem;
>  }
>
>  /* Use contiguous load INSN.  */
> @@ -878,9 +878,7 @@ function_expander::use_contiguous_load_insn (insn_code
> icode)
>    else
>      add_vundef_operand (mode);
>
> -  tree addr_arg = CALL_EXPR_ARG (exp, arg_offset++);
> -  rtx addr = expand_normal (addr_arg);
> -  add_mem_operand (mode, addr);
> +  add_mem_operand (mode, arg_offset++);
>
>    for (int argno = arg_offset; argno < call_expr_nargs (exp); argno++)
>      add_input_operand (argno);
> @@ -904,27 +902,17 @@ function_expander::use_contiguous_store_insn
> (insn_code icode)
>    /* Record the offset to get the argument.  */
>    int arg_offset = 0;
>
> -  int addr_loc = use_real_mask_p (pred) ? 1 : 0;
> -  tree addr_arg = CALL_EXPR_ARG (exp, addr_loc);
> -  rtx addr = expand_normal (addr_arg);
> -  rtx mem = add_mem_operand (mode, addr);
> +  add_mem_operand (mode, use_real_mask_p (pred) ? 1 : 0);
>
>    if (use_real_mask_p (pred))
>      add_input_operand (arg_offset++);
>    else
>      add_all_one_mask_operand (mask_mode);
>
> -  /* To model "+m" constraint, we include memory operand into input.  */
> -  add_input_operand (mode, mem);
> -
>    arg_offset++;
>    for (int argno = arg_offset; argno < call_expr_nargs (exp); argno++)
>      add_input_operand (argno);
>
> -  add_input_operand (Pmode, get_tail_policy_for_pred (pred));
> -  add_input_operand (Pmode, get_mask_policy_for_pred (pred));
> -  add_input_operand (Pmode, get_avl_type_rtx (avl_type::NONVLMAX));
> -
>    return generate_insn (icode);
>  }
>
> diff --git a/gcc/config/riscv/riscv-vector-builtins.h
> b/gcc/config/riscv/riscv-vector-builtins.h
> index c13df99cb5b..58d8d78043c 100644
> --- a/gcc/config/riscv/riscv-vector-builtins.h
> +++ b/gcc/config/riscv/riscv-vector-builtins.h
> @@ -317,12 +317,12 @@ public:
>    rtx expand ();
>
>    void add_input_operand (machine_mode, rtx);
> -  void add_input_operand (unsigned argno);
> +  void add_input_operand (unsigned);
>    void add_output_operand (machine_mode, rtx);
> -  void add_all_one_mask_operand (machine_mode mode);
> -  void add_vundef_operand (machine_mode mode);
> +  void add_all_one_mask_operand (machine_mode);
> +  void add_vundef_operand (machine_mode);
>    void add_fixed_operand (rtx);
> -  rtx add_mem_operand (machine_mode, rtx);
> +  void add_mem_operand (machine_mode, unsigned);
>
>    machine_mode vector_mode (void) const;
>
> diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
> index 89810b183fc..3d0174f98a2 100644
> --- a/gcc/config/riscv/vector.md
> +++ b/gcc/config/riscv/vector.md
> @@ -209,7 +209,7 @@
>
>  ;; The index of operand[] to get the merge op.
>  (define_attr "merge_op_idx" ""
> -       (cond [(eq_attr "type"
> "vlde,vste,vimov,vfmov,vldm,vstm,vlds,vmalu")
> +       (cond [(eq_attr "type" "vlde,vimov,vfmov,vldm,vstm,vlds,vmalu")
>          (const_int 2)]
>         (const_int INVALID_ATTRIBUTE)))
>
> @@ -647,7 +647,7 @@
>              (reg:SI VL_REGNUM)
>              (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
>           (match_operand:V 3 "vector_move_operand"       "    m,     m,
> vr,    vr, viWc0")
> -         (match_operand:V 2 "vector_merge_operand"      "    0,    vu,
>  vu0,   vu0,   vu0")))]
> +         (match_operand:V 2 "vector_merge_operand"      "    0,    vu,
> vu,   vu0,   vu0")))]
>    "TARGET_VECTOR"
>    "@
>     vle<sew>.v\t%0,%3%p1
> @@ -663,6 +663,25 @@
>    [(set_attr "type" "vlde,vlde,vste,vimov,vimov")
>     (set_attr "mode" "<MODE>")])
>
> +;; Dedicated pattern for vse.v instruction since we can't reuse pred_mov
> pattern to include
> +;; memory operand as input which will produce inferior codegen.
> +(define_insn "@pred_store<mode>"
> +  [(set (match_operand:V 0 "memory_operand"                 "+m")
> +       (if_then_else:V
> +         (unspec:<VM>
> +           [(match_operand:<VM> 1 "vector_mask_operand" "vmWc1")
> +            (match_operand 3 "vector_length_operand"    "   rK")
> +            (reg:SI VL_REGNUM)
> +            (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
> +         (match_operand:V 2 "register_operand"         "    vr")
> +         (match_dup 0)))]
> +  "TARGET_VECTOR"
> +  "vse<sew>.v\t%2,%0%p1"
> +  [(set_attr "type" "vste")
> +   (set_attr "mode" "<MODE>")
> +   (set (attr "avl_type") (symbol_ref "riscv_vector::NONVLMAX"))
> +   (set_attr "vl_op_idx" "3")])
> +
>  ;; vlm.v/vsm.v/vmclr.m/vmset.m.
>  ;; constraint alternative 0 match vlm.v.
>  ;; constraint alternative 1 match vsm.v.
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/vse-constraint-1.c
> b/gcc/testsuite/gcc.target/riscv/rvv/base/vse-constraint-1.c
> new file mode 100644
> index 00000000000..5b8b9b41c7b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/vse-constraint-1.c
> @@ -0,0 +1,97 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv32gcv -mabi=ilp32d -O3" } */
> +
> +#include "riscv_vector.h"
> +
> +void f (int * restrict in, int * restrict out, void * restrict mask_in,
> int n)
> +{
> +  vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((float *)(in + 10000), 19);
> +  __riscv_vse32_v_f32mf2 ((float *)(out + 10000), v, 19);
> +  vbool64_t mask = *(vbool64_t*)mask_in;
> +  for (int i = 0; i < n; i++)
> +    {
> +      vint16mf2_t v1 = __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1),
> 19);
> +      __riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19);
> +
> +      vint32mf2_t v2 = __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2),
> 19);
> +      __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19);
> +
> +      vint32mf2_t v3 = __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t
> *)(in + i + 200), 13);
> +      __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 200), v3, 13);
> +
> +      vfloat64m1_t v4 = __riscv_vle64_v_f64m1_m (mask, (double *)(in + i
> + 300), 11);
> +      __riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11);
> +
> +      vfloat64m1_t v5 = __riscv_vle64_v_f64m1_tum (mask, v4, (double
> *)(in + i + 500), 11);
> +      __riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11);
> +
> +      vfloat64m1_t v6 = __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in
> + i + 600), 11);
> +      __riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11);
> +
> +      vuint8mf4_t v7 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700),
> 11);
> +      __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11);
> +    }
> +}
> +
> +void f2 (int * restrict in, int * restrict out, void * restrict mask_in,
> int n)
> +{
> +  vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((float *)(in + 10000), 19);
> +  __riscv_vse32_v_f32mf2 ((float *)(out + 10000), v, 19);
> +  vbool64_t mask = *(vbool64_t*)mask_in;
> +  for (int i = 0; i < n; i++)
> +    {
> +      vint16mf2_t v1 = __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1),
> 19);
> +      __riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19);
> +
> +      vint32mf2_t v2 = __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2),
> 19);
> +      __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19);
> +
> +      vint32mf2_t v3 = __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t
> *)(in + i + 200), 13);
> +      __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 200), v2, 13);
> +
> +      vfloat64m1_t v4 = __riscv_vle64_v_f64m1_m (mask, (double *)(in + i
> + 300), 11);
> +      __riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11);
> +
> +      vfloat64m1_t v5 = __riscv_vle64_v_f64m1_tum (mask, v4, (double
> *)(in + i + 500), 11);
> +      __riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11);
> +
> +      vfloat64m1_t v6 = __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in
> + i + 600), 11);
> +      __riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11);
> +
> +      vuint8mf4_t v7 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700),
> 11);
> +      __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11);
> +    }
> +}
> +
> +void f3 (int * restrict in, int * restrict out, void * restrict mask_in,
> int n)
> +{
> +  vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((float *)(in + 10000), 19);
> +  __riscv_vse32_v_f32mf2 ((float *)(out + 10000), v, 19);
> +  vbool64_t mask = *(vbool64_t*)mask_in;
> +  for (int i = 0; i < n; i++)
> +    {
> +      vint16mf2_t v1 = __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1),
> 19);
> +      __riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19);
> +
> +      vint32mf2_t v2 = __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2),
> 19);
> +      __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19);
> +
> +      vint32mf2_t v3 = __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t
> *)(in + i + 200), 13);
> +      *(vint32mf2_t*)(out + i + 200) = v3;
> +
> +      vfloat64m1_t v4 = __riscv_vle64_v_f64m1_m (mask, (double *)(in + i
> + 300), 11);
> +      __riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11);
> +
> +      vfloat64m1_t v5 = __riscv_vle64_v_f64m1_tum (mask, v4, (double
> *)(in + i + 500), 11);
> +      __riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11);
> +
> +      vfloat64m1_t v6 = __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in
> + i + 600), 11);
> +      __riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11);
> +
> +      vuint8mf4_t v7 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700),
> 11);
> +      __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11);
> +    }
> +}
> +
> +/* It should not have redundant vector register spills which produce csrr
> vlenb instructions allocate stack.  */
> +/* { dg-final { scan-assembler-not {csrr\s+[a-x0-9]+,\s*vlenb} } } */
> --
> 2.36.1
>
>