[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] RISC-V: Use widening shift for scatter/gather if applicable.

Jeff Law law@gcc.gnu.org
Sun Jun 2 19:23:55 GMT 2024


https://gcc.gnu.org/g:5eade133c823d4ef2e226991c6ab5cfb63f2b338

commit 5eade133c823d4ef2e226991c6ab5cfb63f2b338
Author: Robin Dapp <rdapp@ventanamicro.com>
Date:   Fri May 10 13:37:03 2024 +0200

    RISC-V: Use widening shift for scatter/gather if applicable.
    
    With the zvbb extension we can emit a widening shift for scatter/gather
    index preparation in case we need to multiply by 2 and zero extend.
    
    The patch also adds vwsll to the mode_idx attribute and removes the
    mode from shift-count operand of the insn pattern.
    
    gcc/ChangeLog:
    
            * config/riscv/riscv-v.cc (expand_gather_scatter): Use vwsll if
            applicable.
            * config/riscv/vector-crypto.md: Remove mode from vwsll shift
            count operator.
            * config/riscv/vector.md: Add vwsll to mode iterator.
    
    gcc/testsuite/ChangeLog:
    
            * lib/target-supports.exp: Add zvbb.
            * gcc.target/riscv/rvv/autovec/gather-scatter/gather_load_64-12-zvbb.c: New test.
    
    (cherry picked from commit 309ee005aa871286c8daccbce7586f82be347440)

Diff:
---
 gcc/config/riscv/riscv-v.cc                        |  42 +++++---
 gcc/config/riscv/vector-crypto.md                  |   4 +-
 gcc/config/riscv/vector.md                         |   4 +-
 .../gather-scatter/gather_load_64-12-zvbb.c        | 113 +++++++++++++++++++++
 gcc/testsuite/lib/target-supports.exp              |  48 ++++++++-
 5 files changed, 193 insertions(+), 18 deletions(-)

diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index f105f470495..9428beca268 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -4016,7 +4016,7 @@ expand_gather_scatter (rtx *ops, bool is_load)
 {
   rtx ptr, vec_offset, vec_reg;
   bool zero_extend_p;
-  int scale_log2;
+  int shift;
   rtx mask = ops[5];
   rtx len = ops[6];
   if (is_load)
@@ -4025,7 +4025,7 @@ expand_gather_scatter (rtx *ops, bool is_load)
       ptr = ops[1];
       vec_offset = ops[2];
       zero_extend_p = INTVAL (ops[3]);
-      scale_log2 = exact_log2 (INTVAL (ops[4]));
+      shift = exact_log2 (INTVAL (ops[4]));
     }
   else
     {
@@ -4033,7 +4033,7 @@ expand_gather_scatter (rtx *ops, bool is_load)
       ptr = ops[0];
       vec_offset = ops[1];
       zero_extend_p = INTVAL (ops[2]);
-      scale_log2 = exact_log2 (INTVAL (ops[3]));
+      shift = exact_log2 (INTVAL (ops[3]));
     }
 
   machine_mode vec_mode = GET_MODE (vec_reg);
@@ -4043,9 +4043,12 @@ expand_gather_scatter (rtx *ops, bool is_load)
   poly_int64 nunits = GET_MODE_NUNITS (vec_mode);
   bool is_vlmax = is_vlmax_len_p (vec_mode, len);
 
+  bool use_widening_shift = false;
+
   /* Extend the offset element to address width.  */
   if (inner_offsize < BITS_PER_WORD)
     {
+      use_widening_shift = TARGET_ZVBB && zero_extend_p && shift == 1;
       /* 7.2. Vector Load/Store Addressing Modes.
 	 If the vector offset elements are narrower than XLEN, they are
 	 zero-extended to XLEN before adding to the ptr effective address. If
@@ -4054,8 +4057,8 @@ expand_gather_scatter (rtx *ops, bool is_load)
 	 raise an illegal instruction exception if the EEW is not supported for
 	 offset elements.
 
-	 RVV spec only refers to the scale_log == 0 case.  */
-      if (!zero_extend_p || scale_log2 != 0)
+	 RVV spec only refers to the shift == 0 case.  */
+      if (!zero_extend_p || shift)
 	{
 	  if (zero_extend_p)
 	    inner_idx_mode
@@ -4064,19 +4067,32 @@ expand_gather_scatter (rtx *ops, bool is_load)
 	    inner_idx_mode = int_mode_for_size (BITS_PER_WORD, 0).require ();
 	  machine_mode new_idx_mode
 	    = get_vector_mode (inner_idx_mode, nunits).require ();
-	  rtx tmp = gen_reg_rtx (new_idx_mode);
-	  emit_insn (gen_extend_insn (tmp, vec_offset, new_idx_mode, idx_mode,
-				      zero_extend_p ? true : false));
-	  vec_offset = tmp;
+	  if (!use_widening_shift)
+	    {
+	      rtx tmp = gen_reg_rtx (new_idx_mode);
+	      emit_insn (gen_extend_insn (tmp, vec_offset, new_idx_mode, idx_mode,
+					  zero_extend_p ? true : false));
+	      vec_offset = tmp;
+	    }
 	  idx_mode = new_idx_mode;
 	}
     }
 
-  if (scale_log2 != 0)
+  if (shift)
     {
-      rtx tmp = expand_binop (idx_mode, ashl_optab, vec_offset,
-			      gen_int_mode (scale_log2, Pmode), NULL_RTX, 0,
-			      OPTAB_DIRECT);
+      rtx tmp;
+      if (!use_widening_shift)
+	tmp = expand_binop (idx_mode, ashl_optab, vec_offset,
+			    gen_int_mode (shift, Pmode), NULL_RTX, 0,
+			    OPTAB_DIRECT);
+      else
+	{
+	  tmp = gen_reg_rtx (idx_mode);
+	  insn_code icode = code_for_pred_vwsll_scalar (idx_mode);
+	  rtx ops[] = {tmp, vec_offset, const1_rtx};
+	  emit_vlmax_insn (icode, BINARY_OP, ops);
+	}
+
       vec_offset = tmp;
     }
 
diff --git a/gcc/config/riscv/vector-crypto.md b/gcc/config/riscv/vector-crypto.md
index 24822e2712c..0ddc2f3f3c6 100755
--- a/gcc/config/riscv/vector-crypto.md
+++ b/gcc/config/riscv/vector-crypto.md
@@ -295,7 +295,7 @@
        (ashift:VWEXTI
          (zero_extend:VWEXTI
            (match_operand:<V_DOUBLE_TRUNC> 3 "register_operand" "vr"))
-         (match_operand:<V_DOUBLE_TRUNC> 4 "register_operand"  "vr"))
+         (match_operand:<V_DOUBLE_TRUNC> 4 "vector_shift_operand"  "vrvk"))
        (match_operand:VWEXTI 2 "vector_merge_operand" "0vu")))]
   "TARGET_ZVBB"
   "vwsll.v%o4\t%0,%3,%4%p1"
@@ -316,7 +316,7 @@
        (ashift:VWEXTI
          (zero_extend:VWEXTI
            (match_operand:<V_DOUBLE_TRUNC> 3 "register_operand" "   vr,    vr"))
-         (match_operand:<VSUBEL> 4 "pmode_reg_or_uimm5_operand" "   rK,    rK"))
+         (match_operand 4 "pmode_reg_or_uimm5_operand"		"   rK,    rK"))
        (match_operand:VWEXTI 2 "vector_merge_operand"           "   vu,    0")))]
   "TARGET_ZVBB"
   "vwsll.v%o4\t%0,%3,%4%p1"
diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index dccf76f0003..69423be6917 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -750,10 +750,10 @@
 	       (const_int 1)
 
 	       (eq_attr "type" "vssegte,vmpop,vmffs")
-	       (const_int 2)       
+	       (const_int 2)
 
 	       (eq_attr "type" "vstux,vstox,vssegts,vssegtux,vssegtox,vfcvtftoi,vfwcvtitof,vfwcvtftoi,
-				vfwcvtftof,vmsfs,vired,viwred,vfredu,vfredo,vfwredu,vfwredo")
+				vfwcvtftof,vmsfs,vired,viwred,vfredu,vfredo,vfwredu,vfwredo,vwsll")
 	       (const_int 3)
 
 	       (eq_attr "type" "viwalu,viwmul,viwmuladd,vfwalu,vfwmul,vfwmuladd")
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/gather_load_64-12-zvbb.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/gather_load_64-12-zvbb.c
new file mode 100644
index 00000000000..11a4031f47b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/gather_load_64-12-zvbb.c
@@ -0,0 +1,113 @@
+/* { dg-do compile } */
+/* { dg-add-options "riscv_v" } */
+/* { dg-add-options "riscv_zvbb" } */
+/* { dg-additional-options "-fno-vect-cost-model -fdump-tree-vect-details -mrvv-max-lmul=m4" } */
+
+#include <stdint-gcc.h>
+
+#define TEST_LOOP(DATA_TYPE, INDEX_TYPE)                                       \
+  void __attribute__ ((noinline, noclone))                                     \
+  f_##DATA_TYPE##_##INDEX_TYPE (DATA_TYPE *restrict y, DATA_TYPE *restrict x,  \
+				INDEX_TYPE *restrict index)                    \
+  {                                                                            \
+    for (int i = 0; i < 100; ++i)                                              \
+      {                                                                        \
+	y[i * 2] = x[index[i * 2]] + 1;                                        \
+	y[i * 2 + 1] = x[index[i * 2 + 1]] + 2;                                \
+      }                                                                        \
+  }
+
+TEST_LOOP (int8_t, int8_t)
+TEST_LOOP (uint8_t, int8_t)
+TEST_LOOP (int16_t, int8_t)
+TEST_LOOP (uint16_t, int8_t)
+TEST_LOOP (int32_t, int8_t)
+TEST_LOOP (uint32_t, int8_t)
+TEST_LOOP (int64_t, int8_t)
+TEST_LOOP (uint64_t, int8_t)
+TEST_LOOP (_Float16, int8_t)
+TEST_LOOP (float, int8_t)
+TEST_LOOP (double, int8_t)
+TEST_LOOP (int8_t, int16_t)
+TEST_LOOP (uint8_t, int16_t)
+TEST_LOOP (int16_t, int16_t)
+TEST_LOOP (uint16_t, int16_t)
+TEST_LOOP (int32_t, int16_t)
+TEST_LOOP (uint32_t, int16_t)
+TEST_LOOP (int64_t, int16_t)
+TEST_LOOP (uint64_t, int16_t)
+TEST_LOOP (_Float16, int16_t)
+TEST_LOOP (float, int16_t)
+TEST_LOOP (double, int16_t)
+TEST_LOOP (int8_t, int32_t)
+TEST_LOOP (uint8_t, int32_t)
+TEST_LOOP (int16_t, int32_t)
+TEST_LOOP (uint16_t, int32_t)
+TEST_LOOP (int32_t, int32_t)
+TEST_LOOP (uint32_t, int32_t)
+TEST_LOOP (int64_t, int32_t)
+TEST_LOOP (uint64_t, int32_t)
+TEST_LOOP (_Float16, int32_t)
+TEST_LOOP (float, int32_t)
+TEST_LOOP (double, int32_t)
+TEST_LOOP (int8_t, int64_t)
+TEST_LOOP (uint8_t, int64_t)
+TEST_LOOP (int16_t, int64_t)
+TEST_LOOP (uint16_t, int64_t)
+TEST_LOOP (int32_t, int64_t)
+TEST_LOOP (uint32_t, int64_t)
+TEST_LOOP (int64_t, int64_t)
+TEST_LOOP (uint64_t, int64_t)
+TEST_LOOP (_Float16, int64_t)
+TEST_LOOP (float, int64_t)
+TEST_LOOP (double, int64_t)
+TEST_LOOP (int8_t, uint8_t)
+TEST_LOOP (uint8_t, uint8_t)
+TEST_LOOP (int16_t, uint8_t)
+TEST_LOOP (uint16_t, uint8_t)
+TEST_LOOP (int32_t, uint8_t)
+TEST_LOOP (uint32_t, uint8_t)
+TEST_LOOP (int64_t, uint8_t)
+TEST_LOOP (uint64_t, uint8_t)
+TEST_LOOP (_Float16, uint8_t)
+TEST_LOOP (float, uint8_t)
+TEST_LOOP (double, uint8_t)
+TEST_LOOP (int8_t, uint16_t)
+TEST_LOOP (uint8_t, uint16_t)
+TEST_LOOP (int16_t, uint16_t)
+TEST_LOOP (uint16_t, uint16_t)
+TEST_LOOP (int32_t, uint16_t)
+TEST_LOOP (uint32_t, uint16_t)
+TEST_LOOP (int64_t, uint16_t)
+TEST_LOOP (uint64_t, uint16_t)
+TEST_LOOP (_Float16, uint16_t)
+TEST_LOOP (float, uint16_t)
+TEST_LOOP (double, uint16_t)
+TEST_LOOP (int8_t, uint32_t)
+TEST_LOOP (uint8_t, uint32_t)
+TEST_LOOP (int16_t, uint32_t)
+TEST_LOOP (uint16_t, uint32_t)
+TEST_LOOP (int32_t, uint32_t)
+TEST_LOOP (uint32_t, uint32_t)
+TEST_LOOP (int64_t, uint32_t)
+TEST_LOOP (uint64_t, uint32_t)
+TEST_LOOP (_Float16, uint32_t)
+TEST_LOOP (float, uint32_t)
+TEST_LOOP (double, uint32_t)
+TEST_LOOP (int8_t, uint64_t)
+TEST_LOOP (uint8_t, uint64_t)
+TEST_LOOP (int16_t, uint64_t)
+TEST_LOOP (uint16_t, uint64_t)
+TEST_LOOP (int32_t, uint64_t)
+TEST_LOOP (uint32_t, uint64_t)
+TEST_LOOP (int64_t, uint64_t)
+TEST_LOOP (uint64_t, uint64_t)
+TEST_LOOP (_Float16, uint64_t)
+TEST_LOOP (float, uint64_t)
+TEST_LOOP (double, uint64_t)
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 88 "vect" } } */
+/* { dg-final { scan-tree-dump " \.MASK_LEN_GATHER_LOAD" "vect" } } */
+/* { dg-final { scan-tree-dump-not " \.GATHER_LOAD" "vect" } } */
+/* { dg-final { scan-tree-dump-not " \.MASK_GATHER_LOAD" "vect" } } */
+/* { dg-final { scan-assembler "vwsll.vi" } } */
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 6c828b73ded..75ce5b5c896 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -1965,6 +1965,17 @@ proc check_effective_target_riscv_zbb { } {
     }]
 }
 
+# Return 1 if the target arch supports the Zbb extension, 0 otherwise.
+# Cache the result.
+
+proc check_effective_target_riscv_zvbb { } {
+    return [check_no_compiler_messages riscv_ext_zvbb assembly {
+       #ifndef __riscv_zvbb
+       #error "Not __riscv_zvbb"
+       #endif
+    }]
+}
+
 # Return 1 if the target arch supports the XTheadVector extension, 0 otherwise.
 # Cache the result.
 
@@ -2053,10 +2064,33 @@ proc check_effective_target_riscv_zvfh_ok { } {
     return 0
 }
 
+proc check_effective_target_riscv_zvbb_ok { } {
+    # If the target already supports v without any added options,
+    # we may assume we can execute just fine.
+    if { [check_effective_target_riscv_zvbb] } {
+	return 1
+    }
+
+    # check if we can execute vector insns with the given hardware or
+    # simulator
+    set gcc_march [regsub {[[:alnum:]]*} [riscv_get_arch] &zvbb]
+    if { [check_runtime ${gcc_march}_zvbb_exec {
+	int main()
+	{
+	    asm ("vsetivli zero,8,e16,m1,ta,ma");
+	    asm ("vwsll.vi v8,v16,2" : : : "v8");
+	    return 0;
+	} } "-march=${gcc_march}"] } {
+	    return 1
+	}
+
+    return 0
+}
+
 proc riscv_get_arch { } {
     set gcc_march ""
     # ??? do we neeed to add more extensions to the list below?
-    foreach ext { i m a f d q c v zicsr zifencei zfh zba zbb zbc zbs zvfh ztso } {
+    foreach ext { i m a f d q c v zicsr zifencei zfh zba zbb zbc zbs zvbb zvfh ztso } {
 	if { [check_no_compiler_messages  riscv_ext_$ext assembly [string map [list DEF __riscv_$ext] {
 		#ifndef DEF
 		#error "Not DEF"
@@ -2151,6 +2185,18 @@ proc add_options_for_riscv_zvfh { flags } {
     return "$flags -march=[riscv_get_arch]_zvfh"
 }
 
+proc add_options_for_riscv_zvbb { flags } {
+    if { [lsearch $flags -march=*] >= 0 } {
+	# If there are multiple -march flags, we have to adjust all of them.
+	set flags [regsub -all -- {(?:^|[[:space:]])-march=[[:alnum:]_.]*} $flags &_zvbb ]
+	return [regsub -all -- {((?:^|[[:space:]])-march=[[:alnum:]_.]*_zvbb[[:alnum:]_.]*)_zvbb} $flags \\1 ]
+    }
+    if { [check_effective_target_riscv_zvbb] } {
+	return "$flags"
+    }
+    return "$flags -march=[riscv_get_arch]_zvbb"
+}
+
 # Return 1 if the target OS supports running SSE executables, 0
 # otherwise.  Cache the result.


More information about the Gcc-cvs mailing list