[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] RISC-V: Use widening shift for scatter/gather if applicable.
Jeff Law
law@gcc.gnu.org
Sun Jun 2 19:23:55 GMT 2024
https://gcc.gnu.org/g:5eade133c823d4ef2e226991c6ab5cfb63f2b338
commit 5eade133c823d4ef2e226991c6ab5cfb63f2b338
Author: Robin Dapp <rdapp@ventanamicro.com>
Date: Fri May 10 13:37:03 2024 +0200
RISC-V: Use widening shift for scatter/gather if applicable.
With the zvbb extension we can emit a widening shift for scatter/gather
index preparation in case we need to multiply by 2 and zero extend.
The patch also adds vwsll to the mode_idx attribute and removes the
mode from shift-count operand of the insn pattern.
gcc/ChangeLog:
* config/riscv/riscv-v.cc (expand_gather_scatter): Use vwsll if
applicable.
* config/riscv/vector-crypto.md: Remove mode from vwsll shift
count operator.
* config/riscv/vector.md: Add vwsll to mode iterator.
gcc/testsuite/ChangeLog:
* lib/target-supports.exp: Add zvbb.
* gcc.target/riscv/rvv/autovec/gather-scatter/gather_load_64-12-zvbb.c: New test.
(cherry picked from commit 309ee005aa871286c8daccbce7586f82be347440)
Diff:
---
gcc/config/riscv/riscv-v.cc | 42 +++++---
gcc/config/riscv/vector-crypto.md | 4 +-
gcc/config/riscv/vector.md | 4 +-
.../gather-scatter/gather_load_64-12-zvbb.c | 113 +++++++++++++++++++++
gcc/testsuite/lib/target-supports.exp | 48 ++++++++-
5 files changed, 193 insertions(+), 18 deletions(-)
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index f105f470495..9428beca268 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -4016,7 +4016,7 @@ expand_gather_scatter (rtx *ops, bool is_load)
{
rtx ptr, vec_offset, vec_reg;
bool zero_extend_p;
- int scale_log2;
+ int shift;
rtx mask = ops[5];
rtx len = ops[6];
if (is_load)
@@ -4025,7 +4025,7 @@ expand_gather_scatter (rtx *ops, bool is_load)
ptr = ops[1];
vec_offset = ops[2];
zero_extend_p = INTVAL (ops[3]);
- scale_log2 = exact_log2 (INTVAL (ops[4]));
+ shift = exact_log2 (INTVAL (ops[4]));
}
else
{
@@ -4033,7 +4033,7 @@ expand_gather_scatter (rtx *ops, bool is_load)
ptr = ops[0];
vec_offset = ops[1];
zero_extend_p = INTVAL (ops[2]);
- scale_log2 = exact_log2 (INTVAL (ops[3]));
+ shift = exact_log2 (INTVAL (ops[3]));
}
machine_mode vec_mode = GET_MODE (vec_reg);
@@ -4043,9 +4043,12 @@ expand_gather_scatter (rtx *ops, bool is_load)
poly_int64 nunits = GET_MODE_NUNITS (vec_mode);
bool is_vlmax = is_vlmax_len_p (vec_mode, len);
+ bool use_widening_shift = false;
+
/* Extend the offset element to address width. */
if (inner_offsize < BITS_PER_WORD)
{
+ use_widening_shift = TARGET_ZVBB && zero_extend_p && shift == 1;
/* 7.2. Vector Load/Store Addressing Modes.
If the vector offset elements are narrower than XLEN, they are
zero-extended to XLEN before adding to the ptr effective address. If
@@ -4054,8 +4057,8 @@ expand_gather_scatter (rtx *ops, bool is_load)
raise an illegal instruction exception if the EEW is not supported for
offset elements.
- RVV spec only refers to the scale_log == 0 case. */
- if (!zero_extend_p || scale_log2 != 0)
+ RVV spec only refers to the shift == 0 case. */
+ if (!zero_extend_p || shift)
{
if (zero_extend_p)
inner_idx_mode
@@ -4064,19 +4067,32 @@ expand_gather_scatter (rtx *ops, bool is_load)
inner_idx_mode = int_mode_for_size (BITS_PER_WORD, 0).require ();
machine_mode new_idx_mode
= get_vector_mode (inner_idx_mode, nunits).require ();
- rtx tmp = gen_reg_rtx (new_idx_mode);
- emit_insn (gen_extend_insn (tmp, vec_offset, new_idx_mode, idx_mode,
- zero_extend_p ? true : false));
- vec_offset = tmp;
+ if (!use_widening_shift)
+ {
+ rtx tmp = gen_reg_rtx (new_idx_mode);
+ emit_insn (gen_extend_insn (tmp, vec_offset, new_idx_mode, idx_mode,
+ zero_extend_p ? true : false));
+ vec_offset = tmp;
+ }
idx_mode = new_idx_mode;
}
}
- if (scale_log2 != 0)
+ if (shift)
{
- rtx tmp = expand_binop (idx_mode, ashl_optab, vec_offset,
- gen_int_mode (scale_log2, Pmode), NULL_RTX, 0,
- OPTAB_DIRECT);
+ rtx tmp;
+ if (!use_widening_shift)
+ tmp = expand_binop (idx_mode, ashl_optab, vec_offset,
+ gen_int_mode (shift, Pmode), NULL_RTX, 0,
+ OPTAB_DIRECT);
+ else
+ {
+ tmp = gen_reg_rtx (idx_mode);
+ insn_code icode = code_for_pred_vwsll_scalar (idx_mode);
+ rtx ops[] = {tmp, vec_offset, const1_rtx};
+ emit_vlmax_insn (icode, BINARY_OP, ops);
+ }
+
vec_offset = tmp;
}
diff --git a/gcc/config/riscv/vector-crypto.md b/gcc/config/riscv/vector-crypto.md
index 24822e2712c..0ddc2f3f3c6 100755
--- a/gcc/config/riscv/vector-crypto.md
+++ b/gcc/config/riscv/vector-crypto.md
@@ -295,7 +295,7 @@
(ashift:VWEXTI
(zero_extend:VWEXTI
(match_operand:<V_DOUBLE_TRUNC> 3 "register_operand" "vr"))
- (match_operand:<V_DOUBLE_TRUNC> 4 "register_operand" "vr"))
+ (match_operand:<V_DOUBLE_TRUNC> 4 "vector_shift_operand" "vrvk"))
(match_operand:VWEXTI 2 "vector_merge_operand" "0vu")))]
"TARGET_ZVBB"
"vwsll.v%o4\t%0,%3,%4%p1"
@@ -316,7 +316,7 @@
(ashift:VWEXTI
(zero_extend:VWEXTI
(match_operand:<V_DOUBLE_TRUNC> 3 "register_operand" " vr, vr"))
- (match_operand:<VSUBEL> 4 "pmode_reg_or_uimm5_operand" " rK, rK"))
+ (match_operand 4 "pmode_reg_or_uimm5_operand" " rK, rK"))
(match_operand:VWEXTI 2 "vector_merge_operand" " vu, 0")))]
"TARGET_ZVBB"
"vwsll.v%o4\t%0,%3,%4%p1"
diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index dccf76f0003..69423be6917 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -750,10 +750,10 @@
(const_int 1)
(eq_attr "type" "vssegte,vmpop,vmffs")
- (const_int 2)
+ (const_int 2)
(eq_attr "type" "vstux,vstox,vssegts,vssegtux,vssegtox,vfcvtftoi,vfwcvtitof,vfwcvtftoi,
- vfwcvtftof,vmsfs,vired,viwred,vfredu,vfredo,vfwredu,vfwredo")
+ vfwcvtftof,vmsfs,vired,viwred,vfredu,vfredo,vfwredu,vfwredo,vwsll")
(const_int 3)
(eq_attr "type" "viwalu,viwmul,viwmuladd,vfwalu,vfwmul,vfwmuladd")
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/gather_load_64-12-zvbb.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/gather_load_64-12-zvbb.c
new file mode 100644
index 00000000000..11a4031f47b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/gather_load_64-12-zvbb.c
@@ -0,0 +1,113 @@
+/* { dg-do compile } */
+/* { dg-add-options "riscv_v" } */
+/* { dg-add-options "riscv_zvbb" } */
+/* { dg-additional-options "-fno-vect-cost-model -fdump-tree-vect-details -mrvv-max-lmul=m4" } */
+
+#include <stdint-gcc.h>
+
+#define TEST_LOOP(DATA_TYPE, INDEX_TYPE) \
+ void __attribute__ ((noinline, noclone)) \
+ f_##DATA_TYPE##_##INDEX_TYPE (DATA_TYPE *restrict y, DATA_TYPE *restrict x, \
+ INDEX_TYPE *restrict index) \
+ { \
+ for (int i = 0; i < 100; ++i) \
+ { \
+ y[i * 2] = x[index[i * 2]] + 1; \
+ y[i * 2 + 1] = x[index[i * 2 + 1]] + 2; \
+ } \
+ }
+
+TEST_LOOP (int8_t, int8_t)
+TEST_LOOP (uint8_t, int8_t)
+TEST_LOOP (int16_t, int8_t)
+TEST_LOOP (uint16_t, int8_t)
+TEST_LOOP (int32_t, int8_t)
+TEST_LOOP (uint32_t, int8_t)
+TEST_LOOP (int64_t, int8_t)
+TEST_LOOP (uint64_t, int8_t)
+TEST_LOOP (_Float16, int8_t)
+TEST_LOOP (float, int8_t)
+TEST_LOOP (double, int8_t)
+TEST_LOOP (int8_t, int16_t)
+TEST_LOOP (uint8_t, int16_t)
+TEST_LOOP (int16_t, int16_t)
+TEST_LOOP (uint16_t, int16_t)
+TEST_LOOP (int32_t, int16_t)
+TEST_LOOP (uint32_t, int16_t)
+TEST_LOOP (int64_t, int16_t)
+TEST_LOOP (uint64_t, int16_t)
+TEST_LOOP (_Float16, int16_t)
+TEST_LOOP (float, int16_t)
+TEST_LOOP (double, int16_t)
+TEST_LOOP (int8_t, int32_t)
+TEST_LOOP (uint8_t, int32_t)
+TEST_LOOP (int16_t, int32_t)
+TEST_LOOP (uint16_t, int32_t)
+TEST_LOOP (int32_t, int32_t)
+TEST_LOOP (uint32_t, int32_t)
+TEST_LOOP (int64_t, int32_t)
+TEST_LOOP (uint64_t, int32_t)
+TEST_LOOP (_Float16, int32_t)
+TEST_LOOP (float, int32_t)
+TEST_LOOP (double, int32_t)
+TEST_LOOP (int8_t, int64_t)
+TEST_LOOP (uint8_t, int64_t)
+TEST_LOOP (int16_t, int64_t)
+TEST_LOOP (uint16_t, int64_t)
+TEST_LOOP (int32_t, int64_t)
+TEST_LOOP (uint32_t, int64_t)
+TEST_LOOP (int64_t, int64_t)
+TEST_LOOP (uint64_t, int64_t)
+TEST_LOOP (_Float16, int64_t)
+TEST_LOOP (float, int64_t)
+TEST_LOOP (double, int64_t)
+TEST_LOOP (int8_t, uint8_t)
+TEST_LOOP (uint8_t, uint8_t)
+TEST_LOOP (int16_t, uint8_t)
+TEST_LOOP (uint16_t, uint8_t)
+TEST_LOOP (int32_t, uint8_t)
+TEST_LOOP (uint32_t, uint8_t)
+TEST_LOOP (int64_t, uint8_t)
+TEST_LOOP (uint64_t, uint8_t)
+TEST_LOOP (_Float16, uint8_t)
+TEST_LOOP (float, uint8_t)
+TEST_LOOP (double, uint8_t)
+TEST_LOOP (int8_t, uint16_t)
+TEST_LOOP (uint8_t, uint16_t)
+TEST_LOOP (int16_t, uint16_t)
+TEST_LOOP (uint16_t, uint16_t)
+TEST_LOOP (int32_t, uint16_t)
+TEST_LOOP (uint32_t, uint16_t)
+TEST_LOOP (int64_t, uint16_t)
+TEST_LOOP (uint64_t, uint16_t)
+TEST_LOOP (_Float16, uint16_t)
+TEST_LOOP (float, uint16_t)
+TEST_LOOP (double, uint16_t)
+TEST_LOOP (int8_t, uint32_t)
+TEST_LOOP (uint8_t, uint32_t)
+TEST_LOOP (int16_t, uint32_t)
+TEST_LOOP (uint16_t, uint32_t)
+TEST_LOOP (int32_t, uint32_t)
+TEST_LOOP (uint32_t, uint32_t)
+TEST_LOOP (int64_t, uint32_t)
+TEST_LOOP (uint64_t, uint32_t)
+TEST_LOOP (_Float16, uint32_t)
+TEST_LOOP (float, uint32_t)
+TEST_LOOP (double, uint32_t)
+TEST_LOOP (int8_t, uint64_t)
+TEST_LOOP (uint8_t, uint64_t)
+TEST_LOOP (int16_t, uint64_t)
+TEST_LOOP (uint16_t, uint64_t)
+TEST_LOOP (int32_t, uint64_t)
+TEST_LOOP (uint32_t, uint64_t)
+TEST_LOOP (int64_t, uint64_t)
+TEST_LOOP (uint64_t, uint64_t)
+TEST_LOOP (_Float16, uint64_t)
+TEST_LOOP (float, uint64_t)
+TEST_LOOP (double, uint64_t)
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 88 "vect" } } */
+/* { dg-final { scan-tree-dump " \.MASK_LEN_GATHER_LOAD" "vect" } } */
+/* { dg-final { scan-tree-dump-not " \.GATHER_LOAD" "vect" } } */
+/* { dg-final { scan-tree-dump-not " \.MASK_GATHER_LOAD" "vect" } } */
+/* { dg-final { scan-assembler "vwsll.vi" } } */
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 6c828b73ded..75ce5b5c896 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -1965,6 +1965,17 @@ proc check_effective_target_riscv_zbb { } {
}]
}
+# Return 1 if the target arch supports the Zbb extension, 0 otherwise.
+# Cache the result.
+
+proc check_effective_target_riscv_zvbb { } {
+ return [check_no_compiler_messages riscv_ext_zvbb assembly {
+ #ifndef __riscv_zvbb
+ #error "Not __riscv_zvbb"
+ #endif
+ }]
+}
+
# Return 1 if the target arch supports the XTheadVector extension, 0 otherwise.
# Cache the result.
@@ -2053,10 +2064,33 @@ proc check_effective_target_riscv_zvfh_ok { } {
return 0
}
+proc check_effective_target_riscv_zvbb_ok { } {
+ # If the target already supports v without any added options,
+ # we may assume we can execute just fine.
+ if { [check_effective_target_riscv_zvbb] } {
+ return 1
+ }
+
+ # check if we can execute vector insns with the given hardware or
+ # simulator
+ set gcc_march [regsub {[[:alnum:]]*} [riscv_get_arch] &zvbb]
+ if { [check_runtime ${gcc_march}_zvbb_exec {
+ int main()
+ {
+ asm ("vsetivli zero,8,e16,m1,ta,ma");
+ asm ("vwsll.vi v8,v16,2" : : : "v8");
+ return 0;
+ } } "-march=${gcc_march}"] } {
+ return 1
+ }
+
+ return 0
+}
+
proc riscv_get_arch { } {
set gcc_march ""
# ??? do we neeed to add more extensions to the list below?
- foreach ext { i m a f d q c v zicsr zifencei zfh zba zbb zbc zbs zvfh ztso } {
+ foreach ext { i m a f d q c v zicsr zifencei zfh zba zbb zbc zbs zvbb zvfh ztso } {
if { [check_no_compiler_messages riscv_ext_$ext assembly [string map [list DEF __riscv_$ext] {
#ifndef DEF
#error "Not DEF"
@@ -2151,6 +2185,18 @@ proc add_options_for_riscv_zvfh { flags } {
return "$flags -march=[riscv_get_arch]_zvfh"
}
+proc add_options_for_riscv_zvbb { flags } {
+ if { [lsearch $flags -march=*] >= 0 } {
+ # If there are multiple -march flags, we have to adjust all of them.
+ set flags [regsub -all -- {(?:^|[[:space:]])-march=[[:alnum:]_.]*} $flags &_zvbb ]
+ return [regsub -all -- {((?:^|[[:space:]])-march=[[:alnum:]_.]*_zvbb[[:alnum:]_.]*)_zvbb} $flags \\1 ]
+ }
+ if { [check_effective_target_riscv_zvbb] } {
+ return "$flags"
+ }
+ return "$flags -march=[riscv_get_arch]_zvbb"
+}
+
# Return 1 if the target OS supports running SSE executables, 0
# otherwise. Cache the result.
More information about the Gcc-cvs
mailing list