[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] [to-be-committed] [RISC-V] Use Zbkb for general 64 bit constants when profitable
Jeff Law
law@gcc.gnu.org
Sun Jun 2 19:24:15 GMT 2024
https://gcc.gnu.org/g:e26b14182c2c18deab641b4b81fc53c456573818
commit e26b14182c2c18deab641b4b81fc53c456573818
Author: Jeff Law <jlaw@ventanamicro.com>
Date: Fri May 31 21:45:01 2024 -0600
[to-be-committed] [RISC-V] Use Zbkb for general 64 bit constants when profitable
Basically this adds the ability to generate two independent constants during
synthesis, then bring them together with a pack instruction. Thus we never need
to go out to the constant pool when zbkb is enabled. The worst sequence we ever
generate is
lui+addi+lui+addi+pack
Obviously if either half can be synthesized with just a lui or just an addi,
then we'll DTRT automagically. So for example:
unsigned long foo_0xf857f2def857f2de(void) {
return 0x1425000028000000;
}
The high and low halves are just a lui. So the final synthesis is:
> li a5,671088640 # 15 [c=4 l=4] *movdi_64bit/1
> li a0,337969152 # 16 [c=4 l=4] *movdi_64bit/1
> pack a0,a5,a0 # 17 [c=12 l=4] riscv_xpack_di_si_2
On the implementation side, I think the bits I've put in here likely can be
used to handle the repeating constant case for !zbkb. I think it likely could
be used to help capture cases where the upper half can be derived from the
lower half (say by turning a bit on or off, shifting or something similar).
The key in both of these cases is we need a temporary register holding an
intermediate value.
Ventana's internal tester enables zbkb, but I don't think any of the other
testers currently exercise zbkb. We'll probably want to change that at some
point, but I don't think it's super-critical yet.
While I can envision a few more cases where we could improve constant
synthesis, No immediate plans to work in this space, but if someone is
interested, some thoughts are recorded here:
> https://wiki.riseproject.dev/display/HOME/CT_00_031+--+Additional+Constant+Synthesis+Improvements
gcc/
* config/riscv/riscv.cc (riscv_integer_op): Add new field.
(riscv_build_integer_1): Initialize the new field.
(riscv_built_integer): Recognize more cases where Zbkb's
pack instruction is profitable.
(riscv_move_integer): Loop over all the codes. If requested,
save the current constant into a temporary. Generate pack
for more cases using the saved constant.
gcc/testsuite
* gcc.target/riscv/synthesis-10.c: New test.
(cherry picked from commit c0ded050cd29cc73f78cb4ab23674c7bc024969e)
Diff:
---
gcc/config/riscv/riscv.cc | 108 ++++++++++++++++++++++----
gcc/testsuite/gcc.target/riscv/synthesis-10.c | 18 +++++
2 files changed, 110 insertions(+), 16 deletions(-)
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 91fefacee80..10af38a5a81 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -250,6 +250,7 @@ struct riscv_arg_info {
and each VALUE[i] is a constant integer. CODE[0] is undefined. */
struct riscv_integer_op {
bool use_uw;
+ bool save_temporary;
enum rtx_code code;
unsigned HOST_WIDE_INT value;
};
@@ -759,6 +760,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
codes[0].code = UNKNOWN;
codes[0].value = value;
codes[0].use_uw = false;
+ codes[0].save_temporary = false;
return 1;
}
if (TARGET_ZBS && SINGLE_BIT_MASK_OPERAND (value))
@@ -767,6 +769,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
codes[0].code = UNKNOWN;
codes[0].value = value;
codes[0].use_uw = false;
+ codes[0].save_temporary = false;
/* RISC-V sign-extends all 32bit values that live in a 32bit
register. To avoid paradoxes, we thus need to use the
@@ -796,6 +799,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
alt_codes[alt_cost-1].code = PLUS;
alt_codes[alt_cost-1].value = low_part;
alt_codes[alt_cost-1].use_uw = false;
+ alt_codes[alt_cost-1].save_temporary = false;
memcpy (codes, alt_codes, sizeof (alt_codes));
cost = alt_cost;
}
@@ -810,6 +814,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
alt_codes[alt_cost-1].code = XOR;
alt_codes[alt_cost-1].value = low_part;
alt_codes[alt_cost-1].use_uw = false;
+ alt_codes[alt_cost-1].save_temporary = false;
memcpy (codes, alt_codes, sizeof (alt_codes));
cost = alt_cost;
}
@@ -852,6 +857,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
alt_codes[alt_cost-1].code = ASHIFT;
alt_codes[alt_cost-1].value = shift;
alt_codes[alt_cost-1].use_uw = use_uw;
+ alt_codes[alt_cost-1].save_temporary = false;
memcpy (codes, alt_codes, sizeof (alt_codes));
cost = alt_cost;
}
@@ -873,9 +879,11 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
codes[0].value = (((unsigned HOST_WIDE_INT) value >> trailing_ones)
| (value << (64 - trailing_ones)));
codes[0].use_uw = false;
+ codes[0].save_temporary = false;
codes[1].code = ROTATERT;
codes[1].value = 64 - trailing_ones;
codes[1].use_uw = false;
+ codes[1].save_temporary = false;
cost = 2;
}
/* Handle the case where the 11 bit range of zero bits wraps around. */
@@ -888,9 +896,11 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
| ((unsigned HOST_WIDE_INT) value
>> (32 + upper_trailing_ones)));
codes[0].use_uw = false;
+ codes[0].save_temporary = false;
codes[1].code = ROTATERT;
codes[1].value = 32 - upper_trailing_ones;
codes[1].use_uw = false;
+ codes[1].save_temporary = false;
cost = 2;
}
@@ -917,6 +927,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
alt_codes[alt_cost].code = AND;
alt_codes[alt_cost].value = ~(1UL << bit);
alt_codes[alt_cost].use_uw = false;
+ alt_codes[alt_cost].save_temporary = false;
alt_cost++;
nval &= ~(1UL << bit);
}
@@ -938,6 +949,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
alt_codes[alt_cost - 1].code = FMA;
alt_codes[alt_cost - 1].value = 9;
alt_codes[alt_cost - 1].use_uw = false;
+ alt_codes[alt_cost - 1].save_temporary = false;
memcpy (codes, alt_codes, sizeof (alt_codes));
cost = alt_cost;
}
@@ -948,6 +960,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
alt_codes[alt_cost - 1].code = FMA;
alt_codes[alt_cost - 1].value = 5;
alt_codes[alt_cost - 1].use_uw = false;
+ alt_codes[alt_cost - 1].save_temporary = false;
memcpy (codes, alt_codes, sizeof (alt_codes));
cost = alt_cost;
}
@@ -958,6 +971,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
alt_codes[alt_cost - 1].code = FMA;
alt_codes[alt_cost - 1].value = 3;
alt_codes[alt_cost - 1].use_uw = false;
+ alt_codes[alt_cost - 1].save_temporary = false;
memcpy (codes, alt_codes, sizeof (alt_codes));
cost = alt_cost;
}
@@ -978,6 +992,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
alt_codes[alt_cost - 1].code = PLUS;
alt_codes[alt_cost - 1].value = adjustment;
alt_codes[alt_cost - 1].use_uw = false;
+ alt_codes[alt_cost - 1].save_temporary = false;
memcpy (codes, alt_codes, sizeof (alt_codes));
cost = alt_cost;
}
@@ -995,6 +1010,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
alt_codes[i].code = (i == 0 ? UNKNOWN : IOR);
alt_codes[i].value = value & 0x7ffff000;
alt_codes[i].use_uw = false;
+ alt_codes[i].save_temporary = false;
value &= ~0x7ffff000;
i++;
}
@@ -1005,6 +1021,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
alt_codes[i].code = (i == 0 ? UNKNOWN : PLUS);
alt_codes[i].value = value & 0x7ff;
alt_codes[i].use_uw = false;
+ alt_codes[i].save_temporary = false;
value &= ~0x7ff;
i++;
}
@@ -1016,6 +1033,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
alt_codes[i].code = (i == 0 ? UNKNOWN : IOR);
alt_codes[i].value = 1UL << bit;
alt_codes[i].use_uw = false;
+ alt_codes[i].save_temporary = false;
value &= ~(1ULL << bit);
i++;
}
@@ -1057,6 +1075,7 @@ riscv_build_integer (struct riscv_integer_op *codes, HOST_WIDE_INT value,
alt_codes[alt_cost-1].code = LSHIFTRT;
alt_codes[alt_cost-1].value = shift;
alt_codes[alt_cost-1].use_uw = false;
+ alt_codes[alt_cost-1].save_temporary = false;
memcpy (codes, alt_codes, sizeof (alt_codes));
cost = alt_cost;
}
@@ -1069,6 +1088,7 @@ riscv_build_integer (struct riscv_integer_op *codes, HOST_WIDE_INT value,
alt_codes[alt_cost-1].code = LSHIFTRT;
alt_codes[alt_cost-1].value = shift;
alt_codes[alt_cost-1].use_uw = false;
+ alt_codes[alt_cost-1].save_temporary = false;
memcpy (codes, alt_codes, sizeof (alt_codes));
cost = alt_cost;
}
@@ -1093,6 +1113,7 @@ riscv_build_integer (struct riscv_integer_op *codes, HOST_WIDE_INT value,
alt_codes[alt_cost - 1].code = XOR;
alt_codes[alt_cost - 1].value = -1;
alt_codes[alt_cost - 1].use_uw = false;
+ alt_codes[alt_cost - 1].save_temporary = false;
memcpy (codes, alt_codes, sizeof (alt_codes));
cost = alt_cost;
}
@@ -1128,13 +1149,55 @@ riscv_build_integer (struct riscv_integer_op *codes, HOST_WIDE_INT value,
if (cost > 3 && TARGET_64BIT && TARGET_ZBKB)
{
unsigned HOST_WIDE_INT loval = value & 0xffffffff;
- unsigned HOST_WIDE_INT hival = value & ~loval;
- if (hival >> 32 == loval)
+ unsigned HOST_WIDE_INT hival = (value & ~loval) >> 32;
+ if (hival == loval)
{
cost = 1 + riscv_build_integer_1 (codes, sext_hwi (loval, 32), mode);
codes[cost - 1].code = CONCAT;
codes[cost - 1].value = 0;
codes[cost - 1].use_uw = false;
+ codes[cost - 1].save_temporary = false;
+ }
+
+ /* An arbitrary 64 bit constant can be synthesized in 5 instructions
+ using zbkb. We may do better than that if the upper or lower half
+ can be synthsized with a single LUI, ADDI or BSET. Regardless the
+ basic steps are the same. */
+ if (cost > 3 && can_create_pseudo_p ())
+ {
+ struct riscv_integer_op hi_codes[RISCV_MAX_INTEGER_OPS];
+ struct riscv_integer_op lo_codes[RISCV_MAX_INTEGER_OPS];
+ int hi_cost, lo_cost;
+
+ /* Synthesize and get cost for each half. */
+ lo_cost
+ = riscv_build_integer_1 (lo_codes, sext_hwi (loval, 32), mode);
+ hi_cost
+ = riscv_build_integer_1 (hi_codes, sext_hwi (hival, 32), mode);
+
+ /* If profitable, finish synthesis using zbkb. */
+ if (cost > hi_cost + lo_cost + 1)
+ {
+ /* We need the low half independent of the high half. So
+ mark it has creating a temporary we'll use later. */
+ memcpy (codes, lo_codes,
+ lo_cost * sizeof (struct riscv_integer_op));
+ codes[lo_cost - 1].save_temporary = true;
+
+ /* Now the high half synthesis. */
+ memcpy (codes + lo_cost, hi_codes,
+ hi_cost * sizeof (struct riscv_integer_op));
+
+ /* Adjust the cost. */
+ cost = hi_cost + lo_cost + 1;
+
+ /* And finally (ab)use VEC_MERGE to indicate we want to
+ put merge the two parts together. */
+ codes[cost - 1].code = VEC_MERGE;
+ codes[cost - 1].value = 0;
+ codes[cost - 1].use_uw = false;
+ codes[cost - 1].save_temporary = false;
+ }
}
}
@@ -2656,23 +2719,25 @@ riscv_move_integer (rtx temp, rtx dest, HOST_WIDE_INT value,
x = riscv_split_integer (value, mode);
else
{
- codes[0].value = trunc_int_for_mode (codes[0].value, mode);
- /* Apply each binary operation to X. */
- x = GEN_INT (codes[0].value);
-
- for (i = 1; i < num_ops; i++)
+ rtx old_value = NULL_RTX;
+ for (i = 0; i < num_ops; i++)
{
- if (!can_create_pseudo_p ())
+ if (i != 0 && !can_create_pseudo_p ())
x = riscv_emit_set (temp, x);
- else
+ else if (i != 0)
x = force_reg (mode, x);
codes[i].value = trunc_int_for_mode (codes[i].value, mode);
- /* If the sequence requires using a "uw" form of an insn, we're
- going to have to construct the RTL ourselves and put it in
- a register to avoid force_reg/force_operand from mucking things
- up. */
- if (codes[i].use_uw)
+ if (codes[i].code == UNKNOWN)
{
+ /* UNKNOWN means load the constant value into X. */
+ x = GEN_INT (codes[i].value);
+ }
+ else if (codes[i].use_uw)
+ {
+ /* If the sequence requires using a "uw" form of an insn, we're
+ going to have to construct the RTL ourselves and put it in
+ a register to avoid force_reg/force_operand from mucking
+ things up. */
gcc_assert (TARGET_64BIT || TARGET_ZBA);
rtx t = can_create_pseudo_p () ? gen_reg_rtx (mode) : temp;
@@ -2695,16 +2760,27 @@ riscv_move_integer (rtx temp, rtx dest, HOST_WIDE_INT value,
rtx t = can_create_pseudo_p () ? gen_reg_rtx (mode) : temp;
x = riscv_emit_set (t, x);
}
- else if (codes[i].code == CONCAT)
+ else if (codes[i].code == CONCAT || codes[i].code == VEC_MERGE)
{
rtx t = can_create_pseudo_p () ? gen_reg_rtx (mode) : temp;
- rtx t2 = gen_lowpart (SImode, x);
+ rtx t2 = codes[i].code == VEC_MERGE ? old_value : x;
+ gcc_assert (t2);
+ t2 = gen_lowpart (SImode, t2);
emit_insn (gen_riscv_xpack_di_si_2 (t, x, GEN_INT (32), t2));
x = t;
}
else
x = gen_rtx_fmt_ee (codes[i].code, mode,
x, GEN_INT (codes[i].value));
+
+ /* If this entry in the code table indicates we should save away
+ the temporary holding the current value of X, then do so. */
+ if (codes[i].save_temporary)
+ {
+ gcc_assert (old_value == NULL_RTX);
+ x = force_reg (mode, x);
+ old_value = x;
+ }
}
}
diff --git a/gcc/testsuite/gcc.target/riscv/synthesis-10.c b/gcc/testsuite/gcc.target/riscv/synthesis-10.c
new file mode 100644
index 00000000000..0838ec9af47
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/synthesis-10.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target rv64 } */
+/* We aggressively skip as we really just need to test the basic synthesis
+ which shouldn't vary based on the optimization level. -O1 seems to work
+ and eliminates the usual sources of extraneous dead code that would throw
+ off the counts. */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" "-O2" "-O3" "-Os" "-Oz" "-flto" } } */
+/* { dg-options "-march=rv64gc_zba_zbb_zbkb_zbs" } */
+
+/* Rather than test for a specific synthesis of all these constants or
+ having thousands of tests each testing one variant, we just test the
+ total number of instructions.
+
+ This isn't expected to change much and any change is worthy of a look. */
+/* { dg-final { scan-assembler-times "\\t(add|addi|bseti|li|pack|ret|sh1add|sh2add|sh3add|slli|srli|xori)" 10 } } */
+
+unsigned long foo_0x1425000028000000(void) { return 0x1425000028000000UL; }
+unsigned long foo_0xf057f2def857f2de(void) { return 0xf057f2def857f2deUL; }
More information about the Gcc-cvs
mailing list