[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] [to-be-committed] [RISC-V] Use Zbkb for general 64 bit constants when profitable

Jeff Law law@gcc.gnu.org
Sun Jun 2 19:24:15 GMT 2024


https://gcc.gnu.org/g:e26b14182c2c18deab641b4b81fc53c456573818

commit e26b14182c2c18deab641b4b81fc53c456573818
Author: Jeff Law <jlaw@ventanamicro.com>
Date:   Fri May 31 21:45:01 2024 -0600

    [to-be-committed] [RISC-V] Use Zbkb for general 64 bit constants when profitable
    
    Basically this adds the ability to generate two independent constants during
    synthesis, then bring them together with a pack instruction. Thus we never need
    to go out to the constant pool when zbkb is enabled. The worst sequence we ever
    generate is
    
    lui+addi+lui+addi+pack
    
    Obviously if either half can be synthesized with just a lui or just an addi,
    then we'll DTRT automagically.   So for example:
    
    unsigned long foo_0xf857f2def857f2de(void) {
        return 0x1425000028000000;
    }
    
    The high and low halves are just a lui.  So the final synthesis is:
    
    >         li      a5,671088640            # 15    [c=4 l=4]  *movdi_64bit/1
    >         li      a0,337969152            # 16    [c=4 l=4]  *movdi_64bit/1
    >         pack    a0,a5,a0        # 17    [c=12 l=4]  riscv_xpack_di_si_2
    
    On the implementation side, I think the bits I've put in here likely can be
    used to handle the repeating constant case for !zbkb.  I think it likely could
    be used to help capture cases where the upper half can be derived from the
    lower half (say by turning a bit on or off, shifting or something similar).
    The key in both of these cases is we need a temporary register holding an
    intermediate value.
    
    Ventana's internal tester enables zbkb, but I don't think any of the other
    testers currently exercise zbkb.  We'll probably want to change that at some
    point, but I don't think it's super-critical yet.
    
    While I can envision a few more cases where we could improve constant
    synthesis,   No immediate plans to work in this space, but if someone is
    interested, some thoughts are recorded here:
    
    > https://wiki.riseproject.dev/display/HOME/CT_00_031+--+Additional+Constant+Synthesis+Improvements
    
    gcc/
            * config/riscv/riscv.cc (riscv_integer_op): Add new field.
            (riscv_build_integer_1): Initialize the new field.
            (riscv_built_integer): Recognize more cases where Zbkb's
            pack instruction is profitable.
            (riscv_move_integer): Loop over all the codes.  If requested,
            save the current constant into a temporary.  Generate pack
            for more cases using the saved constant.
    
    gcc/testsuite
    
            * gcc.target/riscv/synthesis-10.c: New test.
    
    (cherry picked from commit c0ded050cd29cc73f78cb4ab23674c7bc024969e)

Diff:
---
 gcc/config/riscv/riscv.cc                     | 108 ++++++++++++++++++++++----
 gcc/testsuite/gcc.target/riscv/synthesis-10.c |  18 +++++
 2 files changed, 110 insertions(+), 16 deletions(-)

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 91fefacee80..10af38a5a81 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -250,6 +250,7 @@ struct riscv_arg_info {
    and each VALUE[i] is a constant integer.  CODE[0] is undefined.  */
 struct riscv_integer_op {
   bool use_uw;
+  bool save_temporary;
   enum rtx_code code;
   unsigned HOST_WIDE_INT value;
 };
@@ -759,6 +760,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
       codes[0].code = UNKNOWN;
       codes[0].value = value;
       codes[0].use_uw = false;
+      codes[0].save_temporary = false;
       return 1;
     }
   if (TARGET_ZBS && SINGLE_BIT_MASK_OPERAND (value))
@@ -767,6 +769,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
       codes[0].code = UNKNOWN;
       codes[0].value = value;
       codes[0].use_uw = false;
+      codes[0].save_temporary = false;
 
       /* RISC-V sign-extends all 32bit values that live in a 32bit
 	 register.  To avoid paradoxes, we thus need to use the
@@ -796,6 +799,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
 	  alt_codes[alt_cost-1].code = PLUS;
 	  alt_codes[alt_cost-1].value = low_part;
 	  alt_codes[alt_cost-1].use_uw = false;
+	  alt_codes[alt_cost-1].save_temporary = false;
 	  memcpy (codes, alt_codes, sizeof (alt_codes));
 	  cost = alt_cost;
 	}
@@ -810,6 +814,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
 	  alt_codes[alt_cost-1].code = XOR;
 	  alt_codes[alt_cost-1].value = low_part;
 	  alt_codes[alt_cost-1].use_uw = false;
+	  alt_codes[alt_cost-1].save_temporary = false;
 	  memcpy (codes, alt_codes, sizeof (alt_codes));
 	  cost = alt_cost;
 	}
@@ -852,6 +857,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
 	  alt_codes[alt_cost-1].code = ASHIFT;
 	  alt_codes[alt_cost-1].value = shift;
 	  alt_codes[alt_cost-1].use_uw = use_uw;
+	  alt_codes[alt_cost-1].save_temporary = false;
 	  memcpy (codes, alt_codes, sizeof (alt_codes));
 	  cost = alt_cost;
 	}
@@ -873,9 +879,11 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
 	  codes[0].value = (((unsigned HOST_WIDE_INT) value >> trailing_ones)
 			    | (value << (64 - trailing_ones)));
 	  codes[0].use_uw = false;
+	  codes[0].save_temporary = false;
 	  codes[1].code = ROTATERT;
 	  codes[1].value = 64 - trailing_ones;
 	  codes[1].use_uw = false;
+	  codes[1].save_temporary = false;
 	  cost = 2;
 	}
       /* Handle the case where the 11 bit range of zero bits wraps around.  */
@@ -888,9 +896,11 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
 			    | ((unsigned HOST_WIDE_INT) value
 			       >> (32 + upper_trailing_ones)));
 	  codes[0].use_uw = false;
+	  codes[0].save_temporary = false;
 	  codes[1].code = ROTATERT;
 	  codes[1].value = 32 - upper_trailing_ones;
 	  codes[1].use_uw = false;
+	  codes[1].save_temporary = false;
 	  cost = 2;
 	}
 
@@ -917,6 +927,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
 	      alt_codes[alt_cost].code = AND;
 	      alt_codes[alt_cost].value = ~(1UL << bit);
 	      alt_codes[alt_cost].use_uw = false;
+	      alt_codes[alt_cost].save_temporary = false;
 	      alt_cost++;
 	      nval &= ~(1UL << bit);
 	    }
@@ -938,6 +949,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
 	   alt_codes[alt_cost - 1].code = FMA;
 	   alt_codes[alt_cost - 1].value = 9;
 	   alt_codes[alt_cost - 1].use_uw = false;
+	   alt_codes[alt_cost - 1].save_temporary = false;
 	   memcpy (codes, alt_codes, sizeof (alt_codes));
 	   cost = alt_cost;
 	}
@@ -948,6 +960,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
 	   alt_codes[alt_cost - 1].code = FMA;
 	   alt_codes[alt_cost - 1].value = 5;
 	   alt_codes[alt_cost - 1].use_uw = false;
+	   alt_codes[alt_cost - 1].save_temporary = false;
 	   memcpy (codes, alt_codes, sizeof (alt_codes));
 	   cost = alt_cost;
 	}
@@ -958,6 +971,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
 	   alt_codes[alt_cost - 1].code = FMA;
 	   alt_codes[alt_cost - 1].value = 3;
 	   alt_codes[alt_cost - 1].use_uw = false;
+	   alt_codes[alt_cost - 1].save_temporary = false;
 	   memcpy (codes, alt_codes, sizeof (alt_codes));
 	   cost = alt_cost;
 	}
@@ -978,6 +992,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
 	  alt_codes[alt_cost - 1].code = PLUS;
 	  alt_codes[alt_cost - 1].value = adjustment;
 	  alt_codes[alt_cost - 1].use_uw = false;
+	  alt_codes[alt_cost - 1].save_temporary = false;
 	  memcpy (codes, alt_codes, sizeof (alt_codes));
 	  cost = alt_cost;
 	}
@@ -995,6 +1010,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
 	  alt_codes[i].code = (i == 0 ? UNKNOWN : IOR);
 	  alt_codes[i].value = value & 0x7ffff000;
 	  alt_codes[i].use_uw = false;
+	  alt_codes[i].save_temporary = false;
 	  value &= ~0x7ffff000;
 	   i++;
 	}
@@ -1005,6 +1021,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
 	  alt_codes[i].code = (i == 0 ? UNKNOWN : PLUS);
 	  alt_codes[i].value = value & 0x7ff;
 	  alt_codes[i].use_uw = false;
+	  alt_codes[i].save_temporary = false;
 	  value &= ~0x7ff;
 	  i++;
 	}
@@ -1016,6 +1033,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
 	  alt_codes[i].code = (i == 0 ? UNKNOWN : IOR);
 	  alt_codes[i].value = 1UL << bit;
 	  alt_codes[i].use_uw = false;
+	  alt_codes[i].save_temporary = false;
 	  value &= ~(1ULL << bit);
 	  i++;
 	}
@@ -1057,6 +1075,7 @@ riscv_build_integer (struct riscv_integer_op *codes, HOST_WIDE_INT value,
 	  alt_codes[alt_cost-1].code = LSHIFTRT;
 	  alt_codes[alt_cost-1].value = shift;
 	  alt_codes[alt_cost-1].use_uw = false;
+	  alt_codes[alt_cost-1].save_temporary = false;
 	  memcpy (codes, alt_codes, sizeof (alt_codes));
 	  cost = alt_cost;
 	}
@@ -1069,6 +1088,7 @@ riscv_build_integer (struct riscv_integer_op *codes, HOST_WIDE_INT value,
 	  alt_codes[alt_cost-1].code = LSHIFTRT;
 	  alt_codes[alt_cost-1].value = shift;
 	  alt_codes[alt_cost-1].use_uw = false;
+	  alt_codes[alt_cost-1].save_temporary = false;
 	  memcpy (codes, alt_codes, sizeof (alt_codes));
 	  cost = alt_cost;
 	}
@@ -1093,6 +1113,7 @@ riscv_build_integer (struct riscv_integer_op *codes, HOST_WIDE_INT value,
 	  alt_codes[alt_cost - 1].code = XOR;
 	  alt_codes[alt_cost - 1].value = -1;
 	  alt_codes[alt_cost - 1].use_uw = false;
+	  alt_codes[alt_cost - 1].save_temporary = false;
 	  memcpy (codes, alt_codes, sizeof (alt_codes));
 	  cost = alt_cost;
 	}
@@ -1128,13 +1149,55 @@ riscv_build_integer (struct riscv_integer_op *codes, HOST_WIDE_INT value,
   if (cost > 3 && TARGET_64BIT && TARGET_ZBKB)
     {
       unsigned HOST_WIDE_INT loval = value & 0xffffffff;
-      unsigned HOST_WIDE_INT hival = value & ~loval;
-      if (hival >> 32 == loval)
+      unsigned HOST_WIDE_INT hival = (value & ~loval) >> 32;
+      if (hival == loval)
 	{
 	  cost = 1 + riscv_build_integer_1 (codes, sext_hwi (loval, 32), mode);
 	  codes[cost - 1].code = CONCAT;
 	  codes[cost - 1].value = 0;
 	  codes[cost - 1].use_uw = false;
+	  codes[cost - 1].save_temporary = false;
+	}
+
+      /* An arbitrary 64 bit constant can be synthesized in 5 instructions
+	 using zbkb.  We may do better than that if the upper or lower half
+	 can be synthsized with a single LUI, ADDI or BSET.  Regardless the
+	 basic steps are the same.  */
+      if (cost > 3 && can_create_pseudo_p ())
+	{
+	  struct riscv_integer_op hi_codes[RISCV_MAX_INTEGER_OPS];
+	  struct riscv_integer_op lo_codes[RISCV_MAX_INTEGER_OPS];
+	  int hi_cost, lo_cost;
+
+	  /* Synthesize and get cost for each half.  */
+	  lo_cost
+	    = riscv_build_integer_1 (lo_codes, sext_hwi (loval, 32), mode);
+	  hi_cost
+	    = riscv_build_integer_1 (hi_codes, sext_hwi (hival, 32), mode);
+
+	  /* If profitable, finish synthesis using zbkb.  */
+	  if (cost > hi_cost + lo_cost + 1)
+	    {
+	      /* We need the low half independent of the high half.  So
+		 mark it has creating a temporary we'll use later.  */
+	      memcpy (codes, lo_codes,
+		      lo_cost * sizeof (struct riscv_integer_op));
+	      codes[lo_cost - 1].save_temporary = true;
+
+	      /* Now the high half synthesis.  */
+	      memcpy (codes + lo_cost, hi_codes,
+		      hi_cost * sizeof (struct riscv_integer_op));
+
+	      /* Adjust the cost.  */
+	      cost = hi_cost + lo_cost + 1;
+
+	      /* And finally (ab)use VEC_MERGE to indicate we want to
+		 put merge the two parts together.  */
+	      codes[cost - 1].code = VEC_MERGE;
+	      codes[cost - 1].value = 0;
+	      codes[cost - 1].use_uw = false;
+	      codes[cost - 1].save_temporary = false;
+	    }
 	}
 
     }
@@ -2656,23 +2719,25 @@ riscv_move_integer (rtx temp, rtx dest, HOST_WIDE_INT value,
     x = riscv_split_integer (value, mode);
   else
     {
-      codes[0].value = trunc_int_for_mode (codes[0].value, mode);
-      /* Apply each binary operation to X. */
-      x = GEN_INT (codes[0].value);
-
-      for (i = 1; i < num_ops; i++)
+      rtx old_value = NULL_RTX;
+      for (i = 0; i < num_ops; i++)
 	{
-	  if (!can_create_pseudo_p ())
+	  if (i != 0 && !can_create_pseudo_p ())
 	    x = riscv_emit_set (temp, x);
-	  else
+	  else if (i != 0)
 	    x = force_reg (mode, x);
 	  codes[i].value = trunc_int_for_mode (codes[i].value, mode);
-	  /* If the sequence requires using a "uw" form of an insn, we're
-	     going to have to construct the RTL ourselves and put it in
-	     a register to avoid force_reg/force_operand from mucking things
-	     up.  */
-	  if (codes[i].use_uw)
+	  if (codes[i].code == UNKNOWN)
 	    {
+	      /* UNKNOWN means load the constant value into X.  */
+	      x = GEN_INT (codes[i].value);
+	    }
+	  else if (codes[i].use_uw)
+	    {
+	      /* If the sequence requires using a "uw" form of an insn, we're
+		 going to have to construct the RTL ourselves and put it in
+		 a register to avoid force_reg/force_operand from mucking
+		 things up.  */
 	      gcc_assert (TARGET_64BIT || TARGET_ZBA);
 	      rtx t = can_create_pseudo_p () ? gen_reg_rtx (mode) : temp;
 
@@ -2695,16 +2760,27 @@ riscv_move_integer (rtx temp, rtx dest, HOST_WIDE_INT value,
 	      rtx t = can_create_pseudo_p () ? gen_reg_rtx (mode) : temp;
 	      x = riscv_emit_set (t, x);
 	    }
-	  else if (codes[i].code == CONCAT)
+	  else if (codes[i].code == CONCAT || codes[i].code == VEC_MERGE)
 	    {
 	      rtx t = can_create_pseudo_p () ? gen_reg_rtx (mode) : temp;
-	      rtx t2 = gen_lowpart (SImode, x);
+	      rtx t2 = codes[i].code == VEC_MERGE ? old_value : x;
+	      gcc_assert (t2);
+	      t2 = gen_lowpart (SImode, t2);
 	      emit_insn (gen_riscv_xpack_di_si_2 (t, x, GEN_INT (32), t2));
 	      x = t;
 	    }
 	  else
 	    x = gen_rtx_fmt_ee (codes[i].code, mode,
 				x, GEN_INT (codes[i].value));
+
+	  /* If this entry in the code table indicates we should save away
+	     the temporary holding the current value of X, then do so.  */
+	  if (codes[i].save_temporary)
+	    {
+	      gcc_assert (old_value == NULL_RTX);
+	      x = force_reg (mode, x);
+	      old_value = x;
+	    }
 	}
     }
 
diff --git a/gcc/testsuite/gcc.target/riscv/synthesis-10.c b/gcc/testsuite/gcc.target/riscv/synthesis-10.c
new file mode 100644
index 00000000000..0838ec9af47
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/synthesis-10.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target rv64 } */
+/* We aggressively skip as we really just need to test the basic synthesis
+   which shouldn't vary based on the optimization level.  -O1 seems to work
+   and eliminates the usual sources of extraneous dead code that would throw
+   off the counts.  */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" "-O2" "-O3" "-Os" "-Oz" "-flto" } } */
+/* { dg-options "-march=rv64gc_zba_zbb_zbkb_zbs" } */
+
+/* Rather than test for a specific synthesis of all these constants or
+   having thousands of tests each testing one variant, we just test the
+   total number of instructions.
+
+   This isn't expected to change much and any change is worthy of a look.  */
+/* { dg-final { scan-assembler-times "\\t(add|addi|bseti|li|pack|ret|sh1add|sh2add|sh3add|slli|srli|xori)" 10 } } */
+
+unsigned long foo_0x1425000028000000(void) { return 0x1425000028000000UL; }
+unsigned long foo_0xf057f2def857f2de(void) { return 0xf057f2def857f2deUL; }


More information about the Gcc-cvs mailing list