[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] RISC-V: avoid LUI based const materialization ... [part of PR/106265]

Jeff Law law@gcc.gnu.org
Tue May 14 18:40:57 GMT 2024


https://gcc.gnu.org/g:de257cc78146b0e518b272de5afc3faa9bbf3669

commit de257cc78146b0e518b272de5afc3faa9bbf3669
Author: Vineet Gupta <vineetg@rivosinc.com>
Date:   Mon May 13 11:45:55 2024 -0700

    RISC-V: avoid LUI based const materialization ... [part of PR/106265]
    
    ... if the constant can be represented as sum of two S12 values.
    The two S12 values could instead be fused with subsequent ADD insn.
    The helps
     - avoid an additional LUI insn
     - side benefits of not clobbering a reg
    
    e.g.
                                w/o patch             w/ patch
    long                  |                     |
    plus(unsigned long i) | li      a5,4096     |
    {                     | addi    a5,a5,-2032 | addi a0, a0, 2047
       return i + 2064;   | add     a0,a0,a5    | addi a0, a0, 17
    }                     |         ret         | ret
    
    NOTE: In theory not having const in a standalone reg might seem less
          CSE friendly, but for workloads in consideration these mat are
          from very late LRA reloads and follow on GCSE is not doing much
          currently.
    
    The real benefit however is seen in base+offset computation for array
    accesses and especially for stack accesses which are finalized late in
    optim pipeline, during LRA register allocation. Often the finalized
    offsets trigger LRA reloads resulting in mind boggling repetition of
    exact same insn sequence including LUI based constant materialization.
    
    This shaves off 290 billion dynamic instrustions (QEMU icounts) in
    SPEC 2017 Cactu benchmark which is over 10% of workload. In the rest of
    suite, there additional 10 billion shaved, with both gains and losses
    in indiv workloads as is usual with compiler changes.
    
     500.perlbench_r-0 |  1,214,534,029,025 | 1,212,887,959,387 |
     500.perlbench_r-1 |    740,383,419,739 |   739,280,308,163 |
     500.perlbench_r-2 |    692,074,638,817 |   691,118,734,547 |
     502.gcc_r-0       |    190,820,141,435 |   190,857,065,988 |
     502.gcc_r-1       |    225,747,660,839 |   225,809,444,357 | <- -0.02%
     502.gcc_r-2       |    220,370,089,641 |   220,406,367,876 | <- -0.03%
     502.gcc_r-3       |    179,111,460,458 |   179,135,609,723 | <- -0.02%
     502.gcc_r-4       |    219,301,546,340 |   219,320,416,956 | <- -0.01%
     503.bwaves_r-0    |    278,733,324,691 |   278,733,323,575 | <- -0.01%
     503.bwaves_r-1    |    442,397,521,282 |   442,397,519,616 |
     503.bwaves_r-2    |    344,112,218,206 |   344,112,216,760 |
     503.bwaves_r-3    |    417,561,469,153 |   417,561,467,597 |
     505.mcf_r         |    669,319,257,525 |   669,318,763,084 |
     507.cactuBSSN_r   |  2,852,767,394,456 | 2,564,736,063,742 | <+ 10.10%
     508.namd_r        |  1,855,884,342,110 | 1,855,881,110,934 |
     510.parest_r      |  1,654,525,521,053 | 1,654,402,859,174 |
     511.povray_r      |  2,990,146,655,619 | 2,990,060,324,589 |
     519.lbm_r         |  1,158,337,294,525 | 1,158,337,294,529 |
     520.omnetpp_r     |  1,021,765,791,283 | 1,026,165,661,394 |
     521.wrf_r         |  1,715,955,652,503 | 1,714,352,737,385 |
     523.xalancbmk_r   |    849,846,008,075 |   849,836,851,752 |
     525.x264_r-0      |    277,801,762,763 |   277,488,776,427 |
     525.x264_r-1      |    927,281,789,540 |   926,751,516,742 |
     525.x264_r-2      |    915,352,631,375 |   914,667,785,953 |
     526.blender_r     |  1,652,839,180,887 | 1,653,260,825,512 |
     527.cam4_r        |  1,487,053,494,925 | 1,484,526,670,770 |
     531.deepsjeng_r   |  1,641,969,526,837 | 1,642,126,598,866 |
     538.imagick_r     |  2,098,016,546,691 | 2,097,997,929,125 |
     541.leela_r       |  1,983,557,323,877 | 1,983,531,314,526 |
     544.nab_r         |  1,516,061,611,233 | 1,516,061,407,715 |
     548.exchange2_r   |  2,072,594,330,215 | 2,072,591,648,318 |
     549.fotonik3d_r   |  1,001,499,307,366 | 1,001,478,944,189 |
     554.roms_r        |  1,028,799,739,111 | 1,028,780,904,061 |
     557.xz_r-0        |    363,827,039,684 |   363,057,014,260 |
     557.xz_r-1        |    906,649,112,601 |   905,928,888,732 |
     557.xz_r-2        |    509,023,898,187 |   508,140,356,932 |
     997.specrand_fr   |        402,535,577 |       403,052,561 |
     999.specrand_ir   |        402,535,577 |       403,052,561 |
    
    This should still be considered damage control as the real/deeper fix
    would be to reduce number of LRA reloads or CSE/anchor those during
    LRA constraint sub-pass (re)runs (thats a different PR/114729.
    
    Implementation Details (for posterity)
    --------------------------------------
     - basic idea is to have a splitter selected via a new predicate for constant
       being possible sum of two S12 and provide the transform.
       This is however a 2 -> 2 transform which combine can't handle.
       So we specify it using a define_insn_and_split.
    
     - the initial loose "i" constraint caused LRA to accept invalid insns thus
       needing a tighter new constraint as well.
    
     - An additional fallback alternate with catch-all "r" register
       constraint also needed to allow any "reloads" that LRA might
       require for ADDI with const larger than S12.
    
    Testing
    --------
    This is testsuite clean (rv64 only).
    I'll rely on post-commit CI multlib run for any possible fallout for
    other setups such as rv32.
    
    |                                               |         gcc |          g++ |     gfortran |
    | rv64imafdc_zba_zbb_zbs_zicond/  lp64d/ medlow |  41 /    17 |    8 /     3 |    7 /     2 |
    | rv64imafdc_zba_zbb_zbs_zicond/  lp64d/ medlow |  41 /    17 |    8 /     3 |    7 /     2 |
    
    I also threw this into a buildroot run, it obviously boots Linux to
    userspace. bloat-o-meter on glibc and kernel show overall decrease in
    staic instruction counts with some minor spot increases.
    These are generally in the category of
    
     - LUI + ADDI are 2 byte each vs. two ADD being 4 byte each.
     - Knock on effects due to inlining changes.
     - Sometimes the slightly shorter 2-insn seq in a mult-exit function
       can cause in-place epilogue duplication (vs. a jump back).
       This is slightly larger but more efficient in execution.
    In summary nothing to fret about.
    
    | linux/scripts/bloat-o-meter build-gcc-240131/target/lib/libc.so.6 \
             build-gcc-240131-new-splitter-1-variant/target/lib/libc.so.6
    |
    | add/remove: 0/0 grow/shrink: 21/49 up/down: 520/-3056 (-2536)
    | Function                                     old     new   delta
    | getnameinfo                                 2756    2892    +136
    ...
    | tempnam                                      136     144      +8
    | padzero                                      276     284      +8
    ...
    | __GI___register_printf_specifier             284     280      -4
    | __EI_xdr_array                               468     464      -4
    | try_file_lock                                268     260      -8
    | pthread_create@GLIBC_2                      3520    3508     -12
    | __pthread_create_2_1                        3520    3508     -12
    ...
    | _nss_files_setnetgrent                       932     904     -28
    | _nss_dns_gethostbyaddr2_r                   1524    1480     -44
    | build_trtable                               3312    3208    -104
    | printf_positional                          25000   22580   -2420
    | Total: Before=2107999, After=2105463, chg -0.12%
    
    Caveat:
    ------
    Jeff noted during v2 review that the operand0 constraint !riscv_reg_frame_related
    could potentially cause issues with hard reg cprop in future. If that
    trips things up we will have to loosen the constraint while dialing down
    the const range to (-2048 to 2032) as opposed to fll S12 range of
    (-2048 to 2047) to keep stack regs aligned.
    
    gcc/ChangeLog:
            * config/riscv/riscv.h: New macros to check for sum of two S12
            range.
            * config/riscv/constraints.md: New constraint.
            * config/riscv/predicates.md: New Predicate.
            * config/riscv/riscv.md: New splitter.
            * config/riscv/riscv.cc (riscv_reg_frame_related): New helper.
            * config/riscv/riscv-protos.h: New helper prototype.
    
    gcc/testsuite/ChangeLog:
            * gcc.target/riscv/sum-of-two-s12-const-1.c: New test: checks
            for new patterns output.
            * gcc.target/riscv/sum-of-two-s12-const-2.c: Ditto.
            * gcc.target/riscv/sum-of-two-s12-const-3.c: New test: should not
            ICE.
    
    Tested-by: Edwin Lu <ewlu@rivosinc.com> # pre-commit-CI #1520
    Signed-off-by: Vineet Gupta <vineetg@rivosinc.com>
    (cherry picked from commit 4bfc4585c9935fbde75ccf04e44a15d24f42cde9)

Diff:
---
 gcc/config/riscv/constraints.md                    |  6 +++
 gcc/config/riscv/predicates.md                     |  6 +++
 gcc/config/riscv/riscv-protos.h                    |  1 +
 gcc/config/riscv/riscv.cc                          | 11 ++++++
 gcc/config/riscv/riscv.h                           | 15 ++++++++
 gcc/config/riscv/riscv.md                          | 40 +++++++++++++++++++
 .../gcc.target/riscv/sum-of-two-s12-const-1.c      | 45 ++++++++++++++++++++++
 .../gcc.target/riscv/sum-of-two-s12-const-2.c      | 15 ++++++++
 .../gcc.target/riscv/sum-of-two-s12-const-3.c      | 22 +++++++++++
 9 files changed, 161 insertions(+)

diff --git a/gcc/config/riscv/constraints.md b/gcc/config/riscv/constraints.md
index a590df545d7d..a9ee346af6f0 100644
--- a/gcc/config/riscv/constraints.md
+++ b/gcc/config/riscv/constraints.md
@@ -80,6 +80,12 @@
   (and (match_code "const_int")
        (match_test "LUI_OPERAND (ival)")))
 
+(define_constraint "MiG"
+  "const can be represented as sum of any S12 values."
+  (and (match_code "const_int")
+       (ior (match_test "IN_RANGE (ival,  2048,  4094)")
+	    (match_test "IN_RANGE (ival, -4096, -2049)"))))
+
 (define_constraint "Ds3"
   "@internal
    1, 2 or 3 immediate"
diff --git a/gcc/config/riscv/predicates.md b/gcc/config/riscv/predicates.md
index e7d797d4dbf0..8948fbfc3631 100644
--- a/gcc/config/riscv/predicates.md
+++ b/gcc/config/riscv/predicates.md
@@ -428,6 +428,12 @@
 	return true;
 })
 
+(define_predicate "const_two_s12"
+  (match_code "const_int")
+{
+  return SUM_OF_TWO_S12 (INTVAL (op));
+})
+
 ;; CORE-V Predicates:
 (define_predicate "immediate_register_operand"
   (ior (match_operand 0 "register_operand")
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 255fd6a0de97..5c8a52b78a22 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -165,6 +165,7 @@ extern bool riscv_shamt_matches_mask_p (int, HOST_WIDE_INT);
 extern void riscv_subword_address (rtx, rtx *, rtx *, rtx *, rtx *);
 extern void riscv_lshift_subword (machine_mode, rtx, rtx, rtx *);
 extern enum memmodel riscv_union_memmodels (enum memmodel, enum memmodel);
+extern bool riscv_reg_frame_related (rtx);
 
 /* Routines implemented in riscv-c.cc.  */
 void riscv_cpu_cpp_builtins (cpp_reader *);
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index a1e5a014bedf..4067505270e1 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -7145,6 +7145,17 @@ riscv_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
   return (to == HARD_FRAME_POINTER_REGNUM || to == STACK_POINTER_REGNUM);
 }
 
+/* Helper to determine if reg X pertains to stack.  */
+bool
+riscv_reg_frame_related (rtx x)
+{
+  return REG_P (x)
+	 && (REGNO (x) == FRAME_POINTER_REGNUM
+	     || REGNO (x) == HARD_FRAME_POINTER_REGNUM
+	     || REGNO (x) == ARG_POINTER_REGNUM
+	     || REGNO (x) == VIRTUAL_STACK_VARS_REGNUM);
+}
+
 /* Implement INITIAL_ELIMINATION_OFFSET.  FROM is either the frame pointer
    or argument pointer.  TO is either the stack pointer or hard frame
    pointer.  */
diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h
index 58d0b09bf7d9..0d27c0d378df 100644
--- a/gcc/config/riscv/riscv.h
+++ b/gcc/config/riscv/riscv.h
@@ -626,6 +626,21 @@ enum reg_class
   (((VALUE) | ((1UL<<31) - IMM_REACH)) == ((1UL<<31) - IMM_REACH)	\
    || ((VALUE) | ((1UL<<31) - IMM_REACH)) + IMM_REACH == 0)
 
+/* True if a VALUE (constant) can be expressed as sum of two S12 constants
+   (in range -2048 to 2047).
+   Range check logic:
+     from: min S12 + 1 (or -1 depending on what side of zero)
+       to: two times the min S12 value (to max out S12 bits).  */
+
+#define SUM_OF_TWO_S12_N(VALUE)						\
+  (((VALUE) >= (-2048 * 2)) && ((VALUE) <= (-2048 - 1)))
+
+#define SUM_OF_TWO_S12_P(VALUE)						\
+  (((VALUE) >= (2047 + 1)) && ((VALUE) <= (2047 * 2)))
+
+#define SUM_OF_TWO_S12(VALUE)						\
+  (SUM_OF_TWO_S12_N (VALUE) || SUM_OF_TWO_S12_P (VALUE))
+
 /* If this is a single bit mask, then we can load it with bseti.  Special
    handling of SImode 0x80000000 on RV64 is done in riscv_build_integer_1. */
 #define SINGLE_BIT_MASK_OPERAND(VALUE)					\
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index c45b1129b0a0..893040f28541 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -703,6 +703,46 @@
   [(set_attr "type" "arith")
    (set_attr "mode" "DI")])
 
+;; Special case of adding a reg and constant if latter is sum of two S12
+;; values (in range -2048 to 2047). Avoid materialized the const and fuse
+;; into the add (with an additional add for 2nd value). Makes a 3 insn
+;; sequence into 2 insn.
+
+(define_insn_and_split "*add<mode>3_const_sum_of_two_s12"
+  [(set (match_operand:P	 0 "register_operand" "=r,r")
+	(plus:P (match_operand:P 1 "register_operand" " r,r")
+		(match_operand:P 2 "const_two_s12"    " MiG,r")))]
+  "!riscv_reg_frame_related (operands[0])"
+{
+  /* operand matching MiG constraint is always meant to be split.  */
+  if (which_alternative == 0)
+    return "#";
+  else
+    return "add %0,%1,%2";
+}
+  ""
+  [(set (match_dup 0)
+	(plus:P (match_dup 1) (match_dup 3)))
+   (set (match_dup 0)
+	(plus:P (match_dup 0) (match_dup 4)))]
+{
+  int val = INTVAL (operands[2]);
+  if (SUM_OF_TWO_S12_P (val))
+    {
+       operands[3] = GEN_INT (2047);
+       operands[4] = GEN_INT (val - 2047);
+    }
+  else if (SUM_OF_TWO_S12_N (val))
+    {
+       operands[3] = GEN_INT (-2048);
+       operands[4] = GEN_INT (val + 2048);
+    }
+  else
+      gcc_unreachable ();
+}
+  [(set_attr "type" "arith")
+   (set_attr "mode" "<P:MODE>")])
+
 (define_expand "addv<mode>4"
   [(set (match_operand:GPR           0 "register_operand" "=r,r")
 	(plus:GPR (match_operand:GPR 1 "register_operand" " r,r")
diff --git a/gcc/testsuite/gcc.target/riscv/sum-of-two-s12-const-1.c b/gcc/testsuite/gcc.target/riscv/sum-of-two-s12-const-1.c
new file mode 100644
index 000000000000..4d6d135de5f5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/sum-of-two-s12-const-1.c
@@ -0,0 +1,45 @@
+// TBD: This doesn't quite work for rv32 yet
+/* { dg-do compile } */
+/* { dg-options { -march=rv64gcv -mabi=lp64d } } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" "-Os" "-Oz" } } */
+
+/* Ensure that gcc doesn't generate standlone li reg, 4096.  */
+long
+plus1(unsigned long i)
+{
+   return i + 2048;
+}
+
+long
+plus2(unsigned long i)
+{
+   return i + 4094;
+}
+
+long
+plus3(unsigned long i)
+{
+   return i + 2064;
+}
+
+/* Ensure that gcc doesn't generate standlone li reg, -4096.  */
+long
+minus1(unsigned long i)
+{
+   return i - 4096;
+}
+
+long
+minus2(unsigned long i)
+{
+   return i - 2049;
+}
+
+long
+minus3(unsigned long i)
+{
+   return i - 2064;
+}
+
+/* { dg-final { scan-assembler-not {li\t[a-x0-9]+,-4096} } } */
+/* { dg-final { scan-assembler-not {li\t[a-x0-9]+,4096} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/sum-of-two-s12-const-2.c b/gcc/testsuite/gcc.target/riscv/sum-of-two-s12-const-2.c
new file mode 100644
index 000000000000..9343b43c3106
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/sum-of-two-s12-const-2.c
@@ -0,0 +1,15 @@
+/* Reduced from glibc/stdio-common/tempnam.c.
+   Can't have invalid insn in final output:
+      add s0, sp, 2048    */
+
+/* { dg-do compile } */
+/* { dg-options { -march=rv64gcv -mabi=lp64d -O2 } } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "O1" "-Og" "-Os" "-Oz" } } */
+
+int a() {
+  char b[4096];
+  if (a(b))
+    a(b);
+}
+
+/* { dg-final { scan-assembler-not {add\t[a-x0-9]+,sp,2048} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/sum-of-two-s12-const-3.c b/gcc/testsuite/gcc.target/riscv/sum-of-two-s12-const-3.c
new file mode 100644
index 000000000000..5dcab52c2610
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/sum-of-two-s12-const-3.c
@@ -0,0 +1,22 @@
+/* Reduced version of c-c++-common/torture/builtin-convertvector-1.c.  */
+/* This should NOT ICE */
+
+/* { dg-do compile } */
+
+typedef long b __attribute__((vector_size(256 * sizeof(long))));
+typedef double c __attribute__((vector_size(256 * sizeof(double))));
+int d;
+void e(b *f, c *g) { *g = __builtin_convertvector(*f, c); }
+void h() {
+  struct {
+    b i;
+  } j;
+  union {
+    c i;
+    double a[6];
+  } k;
+  e(&j.i, &k.i);
+  if (k.a[d])
+    for (;;)
+      ;
+}


More information about the Gcc-cvs mailing list