[PATCH] Add a heuristic for eliminate redundant load and store in inline pass.

Cui,Lili lili.cui@intel.com
Wed Jul 6 10:50:43 GMT 2022


From: Lili <lili.cui@intel.com>


Hi Hubicka,

This patch is to add a heuristic inline hint to eliminate redundant load and store.

Bootstrap and regtest pending on x86_64-unknown-linux-gnu.
OK for trunk?

Thanks,
Lili.

Add a INLINE_HINT_eliminate_load_and_store hint in to inline pass.
We accumulate the insn number of redundant load and store that can be
reduced by these three cases, when the count size is greater than the
threshold, we will enable the hint. with the hint, inlining_insns_auto
will enlarge the bound.

1. Caller's store is same with callee's load
2. Caller's load is same with callee's load
3. Callee's load is same with caller's local memory access

With the patch applied
Icelake server: 538.imagic_r get 14.10% improvement for multicopy and 38.90%
improvement for single copy with no measurable changes for other benchmarks.

Casecadelake: 538.imagic_r get 12.5% improvement for multicopy with and code
size increased by 0.2%. With no measurable changes for other benchmarks.

Znver3 server: 538.imagic_r get 14.20% improvement for multicopy with and codei
size increased by 0.3%. With no measurable changes for other benchmarks.

CPU2017 single copy performance data for Icelake server
BenchMarks           Score   Build time  Code size
500.perlbench_r      1.50%   -0.20%      0.00%
502.gcc_r            0.10%   -0.10%      0.00%
505.mcf_r            0.00%   1.70%       0.00%
520.omnetpp_r        -0.60%  -0.30%      0.00%
523.xalancbmk_r      0.60%   0.00%       0.00%
525.x264_r           0.00%   -0.20%      0.00%
531.deepsjeng_r      0.40%   -1.10%      -0.10%
541.leela_r          0.00%   0.00%       0.00%
548.exchange2_r      0.00%   -0.90%      0.00%
557.xz_r             0.00%   0.00%       0.00%
503.bwaves_r         0.00%   1.40%       0.00%
507.cactuBSSN_r      0.00%   1.00%       0.00%
508.namd_r           0.00%   0.30%       0.00%
510.parest_r         0.00%   -0.40%      0.00%
511.povray_r         0.70%   -0.60%      0.00%
519.lbm_r            0.00%   0.00%       0.00%
521.wrf_r            0.00%   0.60%       0.00%
526.blender_r        0.00%   0.00%       0.00%
527.cam4_r           -0.30%  -0.50%      0.00%
538.imagick_r        38.90%  0.50%       0.20%
544.nab_r            0.00%   1.10%       0.00%
549.fotonik3d_r      0.00%   0.90%       0.00%
554.roms_r           2.30%   -0.10%      0.00%
Geomean-int          0.00%   -0.30%      0.00%
Geomean-fp           3.80%   0.30%       0.00%

gcc/ChangeLog:

	* ipa-fnsummary.cc (ipa_dump_hints): Add print for hint "eliminate_load_and_store"
	* ipa-fnsummary.h (enum ipa_hints_vals): Add INLINE_HINT_eliminate_load_and_store.
	* ipa-inline-analysis.cc (do_estimate_edge_time): Add judgment for INLINE_HINT_eliminate_load_and_store.
	* ipa-inline.cc (want_inline_small_function_p): Add "INLINE_HINT_eliminate_load_and_store" for hints flag.
	* ipa-modref-tree.h (struct modref_access_node): Move function contains to public..
	(struct modref_tree): Add new function "same" and "local_vector_memory_accesse"
	* ipa-modref.cc (eliminate_load_and_store): New.
	(ipa_merge_modref_summary_after_inlining): Change the input value of useful_p.
	* ipa-modref.h (eliminate_load_and_store): New.
	* opts.cc: Add param "min_inline_hint_eliminate_loads_num"
	* params.opt: Ditto.

gcc/testsuite/ChangeLog:

	* gcc.dg/ipa/inlinehint-6.c: New test.
---
 gcc/ipa-fnsummary.cc                    |   5 ++
 gcc/ipa-fnsummary.h                     |   4 +-
 gcc/ipa-inline-analysis.cc              |   7 ++
 gcc/ipa-inline.cc                       |   3 +-
 gcc/ipa-modref-tree.h                   | 109 +++++++++++++++++++++++-
 gcc/ipa-modref.cc                       |  46 +++++++++-
 gcc/ipa-modref.h                        |   1 +
 gcc/opts.cc                             |   1 +
 gcc/params.opt                          |   4 +
 gcc/testsuite/gcc.dg/ipa/inlinehint-6.c |  54 ++++++++++++
 10 files changed, 229 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/ipa/inlinehint-6.c

diff --git a/gcc/ipa-fnsummary.cc b/gcc/ipa-fnsummary.cc
index e2a86680a21..0a962f62490 100644
--- a/gcc/ipa-fnsummary.cc
+++ b/gcc/ipa-fnsummary.cc
@@ -150,6 +150,11 @@ ipa_dump_hints (FILE *f, ipa_hints hints)
       hints &= ~INLINE_HINT_builtin_constant_p;
       fprintf (f, " builtin_constant_p");
     }
+  if (hints & INLINE_HINT_eliminate_load_and_store)
+    {
+      hints &= ~INLINE_HINT_eliminate_load_and_store;
+      fprintf (f, " eliminate_load_and_store");
+    }
   gcc_assert (!hints);
 }
 
diff --git a/gcc/ipa-fnsummary.h b/gcc/ipa-fnsummary.h
index 941fea6de0d..5f589e5ea0d 100644
--- a/gcc/ipa-fnsummary.h
+++ b/gcc/ipa-fnsummary.h
@@ -52,7 +52,9 @@ enum ipa_hints_vals {
   INLINE_HINT_known_hot = 128,
   /* There is builtin_constant_p dependent on parameter which is usually
      a strong hint to inline.  */
-  INLINE_HINT_builtin_constant_p = 256
+  INLINE_HINT_builtin_constant_p = 256,
+  /* Inlining can eliminate redundant load and store.  */
+  INLINE_HINT_eliminate_load_and_store = 512
 };
 
 typedef int ipa_hints;
diff --git a/gcc/ipa-inline-analysis.cc b/gcc/ipa-inline-analysis.cc
index 1ca685d1b0e..c10f6f5c71e 100644
--- a/gcc/ipa-inline-analysis.cc
+++ b/gcc/ipa-inline-analysis.cc
@@ -43,6 +43,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "ipa-prop.h"
 #include "ipa-fnsummary.h"
 #include "ipa-inline.h"
+#include "ipa-modref-tree.h"
+#include "ipa-modref.h"
 #include "cfgloop.h"
 #include "tree-scalar-evolution.h"
 #include "ipa-utils.h"
@@ -260,6 +262,11 @@ do_estimate_edge_time (struct cgraph_edge *edge, sreal *ret_nonspec_time)
 	     : edge->caller->count.ipa ())))
     hints |= INLINE_HINT_known_hot;
 
+  if (edge->caller->frequency == NODE_FREQUENCY_HOT
+      && edge->callee->frequency == NODE_FREQUENCY_HOT
+      && eliminate_load_and_store (edge))
+    hints |= INLINE_HINT_eliminate_load_and_store;
+
   gcc_checking_assert (size >= 0);
   gcc_checking_assert (time >= 0);
 
diff --git a/gcc/ipa-inline.cc b/gcc/ipa-inline.cc
index 14969198cde..04715560d97 100644
--- a/gcc/ipa-inline.cc
+++ b/gcc/ipa-inline.cc
@@ -910,7 +910,8 @@ want_inline_small_function_p (struct cgraph_edge *e, bool report)
       bool apply_hints = (hints & (INLINE_HINT_indirect_call
 				   | INLINE_HINT_known_hot
 				   | INLINE_HINT_loop_iterations
-				   | INLINE_HINT_loop_stride));
+				   | INLINE_HINT_loop_stride
+				   | INLINE_HINT_eliminate_load_and_store));
       bool apply_hints2 = (hints & INLINE_HINT_builtin_constant_p);
 
       if (growth <= opt_for_fn (to->decl,
diff --git a/gcc/ipa-modref-tree.h b/gcc/ipa-modref-tree.h
index b788beed477..c859c81dbbd 100644
--- a/gcc/ipa-modref-tree.h
+++ b/gcc/ipa-modref-tree.h
@@ -117,8 +117,8 @@ struct GTY(()) modref_access_node
      around: if information is lost, the kill is lost.  */
   static bool insert_kill (vec<modref_access_node> &kills,
 			   modref_access_node &a, bool record_adjustments);
-private:
   bool contains (const modref_access_node &) const;
+private:
   bool contains_for_kills (const modref_access_node &) const;
   void update (poly_int64, poly_int64, poly_int64, poly_int64, bool);
   bool update_for_kills (poly_int64, poly_int64, poly_int64,
@@ -474,6 +474,113 @@ struct GTY((user)) modref_tree
 		    base, ref, a, record_adjustments);
   }
 
+  /* Try to find if caller and callee have same accesses, except for the
+     caller's local memory.  */
+  int same (modref_tree <T> *from, vec <modref_parm_map> *parm_map)
+    {
+      size_t i, j, k, l;
+      int count = 0;
+      modref_base_node <T> *base_node, *my_base_node;
+      modref_ref_node <T> *ref_node, *my_ref_node;
+      modref_access_node *access_node, *my_access_node;
+
+      if (from->every_base || every_base)
+	return count;
+      FOR_EACH_VEC_SAFE_ELT (from->bases, i, base_node)
+	{
+	  if (base_node->every_ref
+	      || !(my_base_node = search (base_node->base)))
+	    continue;
+	  FOR_EACH_VEC_SAFE_ELT (base_node->refs, j, ref_node)
+	    {
+	      if (ref_node->every_access
+		  || !(my_ref_node = my_base_node->search (ref_node->ref))
+		  || my_ref_node->every_access)
+		continue;
+	      FOR_EACH_VEC_SAFE_ELT (ref_node->accesses, k, access_node)
+		{
+		  modref_access_node a = *access_node;
+		  if (a.parm_index != MODREF_UNKNOWN_PARM
+		      && a.parm_index != MODREF_GLOBAL_MEMORY_PARM
+		      && parm_map)
+		    {
+		      if (a.parm_index >= (int)parm_map->length ())
+			a.parm_index = MODREF_UNKNOWN_PARM;
+		      else
+			{
+			  if (a.parm_index == MODREF_STATIC_CHAIN_PARM)
+			    continue;
+			  modref_parm_map &m = (*parm_map) [a.parm_index];
+			  if (m.parm_index == MODREF_LOCAL_MEMORY_PARM)
+			    continue;
+			  a.parm_offset += m.parm_offset;
+			  a.parm_offset_known &= m.parm_offset_known;
+			  a.parm_index = m.parm_index;
+			}
+		    }
+		  FOR_EACH_VEC_SAFE_ELT (my_ref_node->accesses, l,
+					 my_access_node)
+		    {
+		      if (my_access_node->contains (a))
+			{
+			  int size = a.size.coeffs[0] / BITS_PER_UNIT;
+			  count += (size + MOVE_MAX_PIECES - 1)
+				   / MOVE_MAX_PIECES;
+			}
+		    }
+		}
+	    }
+	}
+      return count;
+    }
+
+  /* Try to find if callee has same accesses with caller's local memory.  */
+  int local_memory_accesses (vec <modref_parm_map> *parm_map)
+    {
+      size_t i, j, k;
+      modref_base_node <T> *base_node;
+      modref_ref_node <T> *ref_node;
+      modref_access_node *access_node;
+      int count = 0;
+
+      if (every_base || !parm_map)
+	return count;
+
+      FOR_EACH_VEC_SAFE_ELT (bases, i, base_node)
+	{
+	  if (base_node->every_ref)
+	    continue;
+	  FOR_EACH_VEC_SAFE_ELT (base_node->refs, j, ref_node)
+	    {
+	      if (ref_node->every_access)
+		continue;
+	      FOR_EACH_VEC_SAFE_ELT (ref_node->accesses, k, access_node)
+		{
+		  if (access_node->parm_index < (int) parm_map->length ()
+		      && access_node->parm_index != MODREF_UNKNOWN_PARM
+		      && access_node->parm_index
+		      != MODREF_GLOBAL_MEMORY_PARM
+		      && access_node->parm_index
+		      != MODREF_STATIC_CHAIN_PARM)
+		    {
+		      /* If callee's memory access is caller's local
+			 memory, inlining the callee function will
+			 reduce paired of loads and stores.  */
+		      if ((*parm_map)[access_node->parm_index].parm_index
+			  == MODREF_LOCAL_MEMORY_PARM)
+			{
+			  int size = access_node->size.coeffs[0]
+				     / BITS_PER_UNIT;
+			  count += (size + MOVE_MAX_PIECES - 1)
+				   / MOVE_MAX_PIECES;
+			}
+		    }
+		}
+	    }
+	}
+      return count;
+    }
+
  /* Remove tree branches that are not useful (i.e. they will always pass).  */
 
  void cleanup ()
diff --git a/gcc/ipa-modref.cc b/gcc/ipa-modref.cc
index 0d9abacf0a6..a7b68070c13 100644
--- a/gcc/ipa-modref.cc
+++ b/gcc/ipa-modref.cc
@@ -5231,6 +5231,48 @@ modref_propagate_flags_in_scc (cgraph_node *component_node)
 
 }  /* ANON namespace.  */
 
+/* If inlining can eliminate load and store.  */
+bool
+eliminate_load_and_store (cgraph_edge *edge)
+{
+  if (!summaries && !summaries_lto)
+    return false;
+
+  cgraph_node *to = edge->caller->inlined_to
+    ? edge->caller->inlined_to : edge->caller;
+  cgraph_node *from = edge->callee;
+  modref_summary *to_info = summaries ? summaries->get (to) : NULL;
+  modref_summary *from_info = summaries ? summaries->get (from) : NULL;
+  modref_summary_lto *to_info_lto = summaries_lto
+    ? summaries_lto->get (to) : NULL;
+  modref_summary_lto *from_info_lto = summaries_lto
+    ? summaries_lto->get (from) : NULL;
+  int count = 0;
+  auto_vec <modref_parm_map, 32> parm_map;
+  compute_parm_map (edge, &parm_map);
+
+  if (to_info && from_info && to != from)
+    {
+      /* Accumulate the number of insns for redundant loads and stores.
+	 For the following three cases.
+
+	 1. Caller's store is same with callee's load
+	 2. Caller's load is same with callee's load
+	 3. Callee's load is same with caller's local memory access */
+      count += to_info->stores->same (from_info->loads, &parm_map)
+	+ to_info->loads->same (from_info->loads, &parm_map)
+	+ from_info->loads->local_memory_accesses (&parm_map);
+    }
+
+  if (to_info_lto && from_info_lto && (to != from))
+    {
+      count += to_info_lto->stores->same (from_info_lto->loads, &parm_map)
+	+ to_info_lto->loads->same (from_info_lto->loads, &parm_map)
+	+ from_info_lto->loads->local_memory_accesses (&parm_map);
+    }
+  return count >= param_min_inline_hint_eliminate_loads_num;
+}
+
 /* Call EDGE was inlined; merge summary from callee to the caller.  */
 
 void
@@ -5407,7 +5449,7 @@ ipa_merge_modref_summary_after_inlining (cgraph_edge *edge)
 
   if (summaries)
     {
-      if (to_info && !to_info->useful_p (flags))
+      if (to_info && !to_info->useful_p (flags, false))
 	{
 	  if (dump_file)
 	    fprintf (dump_file, "Removed mod-ref summary for %s\n",
@@ -5427,7 +5469,7 @@ ipa_merge_modref_summary_after_inlining (cgraph_edge *edge)
     }
   if (summaries_lto)
     {
-      if (to_info_lto && !to_info_lto->useful_p (flags))
+      if (to_info_lto && !to_info_lto->useful_p (flags, false))
 	{
 	  if (dump_file)
 	    fprintf (dump_file, "Removed mod-ref summary for %s\n",
diff --git a/gcc/ipa-modref.h b/gcc/ipa-modref.h
index 19114bcb429..805f5937493 100644
--- a/gcc/ipa-modref.h
+++ b/gcc/ipa-modref.h
@@ -75,6 +75,7 @@ modref_summary *get_modref_function_summary (cgraph_node *func);
 modref_summary *get_modref_function_summary (gcall *call, bool *interposed);
 void ipa_modref_cc_finalize ();
 void ipa_merge_modref_summary_after_inlining (cgraph_edge *e);
+bool eliminate_load_and_store (cgraph_edge *e);
 
 /* All flags that are implied by the ECF_CONST functions.  */
 static const int implicit_const_eaf_flags
diff --git a/gcc/opts.cc b/gcc/opts.cc
index fe0293e4283..2f14e5c3b1b 100644
--- a/gcc/opts.cc
+++ b/gcc/opts.cc
@@ -686,6 +686,7 @@ static const struct default_options default_options_table[] =
     { OPT_LEVELS_3_PLUS, OPT__param_inline_heuristics_hint_percent_, NULL, 600 },
     { OPT_LEVELS_3_PLUS, OPT__param_inline_min_speedup_, NULL, 15 },
     { OPT_LEVELS_3_PLUS, OPT__param_max_inline_insns_single_, NULL, 200 },
+    { OPT_LEVELS_3_PLUS, OPT__param_min_inline_hint_eliminate_loads_num_, NULL, 3 },
 
     /* -Ofast adds optimizations to -O3.  */
     { OPT_LEVELS_FAST, OPT_ffast_math, NULL, 1 },
diff --git a/gcc/params.opt b/gcc/params.opt
index 2f9c9cf27dd..461a4fab1ba 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -566,6 +566,10 @@ The maximum depth of recursive inlining for inline functions.
 Common Joined UInteger Var(param_max_inline_recursive_depth_auto) Optimization Init(8) Param
 The maximum depth of recursive inlining for non-inline functions.
 
+-param=min_inline_hint_eliminate_loads_num=
+Common Joined UInteger Var(param_min_inline_hint_eliminate_loads_num) Optimization Init(8) Param
+The minimum of loads that can be eliminated.
+
 -param=max-isl-operations=
 Common Joined UInteger Var(param_max_isl_operations) Init(350000) Param Optimization
 Maximum number of isl operations, 0 means unlimited.
diff --git a/gcc/testsuite/gcc.dg/ipa/inlinehint-6.c b/gcc/testsuite/gcc.dg/ipa/inlinehint-6.c
new file mode 100644
index 00000000000..e27f7b912e6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ipa/inlinehint-6.c
@@ -0,0 +1,54 @@
+/* { dg-options "-O3 -c -fdump-ipa-inline-details -fno-early-inlining -fno-ipa-cp"  } */
+/* { dg-add-options bind_pic_locally } */
+
+#define size_t long long int
+
+struct B
+{
+  size_t f1, f2, f3, f4, f5;
+};
+
+struct B x;
+
+struct A
+{
+  size_t f1, f2, f3, f4;
+};
+struct C
+{
+  struct A a;
+  size_t b;
+};
+
+__attribute__((hot)) size_t callee (struct A *a, struct C *c)
+{
+  /* *a is caller's local memory, caller needs to store it on the stack, then
+     callee loads it. If inline callee, it reduces a pair of load and store. */
+  c->a=(*a);
+
+  /* Caller also has load c->b, If inline callee, this load can be eliminated. */
+  if((c->b + 7) & 17)
+   {
+      x.f1 = c->a.f2 + c->a.f1;
+      x.f2 = c->a.f3 - c->a.f2;
+      x.f3 = c->a.f2 + c->a.f3;
+      x.f4 = c->a.f2 - c->a.f4;
+      return 0;
+    }
+  return 1;
+}
+
+__attribute__((hot)) size_t caller (size_t d, size_t e, size_t f, size_t g, struct C *c)
+{
+  struct A a;
+  a.f1 = d;
+  a.f2 = e;
+  a.f3 = f;
+  a.f4 = g;
+  if (c->b > 0)
+    return callee (&a, c);
+  else
+    return 1;
+}
+
+/* { dg-final { scan-ipa-dump "eliminate_load_and_store"  "inline"  } } */
-- 
2.17.1



More information about the Gcc-patches mailing list