This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

New badness metric for inliner


Hi,
this patch implements new badness metric for inliner.  It is now based on
relative speedup, but not of calee, but of the caller + callee combo.  This
nicely encompasses the edge frequency and other parameters simplifying the
actual badness code. I re-added code considering growth after inlining into
all callees, but this time without any attempt to keep the value up to
date across inlining that has shown to be expensive to do.

The patch also adds two new inline hints: INLINE_HINT_declared_inline and
INLINE_HINT_cross_module. INLINE_HINT_declared_inline decreases badness of the
call so explicitely declared functions are more likely inlined.
INLINE_HINT_cross_module increase badness since programs are usually organized
in a way that cross module inlining is less important than inter-module
inlining still.

While the patch ended up quite simple, it is result of much experimentation and
bugfixing ;)
I have bootstrapped/regtested the patch on x86_64-linux and tested on SPEC2000,
SPEC2006, c++ benchmark suite and Mozilla.  The patch is generally quite
considerable win. Most nice speedup are seen on the C++ benchmark suite where
it leads to nice code size savings and speedups. (i.e. tramp3d is now 30%
smaller and faster than before) http://gcc.opensuse.org/c++bench-frescobaldi/

There are few benchmarks that are not on historically best scores over my
experiments - botan (that regress relative to 4.7), fatigue and c-ray. I will
address these incrementally.  I will also do incrementally tunning with profile
feedback.

Honza

	* ipa-inline.c (compute_uninlined_call_time,
	compute_inlined_call_time): New functions.
	(RELATIVE_TIME_BENEFIT_RANGE): New macro.
	(relative_time_benefit): Rewrite.
	(edge_badness): Rewrite path with guessed profile and estimated profile.
	* ipa-inline.h (INLINE_HINT_declared_inline, INLINE_HINT_cross_module):
	New hints.
	(struct inline_summary): Add GROWTH filed.
	* ipa-inline-analysis.c (dump_inline_hints): Update.
	(reset_inline_summary): Update.
	(dump_inline_summary): Update.
	(will_be_nonconstant_predicate): Cleanup to use gimple_store_p and
	gimple_assign_load_p predicates.
	(estimate_node_size_and_time): Drop INLINE_HINT_declared_inline hint.
	(simple_edge_hints): New function.
	(do_estimate_edge_time): Return time of invocation of callee rather
	than the time scaled by edge frequency; update hints code.
	(do_estimate_edge_hints): Update.
	(do_estimate_growth): Cleanup.
Index: ipa-inline.c
===================================================================
--- ipa-inline.c	(revision 193157)
+++ ipa-inline.c	(working copy)
@@ -456,6 +456,42 @@ want_early_inline_function_p (struct cgr
   return want_inline;
 }
 
+/* Compute time of the edge->caller + edge->callee execution when inlining
+   does not happen.  */
+
+inline int
+compute_uninlined_call_time (struct inline_summary *callee_info,
+			     struct cgraph_edge *edge)
+{
+  int uninlined_call_time =
+    RDIV ((gcov_type)callee_info->time * MAX (edge->frequency, 1),
+	  CGRAPH_FREQ_BASE);
+  int caller_time = inline_summary (edge->caller->global.inlined_to
+				    ? edge->caller->global.inlined_to
+				    : edge->caller)->time;
+  return uninlined_call_time + caller_time;
+}
+
+/* Same as compute_uinlined_call_time but compute time when inlining
+   does happen.  */
+
+inline gcov_type
+compute_inlined_call_time (struct cgraph_edge *edge,
+			   int edge_time)
+{
+  int caller_time = inline_summary (edge->caller->global.inlined_to
+				    ? edge->caller->global.inlined_to
+				    : edge->caller)->time;
+  int time = caller_time + RDIV ((edge_time - inline_edge_summary (edge)->call_stmt_time)
+			         * MAX (edge->frequency, 1),
+				 CGRAPH_FREQ_BASE);
+  /* Possible one roundoff error, but watch for overflows.  */
+  gcc_checking_assert (time >= INT_MIN / 2);
+  if (time < 0)
+    time = 0;
+  return time;
+}
+
 /* Return true if we are interested in inlining small function.
    When REPORT is true, report reason to dump file.  */
 
@@ -724,31 +760,41 @@ want_inline_function_to_all_callers_p (s
    return true;
 }
 
+#define RELATIVE_TIME_BENEFIT_RANGE (INT_MAX / 64)
 
 /* Return relative time improvement for inlining EDGE in range
-   1...2^9.  */
+   1...RELATIVE_TIME_BENEFIT_RANGE  */
 
 static inline int
 relative_time_benefit (struct inline_summary *callee_info,
 		       struct cgraph_edge *edge,
-		       int time_growth)
+		       int edge_time)
 {
   int relbenefit;
-  gcov_type uninlined_call_time;
+  int uninlined_call_time = compute_uninlined_call_time (callee_info, edge);
+  int inlined_call_time = compute_inlined_call_time (edge, edge_time);
+
+  /* Inlining into extern inline function is not a win.  */
+  if (DECL_EXTERNAL (edge->caller->global.inlined_to
+		     ? edge->caller->global.inlined_to->symbol.decl
+		     : edge->caller->symbol.decl))
+    return 1;
+
+  /* Watch overflows.  */
+  gcc_checking_assert (uninlined_call_time >= 0);
+  gcc_checking_assert (inlined_call_time >= 0);
+  gcc_checking_assert (uninlined_call_time >= inlined_call_time);
 
-  uninlined_call_time =
-    ((gcov_type)
-     (callee_info->time
-      + inline_edge_summary (edge)->call_stmt_time) * edge->frequency
-     + CGRAPH_FREQ_BASE / 2) / CGRAPH_FREQ_BASE;
   /* Compute relative time benefit, i.e. how much the call becomes faster.
      ??? perhaps computing how much the caller+calle together become faster
      would lead to more realistic results.  */
   if (!uninlined_call_time)
     uninlined_call_time = 1;
   relbenefit =
-    (uninlined_call_time - time_growth) * 256 / (uninlined_call_time);
-  relbenefit = MIN (relbenefit, 512);
+    RDIV (((gcov_type)uninlined_call_time - inlined_call_time) * RELATIVE_TIME_BENEFIT_RANGE,
+	  uninlined_call_time);
+  relbenefit = MIN (relbenefit, RELATIVE_TIME_BENEFIT_RANGE);
+  gcc_checking_assert (relbenefit >= 0);
   relbenefit = MAX (relbenefit, 1);
   return relbenefit;
 }
@@ -764,7 +810,7 @@ static int
 edge_badness (struct cgraph_edge *edge, bool dump)
 {
   gcov_type badness;
-  int growth, time_growth;
+  int growth, edge_time;
   struct cgraph_node *callee = cgraph_function_or_thunk_node (edge->callee,
 							      NULL);
   struct inline_summary *callee_info = inline_summary (callee);
@@ -774,17 +820,20 @@ edge_badness (struct cgraph_edge *edge, 
     return INT_MIN;
 
   growth = estimate_edge_growth (edge);
-  time_growth = estimate_edge_time (edge);
+  edge_time = estimate_edge_time (edge);
   hints = estimate_edge_hints (edge);
+  gcc_checking_assert (edge_time >= 0);
+  gcc_checking_assert (edge_time <= callee_info->time);
+  gcc_checking_assert (growth <= callee_info->size);
 
   if (dump)
     {
       fprintf (dump_file, "    Badness calculation for %s -> %s\n",
 	       xstrdup (cgraph_node_name (edge->caller)),
 	       xstrdup (cgraph_node_name (callee)));
-      fprintf (dump_file, "      size growth %i, time growth %i ",
+      fprintf (dump_file, "      size growth %i, time %i ",
 	       growth,
-	       time_growth);
+	       edge_time);
       dump_inline_hints (dump_file, hints);
       fprintf (dump_file, "\n");
     }
@@ -802,7 +851,7 @@ edge_badness (struct cgraph_edge *edge, 
 
 	        relative_edge_count * relative_time_benefit
      goodness = -------------------------------------------
-		edge_growth
+		growth_f_caller
      badness = -goodness  
 
     The fraction is upside down, because on edge counts and time beneits
@@ -810,11 +859,11 @@ edge_badness (struct cgraph_edge *edge, 
 
   else if (max_count)
     {
-      int relbenefit = relative_time_benefit (callee_info, edge, time_growth);
+      int relbenefit = relative_time_benefit (callee_info, edge, edge_time);
       badness =
 	((int)
-	 ((double) edge->count * INT_MIN / 2 / max_count / 512) *
-	 relative_time_benefit (callee_info, edge, time_growth)) / growth;
+	 ((double) edge->count * INT_MIN / 2 / max_count / RELATIVE_TIME_BENEFIT_RANGE) *
+	 relbenefit) / growth;
       
       /* Be sure that insanity of the profile won't lead to increasing counts
 	 in the scalling and thus to overflow in the computation above.  */
@@ -826,73 +875,53 @@ edge_badness (struct cgraph_edge *edge, 
 		   " * Relative benefit %f\n",
 		   (int) badness, (double) badness / INT_MIN,
 		   (double) edge->count / max_count,
-		   relbenefit * 100 / 256.0);
+		   relbenefit * 100.0 / RELATIVE_TIME_BENEFIT_RANGE);
 	}
     }
 
   /* When function local profile is available. Compute badness as:
-
      
-               growth_of_callee
-     badness = -------------------------------------- + growth_for-all
-	       relative_time_benefit * edge_frequency
+                 relative_time_benefit
+     goodness =  ---------------------------------
+	         growth_of_caller * overall_growth
 
+     badness = - goodness
+
+     compensated by the inline hints.
   */
   else if (flag_guess_branch_prob)
     {
-      int div = edge->frequency * (1<<10) / CGRAPH_FREQ_MAX;
-
-      div = MAX (div, 1);
-      gcc_checking_assert (edge->frequency <= CGRAPH_FREQ_MAX);
-      div *= relative_time_benefit (callee_info, edge, time_growth);
-
-      /* frequency is normalized in range 1...2^10.
-         relbenefit in range 1...2^9
-	 DIV should be in range 1....2^19.  */
-      gcc_checking_assert (div >= 1 && div <= (1<<19));
-
-      /* Result must be integer in range 0...INT_MAX.
-	 Set the base of fixed point calculation so we don't lose much of
-	 precision for small bandesses (those are interesting) yet we don't
-	 overflow for growths that are still in interesting range.
-
-	 Fixed point arithmetic with point at 6th bit. */
-      badness = ((gcov_type)growth) * (1<<(19+6));
-      badness = (badness + div / 2) / div;
-
-      /* Overall growth of inlining all calls of function matters: we want to
-	 inline so offline copy of function is no longer needed.
-
-	 Additionally functions that can be fully inlined without much of
-	 effort are better inline candidates than functions that can be fully
-	 inlined only after noticeable overall unit growths. The latter
-	 are better in a sense compressing of code size by factoring out common
-	 code into separate function shared by multiple code paths.
-
-	 We might mix the valud into the fraction by taking into account
-	 relative growth of the unit, but for now just add the number
-	 into resulting fraction.  */
-      if (badness > INT_MAX / 8)
-	{
-	  badness = INT_MAX / 8;
-	  if (dump)
-	    fprintf (dump_file, "Badness overflow\n");
-	}
-      if (hints & (INLINE_HINT_indirect_call
-		   | INLINE_HINT_loop_iterations
-		   | INLINE_HINT_loop_stride))
-	badness /= 8;
+      badness = (relative_time_benefit (callee_info, edge, edge_time)
+		 * (INT_MIN / 16 / RELATIVE_TIME_BENEFIT_RANGE));
+      badness /= (growth * MAX (1, callee_info->growth));
+      gcc_checking_assert (badness <=0 && badness >= INT_MIN / 16);
+      if ((hints & (INLINE_HINT_indirect_call
+		    | INLINE_HINT_loop_iterations
+		    | INLINE_HINT_loop_stride))
+	  || callee_info->growth <= 0)
+	badness *= 8;
       if (hints & (INLINE_HINT_same_scc))
-	badness *= 4;
-      if (hints & (INLINE_HINT_in_scc))
-	badness *= 2;
+	badness /= 16;
+      else if (hints & (INLINE_HINT_in_scc))
+	badness /= 8;
+      else if (hints & (INLINE_HINT_cross_module))
+	badness /= 2;
+      gcc_checking_assert (badness <= 0 && badness >= INT_MIN / 2);
+      if ((hints & INLINE_HINT_declared_inline) && badness >= INT_MIN / 32)
+	badness *= 16;
       if (dump)
 	{
 	  fprintf (dump_file,
 		   "      %i: guessed profile. frequency %f,"
-		   " benefit %f%%, divisor %i\n",
+		   " benefit %f%%, time w/o inlining %i, time w inlining %i"
+		   " overall growth %i (current) %i (original)\n",
 		   (int) badness, (double)edge->frequency / CGRAPH_FREQ_BASE,
-		   relative_time_benefit (callee_info, edge, time_growth) * 100 / 256.0, div);
+		   relative_time_benefit (callee_info, edge, edge_time) * 100.0
+		   / RELATIVE_TIME_BENEFIT_RANGE, 
+		   compute_uninlined_call_time (callee_info, edge),
+		   (int)compute_inlined_call_time (edge, edge_time),
+		   estimate_growth (callee),
+		   callee_info->growth);
 	}
     }
   /* When function local profile is not available or it does not give
@@ -1371,6 +1400,7 @@ inline_small_functions (void)
 
 	    if (!DECL_EXTERNAL (node->symbol.decl))
 	      initial_size += info->size;
+	    info->growth = estimate_growth (node);
 	    if (dfs && dfs->next_cycle)
 	      {
 		struct cgraph_node *n2;
Index: ipa-inline.h
===================================================================
--- ipa-inline.h	(revision 193155)
+++ ipa-inline.h	(working copy)
@@ -49,7 +49,9 @@ enum inline_hints_vals {
   INLINE_HINT_loop_iterations = 2,
   INLINE_HINT_loop_stride = 4,
   INLINE_HINT_same_scc = 8,
-  INLINE_HINT_in_scc = 16
+  INLINE_HINT_in_scc = 16,
+  INLINE_HINT_declared_inline = 32,
+  INLINE_HINT_cross_module = 64
 };
 typedef int inline_hints;
 
@@ -129,6 +131,12 @@ struct GTY(()) inline_summary
   /* Predicate on when some loop in the function becomes to have known
      stride.   */
   struct predicate * GTY((skip)) loop_stride;
+  /* Estimated growth for inlining all copies of the function before start
+     of small functions inlining.
+     This value will get out of date as the callers are duplicated, but
+     using up-to-date value in the badness metric mean a lot of extra
+     expenses.  */
+  int growth;
   /* Number of SCC on the beggining of inlining process.  */
   int scc_no;
 };
Index: ipa-inline-analysis.c
===================================================================
--- ipa-inline-analysis.c	(revision 193155)
+++ ipa-inline-analysis.c	(working copy)
@@ -649,6 +649,16 @@ dump_inline_hints (FILE *f, inline_hints
       hints &= ~INLINE_HINT_in_scc;
       fprintf (f, " in_scc");
     }
+  if (hints & INLINE_HINT_cross_module)
+    {
+      hints &= ~INLINE_HINT_cross_module;
+      fprintf (f, " cross_module");
+    }
+  if (hints & INLINE_HINT_declared_inline)
+    {
+      hints &= ~INLINE_HINT_declared_inline;
+      fprintf (f, " declared_inline");
+    }
   gcc_assert (!hints);
 }
 
@@ -983,6 +993,7 @@ reset_inline_summary (struct cgraph_node
   info->stack_frame_offset = 0;
   info->size = 0;
   info->time = 0;
+  info->growth = 0;
   info->scc_no = 0;
   if (info->loop_iterations)
     {
@@ -1375,6 +1386,9 @@ dump_inline_summary (FILE * f, struct cg
 	       (int) s->estimated_self_stack_size);
       fprintf (f, "  global stack:    %i\n",
 	       (int) s->estimated_stack_size);
+      if (s->growth)
+        fprintf (f, "  estimated growth:%i\n",
+	         (int) s->growth);
       if (s->scc_no)
         fprintf (f, "  In SCC:          %i\n",
 	         (int) s->scc_no);
@@ -1977,10 +1991,11 @@ will_be_nonconstant_predicate (struct ip
     return p;
 
   /* Stores will stay anyway.  */
-  if (gimple_vdef (stmt))
+  if (gimple_store_p (stmt))
     return p;
 
-  is_load = gimple_vuse (stmt) != NULL;
+  is_load = gimple_assign_load_p (stmt);
+
   /* Loads can be optimized when the value is known.  */
   if (is_load)
     {
@@ -2857,6 +2872,8 @@ estimate_node_size_and_time (struct cgra
     hints |=INLINE_HINT_loop_stride;
   if (info->scc_no)
     hints |= INLINE_HINT_in_scc;
+  if (DECL_DECLARED_INLINE_P (node->symbol.decl))
+    hints |= INLINE_HINT_declared_inline;
 
   estimate_calls_size_and_time (node, &size, &time, &hints, possible_truths,
 				known_vals, known_binfos, known_aggs);
@@ -2865,7 +2882,6 @@ estimate_node_size_and_time (struct cgra
   time = RDIV (time, INLINE_TIME_SCALE);
   size = RDIV (size, INLINE_SIZE_SCALE);
 
-
   if (dump_file
       && (dump_flags & TDF_DETAILS))
     fprintf (dump_file, "\n   size:%i time:%i\n", (int)size, (int)time);
@@ -3315,6 +3331,26 @@ inline_update_overall_summary (struct cg
   info->size = (info->size + INLINE_SIZE_SCALE / 2) / INLINE_SIZE_SCALE;
 }
 
+/* Return hints derrived from EDGE.   */
+int
+simple_edge_hints (struct cgraph_edge *edge)
+{
+  int hints = 0;
+  struct cgraph_node *to = (edge->caller->global.inlined_to
+			    ? edge->caller->global.inlined_to
+			    : edge->caller);
+  if (inline_summary (to)->scc_no
+      && inline_summary (to)->scc_no == inline_summary (edge->callee)->scc_no
+      && !cgraph_edge_recursive_p (edge))
+    hints |= INLINE_HINT_same_scc;
+
+  if (to->symbol.lto_file_data && edge->callee->symbol.lto_file_data
+      && to->symbol.lto_file_data != edge->callee->symbol.lto_file_data)
+    hints |= INLINE_HINT_cross_module;
+
+  return hints;
+}
+
 /* Estimate the time cost for the caller when inlining EDGE.
    Only to be called via estimate_edge_time, that handles the
    caching mechanism.
@@ -3328,7 +3364,6 @@ do_estimate_edge_time (struct cgraph_edg
   int time;
   int size;
   inline_hints hints;
-  gcov_type ret;
   struct cgraph_node *callee;
   clause_t clause;
   VEC (tree, heap) *known_vals;
@@ -3347,33 +3382,26 @@ do_estimate_edge_time (struct cgraph_edg
   VEC_free (tree, heap, known_vals);
   VEC_free (tree, heap, known_binfos);
   VEC_free (ipa_agg_jump_function_p, heap, known_aggs);
-
-  ret = RDIV ((gcov_type)time * edge->frequency,
-	      CGRAPH_FREQ_BASE);
+  gcc_checking_assert (size >= 0);
+  gcc_checking_assert (time >= 0);
 
   /* When caching, update the cache entry.  */
   if (edge_growth_cache)
     {
-      struct cgraph_node *to = (edge->caller->global.inlined_to
-			        ? edge->caller->global.inlined_to
-				: edge->caller);
       if ((int)VEC_length (edge_growth_cache_entry, edge_growth_cache)
 	  <= edge->uid)
 	VEC_safe_grow_cleared (edge_growth_cache_entry, heap, edge_growth_cache,
 			       cgraph_edge_max_uid);
       VEC_index (edge_growth_cache_entry, edge_growth_cache, edge->uid).time
-	= ret + (ret >= 0);
+	= time + (time >= 0);
 
       VEC_index (edge_growth_cache_entry, edge_growth_cache, edge->uid).size
 	= size + (size >= 0);
-      if (inline_summary (to)->scc_no
-	  && inline_summary (to)->scc_no == inline_summary (callee)->scc_no
-	  && !cgraph_edge_recursive_p (edge))
-	hints |= INLINE_HINT_same_scc;
+      hints |= simple_edge_hints (edge);
       VEC_index (edge_growth_cache_entry, edge_growth_cache, edge->uid).hints
 	= hints + 1;
     }
-  return ret;
+  return time;
 }
 
 
@@ -3430,9 +3458,6 @@ do_estimate_edge_hints (struct cgraph_ed
   VEC (tree, heap) *known_vals;
   VEC (tree, heap) *known_binfos;
   VEC (ipa_agg_jump_function_p, heap) *known_aggs;
-  struct cgraph_node *to = (edge->caller->global.inlined_to
-		            ? edge->caller->global.inlined_to
-			    : edge->caller);
 
   /* When we do caching, use do_estimate_edge_time to populate the entry.  */
 
@@ -3458,10 +3483,7 @@ do_estimate_edge_hints (struct cgraph_ed
   VEC_free (tree, heap, known_vals);
   VEC_free (tree, heap, known_binfos);
   VEC_free (ipa_agg_jump_function_p, heap, known_aggs);
-  if (inline_summary (to)->scc_no
-      && inline_summary (to)->scc_no == inline_summary (callee)->scc_no
-      && !cgraph_edge_recursive_p (edge))
-    hints |= INLINE_HINT_same_scc;
+  hints |= simple_edge_hints (edge);
   return hints;
 }
 
@@ -3549,10 +3571,11 @@ do_estimate_growth (struct cgraph_node *
      return zero or negative growths. */
   if (d.self_recursive)
     d.growth = d.growth < info->size ? info->size : d.growth;
+  else if (DECL_EXTERNAL (node->symbol.decl))
+    ;
   else
     {
-      if (!DECL_EXTERNAL (node->symbol.decl)
-	  && cgraph_will_be_removed_from_program_if_no_direct_calls (node))
+      if (cgraph_will_be_removed_from_program_if_no_direct_calls (node))
 	d.growth -= info->size;
       /* COMDAT functions are very often not shared across multiple units
 	 since they come from various template instantiations.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]