This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Inliner badness fixes&cleanups


Hi,
while I plan to deffer the heuristics tunning once rest of IPA infrastructure updates
is on the place, I noticed that there are several obvious nonseces in the
badness computation.  This patch fix that.  The changes are:

 1) inline_call increases overall program size when inlining increase size
    but does not decrease it when inlining decrease size.  This is illogical
    and my recollection is that I handled this way tramp3d where there are tons
    of unit decreasing inlines first. This was before early inliner was introduced
    and at a time we did not update unit growth estimates when unit size shrinks
    by inlining. So removed.
 2) Badness computation used funny log limits on overly large divisors.  This is nonsense
    because it does not matter much if you divide by 10000 or 10001, also the frequencies
    are within known range and it is a lot better to use this knowledge.
 3) relative time benefit computation is factored out into separate functions
 4) all fixed point arithmetic is now power of 2 based instead of power of 10 as previously.
    Given that badness calculation became hot spot for Mozilla, there is no need to waste
    cycles on this.
 5) update_edge_key dumping is fixed.

Bootstrapped/regtested x86_64-linux, will commit it shortly.

Honza

	* ipa-inline-transform.c (inline_call): Account when program size decreases.
	* ipa-inline.c (relative_time_benefit): New function.
	(edge_badness): Reorganize to be power 2 based; fix
	thinko when computing badness for negative growth; update
	comments to match reality; better dumps.
	(update_edge_key): Dump before updating.
Index: ipa-inline-transform.c
===================================================================
*** ipa-inline-transform.c	(revision 173531)
--- ipa-inline-transform.c	(working copy)
*************** inline_call (struct cgraph_edge *e, bool
*** 184,190 ****
    old_size = inline_summary (to)->size;
    inline_merge_summary (e);
    new_size = inline_summary (to)->size;
!   if (overall_size && new_size > old_size)
      *overall_size += new_size - old_size;
    ncalls_inlined++;
  
--- 184,190 ----
    old_size = inline_summary (to)->size;
    inline_merge_summary (e);
    new_size = inline_summary (to)->size;
!   if (overall_size)
      *overall_size += new_size - old_size;
    ncalls_inlined++;
  
Index: ipa-inline.c
===================================================================
*** ipa-inline.c	(revision 173532)
--- ipa-inline.c	(working copy)
*************** want_inline_function_called_once_p (stru
*** 666,671 ****
--- 666,702 ----
     return true;
  }
  
+ 
+ /* Return relative time improvement for inlining EDGE in range
+    1...2^9.  */
+ 
+ static inline int
+ relative_time_benefit (struct inline_summary *callee_info,
+ 		       struct cgraph_edge *edge,
+ 		       int time_growth)
+ {
+   int relbenefit;
+   gcov_type uninlined_call_time;
+ 
+   uninlined_call_time =
+     ((gcov_type)
+      (callee_info->time
+       + inline_edge_summary (edge)->call_stmt_time
+       + CGRAPH_FREQ_BASE / 2) * edge->frequency
+      / CGRAPH_FREQ_BASE);
+   /* Compute relative time benefit, i.e. how much the call becomes faster.
+      ??? perhaps computing how much the caller+calle together become faster
+      would lead to more realistic results.  */
+   if (!uninlined_call_time)
+     uninlined_call_time = 1;
+   relbenefit =
+     (uninlined_call_time - time_growth) * 256 / (uninlined_call_time);
+   relbenefit = MIN (relbenefit, 512);
+   relbenefit = MAX (relbenefit, 1);
+   return relbenefit;
+ }
+ 
+ 
  /* A cost model driving the inlining heuristics in a way so the edges with
     smallest badness are inlined first.  After each inlining is performed
     the costs of all caller edges of nodes affected are recomputed so the
*************** edge_badness (struct cgraph_edge *edge, 
*** 690,696 ****
        fprintf (dump_file, "    Badness calculation for %s -> %s\n",
  	       cgraph_node_name (edge->caller),
  	       cgraph_node_name (edge->callee));
!       fprintf (dump_file, "      growth size %i, time %i\n",
  	       growth,
  	       time_growth);
      }
--- 721,727 ----
        fprintf (dump_file, "    Badness calculation for %s -> %s\n",
  	       cgraph_node_name (edge->caller),
  	       cgraph_node_name (edge->callee));
!       fprintf (dump_file, "      size growth %i, time growth %i\n",
  	       growth,
  	       time_growth);
      }
*************** edge_badness (struct cgraph_edge *edge, 
*** 698,723 ****
    /* Always prefer inlining saving code size.  */
    if (growth <= 0)
      {
!       badness = INT_MIN - growth;
        if (dump)
! 	fprintf (dump_file, "      %i: Growth %i < 0\n", (int) badness,
  		 growth);
      }
  
!   /* When profiling is available, base priorities -(#calls / growth).
!      So we optimize for overall number of "executed" inlined calls.  */
    else if (max_count)
      {
!       int benefitperc;
!       benefitperc = (((gcov_type)callee_info->time
! 		     * edge->frequency / CGRAPH_FREQ_BASE - time_growth) * 100
! 		     / (callee_info->time + 1) + 1);
!       benefitperc = MIN (benefitperc, 100);
!       benefitperc = MAX (benefitperc, 0);
        badness =
  	((int)
! 	 ((double) edge->count * INT_MIN / max_count / 100) *
! 	 benefitperc) / growth;
        
        /* Be sure that insanity of the profile won't lead to increasing counts
  	 in the scalling and thus to overflow in the computation above.  */
--- 729,757 ----
    /* Always prefer inlining saving code size.  */
    if (growth <= 0)
      {
!       badness = INT_MIN / 2 + growth;
        if (dump)
! 	fprintf (dump_file, "      %i: Growth %i <= 0\n", (int) badness,
  		 growth);
      }
  
!   /* When profiling is available, compute badness as:
! 
! 	        relative_edge_count * relative_time_benefit
!      goodness = -------------------------------------------
! 		edge_growth
!      badness = -goodness  
! 
!     The fraction is upside down, becuase on edge counts and time beneits
!     the bounds are known. Edge growth is essentially unlimited.  */
! 
    else if (max_count)
      {
!       int relbenefit = relative_time_benefit (callee_info, edge, time_growth);
        badness =
  	((int)
! 	 ((double) edge->count * INT_MIN / 2 / max_count / 512) *
! 	 relative_time_benefit (callee_info, edge, time_growth)) / growth;
        
        /* Be sure that insanity of the profile won't lead to increasing counts
  	 in the scalling and thus to overflow in the computation above.  */
*************** edge_badness (struct cgraph_edge *edge, 
*** 729,779 ****
  		   " * Relative benefit %f\n",
  		   (int) badness, (double) badness / INT_MIN,
  		   (double) edge->count / max_count,
! 		   (double) benefitperc);
  	}
      }
  
!   /* When function local profile is available, base priorities on
!      growth / frequency, so we optimize for overall frequency of inlined
!      calls.  This is not too accurate since while the call might be frequent
!      within function, the function itself is infrequent.
! 
!      Other objective to optimize for is number of different calls inlined.
!      We add the estimated growth after inlining all functions to bias the
!      priorities slightly in this direction (so fewer times called functions
!      of the same size gets priority).  */
    else if (flag_guess_branch_prob)
      {
!       int div = edge->frequency * 100 / CGRAPH_FREQ_BASE + 1;
!       int benefitperc;
        int growth_for_all;
-       badness = growth * 10000;
-       benefitperc = (((gcov_type)callee_info->time
- 		     * edge->frequency / CGRAPH_FREQ_BASE - time_growth) * 100
- 		     / (callee_info->time + 1) + 1);
-       benefitperc = MIN (benefitperc, 100);
-       benefitperc = MAX (benefitperc, 0);
-       div *= benefitperc;
  
!       /* Decrease badness if call is nested.  */
!       /* Compress the range so we don't overflow.  */
!       if (div > 10000)
! 	div = 10000 + ceil_log2 (div) - 8;
!       if (div < 1)
! 	div = 1;
!       if (badness > 0)
! 	badness /= div;
        growth_for_all = estimate_growth (edge->callee);
        badness += growth_for_all;
!       if (badness > INT_MAX)
! 	badness = INT_MAX;
        if (dump)
  	{
  	  fprintf (dump_file,
! 		   "      %i: guessed profile. frequency %i, overall growth %i,"
! 		   " benefit %i%%, divisor %i\n",
! 		   (int) badness, edge->frequency, growth_for_all,
! 		   benefitperc, div);
  	}
      }
    /* When function local profile is not available or it does not give
--- 763,824 ----
  		   " * Relative benefit %f\n",
  		   (int) badness, (double) badness / INT_MIN,
  		   (double) edge->count / max_count,
! 		   relbenefit * 100 / 256.0);
  	}
      }
  
!   /* When function local profile is available. Compute badness as:
! 
!      
!                growth_of_callee
!      badness = -------------------------------------- + growth_for-all
! 	       relative_time_benefit * edge_frequency
! 
!   */
    else if (flag_guess_branch_prob)
      {
!       int div = edge->frequency * (1<<10) / CGRAPH_FREQ_MAX;
        int growth_for_all;
  
!       div = MAX (div, 1);
!       gcc_checking_assert (edge->frequency <= CGRAPH_FREQ_MAX);
!       div *= relative_time_benefit (callee_info, edge, time_growth);
! 
!       /* frequency is normalized in range 1...2^10.
!          relbenefit in range 1...2^9
! 	 DIV should be in range 1....2^19.  */
!       gcc_checking_assert (div >= 1 && div <= (1<<19));
! 
!       /* Result must be integer in range 0...INT_MAX.
! 	 Set the base of fixed point calculation so we don't lose much of
! 	 precision for small bandesses (those are interesting) yet we don't
! 	 overflow for growths that are still in interesting range.  */
!       badness = ((gcov_type)growth) * (1<<18);
!       badness = (badness + div / 2) / div;
! 
!       /* Overall growth of inlining all calls of function matters: we want to
! 	 inline so offline copy of function is no longer needed.
! 
! 	 Additionally functions that can be fully inlined without much of
! 	 effort are better inline candidates than functions that can be fully
! 	 inlined only after noticeable overall unit growths. The latter
! 	 are better in a sense compressing of code size by factoring out common
! 	 code into separate function shared by multiple code paths.
! 
! 	 We might mix the valud into the fraction by taking into account
! 	 relative growth of the unit, but for now just add the number
! 	 into resulting fraction.  */
        growth_for_all = estimate_growth (edge->callee);
        badness += growth_for_all;
!       if (badness > INT_MAX - 1)
! 	badness = INT_MAX - 1;
        if (dump)
  	{
  	  fprintf (dump_file,
! 		   "      %i: guessed profile. frequency %f, overall growth %i,"
! 		   " benefit %f%%, divisor %i\n",
! 		   (int) badness, (double)edge->frequency / CGRAPH_FREQ_BASE, growth_for_all,
! 		   relative_time_benefit (callee_info, edge, time_growth) * 100 / 256.0, div);
  	}
      }
    /* When function local profile is not available or it does not give
*************** update_edge_key (fibheap_t heap, struct 
*** 823,829 ****
  	 a minimum of heap.  */
        if (badness < n->key)
  	{
- 	  fibheap_replace_key (heap, n, badness);
  	  if (dump_file && (dump_flags & TDF_DETAILS))
  	    {
  	      fprintf (dump_file,
--- 868,873 ----
*************** update_edge_key (fibheap_t heap, struct 
*** 833,838 ****
--- 877,883 ----
  		       (int)n->key,
  		       badness);
  	    }
+ 	  fibheap_replace_key (heap, n, badness);
  	  gcc_checking_assert (n->key == badness);
  	}
      }


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]