This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Cleanup and speedup inliner after conversion of heap to sreals
- From: Jan Hubicka <hubicka at ucw dot cz>
- To: gcc-patches at gcc dot gnu dot org
- Date: Tue, 16 Dec 2014 23:08:24 +0100
- Subject: Cleanup and speedup inliner after conversion of heap to sreals
- Authentication-results: sourceware.org; auth=none
Hi,
conversion to sreal makes it possible to compute badness in more streamlined
manner. Together with the sreal::normalize change this patch finally makes
fibheap badness calcualtion to be out of radar in profiling and I hope it
makes it more maintainable by eliminating the issues from roundoff errors and
overflows that was quite painful.
Incrementally it is also possible to turn time computation into sreals.
Bootstrapped/regtested x86_64-linux, will wait with the commit for the
preivous changes to show up in the benchmark testers.
Thanks for Martin and Trevor for working on sreals&fibheap changes.
Honza
* ipa-inline.c (cgraph_freq_base_rec, percent_rec): New functions.
(compute_uninlined_call_time): Return sreal.
(compute_inlined_call_time): Return sreal.
(big_speedup_p): Update to be computed with sreals.
(relative_time_benefit): Turn to sreal computation; return
value as numerator/denominator to save division.
(edge_badness): Rewrite to sreals; remove overflow checks
and cleanup.
(ipa_inline): Initialize cgraph_freq_base_rec and percent_rec.
(inline_small_functions): Update dumping; speedup fibheap maintenance.
(update_edge_key): Update dumping.
Index: ipa-inline.c
===================================================================
--- ipa-inline.c (revision 218730)
+++ ipa-inline.c (working copy)
@@ -148,6 +148,10 @@ static gcov_type max_count;
static sreal max_count_real, max_relbenefit_real, half_int_min_real;
static gcov_type spec_rem;
+/* sreal constants representing 1/CGRAPH_FREQ_BASE and
+ 1/100. */
+static sreal cgraph_freq_base_rec, percent_rec;
+
/* Return false when inlining edge E would lead to violating
limits on function unit growth or stack usage growth.
@@ -517,37 +521,34 @@ want_early_inline_function_p (struct cgr
/* Compute time of the edge->caller + edge->callee execution when inlining
does not happen. */
-inline gcov_type
+inline sreal
compute_uninlined_call_time (struct inline_summary *callee_info,
struct cgraph_edge *edge)
{
- gcov_type uninlined_call_time =
- RDIV ((gcov_type)callee_info->time * MAX (edge->frequency, 1),
- CGRAPH_FREQ_BASE);
- gcov_type caller_time = inline_summary (edge->caller->global.inlined_to
- ? edge->caller->global.inlined_to
- : edge->caller)->time;
+ sreal uninlined_call_time = (sreal)callee_info->time
+ * (sreal)MAX (edge->frequency, 1)
+ * cgraph_freq_base_rec;
+ int caller_time = inline_summary (edge->caller->global.inlined_to
+ ? edge->caller->global.inlined_to
+ : edge->caller)->time;
return uninlined_call_time + caller_time;
}
/* Same as compute_uinlined_call_time but compute time when inlining
does happen. */
-inline gcov_type
+inline sreal
compute_inlined_call_time (struct cgraph_edge *edge,
int edge_time)
{
- gcov_type caller_time = inline_summary (edge->caller->global.inlined_to
- ? edge->caller->global.inlined_to
- : edge->caller)->time;
- gcov_type time = (caller_time
- + RDIV (((gcov_type) edge_time
- - inline_edge_summary (edge)->call_stmt_time)
- * MAX (edge->frequency, 1), CGRAPH_FREQ_BASE));
- /* Possible one roundoff error, but watch for overflows. */
- gcc_checking_assert (time >= INT_MIN / 2);
- if (time < 0)
- time = 0;
+ int caller_time = inline_summary (edge->caller->global.inlined_to
+ ? edge->caller->global.inlined_to
+ : edge->caller)->time;
+ sreal time = (sreal)caller_time
+ + ((sreal)(edge_time - inline_edge_summary (edge)->call_stmt_time)
+ * (sreal)MAX (edge->frequency, 1)
+ * cgraph_freq_base_rec);
+ gcc_checking_assert (time >= 0);
return time;
}
@@ -557,12 +558,10 @@ compute_inlined_call_time (struct cgraph
static bool
big_speedup_p (struct cgraph_edge *e)
{
- gcov_type time = compute_uninlined_call_time (inline_summary (e->callee),
- e);
- gcov_type inlined_time = compute_inlined_call_time (e,
- estimate_edge_time (e));
+ sreal time = compute_uninlined_call_time (inline_summary (e->callee), e);
+ sreal inlined_time = compute_inlined_call_time (e, estimate_edge_time (e));
if (time - inlined_time
- > RDIV (time * PARAM_VALUE (PARAM_INLINE_MIN_SPEEDUP), 100))
+ > (sreal)time * (sreal)PARAM_VALUE (PARAM_INLINE_MIN_SPEEDUP) * percent_rec)
return true;
return false;
}
@@ -860,42 +859,45 @@ want_inline_function_to_all_callers_p (s
#define RELATIVE_TIME_BENEFIT_RANGE (INT_MAX / 64)
/* Return relative time improvement for inlining EDGE in range
- 1...RELATIVE_TIME_BENEFIT_RANGE */
+ as value NUMERATOR/DENOMINATOR. */
-static inline int
+static inline void
relative_time_benefit (struct inline_summary *callee_info,
struct cgraph_edge *edge,
- int edge_time)
+ int edge_time,
+ sreal *numerator,
+ sreal *denominator)
{
- gcov_type relbenefit;
- gcov_type uninlined_call_time = compute_uninlined_call_time (callee_info, edge);
- gcov_type inlined_call_time = compute_inlined_call_time (edge, edge_time);
-
/* Inlining into extern inline function is not a win. */
if (DECL_EXTERNAL (edge->caller->global.inlined_to
? edge->caller->global.inlined_to->decl
: edge->caller->decl))
- return 1;
+ {
+ *numerator = (sreal)1;
+ *denominator = (sreal)1024;
+ return;
+ }
- /* Watch overflows. */
- gcc_checking_assert (uninlined_call_time >= 0);
- gcc_checking_assert (inlined_call_time >= 0);
- gcc_checking_assert (uninlined_call_time >= inlined_call_time);
+ sreal uninlined_call_time = compute_uninlined_call_time (callee_info, edge);
+ sreal inlined_call_time = compute_inlined_call_time (edge, edge_time);
/* Compute relative time benefit, i.e. how much the call becomes faster.
??? perhaps computing how much the caller+calle together become faster
would lead to more realistic results. */
- if (!uninlined_call_time)
+ if (uninlined_call_time == (sreal)0)
uninlined_call_time = 1;
- relbenefit =
- RDIV (((gcov_type)uninlined_call_time - inlined_call_time) * RELATIVE_TIME_BENEFIT_RANGE,
- uninlined_call_time);
- relbenefit = MIN (relbenefit, RELATIVE_TIME_BENEFIT_RANGE);
- gcc_checking_assert (relbenefit >= 0);
- relbenefit = MAX (relbenefit, 1);
- return relbenefit;
-}
+ /* Avoid zeros, these are not useful later in calculations. */
+ if (uninlined_call_time == inlined_call_time)
+ *numerator = ((sreal)1)>>8;
+ else
+ *numerator = uninlined_call_time - inlined_call_time;
+ *denominator = uninlined_call_time;
+#ifdef ENABLE_CHECKING
+ gcc_checking_assert (*numerator >= 0);
+ gcc_checking_assert (*denominator >= 0);
+#endif
+}
/* A cost model driving the inlining heuristics in a way so the edges with
smallest badness are inlined first. After each inlining is performed
@@ -943,54 +945,39 @@ edge_badness (struct cgraph_edge *edge,
{
badness = INT_MIN / 2 + growth;
if (dump)
- fprintf (dump_file, " %"PRId64": Growth %d <= 0\n", badness.to_int (),
+ fprintf (dump_file, " %f: Growth %d <= 0\n", badness.to_double (),
growth);
}
/* When profiling is available, compute badness as:
- relative_edge_count * relative_time_benefit
+ edge_count * relative_time_benefit
goodness = -------------------------------------------
- growth_f_caller
- badness = -goodness
+ growth_of_caller
+ badness = - goodness
The fraction is upside down, because on edge counts and time beneits
the bounds are known. Edge growth is essentially unlimited. */
else if (max_count)
{
- int relbenefit = relative_time_benefit (callee_info, edge, edge_time);
- /* Capping edge->count to max_count. edge->count can be larger than
- max_count if an inline adds new edges which increase max_count
- after max_count is computed. */
- gcov_type edge_count = edge->count > max_count ? max_count : edge->count;
-
- sreal relbenefit_real (relbenefit, 0);
- sreal growth_real (growth, 0);
-
- /* relative_edge_count. */
- sreal tmp (edge_count, 0);
- tmp /= max_count_real;
-
- /* relative_time_benefit. */
- tmp *= relbenefit_real;
- tmp /= max_relbenefit_real;
-
- /* growth_f_caller. */
- tmp *= half_int_min_real;
- tmp /= growth_real;
+ sreal numerator, denumerator;
+ relative_time_benefit (callee_info, edge, edge_time, &numerator, &denumerator);
+
+ numerator *= edge->count;
+ denumerator *= growth;
- badness = -1 * tmp.to_int ();
+ badness = - numerator/denumerator;
if (dump)
{
+ sreal num,den;
+ relative_time_benefit (callee_info, edge, edge_time, &num, &den);
fprintf (dump_file,
- " %"PRId64" (relative %f): profile info. Relative count %f%s"
- " * Relative benefit %f\n",
- badness.to_int (), (double) badness.to_int () / INT_MIN,
- (double) edge_count / max_count,
- edge->count > max_count ? " (capped to max_count)" : "",
- relbenefit * 100.0 / RELATIVE_TIME_BENEFIT_RANGE);
+ " %f: profile info. count %"PRId64
+ " * Relative benefit %f / growth %i\n",
+ badness.to_double (), (int64_t)edge->count,
+ (num / den * 100).to_double (), growth);
}
}
@@ -1008,36 +995,26 @@ edge_badness (struct cgraph_edge *edge,
and some not. */
else if (flag_guess_branch_prob)
{
- badness = (relative_time_benefit (callee_info, edge, edge_time)
- * (INT_MIN / 16 / RELATIVE_TIME_BENEFIT_RANGE));
- badness /= (MIN (65536/2, growth) * MIN (65536/2, MAX (1, callee_info->growth)));
- gcc_checking_assert (badness <=0 && badness >= INT_MIN / 16);
- if ((hints & (INLINE_HINT_indirect_call
- | INLINE_HINT_loop_iterations
- | INLINE_HINT_array_index
- | INLINE_HINT_loop_stride))
- || callee_info->growth <= 0)
- badness *= 8;
- if (hints & (INLINE_HINT_same_scc))
- badness /= 16;
- else if (hints & (INLINE_HINT_in_scc))
- badness /= 8;
- else if (hints & (INLINE_HINT_cross_module))
- badness /= 2;
- gcc_checking_assert (badness <= 0 && badness >= INT_MIN / 2);
- if ((hints & INLINE_HINT_declared_inline) && badness >= INT_MIN / 32)
- badness *= 16;
+ sreal numerator, denumerator;
+ relative_time_benefit (callee_info, edge, edge_time, &numerator, &denumerator);
+ denumerator *= growth;
+ if (callee_info->growth > 0)
+ denumerator *= callee_info->growth;
+
+ badness = - numerator / denumerator;
+
if (dump)
{
+ sreal num,den;
+ relative_time_benefit (callee_info, edge, edge_time, &num, &den);
fprintf (dump_file,
- " %"PRId64": guessed profile. frequency %f,"
- " benefit %f%%, time w/o inlining %i, time w inlining %i"
+ " %f: guessed profile. frequency %f,"
+ " benefit %f%%, time w/o inlining %f, time w inlining %f"
" overall growth %i (current) %i (original)\n",
- badness.to_int (), (double)edge->frequency / CGRAPH_FREQ_BASE,
- relative_time_benefit (callee_info, edge, edge_time) * 100.0
- / RELATIVE_TIME_BENEFIT_RANGE,
- (int)compute_uninlined_call_time (callee_info, edge),
- (int)compute_inlined_call_time (edge, edge_time),
+ badness.to_double (), (double)edge->frequency / CGRAPH_FREQ_BASE,
+ (num/den).to_double () * 100,
+ compute_uninlined_call_time (callee_info, edge).to_double (),
+ compute_inlined_call_time (edge, edge_time).to_double (),
estimate_growth (callee),
callee_info->growth);
}
@@ -1049,28 +1026,38 @@ edge_badness (struct cgraph_edge *edge,
else
{
int nest = MIN (inline_edge_summary (edge)->loop_depth, 8);
- badness = growth * 256;
+ badness = growth;
/* Decrease badness if call is nested. */
if (badness > 0)
badness = badness >> nest;
else
- {
- badness = badness << nest;
- }
+ badness = badness << nest;
if (dump)
- fprintf (dump_file, " %"PRId64": no profile. nest %i\n", badness.to_int (),
+ fprintf (dump_file, " %f: no profile. nest %i\n", badness.to_double (),
nest);
}
+ gcc_checking_assert (badness != 0);
- /* Ensure that we did not overflow in all the fixed point math above. */
- gcc_assert (badness >= INT_MIN);
- gcc_assert (badness <= INT_MAX - 1);
- /* Make recursive inlining happen always after other inlining is done. */
if (edge->recursive_p ())
- return badness + 1;
- else
- return badness;
+ badness = badness.shift (badness > 0 ? 4 : -4);
+ if ((hints & (INLINE_HINT_indirect_call
+ | INLINE_HINT_loop_iterations
+ | INLINE_HINT_array_index
+ | INLINE_HINT_loop_stride))
+ || callee_info->growth <= 0)
+ badness = badness.shift (badness > 0 ? -2 : 2);
+ if (hints & (INLINE_HINT_same_scc))
+ badness = badness.shift (badness > 0 ? 3 : -3);
+ else if (hints & (INLINE_HINT_in_scc))
+ badness = badness.shift (badness > 0 ? 2 : -2);
+ else if (hints & (INLINE_HINT_cross_module))
+ badness = badness.shift (badness > 0 ? 1 : -1);
+ if ((hints & INLINE_HINT_declared_inline))
+ badness = badness.shift (badness > 0 ? -3 : 3);
+ if (dump)
+ fprintf (dump_file, " Adjusted by hints %f\n", badness.to_double ());
+ return badness;
}
/* Recompute badness of EDGE and update its key in HEAP if needed. */
@@ -1083,26 +1070,26 @@ update_edge_key (edge_heap_t *heap, stru
edge_heap_node_t *n = (edge_heap_node_t *) edge->aux;
gcc_checking_assert (n->get_data () == edge);
- /* fibonacci_heap::replace_key only decrease the keys.
- When we increase the key we do not update heap
- and instead re-insert the element once it becomes
- a minimum of heap. */
+ /* fibonacci_heap::replace_key does busy updating of the
+ heap that is unnecesarily expensive.
+ We do lazy increases: after extracting minimum if the key
+ turns out to be out of date, it is re-inserted into heap
+ with correct value. */
if (badness < n->get_key ())
{
if (dump_file && (dump_flags & TDF_DETAILS))
{
fprintf (dump_file,
- " decreasing badness %s/%i -> %s/%i, %"PRId64
- " to %"PRId64"\n",
+ " decreasing badness %s/%i -> %s/%i, %f"
+ " to %f\n",
xstrdup_for_dump (edge->caller->name ()),
edge->caller->order,
xstrdup_for_dump (edge->callee->name ()),
edge->callee->order,
- n->get_key ().to_int (),
- badness.to_int ());
+ n->get_key ().to_double (),
+ badness.to_double ());
}
heap->decrease_key (n, badness);
- gcc_checking_assert (n->get_key () == badness);
}
}
else
@@ -1110,12 +1097,12 @@ update_edge_key (edge_heap_t *heap, stru
if (dump_file && (dump_flags & TDF_DETAILS))
{
fprintf (dump_file,
- " enqueuing call %s/%i -> %s/%i, badness %"PRId64"\n",
+ " enqueuing call %s/%i -> %s/%i, badness %f\n",
xstrdup_for_dump (edge->caller->name ()),
edge->caller->order,
xstrdup_for_dump (edge->callee->name ()),
edge->callee->order,
- badness.to_int ());
+ badness.to_double ());
}
edge->aux = heap->insert (badness, edge);
}
@@ -1685,7 +1672,6 @@ inline_small_functions (void)
struct cgraph_node *where, *callee;
sreal badness = edge_heap.min_key ();
sreal current_badness;
- sreal cached_badness;
int growth;
edge = edge_heap.extract_min ();
@@ -1694,10 +1680,9 @@ inline_small_functions (void)
if (!edge->inline_failed || !edge->callee->analyzed)
continue;
- /* Be sure that caches are maintained consistent.
- We can not make this ENABLE_CHECKING only because it cause different
- updates of the fibheap queue. */
- cached_badness = edge_badness (edge, false);
+#ifdef ENABLE_CHECKING
+ /* Be sure that caches are maintained consistent. */
+ sreal cached_badness = edge_badness (edge, false);
reset_edge_growth_cache (edge);
reset_node_growth_cache (edge->callee);
@@ -1707,10 +1692,18 @@ inline_small_functions (void)
current_badness = edge_badness (edge, false);
gcc_assert (cached_badness == current_badness);
gcc_assert (current_badness >= badness);
+#else
+ current_badness = edge_badness (edge, false);
+#endif
if (current_badness != badness)
{
- edge->aux = edge_heap.insert (current_badness, edge);
- continue;
+ if (edge_heap. min() && badness > edge_heap.min_key ())
+ {
+ edge->aux = edge_heap.insert (current_badness, edge);
+ continue;
+ }
+ else
+ badness = current_badness;
}
if (!can_inline_edge_p (edge, true))
@@ -1729,13 +1722,13 @@ inline_small_functions (void)
inline_summary (callee)->size);
fprintf (dump_file,
" to be inlined into %s/%i in %s:%i\n"
- " Estimated badness is %"PRId64", frequency %.2f.\n",
+ " Estimated badness is %f, frequency %.2f.\n",
edge->caller->name (), edge->caller->order,
- edge->call_stmt ? "unknown"
+ !edge->call_stmt ? "unknown"
: gimple_filename ((const_gimple) edge->call_stmt),
- edge->call_stmt ? -1
+ !edge->call_stmt ? -1
: gimple_lineno ((const_gimple) edge->call_stmt),
- badness.to_int (),
+ badness.to_double (),
edge->frequency / (double)CGRAPH_FREQ_BASE);
if (edge->count)
fprintf (dump_file," Called %"PRId64"x\n",
@@ -2147,6 +2140,9 @@ ipa_inline (void)
if (!optimize)
return 0;
+ cgraph_freq_base_rec = (sreal) 1 / (sreal) CGRAPH_FREQ_BASE;
+ percent_rec = (sreal) 1 / (sreal) 100;
+
order = XCNEWVEC (struct cgraph_node *, symtab->cgraph_count);
if (in_lto_p && optimize)