This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Fix three inliner issues
- From: Jan Hubicka <hubicka at ucw dot cz>
- To: gcc-patches at gcc dot gnu dot org, rguenther at suse dot de, matz at suse dot de
- Date: Tue, 13 Apr 2010 18:09:51 +0200
- Subject: Fix three inliner issues
Hi,
this patch fixes several problems in inliner:
1) Michael and Richi noticed that when we optimize out offline function, we account
it twice in code size computations.
2) update_caller_keys got wrong updating of fibheap, so sometimes the
key in heap was not updated to new cost.
3) small function inlining did not update cost of the calls of the offline
copy. This is wrong, as inliner is trying to prioritize functions that
gets fully inlined by making the cost of every call depend on the estimated
growth after inlining everything. When one of calls gets inlined, all
the other calls gets cheaper.
I also added a dumping facility to cgraph_edge_badness that helps to track down
the reasons for inliner (mis)decisions. This needs extra recoputation of the
cost since the costs are determined when functions are inserted into heap and
updated several times before function becomes minimum of the heap. Dumping
reasons at that time makes dumps very difficult to parse, so I am dumping at
the time we are removing function from heap only.
Fixing 1) leads to regression in tramp3d and few other bechmarks, where this
double accounting happens particularly often and thus we now give up before we
inline everything important. I did some testing here last two weeks and this
can be fixed by bumping unit growth from 30% to 50%. Everything except for
tramp3d is fixed by bumping to 40% and code size is still at average smaller.
All these three fixes together makes tramp3d happy (10% smaller, 7% faster)
than unpatched mainline. So I am going to commit the patch once testing on
x86_64-linux is complette. Because all the changes are fixes of quite obvious
bugs.
I did some of analysis of tramp3d. The main problem here is that the unit size
growth cutoff happens at quite arbitrary place. The badness computation gives
result in very small range (0-400) that is sort of correct. All the functions
are very small and inlining them is locally good idea. Just we can't inline
them all.
We can account more thoroughly the loop levels (frequencies), but I guess we
also should take into account whether the function being inlined contains other
calls that are candidates for inlining or not thus attempting to produce more
leaf functions (at least leaf in the compilation unit POV. The actual leaf
functions are already prioritized both in early and late inlining). I will
experiment with this on pretty-ipa.
Honza
* ipa-inline.c (cgraph_mark_inline_edge): Avoid double accounting
of optimized out static functions.
(cgraph_edge_badness): Add DUMP parameter and dump reasons for the
cost computation. Also sanity check for overflows.
(update_caller_keys): Update cgraph_edge_badness call; properly
update fibheap and sanity check that it is up to date.
(add_new_edges_to_heap): Update cgraph_edge_badness.
(cgraph_decide_inlining_of_small_function): Likewise;
add sanity checking that badness in heap is up to date;
improve dumping of reason; Update badness of calls to the
offline copy of function currently inlined; dump badness
of functions not inlined because of unit growth limits.
Index: ipa-inline.c
===================================================================
--- ipa-inline.c (revision 158273)
+++ ipa-inline.c (working copy)
@@ -306,8 +306,6 @@ cgraph_mark_inline_edge (struct cgraph_e
struct cgraph_node *to = NULL, *what;
struct cgraph_edge *curr = e;
int freq;
- bool duplicate = false;
- int orig_size = e->callee->global.size;
gcc_assert (e->inline_failed);
e->inline_failed = CIF_OK;
@@ -316,10 +314,6 @@ cgraph_mark_inline_edge (struct cgraph_e
DECL_POSSIBLY_INLINED (e->callee->decl) = true;
e->callee->global.inlined = true;
- if (e->callee->callers->next_caller
- || !cgraph_can_remove_if_no_direct_calls_p (e->callee)
- || e->callee->same_comdat_group)
- duplicate = true;
cgraph_clone_inlined_nodes (e, true, update_original);
what = e->callee;
@@ -337,8 +331,6 @@ cgraph_mark_inline_edge (struct cgraph_e
gcc_assert (what->global.inlined_to == to);
if (new_size > old_size)
overall_size += new_size - old_size;
- if (!duplicate)
- overall_size -= orig_size;
ncalls_inlined++;
if (flag_indirect_inlining)
@@ -544,23 +536,53 @@ cgraph_recursive_inlining_p (struct cgra
of the function or function body size. */
static int
-cgraph_edge_badness (struct cgraph_edge *edge)
+cgraph_edge_badness (struct cgraph_edge *edge, bool dump)
{
gcov_type badness;
int growth =
- cgraph_estimate_size_after_inlining (1, edge->caller, edge->callee);
-
- growth -= edge->caller->global.size;
+ (cgraph_estimate_size_after_inlining (1, edge->caller, edge->callee)
+ - edge->caller->global.size);
+ if (dump)
+ {
+ fprintf (dump_file, " Badness calculcation for %s -> %s\n",
+ cgraph_node_name (edge->caller),
+ cgraph_node_name (edge->callee));
+ fprintf (dump_file, " growth %i, time %i-%i, size %i-%i\n",
+ growth,
+ edge->callee->global.time,
+ inline_summary (edge->callee)->time_inlining_benefit,
+ edge->callee->global.size,
+ inline_summary (edge->callee)->size_inlining_benefit);
+ }
/* Always prefer inlining saving code size. */
if (growth <= 0)
- badness = INT_MIN - growth;
+ {
+ badness = INT_MIN - growth;
+ if (dump)
+ fprintf (dump_file, " %i: Growth %i < 0\n", (int) badness,
+ growth);
+ }
/* When profiling is available, base priorities -(#calls / growth).
So we optimize for overall number of "executed" inlined calls. */
else if (max_count)
- badness = ((int)((double)edge->count * INT_MIN / max_count / (max_benefit + 1))
- * (inline_summary (edge->callee)->time_inlining_benefit + 1)) / growth;
+ {
+ badness =
+ ((int)
+ ((double) edge->count * INT_MIN / max_count / (max_benefit + 1)) *
+ (inline_summary (edge->callee)->time_inlining_benefit + 1)) / growth;
+ if (dump)
+ {
+ fprintf (dump_file,
+ " %i (relative %f): profile info. Relative count %f"
+ " * Relative benefit %f\n",
+ (int) badness, (double) badness / INT_MIN,
+ (double) edge->count / max_count,
+ (double) (inline_summary (edge->callee)->
+ time_inlining_benefit + 1) / (max_benefit + 1));
+ }
+ }
/* When function local profile is available, base priorities on
growth / frequency, so we optimize for overall frequency of inlined
@@ -574,9 +596,13 @@ cgraph_edge_badness (struct cgraph_edge
else if (flag_guess_branch_prob)
{
int div = edge->frequency * 100 / CGRAPH_FREQ_BASE + 1;
+ int benefitperc;
+ int growth_for_all;
badness = growth * 10000;
- div *= MIN (100 * inline_summary (edge->callee)->time_inlining_benefit
- / (edge->callee->global.time + 1) + 1, 100);
+ benefitperc =
+ MIN (100 * inline_summary (edge->callee)->time_inlining_benefit /
+ (edge->callee->global.time + 1) +1, 100);
+ div *= benefitperc;
/* Decrease badness if call is nested. */
@@ -587,9 +613,17 @@ cgraph_edge_badness (struct cgraph_edge
div = 1;
if (badness > 0)
badness /= div;
- badness += cgraph_estimate_growth (edge->callee);
+ growth_for_all = cgraph_estimate_growth (edge->callee);
+ badness += growth_for_all;
if (badness > INT_MAX)
- badness = INT_MAX;
+ badness = INT_MAX;
+ if (dump)
+ {
+ fprintf (dump_file,
+ " %i: guessed profile. frequency %i, overall growth %i,"
+ " benefit %i%%, divisor %i\n",
+ (int) badness, edge->frequency, growth_for_all, benefitperc, div);
+ }
}
/* When function local profile is not available or it does not give
useful information (ie frequency is zero), base the cost on
@@ -604,10 +638,17 @@ cgraph_edge_badness (struct cgraph_edge
if (badness > 0)
badness >>= nest;
else
- {
+ {
badness <<= nest;
- }
+ }
+ if (dump)
+ fprintf (dump_file, " %i: no profile. nest %i\n", (int) badness,
+ nest);
}
+
+ /* Ensure that we did not overflow in all the fixed point math above. */
+ gcc_assert (badness >= INT_MIN);
+ gcc_assert (badness <= INT_MAX - 1);
/* Make recursive inlining happen always after other inlining is done. */
if (cgraph_recursive_inlining_p (edge->caller, edge->callee, NULL))
return badness + 1;
@@ -651,7 +692,7 @@ update_caller_keys (fibheap_t heap, stru
for (edge = node->callers; edge; edge = edge->next_caller)
if (edge->inline_failed)
{
- int badness = cgraph_edge_badness (edge);
+ int badness = cgraph_edge_badness (edge, false);
if (edge->aux)
{
fibnode_t n = (fibnode_t) edge->aux;
@@ -660,8 +701,12 @@ update_caller_keys (fibheap_t heap, stru
continue;
/* fibheap_replace_key only increase the keys. */
- if (fibheap_replace_key (heap, n, badness))
- continue;
+ if (badness < n->key)
+ {
+ fibheap_replace_key (heap, n, badness);
+ gcc_assert (n->key == badness);
+ continue;
+ }
fibheap_delete_node (heap, (fibnode_t) edge->aux);
}
edge->aux = fibheap_insert (heap, badness, edge);
@@ -889,7 +934,7 @@ add_new_edges_to_heap (fibheap_t heap, V
struct cgraph_edge *edge = VEC_pop (cgraph_edge_p, new_edges);
gcc_assert (!edge->aux);
- edge->aux = fibheap_insert (heap, cgraph_edge_badness (edge), edge);
+ edge->aux = fibheap_insert (heap, cgraph_edge_badness (edge, false), edge);
}
}
@@ -938,7 +983,7 @@ cgraph_decide_inlining_of_small_function
if (edge->inline_failed)
{
gcc_assert (!edge->aux);
- edge->aux = fibheap_insert (heap, cgraph_edge_badness (edge), edge);
+ edge->aux = fibheap_insert (heap, cgraph_edge_badness (edge, false), edge);
}
}
@@ -946,18 +991,27 @@ cgraph_decide_inlining_of_small_function
min_size = overall_size;
while (overall_size <= max_size
- && (edge = (struct cgraph_edge *) fibheap_extract_min (heap)))
+ && !fibheap_empty (heap))
{
int old_size = overall_size;
- struct cgraph_node *where;
- int growth =
- cgraph_estimate_size_after_inlining (1, edge->caller, edge->callee);
+ struct cgraph_node *where, *callee;
+ int badness = fibheap_min_key (heap);
+ int growth;
cgraph_inline_failed_t not_good = CIF_OK;
- growth -= edge->caller->global.size;
+ edge = (struct cgraph_edge *) fibheap_extract_min (heap);
+#ifdef ENABLE_CHECKING
+ gcc_assert (cgraph_edge_badness (edge, false) == badness);
+#endif
+ callee = edge->callee;
+
+ growth = (cgraph_estimate_size_after_inlining (1, edge->caller, edge->callee)
+ - edge->caller->global.size);
if (dump_file)
{
+ if (dump_flags & TDF_DETAILS)
+ cgraph_edge_badness (edge, true);
fprintf (dump_file,
"\nConsidering %s with %i size\n",
cgraph_node_name (edge->callee),
@@ -970,7 +1024,7 @@ cgraph_decide_inlining_of_small_function
gimple_filename ((const_gimple) edge->call_stmt),
gimple_lineno ((const_gimple) edge->call_stmt),
cgraph_estimate_growth (edge->callee),
- cgraph_edge_badness (edge),
+ badness,
edge->frequency / (double)CGRAPH_FREQ_BASE);
if (edge->count)
fprintf (dump_file," Called "HOST_WIDEST_INT_PRINT_DEC"x\n", edge->count);
@@ -1096,6 +1150,11 @@ cgraph_decide_inlining_of_small_function
called by function we inlined (since number of it inlinable callers
might change). */
update_caller_keys (heap, where, updated_nodes);
+
+ /* We removed one call of the function we just inlined. If offline
+ copy is still needed, be sure to update the keys. */
+ if (callee != where && !callee->global.inlined_to)
+ update_caller_keys (heap, callee, updated_nodes);
bitmap_clear (updated_nodes);
if (dump_file)
@@ -1117,8 +1176,38 @@ cgraph_decide_inlining_of_small_function
fprintf (dump_file, "New minimal size reached: %i\n", min_size);
}
}
- while ((edge = (struct cgraph_edge *) fibheap_extract_min (heap)) != NULL)
+ while (!fibheap_empty (heap))
{
+ int badness = fibheap_min_key (heap);
+
+ edge = (struct cgraph_edge *) fibheap_extract_min (heap);
+#ifdef ENABLE_CHECKING
+ gcc_assert (cgraph_edge_badness (edge, false) == badness);
+#endif
+ if (dump_file)
+ {
+ if (dump_flags & TDF_DETAILS)
+ {
+ int badness2 = cgraph_edge_badness (edge, true);
+ gcc_assert (badness == badness2);
+ }
+ fprintf (dump_file,
+ "\nSkipping %s with %i size\n",
+ cgraph_node_name (edge->callee),
+ edge->callee->global.size);
+ fprintf (dump_file,
+ " called by %s in %s:%i\n"
+ " Estimated growth after inlined into all callees is %+i insns.\n"
+ " Estimated badness is %i, frequency %.2f.\n",
+ cgraph_node_name (edge->caller),
+ gimple_filename ((const_gimple) edge->call_stmt),
+ gimple_lineno ((const_gimple) edge->call_stmt),
+ cgraph_estimate_growth (edge->callee),
+ badness,
+ edge->frequency / (double)CGRAPH_FREQ_BASE);
+ if (edge->count)
+ fprintf (dump_file," Called "HOST_WIDEST_INT_PRINT_DEC"x\n", edge->count);
+ }
gcc_assert (edge->aux);
edge->aux = NULL;
if (!edge->callee->local.disregard_inline_limits && edge->inline_failed