[RFC, PATCH] LTO: IPA inline speed up for large apps (Chrome)

Martin Liška mliska@suse.cz
Wed Feb 18 14:13:00 GMT 2015


On 02/18/2015 02:58 PM, Martin Liška wrote:
> On 02/17/2015 10:03 PM, Jan Hubicka wrote:
>> Hi,
>> this patch should chase away the expensive thunks and aliases walks from most
>> of analysis code. I think only real use left is local_p predicate that needs to
>> stay because i386 expect local flag to match between caller and callee when
>> expanding assembler thunk. I at least optimized it by first moving the walk to
>> be conditional for nonlocal functions only and then reorganizing
>> call_for_symbol_thunks_and_aliases to first inspect aliases (that is cheap) and
>> only then work on thunks.  Most likely this will find the non-local thunk/alias
>> faster.  Other cases was leftovers from the conversion of thunks from aliases
>> to functions.
>>
>> I also noticed a bug in ipa-profile that does not disable all the
>> transofrms with !ipa_profile_flag used on OPTIMIZTION_NODE and fixed it.
>>
>> Bootstrapped/regtested x86_64-linux, comitted.  I would be interested to
>> know if the call_for_symbol_thunks_and_aliases is now off your oprofiles
>> (sorry, easier to type than perf-profiles)
>>
>> Honza
>>
>>     * ipa-visibility.c (function_and_variable_visibility): Only
>>     check locality if node is not already local.
>>     * ipa-inline.c (want_inline_function_to_all_callers_p): Use
>>     call_for_symbol_and_aliases instead of
>>     call_for_symbol_thunks_and_aliases.
>>     (ipa_inline): Likewise.
>>     * cgraph.c (cgraph_node::call_for_symbol_thunks_and_aliases):
>>     first walk aliases.
>>     * ipa.c (symbol_table::remove_unreachable_nodes): Use
>>     call_for_symbol_and_aliases.
>>     * ipa-profile.c (ipa_propagate_frequency_data): Add function_symbol.
>>     (ipa_propagate_frequency_1): Use it; use opt_for_fn
>>     (ipa_propagate_frequency): Update.
>>     (ipa_profile): Add opt_for_fn gueards.
>> Index: ipa-visibility.c
>> ===================================================================
>> --- ipa-visibility.c    (revision 220741)
>> +++ ipa-visibility.c    (working copy)
>> @@ -595,7 +595,8 @@ function_and_variable_visibility (bool w
>>       }
>>     FOR_EACH_DEFINED_FUNCTION (node)
>>       {
>> -      node->local.local |= node->local_p ();
>> +      if (!node->local.local)
>> +        node->local.local |= node->local_p ();
>>
>>         /* If we know that function can not be overwritten by a different semantics
>>        and moreover its section can not be discarded, replace all direct calls
>> Index: ipa-inline.c
>> ===================================================================
>> --- ipa-inline.c    (revision 220741)
>> +++ ipa-inline.c    (working copy)
>> @@ -975,14 +975,14 @@ want_inline_function_to_all_callers_p (s
>>     if (node->global.inlined_to)
>>       return false;
>>     /* Does it have callers?  */
>> -  if (!node->call_for_symbol_thunks_and_aliases (has_caller_p, NULL, true))
>> +  if (!node->call_for_symbol_and_aliases (has_caller_p, NULL, true))
>>       return false;
>>     /* Inlining into all callers would increase size?  */
>>     if (estimate_growth (node) > 0)
>>       return false;
>>     /* All inlines must be possible.  */
>> -  if (node->call_for_symbol_thunks_and_aliases (check_callers, &has_hot_call,
>> -                        true))
>> +  if (node->call_for_symbol_and_aliases (check_callers, &has_hot_call,
>> +                     true))
>>       return false;
>>     if (!cold && !has_hot_call)
>>       return false;
>> @@ -2359,9 +2359,9 @@ ipa_inline (void)
>>         if (want_inline_function_to_all_callers_p (node, cold))
>>           {
>>             int num_calls = 0;
>> -          node->call_for_symbol_thunks_and_aliases (sum_callers, &num_calls,
>> -                              true);
>> -          while (node->call_for_symbol_thunks_and_aliases
>> +          node->call_for_symbol_and_aliases (sum_callers, &num_calls,
>> +                         true);
>> +          while (node->call_for_symbol_and_aliases
>>                  (inline_to_all_callers, &num_calls, true))
>>           ;
>>             remove_functions = true;
>> Index: cgraph.c
>> ===================================================================
>> --- cgraph.c    (revision 220741)
>> +++ cgraph.c    (working copy)
>> @@ -2191,6 +2191,16 @@ cgraph_node::call_for_symbol_thunks_and_
>>
>>     if (callback (this, data))
>>       return true;
>> +  FOR_EACH_ALIAS (this, ref)
>> +    {
>> +      cgraph_node *alias = dyn_cast <cgraph_node *> (ref->referring);
>> +      if (include_overwritable
>> +      || alias->get_availability () > AVAIL_INTERPOSABLE)
>> +    if (alias->call_for_symbol_thunks_and_aliases (callback, data,
>> +                             include_overwritable,
>> +                             exclude_virtual_thunks))
>> +      return true;
>> +    }
>>     for (e = callers; e; e = e->next_caller)
>>       if (e->caller->thunk.thunk_p
>>       && (include_overwritable
>> @@ -2202,16 +2212,6 @@ cgraph_node::call_for_symbol_thunks_and_
>>                                  exclude_virtual_thunks))
>>       return true;
>>
>> -  FOR_EACH_ALIAS (this, ref)
>> -    {
>> -      cgraph_node *alias = dyn_cast <cgraph_node *> (ref->referring);
>> -      if (include_overwritable
>> -      || alias->get_availability () > AVAIL_INTERPOSABLE)
>> -    if (alias->call_for_symbol_thunks_and_aliases (callback, data,
>> -                             include_overwritable,
>> -                             exclude_virtual_thunks))
>> -      return true;
>> -    }
>>     return false;
>>   }
>>
>> Index: ipa.c
>> ===================================================================
>> --- ipa.c    (revision 220741)
>> +++ ipa.c    (working copy)
>> @@ -661,7 +661,7 @@ symbol_table::remove_unreachable_nodes (
>>       if (node->address_taken
>>       && !node->used_from_other_partition)
>>         {
>> -    if (!node->call_for_symbol_thunks_and_aliases
>> +    if (!node->call_for_symbol_and_aliases
>>           (has_addr_references_p, NULL, true)
>>           && (!node->instrumentation_clone
>>           || !node->instrumented_version
>> Index: ipa-profile.c
>> ===================================================================
>> --- ipa-profile.c    (revision 220741)
>> +++ ipa-profile.c    (working copy)
>> @@ -322,6 +322,7 @@ ipa_profile_read_summary (void)
>>
>>   struct ipa_propagate_frequency_data
>>   {
>> +  cgraph_node *function_symbol;
>>     bool maybe_unlikely_executed;
>>     bool maybe_executed_once;
>>     bool only_called_at_startup;
>> @@ -342,7 +343,7 @@ ipa_propagate_frequency_1 (struct cgraph
>>               || d->only_called_at_startup || d->only_called_at_exit);
>>          edge = edge->next_caller)
>>       {
>> -      if (edge->caller != node)
>> +      if (edge->caller != d->function_symbol)
>>       {
>>             d->only_called_at_startup &= edge->caller->only_called_at_startup;
>>         /* It makes sense to put main() together with the static constructors.
>> @@ -358,7 +359,11 @@ ipa_propagate_frequency_1 (struct cgraph
>>        errors can make us to push function into unlikely section even when
>>        it is executed by the train run.  Transfer the function only if all
>>        callers are unlikely executed.  */
>> -      if (profile_info && flag_branch_probabilities
>> +      if (profile_info
>> +      && opt_for_fn (d->function_symbol->decl, flag_branch_probabilities)
>> +      /* Thunks are not profiled.  This is more or less implementation
>> +         bug.  */
>> +      && !d->function_symbol->thunk.thunk_p
>>         && (edge->caller->frequency != NODE_FREQUENCY_UNLIKELY_EXECUTED
>>             || (edge->caller->global.inlined_to
>>             && edge->caller->global.inlined_to->frequency
>> @@ -418,7 +423,7 @@ contains_hot_call_p (struct cgraph_node
>>   bool
>>   ipa_propagate_frequency (struct cgraph_node *node)
>>   {
>> -  struct ipa_propagate_frequency_data d = {true, true, true, true};
>> +  struct ipa_propagate_frequency_data d = {node, true, true, true, true};
>>     bool changed = false;
>>
>>     /* We can not propagate anything useful about externally visible functions
>> @@ -432,8 +437,8 @@ ipa_propagate_frequency (struct cgraph_n
>>     if (dump_file && (dump_flags & TDF_DETAILS))
>>       fprintf (dump_file, "Processing frequency %s\n", node->name ());
>>
>> -  node->call_for_symbol_thunks_and_aliases (ipa_propagate_frequency_1, &d,
>> -                        true);
>> +  node->call_for_symbol_and_aliases (ipa_propagate_frequency_1, &d,
>> +                     true);
>>
>>     if ((d.only_called_at_startup && !d.only_called_at_exit)
>>         && !node->only_called_at_startup)
>> @@ -597,6 +602,9 @@ ipa_profile (void)
>>       {
>>         bool update = false;
>>
>> +      if (!opt_for_fn (n->decl, flag_ipa_profile))
>> +    continue;
>> +
>>         for (e = n->indirect_calls; e; e = e->next_callee)
>>       {
>>         if (n->count)
>> @@ -697,7 +705,9 @@ ipa_profile (void)
>>     order_pos = ipa_reverse_postorder (order);
>>     for (i = order_pos - 1; i >= 0; i--)
>>       {
>> -      if (order[i]->local.local && ipa_propagate_frequency (order[i]))
>> +      if (order[i]->local.local
>> +      && opt_for_fn (order[i]->decl, flag_ipa_profile)
>> +      && ipa_propagate_frequency (order[i]))
>>       {
>>         for (e = order[i]->callees; e; e = e->next_callee)
>>           if (e->callee->local.local && !e->callee->aux)
>> @@ -714,7 +724,9 @@ ipa_profile (void)
>>         something_changed = false;
>>         for (i = order_pos - 1; i >= 0; i--)
>>       {
>> -      if (order[i]->aux && ipa_propagate_frequency (order[i]))
>> +      if (order[i]->aux
>> +          && opt_for_fn (order[i]->decl, flag_ipa_profile)
>> +          && ipa_propagate_frequency (order[i]))
>>           {
>>             for (e = order[i]->callees; e; e = e->next_callee)
>>           if (e->callee->local.local && !e->callee->aux)
>>
>
> Hi.
>
> There's perf report and -ftime report of WPA phase.
>
> Martin

Hm, using the same compiler, Firefox LTO time statistics and perf report and very different.
I'm wondering how can be that possible?

Martin
-------------- next part --------------
Execution times (seconds)
 phase setup             :   0.00 ( 0%) usr   0.00 ( 0%) sys   0.01 ( 0%) wall    1988 kB ( 0%) ggc
 phase opt and generate  :  42.32 (70%) usr   0.85 (56%) sys  43.16 (69%) wall 1387464 kB (28%) ggc
 phase stream in         :  18.50 (30%) usr   0.68 (44%) sys  19.17 (31%) wall 3528077 kB (72%) ggc
 garbage collection      :   2.24 ( 4%) usr   0.00 ( 0%) sys   2.24 ( 4%) wall       0 kB ( 0%) ggc
 callgraph optimization  :   0.37 ( 1%) usr   0.00 ( 0%) sys   0.37 ( 1%) wall      38 kB ( 0%) ggc
 ipa dead code removal   :   3.06 ( 5%) usr   0.01 ( 1%) sys   2.88 ( 5%) wall       0 kB ( 0%) ggc
 ipa virtual call target :   5.72 ( 9%) usr   0.06 ( 4%) sys   5.87 ( 9%) wall       0 kB ( 0%) ggc
 ipa devirtualization    :   0.18 ( 0%) usr   0.00 ( 0%) sys   0.23 ( 0%) wall   22382 kB ( 0%) ggc
 ipa cp                  :   2.88 ( 5%) usr   0.09 ( 6%) sys   2.97 ( 5%) wall  515623 kB (10%) ggc
 ipa inlining heuristics :  13.96 (23%) usr   0.13 ( 8%) sys  14.12 (23%) wall  471848 kB (10%) ggc
 ipa comdats             :   0.12 ( 0%) usr   0.00 ( 0%) sys   0.12 ( 0%) wall       0 kB ( 0%) ggc
 ipa lto gimple in       :   2.54 ( 4%) usr   0.48 (31%) sys   3.23 ( 5%) wall  645652 kB (13%) ggc
 ipa lto decl in         :  12.64 (21%) usr   0.37 (24%) sys  13.01 (21%) wall 2592737 kB (53%) ggc
 ipa lto constructors in :   0.17 ( 0%) usr   0.01 ( 1%) sys   0.20 ( 0%) wall   16493 kB ( 0%) ggc
 ipa lto cgraph I/O      :   0.58 ( 1%) usr   0.09 ( 6%) sys   0.67 ( 1%) wall  437504 kB ( 9%) ggc
 ipa lto decl merge      :   1.90 ( 3%) usr   0.00 ( 0%) sys   1.90 ( 3%) wall    8191 kB ( 0%) ggc
 ipa lto cgraph merge    :   1.30 ( 2%) usr   0.00 ( 0%) sys   1.29 ( 2%) wall   14989 kB ( 0%) ggc
 whopr wpa               :   0.91 ( 1%) usr   0.00 ( 0%) sys   0.88 ( 1%) wall       2 kB ( 0%) ggc
 whopr partitioning      :   2.66 ( 4%) usr   0.00 ( 0%) sys   2.67 ( 4%) wall    6081 kB ( 0%) ggc
 ipa reference           :   1.38 ( 2%) usr   0.01 ( 1%) sys   1.40 ( 2%) wall       0 kB ( 0%) ggc
 ipa profile             :   0.21 ( 0%) usr   0.01 ( 1%) sys   0.21 ( 0%) wall       0 kB ( 0%) ggc
 ipa pure const          :   1.61 ( 3%) usr   0.01 ( 1%) sys   1.61 ( 3%) wall       0 kB ( 0%) ggc
 ipa icf                 :   4.99 ( 8%) usr   0.06 ( 4%) sys   5.00 ( 8%) wall    1120 kB ( 0%) ggc
 tree SSA rewrite        :   0.12 ( 0%) usr   0.02 ( 1%) sys   0.12 ( 0%) wall   23170 kB ( 0%) ggc
 tree SSA incremental    :   0.23 ( 0%) usr   0.05 ( 3%) sys   0.21 ( 0%) wall   14434 kB ( 0%) ggc
 tree operand scan       :   0.14 ( 0%) usr   0.03 ( 2%) sys   0.22 ( 0%) wall  145252 kB ( 3%) ggc
 dominance frontiers     :   0.04 ( 0%) usr   0.00 ( 0%) sys   0.01 ( 0%) wall       0 kB ( 0%) ggc
 dominance computation   :   0.14 ( 0%) usr   0.05 ( 3%) sys   0.11 ( 0%) wall       0 kB ( 0%) ggc
 varconst                :   0.01 ( 0%) usr   0.02 ( 1%) sys   0.03 ( 0%) wall       0 kB ( 0%) ggc
 loop fini               :   0.07 ( 0%) usr   0.00 ( 0%) sys   0.03 ( 0%) wall       0 kB ( 0%) ggc
 unaccounted todo        :   0.62 ( 1%) usr   0.00 ( 0%) sys   0.65 ( 1%) wall       0 kB ( 0%) ggc
 TOTAL                 :  60.82             1.53            62.34            4917531 kB
[ perf record: Woken up 59 times to write data ]
[ perf record: Captured and wrote 14.722 MB perf.data (~643202 samples) ]
marxin@marxinbox:~/Programming/gecko-dev/obj-x86_64-unknown-linux-gnu/toolkit/library> perf report
marxin@marxinbox:~/Programming/gecko-dev/obj-x86_64-unknown-linux-gnu/toolkit/library> gcc -v
Using built-in specs.
COLLECT_GCC=gcc
COLLECT_LTO_WRAPPER=/home/marxin/Programming/bin/gcc2/lib/gcc/x86_64-unknown-linux-gnu/5.0.0/lto-wrapper
Target: x86_64-unknown-linux-gnu
Configured with: ../configure --enable-languages=c,c++ --disable-libsanitizer --prefix=/home/marxin/Programming/bin/gcc2 --disable-bootstrap --enable-checking=release
Thread model: posix
gcc version 5.0.0 20150218 (experimental) (GCC) 
marxin@marxinbox:~/Programming/gecko-dev/obj-x86_64-unknown-linux-gnu/toolkit/library> perf report
marxin@marxinbox:~/Programming/gecko-dev/obj-x86_64-unknown-linux-gnu/toolkit/library> perf report --stdio | sed 's/\ *$//' | head -n50 
# To display the perf.data header info, please use --header/--header-only options.
#
# Samples: 245K of event 'cycles'
# Event count (approx.): 216467422123
#
# Overhead   Command      Shared Object
# ........  ........  .................  ..................................................................................................................................................................................................................................................................................................
#
     4.97%  lto1-wpa  lto1               [.] inflate_fast
     2.78%  lto1-wpa  lto1               [.] symbol_table::remove_unreachable_nodes(_IO_FILE*)
     2.37%  lto1-wpa  libc-2.19.so       [.] _int_malloc
     1.77%  lto1-wpa  lto1               [.] record_target_from_binfo(vec<cgraph_node*, va_heap, vl_ptr>&, vec<tree_node*, va_heap, vl_ptr>*, tree_node*, tree_node*, vec<tree_node*, va_heap, vl_ptr>&, long, tree_node*, long, hash_set<tree_node*, default_hashset_traits>*, hash_set<tree_node*, default_hashset_traits>*, bool, bool*)
     1.57%  lto1-wpa  lto1               [.] ht_lookup_with_hash(ht*, unsigned char const*, unsigned long, unsigned int, ht_lookup_option)
     1.56%  lto1-wpa  lto1               [.] streamer_read_uhwi(lto_input_block*)
     1.48%  lto1-wpa  lto1               [.] estimate_calls_size_and_time(cgraph_node*, int*, int*, int*, int*, unsigned int, vec<tree_node*, va_heap, vl_ptr>, vec<ipa_polymorphic_call_context, va_heap, vl_ptr>, vec<ipa_agg_jump_function*, va_heap, vl_ptr>) [clone .isra.129]
     1.48%  lto1-wpa  lto1               [.] unify_scc(streamer_tree_cache_d*, unsigned int, unsigned int, unsigned int, unsigned int)
     1.40%  lto1-wpa  lto1               [.] lto_cgraph_replace_node(cgraph_node*, cgraph_node*)
     1.38%  lto1-wpa  lto1               [.] ggc_set_mark(void const*)
     1.30%  lto1-wpa  libc-2.19.so       [.] malloc_consolidate
     1.28%  lto1-wpa  lto1               [.] htab_hash_string
     1.25%  lto1-wpa  lto1               [.] compare_tree_sccs_1(tree_node*, tree_node*, tree_node***)
     1.23%  lto1-wpa  lto1               [.] fibonacci_heap<sreal, cgraph_edge>::consolidate()
     1.19%  lto1-wpa  lto1               [.] splay_tree_splay
     1.15%  lto1-wpa  lto1               [.] can_inline_edge_p(cgraph_edge*, bool, bool)
     1.14%  lto1-wpa  lto1               [.] cgraph_node::get_availability()
     1.14%  lto1-wpa  lto1               [.] evaluate_properties_for_edge(cgraph_edge*, bool, unsigned int*, vec<tree_node*, va_heap, vl_ptr>*, vec<ipa_polymorphic_call_context, va_heap, vl_ptr>*, vec<ipa_agg_jump_function*, va_heap, vl_ptr>*) [clone .constprop.131]
     1.13%  lto1-wpa  lto1               [.] gimple_get_virt_method_for_vtable(long, tree_node*, unsigned long, bool*)
     1.10%  lto1-wpa  lto1               [.] types_same_for_odr(tree_node const*, tree_node const*)
     1.08%  lto1-wpa  lto1               [.] gt_ggc_mx_lang_tree_node(void*)
     1.05%  lto1-wpa  lto1               [.] streamer_read_tree_bitfields(lto_input_block*, data_in*, tree_node*)
     0.99%  lto1-wpa  lto1               [.] type_in_anonymous_namespace_p(tree_node const*)
     0.99%  lto1-wpa  lto1               [.] gimple_has_body_p(tree_node*)
     0.95%  lto1-wpa  lto1               [.] decl_assembler_name(tree_node*)
     0.93%  lto1-wpa  lto1               [.] do_per_function(void (*)(function*, void*), void*)
     0.82%  lto1-wpa  libc-2.19.so       [.] _int_free
     0.81%  lto1-wpa  lto1               [.] possible_polymorphic_call_targets_1(vec<cgraph_node*, va_heap, vl_ptr>&, hash_set<tree_node*, default_hashset_traits>*, hash_set<tree_node*, default_hashset_traits>*, tree_node*, odr_type_d*, long, tree_node*, long, bool*, vec<tree_node*, va_heap, vl_ptr>&, bool)
     0.81%  lto1-wpa  lto1               [.] searchc(searchc_env*, cgraph_node*, bool (*)(cgraph_edge*))
     0.80%  lto1-wpa  lto1               [.] streamer_get_pickled_tree(lto_input_block*, data_in*)
     0.78%  lto1-wpa  lto1               [.] edge_badness(cgraph_edge*, bool)
     0.77%  lto1-wpa  lto1               [.] hash_table<asmname_hasher, xcallocator, true>::find_slot_with_hash(tree_node const* const&, unsigned int, insert_option)
     0.77%  lto1-wpa  lto1               [.] update_callee_keys(fibonacci_heap<sreal, cgraph_edge>*, cgraph_node*, bitmap_head*)
     0.76%  lto1-wpa  lto1               [.] ggc_internal_alloc(unsigned long, void (*)(void*), unsigned long, unsigned long)
     0.75%  lto1-wpa  lto1               [.] fibonacci_heap<sreal, cgraph_edge>::extract_minimum_node()
     0.75%  lto1-wpa  lto1               [.] execute_one_pass(opt_pass*)
     0.74%  lto1-wpa  lto1               [.] inflate
     0.71%  lto1-wpa  lto1               [.] contains_polymorphic_type_p(tree_node const*)
     0.67%  lto1-wpa  lto1               [.] get_binfo_at_offset(tree_node*, long, tree_node*)
     0.64%  lto1-wpa  lto1               [.] symbol_table::decl_assembler_name_equal(tree_node*, tree_node const*)
     0.61%  lto1-wpa  lto1               [.] lto_balanced_map(int)
     0.61%  lto1-wpa  lto1               [.] ipa_icf::sem_item_optimizer::do_congruence_step_for_index(ipa_icf::congruence_class*, unsigned int)



More information about the Gcc-patches mailing list