This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: Optimize streaming in of non-prevailing ipa-parm summaries
- From: Richard Biener <richard dot guenther at gmail dot com>
- To: gcc-patches at gcc dot gnu dot org,Jan Hubicka <hubicka at ucw dot cz>
- Date: Sun, 16 Dec 2018 08:21:46 +0100
- Subject: Re: Optimize streaming in of non-prevailing ipa-parm summaries
- References: <20181215211743.b4kzkqfegennl7lz@kam.mff.cuni.cz>
On December 15, 2018 10:17:43 PM GMT+01:00, Jan Hubicka <hubicka@ucw.cz> wrote:
>Hi,
>this patch is motivated by finding that Firefox ipa summaries stream-in
>is fully dominated by streaming edge summaries that are not necessary
>because they lead to external calls or are from comdat functions that
>are unused.
>
>Reading them, populating hashes and later removing them at once
>streaming is finished increases memory use by about 1GB. This patch
>simply adds logic to notice such unnecesary summaries early and not
>put them to memory. This includes ADDR_EXPR that seems to be most
>common
>constant values and are easy to rebuild.
Don't you want to go the full way decomposing it to sym + offset?
>Our streams are not organized in a way it would be easy to skip
>something so I opted to simply "fast forward" reading the dead data but
>not producing datastructures. We may think of more systematic solution
>next stage1, but there are only two places where this matters -
>ipa-prop
>and ipa-fnsummary I will treat next.
>
>Bootstrapped/regtested x86_64-linux, will commit it shortly.
>
>Honza
>
> * cgraph.h (cgraph_node): Add predicate prevailing_p.
> (cgraph_edge): Add predicate possible_call_in_translation_unit_p.
> * ipa-prop.c (ipa_write_jump_function): Optimize streaming of
>ADDR_EXPR.
> (ipa_read_jump_function): Add prevails parameter; optimize streaming.
> (ipa_read_edge_info): Break out from ...
> (ipa_read_node_info): ... here; optimize streaming.
> * cgraph.c (cgraph_edge::possibly_call_in_translation_unit_p): New
> predicate.
>
>Index: cgraph.h
>===================================================================
>--- cgraph.h (revision 267122)
>+++ cgraph.h (working copy)
>@@ -308,6 +308,10 @@ public:
> /* Return availability of NODE when referenced from REF. */
> enum availability get_availability (symtab_node *ref = NULL);
>
>+ /* During LTO stream-in this predicate can be used to check whether
>node
>+ in question prevails in the linking to save some memory usage.
>*/
>+ bool prevailing_p (void);
>+
> /* Return true if NODE binds to current definition in final executable
> when referenced from REF. If REF is NULL return conservative value
> for any reference. */
>@@ -1730,6 +1734,10 @@ struct GTY((chain_next ("%h.next_caller"
> after passes that don't update the cgraph. */
> static void rebuild_references (void);
>
>+ /* During LTO stream in this can be used to check whether call can
>possibly
>+ be internal to the current translation unit. */
>+ bool possibly_call_in_translation_unit_p (void);
>+
> /* Expected number of executions: calculated in profile.c. */
> profile_count count;
> cgraph_node *caller;
>@@ -3357,6 +3365,15 @@ xstrdup_for_dump (const char *transient_
> return ggc_strdup (transient_str);
> }
>
>+/* During LTO stream-in this predicate can be used to check whether
>node
>+ in question prevails in the linking to save some memory usage. */
>+inline bool
>+symtab_node::prevailing_p (void)
>+{
>+ return definition && ((!TREE_PUBLIC (decl) && !DECL_EXTERNAL (decl))
>+ || previous_sharing_asm_name == NULL);
>+}
>+
> extern GTY(()) symbol_table *saved_symtab;
>
> #if CHECKING_P
>Index: ipa-prop.c
>===================================================================
>--- ipa-prop.c (revision 267122)
>+++ ipa-prop.c (working copy)
>@@ -4053,8 +4053,15 @@ ipa_write_jump_function (struct output_b
> struct ipa_agg_jf_item *item;
> struct bitpack_d bp;
> int i, count;
>+ int flag = 0;
>
>- streamer_write_uhwi (ob, jump_func->type);
>+ /* ADDR_EXPRs are very comon IP invariants; save some streamer data
>+ as well as WPA memory by handling them specially. */
>+ if (jump_func->type == IPA_JF_CONST
>+ && TREE_CODE (jump_func->value.constant.value) == ADDR_EXPR)
>+ flag = 1;
>+
>+ streamer_write_uhwi (ob, jump_func->type * 2 + flag);
> switch (jump_func->type)
> {
> case IPA_JF_UNKNOWN:
>@@ -4062,7 +4069,10 @@ ipa_write_jump_function (struct output_b
> case IPA_JF_CONST:
> gcc_assert (
> EXPR_LOCATION (jump_func->value.constant.value) ==
>UNKNOWN_LOCATION);
>- stream_write_tree (ob, jump_func->value.constant.value, true);
>+ stream_write_tree (ob,
>+ flag
>+ ? TREE_OPERAND (jump_func->value.constant.value, 0)
>+ : jump_func->value.constant.value, true);
> break;
> case IPA_JF_PASS_THROUGH:
> streamer_write_uhwi (ob, jump_func->value.pass_through.operation);
>@@ -4131,20 +4141,28 @@ static void
> ipa_read_jump_function (struct lto_input_block *ib,
> struct ipa_jump_func *jump_func,
> struct cgraph_edge *cs,
>- struct data_in *data_in)
>+ struct data_in *data_in,
>+ bool prevails)
> {
> enum jump_func_type jftype;
> enum tree_code operation;
> int i, count;
>+ int val = streamer_read_uhwi (ib);
>+ bool flag = val & 1;
>
>- jftype = (enum jump_func_type) streamer_read_uhwi (ib);
>+ jftype = (enum jump_func_type) (val / 2);
> switch (jftype)
> {
> case IPA_JF_UNKNOWN:
> ipa_set_jf_unknown (jump_func);
> break;
> case IPA_JF_CONST:
>- ipa_set_jf_constant (jump_func, stream_read_tree (ib, data_in),
>cs);
>+ {
>+ tree t = stream_read_tree (ib, data_in);
>+ if (flag && prevails)
>+ t = build_fold_addr_expr (t);
>+ ipa_set_jf_constant (jump_func, t, cs);
>+ }
> break;
> case IPA_JF_PASS_THROUGH:
> operation = (enum tree_code) streamer_read_uhwi (ib);
>@@ -4177,10 +4195,13 @@ ipa_read_jump_function (struct lto_input
> ipa_set_ancestor_jf (jump_func, offset, formal_id, agg_preserved);
> break;
> }
>+ default:
>+ fatal_error (UNKNOWN_LOCATION, "invalid jump function in LTO
>stream");
> }
>
> count = streamer_read_uhwi (ib);
>- vec_alloc (jump_func->agg.items, count);
>+ if (prevails)
>+ vec_alloc (jump_func->agg.items, count);
> if (count)
> {
> struct bitpack_d bp = streamer_read_bitpack (ib);
>@@ -4191,7 +4212,8 @@ ipa_read_jump_function (struct lto_input
> struct ipa_agg_jf_item item;
> item.offset = streamer_read_uhwi (ib);
> item.value = stream_read_tree (ib, data_in);
>- jump_func->agg.items->quick_push (item);
>+ if (prevails)
>+ jump_func->agg.items->quick_push (item);
> }
>
> struct bitpack_d bp = streamer_read_bitpack (ib);
>@@ -4200,7 +4222,8 @@ ipa_read_jump_function (struct lto_input
> {
> widest_int value = streamer_read_widest_int (ib);
> widest_int mask = streamer_read_widest_int (ib);
>- ipa_set_jfunc_bits (jump_func, value, mask);
>+ if (prevails)
>+ ipa_set_jfunc_bits (jump_func, value, mask);
> }
> else
> jump_func->bits = NULL;
>@@ -4213,7 +4236,8 @@ ipa_read_jump_function (struct lto_input
> VR_LAST);
> tree min = stream_read_tree (ib, data_in);
> tree max = stream_read_tree (ib, data_in);
>- ipa_set_jfunc_vr (jump_func, type, min, max);
>+ if (prevails)
>+ ipa_set_jfunc_vr (jump_func, type, min, max);
> }
> else
> jump_func->m_vr = NULL;
>@@ -4345,24 +4369,48 @@ ipa_write_node_info (struct output_block
> }
> }
>
>-/* If jump functions points to node we possibly can propagate into.
>- At this moment symbol table is still not merged, but the prevailing
>- symbol is always first in the list. */
>+/* Stream in edge E from IB. */
>
>-static bool
>-jump_function_useful_p (symtab_node *node)
>+static void
>+ipa_read_edge_info (struct lto_input_block *ib,
>+ struct data_in *data_in,
>+ struct cgraph_edge *e, bool prevails)
> {
>- /* While incremental linking we may end up getting function body
>later. */
>- if (flag_incremental_link == INCREMENTAL_LINK_LTO)
>- return true;
>- if (!TREE_PUBLIC (node->decl) && !DECL_EXTERNAL (node->decl))
>- return true;
>- for (int n = 10; node->previous_sharing_asm_name && n ; n--)
>- node = node->previous_sharing_asm_name;
>- if (node->previous_sharing_asm_name)
>- node = symtab_node::get_for_asmname (DECL_ASSEMBLER_NAME
>(node->decl));
>- gcc_assert (TREE_PUBLIC (node->decl));
>- return node->definition;
>+ int count = streamer_read_uhwi (ib);
>+ bool contexts_computed = count & 1;
>+
>+ count /= 2;
>+ if (!count)
>+ return;
>+ if (prevails && e->possibly_call_in_translation_unit_p ())
>+ {
>+ struct ipa_edge_args *args = IPA_EDGE_REF (e);
>+ vec_safe_grow_cleared (args->jump_functions, count);
>+ if (contexts_computed)
>+ vec_safe_grow_cleared (args->polymorphic_call_contexts, count);
>+ for (int k = 0; k < count; k++)
>+ {
>+ ipa_read_jump_function (ib, ipa_get_ith_jump_func (args, k), e,
>+ data_in, prevails);
>+ if (contexts_computed)
>+ ipa_get_ith_polymorhic_call_context (args, k)->stream_in
>+ (ib, data_in);
>+ }
>+ }
>+ else
>+ {
>+ for (int k = 0; k < count; k++)
>+ {
>+ struct ipa_jump_func dummy;
>+ ipa_read_jump_function (ib, &dummy, e,
>+ data_in, prevails);
>+ if (contexts_computed)
>+ {
>+ struct ipa_polymorphic_call_context ctx;
>+ ctx.stream_in (ib, data_in);
>+ }
>+ }
>+ }
> }
>
> /* Stream in NODE info from IB. */
>@@ -4371,82 +4419,50 @@ static void
>ipa_read_node_info (struct lto_input_block *ib, struct cgraph_node
>*node,
> struct data_in *data_in)
> {
>- struct ipa_node_params *info = IPA_NODE_REF (node);
> int k;
> struct cgraph_edge *e;
> struct bitpack_d bp;
>+ bool prevails = node->prevailing_p ();
>+ struct ipa_node_params *info = prevails ? IPA_NODE_REF (node) :
>NULL;
>
>- ipa_alloc_node_params (node, streamer_read_uhwi (ib));
>-
>- for (k = 0; k < ipa_get_param_count (info); k++)
>- (*info->descriptors)[k].move_cost = streamer_read_uhwi (ib);
>+ int param_count = streamer_read_uhwi (ib);
>+ if (prevails)
>+ {
>+ ipa_alloc_node_params (node, param_count);
>+ for (k = 0; k < param_count; k++)
>+ (*info->descriptors)[k].move_cost = streamer_read_uhwi (ib);
>+ if (ipa_get_param_count (info) != 0)
>+ info->analysis_done = true;
>+ info->node_enqueued = false;
>+ }
>+ else
>+ for (k = 0; k < param_count; k++)
>+ streamer_read_uhwi (ib);
>
> bp = streamer_read_bitpack (ib);
>- if (ipa_get_param_count (info) != 0)
>- info->analysis_done = true;
>- info->node_enqueued = false;
>- for (k = 0; k < ipa_get_param_count (info); k++)
>- ipa_set_param_used (info, k, bp_unpack_value (&bp, 1));
>- for (k = 0; k < ipa_get_param_count (info); k++)
>+ for (k = 0; k < param_count; k++)
> {
>- ipa_set_controlled_uses (info, k, streamer_read_hwi (ib));
>- (*info->descriptors)[k].decl_or_type = stream_read_tree (ib,
>data_in);
>+ bool used = bp_unpack_value (&bp, 1);
>+
>+ if (prevails)
>+ ipa_set_param_used (info, k, used);
> }
>- for (e = node->callees; e; e = e->next_callee)
>+ for (k = 0; k < param_count; k++)
> {
>- struct ipa_edge_args *args = IPA_EDGE_REF (e);
>- int count = streamer_read_uhwi (ib);
>- bool contexts_computed = count & 1;
>- count /= 2;
>-
>- if (!count)
>- continue;
>- if (!jump_function_useful_p (e->callee))
>- {
>- for (k = 0; k < count; k++)
>- {
>- struct ipa_jump_func dummy;
>- ipa_read_jump_function (ib, &dummy, e, data_in);
>- if (contexts_computed)
>- {
>- struct ipa_polymorphic_call_context ctx;
>- ctx.stream_in (ib, data_in);
>- }
>- }
>- continue;
>- }
>- vec_safe_grow_cleared (args->jump_functions, count);
>- if (contexts_computed)
>- vec_safe_grow_cleared (args->polymorphic_call_contexts, count);
>+ int nuses = streamer_read_hwi (ib);
>+ tree type = stream_read_tree (ib, data_in);
>
>- for (k = 0; k < ipa_get_cs_argument_count (args); k++)
>+ if (prevails)
> {
>- ipa_read_jump_function (ib, ipa_get_ith_jump_func (args, k), e,
>- data_in);
>- if (contexts_computed)
>- ipa_get_ith_polymorhic_call_context (args, k)->stream_in (ib,
>data_in);
>+ ipa_set_controlled_uses (info, k, nuses);
>+ (*info->descriptors)[k].decl_or_type = type;
> }
> }
>+ for (e = node->callees; e; e = e->next_callee)
>+ ipa_read_edge_info (ib, data_in, e, prevails);
> for (e = node->indirect_calls; e; e = e->next_callee)
> {
>- struct ipa_edge_args *args = IPA_EDGE_REF (e);
>- int count = streamer_read_uhwi (ib);
>- bool contexts_computed = count & 1;
>- count /= 2;
>-
>- if (count)
>- {
>- vec_safe_grow_cleared (args->jump_functions, count);
>- if (contexts_computed)
>- vec_safe_grow_cleared (args->polymorphic_call_contexts, count);
>- for (k = 0; k < ipa_get_cs_argument_count (args); k++)
>- {
>- ipa_read_jump_function (ib, ipa_get_ith_jump_func (args, k), e,
>- data_in);
>- if (contexts_computed)
>- ipa_get_ith_polymorhic_call_context (args, k)->stream_in (ib,
>data_in);
>- }
>- }
>+ ipa_read_edge_info (ib, data_in, e, prevails);
> ipa_read_indirect_edge_info (ib, data_in, e);
> }
> }
>Index: cgraph.c
>===================================================================
>--- cgraph.c (revision 267122)
>+++ cgraph.c (working copy)
>@@ -3766,6 +3780,41 @@ cgraph_edge::sreal_frequency ()
> : caller->count);
> }
>
>+
>+/* During LTO stream in this can be used to check whether call can
>possibly
>+ be internal to the current translation unit. */
>+
>+bool
>+cgraph_edge::possibly_call_in_translation_unit_p (void)
>+{
>+ gcc_checking_assert (in_lto_p && caller->prevailing_p ());
>+
>+ /* While incremental linking we may end up getting function body
>later. */
>+ if (flag_incremental_link == INCREMENTAL_LINK_LTO)
>+ return true;
>+
>+ /* We may be smarter here and avoid stremaing in indirect calls we
>can't
>+ track, but that would require arranging stremaing the indirect
>call
>+ summary first. */
>+ if (!callee)
>+ return true;
>+
>+ /* If calle is local to the original translation unit, it will be
>defined. */
>+ if (!TREE_PUBLIC (callee->decl) && !DECL_EXTERNAL (callee->decl))
>+ return true;
>+
>+ /* Otherwise we need to lookup prevailing symbol (symbol table is
>not merged,
>+ yet) and see if it is a definition. In fact we may also resolve
>aliases,
>+ but that is probably not too important. */
>+ symtab_node *node = callee;
>+ for (int n = 10; node->previous_sharing_asm_name && n ; n--)
>+ node = node->previous_sharing_asm_name;
>+ if (node->previous_sharing_asm_name)
>+ node = symtab_node::get_for_asmname (DECL_ASSEMBLER_NAME
>(callee->decl));
>+ gcc_assert (TREE_PUBLIC (node->decl));
>+ return node->get_availability () >= AVAIL_AVAILABLE;
>+}
>+
> /* A stashed copy of "symtab" for use by selftest::symbol_table_test.
> This needs to be a global so that it can be a GC root, and thus
> prevent the stashed copy from being garbage-collected if the GC runs