Reduce memory usage of ipa-cp streaming

Jan Hubicka hubicka@ucw.cz
Tue Dec 4 15:55:00 GMT 2018


Hi,
this patch reduces memory usage of WPA for current firefox LTO+FDO build
from 10GB to 8.5GB by not allocating vector for jump functions annotating
calls to libgcov runtime.

Comparing -Q report before and after patch I get:

Time variable                                   usr           sys          wall               GGC
 phase opt and generate             : 190.66 ( 62%)   1.15 (  1%) 194.94 ( 46%) 1038947 kB (  9%)
 phase stream in                    : 100.59 ( 33%)   8.85 ( 10%) 109.87 ( 26%)10838189 kB ( 91%)
 phase stream out                   :   1.44 (  0%)  75.28 ( 87%) 106.10 ( 25%)       1 kB (  0%)
 phase finalize                     :  13.41 (  4%)   1.42 (  2%)  14.95 (  4%)       0 kB (  0%)
 garbage collection                 :  23.40 (  8%)   0.10 (  0%)  23.63 (  6%)       0 kB (  0%)
 callgraph optimization             :  48.66 ( 16%)   0.09 (  0%)  50.02 ( 12%)    1073 kB (  0%)
 ipa function summary               :   3.02 (  1%)   0.82 (  1%)   3.94 (  1%)  387933 kB (  3%)
 ipa dead code removal              :  18.12 (  6%)   0.13 (  0%)  18.37 (  4%)       0 kB (  0%)
 ipa devirtualization               :   0.52 (  0%)   0.00 (  0%)   0.52 (  0%)       0 kB (  0%)
 ipa cp                             :  23.84 (  8%)   2.21 (  3%)  26.15 (  6%) 3645794 kB ( 31%)
 ipa inlining heuristics            :  60.39 ( 20%)   0.38 (  0%)  61.00 ( 14%)  872904 kB (  7%)
 ipa comdats                        :   0.80 (  0%)   0.00 (  0%)   0.80 (  0%)       0 kB (  0%)
 lto stream inflate                 :  14.09 (  5%)   0.83 (  1%)  14.90 (  3%)       0 kB (  0%)
 ipa lto gimple in                  :   0.45 (  0%)   0.09 (  0%)   0.51 (  0%)   42421 kB (  0%)
 ipa lto gimple out                 :   0.25 (  0%)   0.24 (  0%)   0.81 (  0%)       1 kB (  0%)
 ipa lto decl in                    :  36.78 ( 12%)   3.01 (  3%)  39.53 (  9%) 3900047 kB ( 33%)
 ipa lto decl out                   :   0.75 (  0%)   0.09 (  0%)   0.88 (  0%)       0 kB (  0%)
 ipa lto constructors in            :   0.16 (  0%)   0.03 (  0%)   0.14 (  0%)    1820 kB (  0%)
 ipa lto constructors out           :   0.29 (  0%)   0.13 (  0%)   0.66 (  0%)       0 kB (  0%)
 ipa lto cgraph I/O                 :   4.80 (  2%)   1.09 (  1%)   6.09 (  1%) 2739424 kB ( 23%)
 ipa lto decl merge                 :   7.73 (  3%)   0.03 (  0%)   8.18 (  2%)   69395 kB (  1%)
 ipa lto cgraph merge               :   3.91 (  1%)   0.02 (  0%)   3.99 (  1%)  129651 kB (  1%)
 whopr wpa                          :   0.34 (  0%)   0.00 (  0%)   0.33 (  0%)      79 kB (  0%)
 whopr wpa I/O                      :   0.15 (  0%)  74.80 ( 86%) 103.71 ( 24%)       0 kB (  0%)
 whopr partitioning                 :  26.75 (  9%)   0.14 (  0%)  28.04 (  7%)   73797 kB (  1%)
 ipa reference                      :   3.51 (  1%)   0.01 (  0%)   3.64 (  1%)       0 kB (  0%)
 ipa profile                        :   1.86 (  1%)   0.01 (  0%)   1.86 (  0%)       0 kB (  0%)
 ipa pure const                     :   6.89 (  2%)   0.05 (  0%)   7.04 (  2%)       0 kB (  0%)
 ipa icf                            :  16.97 (  6%)   0.50 (  1%)  17.62 (  4%)      49 kB (  0%)
 inline parameters                  :   0.01 (  0%)   0.01 (  0%)   0.03 (  0%)     269 kB (  0%)
 tree PTA                           :   0.02 (  0%)   0.00 (  0%)   0.00 (  0%)       0 kB (  0%)
 tree SSA rewrite                   :   0.02 (  0%)   0.00 (  0%)   0.05 (  0%)    4479 kB (  0%)
 tree SSA other                     :   0.00 (  0%)   0.00 (  0%)   0.01 (  0%)       0 kB (  0%)
 tree SSA incremental               :   0.08 (  0%)   0.01 (  0%)   0.10 (  0%)     572 kB (  0%)
 tree operand scan                  :   0.03 (  0%)   0.00 (  0%)   0.01 (  0%)    6655 kB (  0%)
 dominance frontiers                :   0.01 (  0%)   0.00 (  0%)   0.00 (  0%)       0 kB (  0%)
 dominance computation              :   0.05 (  0%)   0.00 (  0%)   0.02 (  0%)       0 kB (  0%)
 varconst                           :   0.66 (  0%)   0.41 (  0%)   1.00 (  0%)       0 kB (  0%)
 branch prediction                  :   0.00 (  0%)   0.00 (  0%)   0.01 (  0%)       0 kB (  0%)
 TOTAL                              : 306.10         86.70        425.87       11879212 kB

Time variable                                   usr           sys          wall               GGC
 phase setup                        :   0.01 (  0%)   0.00 (  0%)   0.01 (  0%)    2072 kB (  0%)
 phase opt and generate             : 155.97 ( 59%)   1.09 (  2%) 176.62 ( 31%)  756872 kB (  7%)
 phase stream in                    : 105.51 ( 40%)   8.76 ( 15%) 163.22 ( 28%) 9382008 kB ( 93%)
 phase stream out                   :   1.09 (  0%)  48.71 ( 83%) 238.88 ( 41%)       0 kB (  0%)
 garbage collection                 :   9.22 (  4%)   0.00 (  0%)   9.23 (  2%)       0 kB (  0%)
 callgraph construction             :   0.01 (  0%)   0.00 (  0%)   0.02 (  0%)       0 kB (  0%)
 callgraph optimization             :  33.82 ( 13%)   0.05 (  0%)  33.88 (  6%)    1073 kB (  0%)
 ipa function summary               :   3.10 (  1%)   0.94 (  2%)   4.38 (  1%)  387933 kB (  4%)
 ipa dead code removal              :  17.74 (  7%)   0.20 (  0%)  17.90 (  3%)       0 kB (  0%)
 ipa devirtualization               :   0.53 (  0%)   0.00 (  0%)   0.53 (  0%)       0 kB (  0%)
 ipa cp                             :  27.20 ( 10%)   1.50 (  3%)  35.36 (  6%) 2210990 kB ( 22%)
 ipa inlining heuristics            :  50.97 ( 19%)   0.28 (  0%)  51.46 (  9%)  579877 kB (  6%)
 ipa comdats                        :   0.58 (  0%)   0.00 (  0%)   0.59 (  0%)       0 kB (  0%)
 lto stream inflate                 :  14.65 (  6%)   0.93 (  2%)  14.84 (  3%)       0 kB (  0%)
 ipa lto gimple in                  :   0.37 (  0%)   0.05 (  0%)   9.40 (  2%)    1483 kB (  0%)
 ipa lto gimple out                 :   0.23 (  0%)   0.15 (  0%)   6.14 (  1%)       0 kB (  0%)
 ipa lto decl in                    :  38.40 ( 15%)   3.30 (  6%)  68.07 ( 12%) 3900047 kB ( 38%)
 ipa lto decl out                   :   0.54 (  0%)   0.10 (  0%)   0.64 (  0%)       0 kB (  0%)
 ipa lto constructors in            :   0.22 (  0%)   0.05 (  0%)  10.72 (  2%)   34588 kB (  0%)
 ipa lto constructors out           :   0.17 (  0%)   0.20 (  0%)   0.85 (  0%)       0 kB (  0%)
 ipa lto cgraph I/O                 :   5.09 (  2%)   1.24 (  2%)  11.46 (  2%) 2739424 kB ( 27%)
 ipa lto decl merge                 :   8.75 (  3%)   0.01 (  0%)   8.87 (  2%)   69395 kB (  1%)
 ipa lto cgraph merge               :   3.87 (  1%)   0.02 (  0%)   3.90 (  1%)  129463 kB (  1%)
 whopr wpa                          :   0.31 (  0%)   0.00 (  0%)   0.29 (  0%)      83 kB (  0%)
 whopr wpa I/O                      :   0.13 (  0%)  48.25 ( 82%) 231.23 ( 40%)       0 kB (  0%)
 whopr partitioning                 :  18.81 (  7%)   0.08 (  0%)  18.88 (  3%)   73797 kB (  1%)
 ipa reference                      :   2.65 (  1%)   0.00 (  0%)   2.64 (  0%)       0 kB (  0%)
 ipa profile                        :   1.89 (  1%)   0.07 (  0%)  12.71 (  2%)       0 kB (  0%)
 ipa pure const                     :   5.31 (  2%)   0.04 (  0%)   5.37 (  1%)       0 kB (  0%)
 ipa icf                            :  17.01 (  6%)   0.50 (  1%)  17.76 (  3%)      49 kB (  0%)
 alias stmt walking                 :   0.00 (  0%)   0.01 (  0%)   0.00 (  0%)       0 kB (  0%)
 inline parameters                  :   0.02 (  0%)   0.00 (  0%)   0.03 (  0%)     269 kB (  0%)
 tree CFG construction              :   0.00 (  0%)   0.00 (  0%)   0.01 (  0%)       2 kB (  0%)
 tree PTA                           :   0.01 (  0%)   0.00 (  0%)   0.00 (  0%)       0 kB (  0%)
 tree SSA rewrite                   :   0.05 (  0%)   0.01 (  0%)   0.02 (  0%)    3890 kB (  0%)
 tree SSA other                     :   0.01 (  0%)   0.00 (  0%)   0.00 (  0%)       0 kB (  0%)
 tree SSA incremental               :   0.05 (  0%)   0.02 (  0%)   0.04 (  0%)       0 kB (  0%)
 tree operand scan                  :   0.04 (  0%)   0.01 (  0%)   0.04 (  0%)    5745 kB (  0%)
 tree aggressive DCE                :   0.00 (  0%)   0.01 (  0%)   0.01 (  0%)       3 kB (  0%)
 dominance computation              :   0.00 (  0%)   0.00 (  0%)   0.01 (  0%)       0 kB (  0%)
 varconst                           :   0.66 (  0%)   0.35 (  1%)   1.09 (  0%)       0 kB (  0%)
 loop init                          :   0.01 (  0%)   0.00 (  0%)   0.00 (  0%)       2 kB (  0%)
 TOTAL                              : 262.58         58.56        578.73       10140954 kB

Bootstrapped/regtested x86_64-linux, plan to commit it shortly.
Obviously this is not perfect solution since linking LTO with libgcov will make
memory go back.  We should work on reducing memory use of the jump functions and
perhaps getting rid of the wrapping ADDR_EXPRs, there is 11 million of them.

 ipa cp                             :  27.20 ( 10%)   1.50 (  3%)  35.36 (  6%) 2210990 kB ( 22%)

ipa-cp still allocated 2GB of data, 450GB of that are dead trees used to represent those jump
function we do not stream in.  It would be nice to have way to skip them and not allocate them
but creating special sectoin for every call site would be overkill.

I will also look into not allocating VRP and bitfields data.

Bootstrapped/regtested x86_64-linux.

	* ipa-prop.c (jump_function_useful_p): New.
	(ipa_read_node_info): Do not allocated useless jump functions.
Index: ipa-prop.c
===================================================================
--- ipa-prop.c	(revision 266725)
+++ ipa-prop.c	(working copy)
@@ -4344,6 +4344,26 @@ ipa_write_node_info (struct output_block
     }
 }
 
+/* If jump functions points to node we possibly can propagate into.
+   At this moment symbol table is still not merged, but the prevailing
+   symbol is always first in the list.  */
+
+static bool
+jump_function_useful_p (symtab_node *node)
+{
+  /* While incremental linking we may end up getting function body later.  */
+  if (flag_incremental_link == INCREMENTAL_LINK_LTO)
+    return true;
+  if (!TREE_PUBLIC (node->decl) && !DECL_EXTERNAL (node->decl))
+    return true;
+  for (int n = 10; node->previous_sharing_asm_name && n ; n--)
+    node = node->previous_sharing_asm_name;
+  if (node->previous_sharing_asm_name)
+    node = symtab_node::get_for_asmname (DECL_ASSEMBLER_NAME (node->decl));
+  gcc_assert (TREE_PUBLIC (node->decl));
+  return node->definition;
+}
+
 /* Stream in NODE info from IB.  */
 
 static void
@@ -4380,6 +4400,20 @@ ipa_read_node_info (struct lto_input_blo
 
       if (!count)
 	continue;
+      if (!jump_function_useful_p (e->callee))
+	{
+          for (k = 0; k < count; k++)
+	    {
+	      struct ipa_jump_func dummy;
+	      ipa_read_jump_function (ib, &dummy, e, data_in);
+	      if (contexts_computed)
+		{
+		  struct ipa_polymorphic_call_context ctx;
+		  ctx.stream_in (ib, data_in);
+		}
+	    }
+	  continue;
+	}
       vec_safe_grow_cleared (args->jump_functions, count);
       if (contexts_computed)
 	vec_safe_grow_cleared (args->polymorphic_call_contexts, count);



More information about the Gcc-patches mailing list