Add parameter to limit LTO streaming parallelism

Jan Hubicka hubicka@ucw.cz
Thu Apr 11 12:11:00 GMT 2019


Hi,
the LTO streaming forks for every partition. With the number of
partitions incrased to 128 and relatively large memory usage (around
5GB) needed to WPA firefox this causes kernel to spend a lot of time
probably by copying the page tables.

This patch makes the streamer to for only lto_parallelism times
and strem num_partitions/lto_paralleism in each thread.
I have also added parameter because currently -flto=jobserv leads
to unlimited parallelism.  This should be fixed by conneting to Make's
jobsever and build our own mini jobserver to distribute partitions
between worker threads, but this seems bit too involved for last minute
change in stage4.  I plan to work on this and hopefully bacport it to .2
release.

I have tested the performance on by 32CPU 64threads box and got best
wall time with 32 partitions and therefore I set it by default.  I get

--param max-lto-streaming-parallelism=1
Time variable                                   usr           sys          wall               GGC
 phase stream out                   :  50.65 ( 30%)  20.66 ( 61%)  71.38 ( 35%)     921 kB (  0%)
 TOTAL                              : 170.73         33.69        204.64        7459610 kB

--param max-lto-streaming-parallelism=4
 phase stream out                   :  13.79 ( 11%)   6.80 ( 35%)  20.94 ( 14%)     155 kB (  0%)
 TOTAL                              : 130.26         19.68        150.46        7458844 kB

--param max-lto-streaming-parallelism=8
 phase stream out                   :   8.94 (  7%)   5.21 ( 29%)  14.15 ( 10%)      83 kB (  0%)
 TOTAL                              : 125.28         18.09        143.54        7458773 kB

--param max-lto-streaming-parallelism=16
 phase stream out                   :   4.56 (  4%)   4.34 ( 25%)   9.46 (  7%)      35 kB (  0%)
 TOTAL                              : 122.60         17.21        140.56        7458725 kB

--param max-lto-streaming-parallelism=32
 phase stream out                   :   2.34 (  2%)   5.69 ( 31%)   8.03 (  6%)      15 kB (  0%)
 TOTAL                              : 118.53         18.36        137.08        7458705 kB

--param max-lto-streaming-parallelism=64
 phase stream out                   :   1.63 (  1%)  15.76 ( 55%)  17.40 ( 12%)      13 kB (  0%)
 TOTAL                              : 122.17         28.66        151.00        7458702 kB

--param max-lto-streaming-parallelism=256
 phase stream out                   :   1.28 (  1%)   9.24 ( 41%)  10.53 (  8%)      13 kB (  0%)
 TOTAL                              : 116.78         22.56        139.53        7458702 kB

Note that it is bit odd that 64 leads to worse results that full
parallelism but it seems to reproduce relatively well. Also the usr/sys
times for streaming are not representative since they do not account sys
time of the forked threads. I am not sure where the fork time is
accounted.

Generally it seems that the forking performance is not at all that
bad and scales reasonably, but I still we should limit the default for
something less than 128 we do now. Definitly there are diminishing
returns after increasing from 16 or 32 and memory use goes up
noticeably. With current trunk memory use also does not seem terribly
bad (less global stream streaming makes the workers cheaper) and in all
memory traces I collected it is dominated by compilation stage during
the full rebuild.

I did similar tests for cc1 binary. There the relative time spent in
streaming is lower so it goes from 17% to 1% (for parallelism 1 and 32
respectively)

Bootstrapped/regtested x86_64-linux, OK?

	* params.def (PARAM_MAX_LTO_STREAMING_PARALLELISM): New parameter.
	* lto.c (do_stream_out): rename to ...
	(stream_out): ... this one; move original code to ...
	(stream_out_partitions_1, stream_out_partitions): ... these new
	functions.
	(lto_wpa_write_files): Honnor lto_parallelism
Index: params.def
===================================================================
--- params.def	(revision 270143)
+++ params.def	(working copy)
@@ -1146,6 +1146,11 @@ DEFPARAM (MAX_PARTITION_SIZE,
 	  "Maximal size of a partition for LTO (in estimated instructions).",
 	  1000000, 0, INT_MAX)
 
+DEFPARAM (PARAM_MAX_LTO_STREAMING_PARALLELISM,
+	  "max-lto-streaming-parallelism",
+	  "maximal number of LTO partitions streamed in parallel.",
+	  32, 1, 0)
+
 /* Diagnostic parameters.  */
 
 DEFPARAM (CXX_MAX_NAMESPACES_FOR_DIAGNOSTIC_HELP,
Index: lto/lto.c
===================================================================
--- lto/lto.c	(revision 270143)
+++ lto/lto.c	(working copy)
@@ -2304,7 +2304,7 @@ static lto_file *current_lto_file;
 /* Actually stream out ENCODER into TEMP_FILENAME.  */
 
 static void
-do_stream_out (char *temp_filename, lto_symtab_encoder_t encoder, int part)
+stream_out (char *temp_filename, lto_symtab_encoder_t encoder, int part)
 {
   lto_file *file = lto_obj_file_open (temp_filename, true);
   if (!file)
@@ -2352,19 +2352,31 @@ wait_for_child ()
 }
 #endif
 
+static void
+stream_out_partitions_1 (char *temp_filename, int blen, int min, int max)
+{
+   /* Write all the nodes in SET.  */
+   for (int p = min; p < max; p ++)
+     {
+       sprintf (temp_filename + blen, "%u.o", p);
+       stream_out (temp_filename, ltrans_partitions[p]->encoder, p);
+       ltrans_partitions[p]->encoder = NULL;
+     }
+}
+
 /* Stream out ENCODER into TEMP_FILENAME
    Fork if that seems to help.  */
 
 static void
-stream_out (char *temp_filename, lto_symtab_encoder_t encoder,
-	    bool ARG_UNUSED (last), int part)
+stream_out_partitions (char *temp_filename, int blen, int min, int max,
+		       bool ARG_UNUSED (last))
 {
 #ifdef HAVE_WORKING_FORK
   static int nruns;
 
   if (lto_parallelism <= 1)
     {
-      do_stream_out (temp_filename, encoder, part);
+      stream_out_partitions_1 (temp_filename, blen, min, max);
       return;
     }
 
@@ -2384,12 +2396,12 @@ stream_out (char *temp_filename, lto_sym
       if (!cpid)
 	{
 	  setproctitle ("lto1-wpa-streaming");
-	  do_stream_out (temp_filename, encoder, part);
+          stream_out_partitions_1 (temp_filename, blen, min, max);
 	  exit (0);
 	}
       /* Fork failed; lets do the job ourseleves.  */
       else if (cpid == -1)
-        do_stream_out (temp_filename, encoder, part);
+        stream_out_partitions_1 (temp_filename, blen, min, max);
       else
 	nruns++;
     }
@@ -2397,13 +2409,13 @@ stream_out (char *temp_filename, lto_sym
   else
     {
       int i;
-      do_stream_out (temp_filename, encoder, part);
+      stream_out_partitions_1 (temp_filename, blen, min, max);
       for (i = 0; i < nruns; i++)
 	wait_for_child ();
     }
   asm_nodes_output = true;
 #else
-  do_stream_out (temp_filename, encoder, part);
+  stream_out_partitions_1 (temp_filename, blen, min, max);
 #endif
 }
 
@@ -2445,6 +2457,13 @@ lto_wpa_write_files (void)
   blen = strlen (temp_filename);
 
   n_sets = ltrans_partitions.length ();
+  unsigned sets_per_worker = n_sets;
+  if (lto_parallelism > 1)
+    {
+      if (lto_parallelism > (int)n_sets)
+	lto_parallelism = n_sets;
+      sets_per_worker = (n_sets + lto_parallelism - 1) / lto_parallelism;
+    }
 
   for (i = 0; i < n_sets; i++)
     {
@@ -2493,13 +2512,17 @@ lto_wpa_write_files (void)
 	}
       gcc_checking_assert (lto_symtab_encoder_size (part->encoder) || !i);
 
-      stream_out (temp_filename, part->encoder, i == n_sets - 1, i);
-
-      part->encoder = NULL;
-
       temp_priority.safe_push (part->insns);
       temp_filenames.safe_push (xstrdup (temp_filename));
     }
+
+  for (int set = 0; set < MAX (lto_parallelism, 1); set++)
+    {
+      stream_out_partitions (temp_filename, blen, set * sets_per_worker,
+			     MIN ((set + 1) * sets_per_worker, n_sets),
+			     set == MAX (lto_parallelism, 1) - 1);
+    }
+
   ltrans_output_list_stream = fopen (ltrans_output_list, "w");
   if (ltrans_output_list_stream == NULL)
     fatal_error (input_location,
@@ -3113,14 +3136,16 @@ do_whole_program_analysis (void)
 
   lto_parallelism = 1;
 
-  /* TODO: jobserver communicatoin is not supported, yet.  */
+  /* TODO: jobserver communication is not supported, yet.  */
   if (!strcmp (flag_wpa, "jobserver"))
-    lto_parallelism = -1;
+    lto_parallelism = PARAM_VALUE (PARAM_MAX_LTO_STREAMING_PARALLELISM);
   else
     {
       lto_parallelism = atoi (flag_wpa);
       if (lto_parallelism <= 0)
 	lto_parallelism = 0;
+      if (lto_parallelism >= PARAM_VALUE (PARAM_MAX_LTO_STREAMING_PARALLELISM))
+	lto_parallelism = PARAM_VALUE (PARAM_MAX_LTO_STREAMING_PARALLELISM);
     }
 
   timevar_start (TV_PHASE_OPT_GEN);



More information about the Gcc-patches mailing list