This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [RFC] Old school parallelization of WPA streaming


On Thu, 21 Nov 2013, Jan Hubicka wrote:

> Hi,
> I am not sure where we converged concerning the fork trick.  I am using it in my
> tree for months and it does save my waiting time for WPA compilations, so I am
> re-attaching the patch.
> 
> Does it seem resonable for mainline?
> 
> As for other plans mentioned on this thread
> > > 
> > > I still have some items on list here
> > >  1) avoid function sections to be decompressed by WPA
> > >     (this won't cause much compile time improvements as decompression is
> > >      well bellow 10% of runtime)
> > 
> > still low-hanging
> > 
> > finally get a LTO section header!  (with a flag telling whether the
> > section is compressed)
> 
> I have patch for it somewhere (not particularly clean, we need to dig more into
> the basic section handling code in LTO). The benefits however was quite small
> (we get dominated by decls and types still), so perhaps this can wait for next
> stage1 or a development branch.
> > 
> > >  2) put variable initializers into named sections just as function bodies
> > >     are.
> > >     Seeing Martin's systemtaps of firefox/gimp/inkscape, to my surprise the
> > >     initializers are actually about as big as the text segment.  While
> > >     it seems bit wasteful to pust single integer_cst there (and we can
> > >     special case this), it seems that there is a promise for vtables
> > >     and other stuff.
> > > 
> > >     To make devirt work, we will need to load vtables into memory (or
> > >     invent representation to stream them other way that would be similarly
> > >     big). Still we will avoid need to load them in 5000 copies and merge
> > >     them.
> 
> Did not fnish this, unfortunately (devirtualization was more involved and
> I lost track on this one).  I had a prototype working where savings was about
> 15% of WPA memory.  I will try to get cleaner implementation soon.
> 
> > >  3) I think good part of function/partitioning overhead is because abstract
> > >     origin streaming is utterly broken.
> 
> Yep, this is definitely still in longer term plans only.

Why do you need an additional -fparallelism?  Wouldn't
-fwpa=... be a better match, matching -flto=...?  As we already
pass down a -fwpa option to WPA this would make things easier, no?

Thanks,
Richard.

> Honza
> 
> 	* lto-cgraph.c (asm_nodes_output): Make global.
> 	* lto-streamer.h (asm_nodes_output): Declare.
> 	* lto-wrapper.c (parallel, jobserver): Make global.
> 	(run_gcc): Pass down -fparallelism
> 
> 	* lto.c (lto_parallelism): New variable.
> 	(do_stream_out): New function.
> 	(stream_out): New function.
> 	(lto_wpa_write_files): Use it.
> 	* lang.opt (fparallelism): New.
> 	* lto.h (lto_parallelism): Declare.
> 	* lto-lang.c (lto_handle_option): Add fparalelism.
> 
> Index: lto-cgraph.c
> ===================================================================
> --- lto-cgraph.c	(revision 201891)
> +++ lto-cgraph.c	(working copy)
> @@ -50,6 +50,9 @@ along with GCC; see the file COPYING3.
>  #include "context.h"
>  #include "pass_manager.h"
>  
> +/* True when asm nodes has been output.  */
> +bool asm_nodes_output = false;
> +
>  static void output_cgraph_opt_summary (void);
>  static void input_cgraph_opt_summary (vec<symtab_node>  nodes);
>  
> @@ -852,7 +855,6 @@ output_symtab (void)
>    lto_symtab_encoder_iterator lsei;
>    int i, n_nodes;
>    lto_symtab_encoder_t encoder;
> -  static bool asm_nodes_output = false;
>  
>    if (flag_wpa)
>      output_cgraph_opt_summary ();
> Index: lto-streamer.h
> ===================================================================
> --- lto-streamer.h	(revision 201891)
> +++ lto-streamer.h	(working copy)
> @@ -870,6 +870,7 @@ void lto_output_location (struct output_
>  
>  
>  /* In lto-cgraph.c  */
> +extern bool asm_nodes_output;
>  lto_symtab_encoder_t lto_symtab_encoder_new (bool);
>  int lto_symtab_encoder_encode (lto_symtab_encoder_t, symtab_node);
>  void lto_symtab_encoder_delete (lto_symtab_encoder_t);
> Index: lto-wrapper.c
> ===================================================================
> --- lto-wrapper.c	(revision 201891)
> +++ lto-wrapper.c	(working copy)
> @@ -56,6 +56,9 @@ along with GCC; see the file COPYING3.
>  
>  int debug;				/* true if -save-temps.  */
>  int verbose;				/* true if -v.  */
> +int parallel = 0;			/* number of parallel builds specified
> +					   by -flto=N  */
> +int jobserver = 0;			/* true if -flto=jobserver was used.  */
>  
>  enum lto_mode_d {
>    LTO_MODE_NONE,			/* Not doing LTO.  */
> @@ -445,8 +448,6 @@ run_gcc (unsigned argc, char *argv[])
>    char *list_option_full = NULL;
>    const char *linker_output = NULL;
>    const char *collect_gcc, *collect_gcc_options;
> -  int parallel = 0;
> -  int jobserver = 0;
>    bool no_partition = false;
>    struct cl_decoded_option *fdecoded_options = NULL;
>    unsigned int fdecoded_options_count = 0;
> @@ -630,6 +631,16 @@ run_gcc (unsigned argc, char *argv[])
>  	      if (parallel <= 1)
>  		parallel = 0;
>  	    }
> +	  if (jobserver)
> +	    {
> +	      obstack_ptr_grow (&argv_obstack, xstrdup ("-fparallelism=jobserver"));
> +	    }
> +	  else if (parallel > 1)
> +	    {
> +	      char buf[256];
> +	      sprintf (buf, "-fparallelism=%i", parallel);
> +	      obstack_ptr_grow (&argv_obstack, xstrdup (buf));
> +	    }
>  	  /* Fallthru.  */
>  
>  	case OPT_flto:
> Index: lto/lto.c
> ===================================================================
> --- lto/lto.c	(revision 201891)
> +++ lto/lto.c	(working copy)
> @@ -49,6 +49,9 @@ along with GCC; see the file COPYING3.
>  #include "context.h"
>  #include "pass_manager.h"
>  
> +/* Number of parallel tasks to run, -1 if we want to use GNU Make jobserver.  */
> +int lto_parallelism;
> +
>  static GTY(()) tree first_personality_decl;
>  
>  /* Returns a hash code for P.  */
> @@ -3002,6 +3005,98 @@ cmp_partitions_order (const void *a, con
>    return orderb - ordera;
>  }
>  
> +/* Actually stream out ENCODER into TEMP_FILENAME.  */
> +
> +void
> +do_stream_out (char *temp_filename, lto_symtab_encoder_t encoder)
> +{
> +  lto_file *file = lto_obj_file_open (temp_filename, true);
> +  if (!file)
> +    fatal_error ("lto_obj_file_open() failed");
> +  lto_set_current_out_file (file);
> +
> +  ipa_write_optimization_summaries (encoder);
> +
> +  lto_set_current_out_file (NULL);
> +  lto_obj_file_close (file);
> +  free (file);
> +}
> +
> +/* Wait for forked process and signal errors.  */
> +#ifdef HAVE_WORKING_FORK
> +void
> +wait_for_child ()
> +{
> +  int status;
> +  do
> +    {
> +      int w = waitpid(0, &status, WUNTRACED | WCONTINUED);
> +      if (w == -1)
> +	fatal_error ("waitpid failed");
> +
> +      if (WIFEXITED (status) && WEXITSTATUS (status))
> +	fatal_error ("streaming subprocess failed");
> +      else if (WIFSIGNALED (status))
> +	fatal_error ("streaming subprocess was killed by signal");
> +    }
> +  while (!WIFEXITED(status) && !WIFSIGNALED(status));
> +}
> +#endif
> +
> +/* Stream out ENCODER into TEMP_FILENAME
> +   Fork if that seems to help.  */
> +
> +void
> +stream_out (char *temp_filename, lto_symtab_encoder_t encoder, bool last)
> +{
> +#ifdef HAVE_WORKING_FORK
> +  static int nruns;
> +
> +  if (!lto_parallelism || lto_parallelism == 1)
> +    {
> +      do_stream_out (temp_filename, encoder);
> +      return;
> +    }
> +
> +  /* Do not run more than LTO_PARALLELISM streamings
> +     FIXME: we ignore limits on jobserver.  */
> +  if (lto_parallelism > 0 && nruns >= lto_parallelism)
> +    {
> +      wait_for_child ();
> +      nruns --;
> +    }
> +  /* If this is not the last parallel partition, execute new
> +     streaming process.  */
> +  if (!last)
> +    {
> +      pid_t cpid = fork ();
> +
> +      if (!cpid)
> +	{
> +	  setproctitle ("lto1-wpa-streaming");
> +	  do_stream_out (temp_filename, encoder);
> +	  exit (0);
> +	}
> +      /* Fork failed; lets do the job ourseleves.  */
> +      else if (cpid == -1)
> +        do_stream_out (temp_filename, encoder);
> +      else
> +	nruns++;
> +    }
> +  /* Last partition; stream it and wait for all children to die.  */
> +  else
> +    {
> +      int i;
> +      do_stream_out (temp_filename, encoder);
> +      for (i = 0; i < nruns; i++)
> +	wait_for_child ();
> +    }
> +  asm_nodes_output = true;
> +#else
> +  do_stream_out (temp_filename, encoder);
> +#endif
> +}
> +
>  /* Write all output files in WPA mode and the file with the list of
>     LTRANS units.  */
>  
> @@ -3009,18 +3104,15 @@ static void
>  lto_wpa_write_files (void)
>  {
>    unsigned i, n_sets;
> -  lto_file *file;
>    ltrans_partition part;
>    FILE *ltrans_output_list_stream;
>    char *temp_filename;
> +  vec <char *>temp_filenames = vNULL;
>    size_t blen;
>  
>    /* Open the LTRANS output list.  */
>    if (!ltrans_output_list)
>      fatal_error ("no LTRANS output list filename provided");
> -  ltrans_output_list_stream = fopen (ltrans_output_list, "w");
> -  if (ltrans_output_list_stream == NULL)
> -    fatal_error ("opening LTRANS output list %s: %m", ltrans_output_list);
>  
>    timevar_push (TV_WHOPR_WPA);
>  
> @@ -3056,14 +3148,10 @@ lto_wpa_write_files (void)
>  			   : cmp_partitions_order);
>    for (i = 0; i < n_sets; i++)
>      {
> -      size_t len;
>        ltrans_partition part = ltrans_partitions[i];
>  
>        /* Write all the nodes in SET.  */
>        sprintf (temp_filename + blen, "%u.o", i);
> -      file = lto_obj_file_open (temp_filename, true);
> -      if (!file)
> -	fatal_error ("lto_obj_file_open() failed");
>  
>        if (!quiet_flag)
>  	fprintf (stderr, " %s (%s %i insns)", temp_filename, part->name, part->insns);
> @@ -3105,21 +3193,25 @@ lto_wpa_write_files (void)
>  	}
>        gcc_checking_assert (lto_symtab_encoder_size (part->encoder) || !i);
>  
> -      lto_set_current_out_file (file);
> -
> -      ipa_write_optimization_summaries (part->encoder);
> +      stream_out (temp_filename, part->encoder, i == n_sets - 1);
>  
> -      lto_set_current_out_file (NULL);
> -      lto_obj_file_close (file);
> -      free (file);
>        part->encoder = NULL;
>  
> -      len = strlen (temp_filename);
> -      if (fwrite (temp_filename, 1, len, ltrans_output_list_stream) < len
> +      temp_filenames.safe_push (xstrdup (temp_filename));
> +    }
> +  ltrans_output_list_stream = fopen (ltrans_output_list, "w");
> +  if (ltrans_output_list_stream == NULL)
> +    fatal_error ("opening LTRANS output list %s: %m", ltrans_output_list);
> +  for (i = 0; i < n_sets; i++)
> +    {
> +      unsigned int len = strlen (temp_filenames[i]);
> +      if (fwrite (temp_filenames[i], 1, len, ltrans_output_list_stream) < len
>  	  || fwrite ("\n", 1, 1, ltrans_output_list_stream) < 1)
>  	fatal_error ("writing to LTRANS output list %s: %m",
>  		     ltrans_output_list);
> +     free (temp_filenames[i]);
>      }
> +  temp_filenames.release();
>  
>    lto_stats.num_output_files += n_sets;
>  
> Index: lto/lang.opt
> ===================================================================
> --- lto/lang.opt	(revision 201891)
> +++ lto/lang.opt	(working copy)
> @@ -32,6 +32,10 @@ fltrans-output-list=
>  LTO Joined Var(ltrans_output_list)
>  Specify a file to which a list of files output by LTRANS is written.
>  
> +fparallelism=
> +LTO Joined
> +Run the link-time optimizer in whole program analysis (WPA) mode.
> +
>  fwpa
>  LTO Driver Report Var(flag_wpa)
>  Run the link-time optimizer in whole program analysis (WPA) mode.
> Index: lto/lto.h
> ===================================================================
> --- lto/lto.h	(revision 201891)
> +++ lto/lto.h	(working copy)
> @@ -39,6 +39,7 @@ extern const char *resolution_file_name;
>  extern tree lto_eh_personality (void);
>  extern void lto_main (void);
>  extern void lto_read_all_file_options (void);
> +extern int lto_parallelism;
>  
>  /* In lto-elf.c or lto-coff.c  */
>  extern lto_file *lto_obj_file_open (const char *filename, bool writable);
> Index: lto/lto-lang.c
> ===================================================================
> --- lto/lto-lang.c	(revision 201891)
> +++ lto/lto-lang.c	(working copy)
> @@ -735,6 +735,19 @@ lto_handle_option (size_t scode, const c
>        warn_psabi = value;
>        break;
>  
> +    case OPT_fparallelism_:
> +      if (!arg)
> +	lto_parallelism = 1;
> +      else if (!strcmp (arg, "jobserver"))
> +	lto_parallelism = -1;
> +      else
> +	{
> +	  lto_parallelism = atoi (arg);
> +	  if (lto_parallelism <= 0)
> +	    lto_parallelism = 0;
> +	}
> +      break;
> +
>      default:
>        break;
>      }
> 
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE / SUSE Labs
SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
GF: Jeff Hawn, Jennifer Guild, Felix Imend"orffer


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]