Allow merging of units with profile feedback run different amount of times

Jan Hubicka hubicka@ucw.cz
Sat Dec 4 19:34:00 GMT 2010


Hi,
this patch adds support for merging compilation units with profile feedbacks where
each unit was run different times.  This is neccesary for LTO profiledbootstrap
since we do have single libbackend linked into multiple binaries. As a result
libbackend summaries are trained more times than each of individual binaries.
It would make more sense to instrument at link time and have profile feedback for
each binary separately but this will require more changes and thus can't be done
in stage3.

This patch simply makes the profiles to be scaled up to the largest number of runs
in the whole program.  This allows profiledbootstrap to pass on x86-64 for C only
compiler.   Resulting compiler seems smaller and faster than one built with LTO
only, so it seems to work as expected.

Merging is done by reading in profile summaries per file basis and storing them in
lto_file_decl_data.  At the end the maximal number of runs is found and
profiles are re-scaled to work out global sum_max, in merge_profile_summaries.
We also need to update callgraph counts that is done in merge_profile_summaries
and BB/edge counts that is done in lto-streamer based on
count_materialization_scale stored in cgraph nodes.

I will look into failures appearing in C++ and Ada build (profile mismatches).

Bootstrapped/regtested x86_64-linux, will commit it shortly.

	PR tree-optimization/46760
	* cgraph.c (cgraph_create_node): Initialize count_materialization_scale.
	* cgraph.h (struct cgraph_node): Add count_materialization_scale.
	* lto-cgraph.c (lto_output_edge): Fix assert.
	(lto_output_node): Output count_materialization_scale.
	(output_profile_summary): Output only runs and sum_max.
	(input_node): Input count_materialization_scale.
	(input_profile_summary): Read data into file specific gcov summary.
	(merge_profile_summaries): New function.
	(input_cgraph): Update call of input_profile_summary;
	call merge_profile_summaries.
	* lto-streamer-in.c (input_cfg): Add count_materialization_scale arg;
	rescale counts at read in.
	(intput_bb): Likewise.
	(input_function): Update call of input_bb.
	(lto_read_body): Update call of input_cfg.
	* lto-streamer.h: Inlclude gcov-io.h
	(lto_file_decl_data): Add gcov_ctr_summary.
Index: cgraph.c
===================================================================
*** cgraph.c	(revision 167457)
--- cgraph.c	(working copy)
*************** cgraph_create_node (void)
*** 478,483 ****
--- 478,484 ----
    node->previous = NULL;
    node->global.estimated_growth = INT_MIN;
    node->frequency = NODE_FREQUENCY_NORMAL;
+   node->count_materialization_scale = REG_BR_PROB_BASE;
    ipa_empty_ref_list (&node->ref_list);
    cgraph_nodes = node;
    cgraph_n_nodes++;
Index: cgraph.h
===================================================================
*** cgraph.h	(revision 167457)
--- cgraph.h	(working copy)
*************** struct GTY((chain_next ("%h.next"), chai
*** 233,238 ****
--- 233,241 ----
  
    /* Expected number of executions: calculated in profile.c.  */
    gcov_type count;
+   /* How to scale counts at materialization time; used to merge
+      LTO units with different number of profile runs.  */
+   int count_materialization_scale;
    /* Unique id of the node.  */
    int uid;
    /* Ordering of all cgraph nodes.  */
Index: lto-cgraph.c
===================================================================
*** lto-cgraph.c	(revision 167457)
--- lto-cgraph.c	(working copy)
*************** lto_output_edge (struct lto_simple_outpu
*** 302,307 ****
--- 302,308 ----
        gcc_assert (!(flags & (ECF_LOOPING_CONST_OR_PURE
  			     | ECF_MAY_BE_ALLOCA
  			     | ECF_SIBCALL
+ 			     | ECF_LEAF
  			     | ECF_NOVOPS)));
      }
    lto_output_bitpack (&bp);
*************** lto_output_node (struct lto_simple_outpu
*** 462,467 ****
--- 463,469 ----
  
    lto_output_fn_decl_index (ob->decl_state, ob->main_stream, node->decl);
    lto_output_sleb128_stream (ob->main_stream, node->count);
+   lto_output_sleb128_stream (ob->main_stream, node->count_materialization_scale);
  
    if (tag == LTO_cgraph_analyzed_node)
      {
*************** output_profile_summary (struct lto_simpl
*** 661,672 ****
  {
    if (profile_info)
      {
!       /* We do not output num, it is not terribly useful.  */
        gcc_assert (profile_info->runs);
        lto_output_uleb128_stream (ob->main_stream, profile_info->runs);
!       lto_output_sleb128_stream (ob->main_stream, profile_info->sum_all);
!       lto_output_sleb128_stream (ob->main_stream, profile_info->run_max);
!       lto_output_sleb128_stream (ob->main_stream, profile_info->sum_max);
      }
    else
      lto_output_uleb128_stream (ob->main_stream, 0);
--- 663,674 ----
  {
    if (profile_info)
      {
!       /* We do not output num, sum_all and run_max, they are not used by
! 	 GCC profile feedback and they are difficult to merge from multiple
! 	 units.  */
        gcc_assert (profile_info->runs);
        lto_output_uleb128_stream (ob->main_stream, profile_info->runs);
!       lto_output_uleb128_stream (ob->main_stream, profile_info->sum_max);
      }
    else
      lto_output_uleb128_stream (ob->main_stream, 0);
*************** input_node (struct lto_file_decl_data *f
*** 1045,1050 ****
--- 1047,1053 ----
      node = cgraph_node (fn_decl);
  
    node->count = lto_input_sleb128 (ib);
+   node->count_materialization_scale = lto_input_sleb128 (ib);
  
    if (tag == LTO_cgraph_analyzed_node)
      {
*************** static struct gcov_ctr_summary lto_gcov_
*** 1424,1455 ****
  
  /* Input profile_info from IB.  */
  static void
! input_profile_summary (struct lto_input_block *ib)
  {
    unsigned int runs = lto_input_uleb128 (ib);
    if (runs)
      {
!       if (!profile_info)
!         {
! 	  profile_info = &lto_gcov_summary;
! 	  lto_gcov_summary.runs = runs;
! 	  lto_gcov_summary.sum_all = lto_input_sleb128 (ib);
! 	  lto_gcov_summary.run_max = lto_input_sleb128 (ib);
! 	  lto_gcov_summary.sum_max = lto_input_sleb128 (ib);
! 	}
!       /* We can support this by scaling all counts to nearest common multiple
!          of all different runs, but it is perhaps not worth the effort.  */
!       else if (profile_info->runs != runs
! 	       || profile_info->sum_all != lto_input_sleb128 (ib)
! 	       || profile_info->run_max != lto_input_sleb128 (ib)
! 	       || profile_info->sum_max != lto_input_sleb128 (ib))
! 	sorry ("combining units with different profiles is not supported");
!       /* We allow some units to have profile and other to not have one.  This will
!          just make unprofiled units to be size optimized that is sane.  */
      }
  
  }
  
  /* Input and merge the cgraph from each of the .o files passed to
     lto1.  */
  
--- 1427,1534 ----
  
  /* Input profile_info from IB.  */
  static void
! input_profile_summary (struct lto_input_block *ib,
! 		       struct lto_file_decl_data *file_data)
  {
    unsigned int runs = lto_input_uleb128 (ib);
    if (runs)
      {
!       file_data->profile_info.runs = runs;
!       file_data->profile_info.sum_max = lto_input_uleb128 (ib);
!       if (runs > file_data->profile_info.sum_max)
! 	fatal_error ("Corrupted profile info in %s: sum_max is smaller than runs",
! 		     file_data->file_name);
      }
  
  }
  
+ /* Rescale profile summaries to the same number of runs in the whole unit.  */
+ 
+ static void
+ merge_profile_summaries (struct lto_file_decl_data **file_data_vec)
+ {
+   struct lto_file_decl_data *file_data;
+   unsigned int j;
+   gcov_unsigned_t max_runs = 0;
+   struct cgraph_node *node;
+   struct cgraph_edge *edge;
+ 
+   /* Find unit with maximal number of runs.  If we ever get serious about
+      roundoff errors, we might also consider computing smallest common
+      multiply.  */
+   for (j = 0; (file_data = file_data_vec[j]) != NULL; j++)
+     if (max_runs < file_data->profile_info.runs)
+       max_runs = file_data->profile_info.runs;
+ 
+   if (!max_runs)
+     return;
+ 
+   /* Simple overflow check.  We probably don't need to support that many train
+      runs. Such a large value probably imply data corruption anyway.  */
+   if (max_runs > INT_MAX / REG_BR_PROB_BASE)
+     {
+       sorry ("At most %i profile runs is supported. Perhaps corrupted profile?",
+ 	     INT_MAX / REG_BR_PROB_BASE);
+       return;
+     }
+ 
+   profile_info = &lto_gcov_summary;
+   lto_gcov_summary.runs = max_runs;
+   lto_gcov_summary.sum_max = 0;
+ 
+   /* Rescale all units to the maximal number of runs.
+      sum_max can not be easily merged, as we have no idea what files come from
+      the same run.  We do not use the info anyway, so leave it 0.  */
+   for (j = 0; (file_data = file_data_vec[j]) != NULL; j++)
+     if (file_data->profile_info.runs)
+       {
+ 	int scale = ((REG_BR_PROB_BASE * max_runs
+ 		      + file_data->profile_info.runs / 2)
+ 		     / file_data->profile_info.runs);
+ 	lto_gcov_summary.sum_max = MAX (lto_gcov_summary.sum_max,
+ 					(file_data->profile_info.sum_max
+ 					 * scale
+ 					 + REG_BR_PROB_BASE / 2)
+ 					/ REG_BR_PROB_BASE);
+       }
+ 
+   /* Watch roundoff errors.  */
+   if (lto_gcov_summary.sum_max < max_runs)
+     lto_gcov_summary.sum_max = max_runs;
+ 
+   /* If merging already happent at WPA time, we are done.  */
+   if (flag_ltrans)
+     return;
+ 
+   /* Now compute count_materialization_scale of each node.
+      During LTRANS we already have values of count_materialization_scale
+      computed, so just update them.  */
+   for (node = cgraph_nodes; node; node = node->next)
+     if (node->local.lto_file_data->profile_info.run_max)
+       {
+ 	int scale;
+ 	if (node->local.lto_file_data->profile_info.runs)
+ 	  scale =
+ 	     ((node->count_materialization_scale * max_runs
+ 	       + node->local.lto_file_data->profile_info.run_max / 2)
+ 	      / node->local.lto_file_data->profile_info.run_max);
+ 	else
+ 	  scale = node->count_materialization_scale;
+ 	node->count_materialization_scale = scale;
+ 	if (scale < 0)
+ 	  fatal_error ("Profile information in %s corrupted",
+ 		       file_data->file_name);
+ 
+ 	if (scale == REG_BR_PROB_BASE)
+ 	  continue;
+ 	for (edge = node->callees; edge; edge = edge->next_callee)
+ 	  edge->count = ((edge->count * scale + REG_BR_PROB_BASE / 2)
+ 			 / REG_BR_PROB_BASE);
+ 	node->count = ((node->count * scale + REG_BR_PROB_BASE / 2)
+ 		       / REG_BR_PROB_BASE);
+       }
+ }
+ 
  /* Input and merge the cgraph from each of the .o files passed to
     lto1.  */
  
*************** input_cgraph (void)
*** 1473,1479 ****
  					  &data, &len);
        if (!ib) 
  	fatal_error ("cannot find LTO cgraph in %s", file_data->file_name);
!       input_profile_summary (ib);
        file_data->cgraph_node_encoder = lto_cgraph_encoder_new ();
        nodes = input_cgraph_1 (file_data, ib);
        lto_destroy_simple_input_block (file_data, LTO_section_cgraph,
--- 1552,1558 ----
  					  &data, &len);
        if (!ib) 
  	fatal_error ("cannot find LTO cgraph in %s", file_data->file_name);
!       input_profile_summary (ib, file_data);
        file_data->cgraph_node_encoder = lto_cgraph_encoder_new ();
        nodes = input_cgraph_1 (file_data, ib);
        lto_destroy_simple_input_block (file_data, LTO_section_cgraph,
*************** input_cgraph (void)
*** 1499,1504 ****
--- 1578,1585 ----
        VEC_free (cgraph_node_ptr, heap, nodes);
        VEC_free (varpool_node_ptr, heap, varpool);
      }
+   merge_profile_summaries (file_data_vec);
+     
  
    /* Clear out the aux field that was used to store enough state to
       tell which nodes should be overwritten.  */
Index: lto-streamer-in.c
===================================================================
*** lto-streamer-in.c	(revision 167457)
--- lto-streamer-in.c	(working copy)
*************** make_new_block (struct function *fn, uns
*** 719,725 ****
  /* Read the CFG for function FN from input block IB.  */
  
  static void
! input_cfg (struct lto_input_block *ib, struct function *fn)
  {
    unsigned int bb_count;
    basic_block p_bb;
--- 719,726 ----
  /* Read the CFG for function FN from input block IB.  */
  
  static void
! input_cfg (struct lto_input_block *ib, struct function *fn,
! 	   int count_materialization_scale)
  {
    unsigned int bb_count;
    basic_block p_bb;
*************** input_cfg (struct lto_input_block *ib, s
*** 752,758 ****
        if (bb == NULL)
  	bb = make_new_block (fn, index);
  
!       edge_count = lto_input_uleb128 (ib);
  
        /* Connect up the CFG.  */
        for (i = 0; i < edge_count; i++)
--- 753,760 ----
        if (bb == NULL)
  	bb = make_new_block (fn, index);
  
!       edge_count = (lto_input_uleb128 (ib) * count_materialization_scale
! 		    + REG_BR_PROB_BASE / 2) / REG_BR_PROB_BASE;
  
        /* Connect up the CFG.  */
        for (i = 0; i < edge_count; i++)
*************** input_gimple_stmt (struct lto_input_bloc
*** 1066,1072 ****
  
  static void
  input_bb (struct lto_input_block *ib, enum LTO_tags tag,
! 	  struct data_in *data_in, struct function *fn)
  {
    unsigned int index;
    basic_block bb;
--- 1068,1075 ----
  
  static void
  input_bb (struct lto_input_block *ib, enum LTO_tags tag,
! 	  struct data_in *data_in, struct function *fn,
! 	  int count_materialization_scale)
  {
    unsigned int index;
    basic_block bb;
*************** input_bb (struct lto_input_block *ib, en
*** 1079,1085 ****
    index = lto_input_uleb128 (ib);
    bb = BASIC_BLOCK_FOR_FUNCTION (fn, index);
  
!   bb->count = lto_input_sleb128 (ib);
    bb->loop_depth = lto_input_sleb128 (ib);
    bb->frequency = lto_input_sleb128 (ib);
    bb->flags = lto_input_sleb128 (ib);
--- 1082,1089 ----
    index = lto_input_uleb128 (ib);
    bb = BASIC_BLOCK_FOR_FUNCTION (fn, index);
  
!   bb->count = (lto_input_sleb128 (ib) * count_materialization_scale
! 	       + REG_BR_PROB_BASE / 2) / REG_BR_PROB_BASE;
    bb->loop_depth = lto_input_sleb128 (ib);
    bb->frequency = lto_input_sleb128 (ib);
    bb->flags = lto_input_sleb128 (ib);
*************** input_function (tree fn_decl, struct dat
*** 1253,1264 ****
    DECL_INITIAL (fn_decl) = lto_input_tree (ib, data_in);
    gcc_assert (DECL_INITIAL (fn_decl));
    DECL_SAVED_TREE (fn_decl) = NULL_TREE;
  
    /* Read all the basic blocks.  */
    tag = input_record_start (ib);
    while (tag)
      {
!       input_bb (ib, tag, data_in, fn);
        tag = input_record_start (ib);
      }
  
--- 1257,1270 ----
    DECL_INITIAL (fn_decl) = lto_input_tree (ib, data_in);
    gcc_assert (DECL_INITIAL (fn_decl));
    DECL_SAVED_TREE (fn_decl) = NULL_TREE;
+   node = cgraph_node (fn_decl);
  
    /* Read all the basic blocks.  */
    tag = input_record_start (ib);
    while (tag)
      {
!       input_bb (ib, tag, data_in, fn,
! 		node->count_materialization_scale);
        tag = input_record_start (ib);
      }
  
*************** input_function (tree fn_decl, struct dat
*** 1300,1306 ****
      gimple_set_body (fn_decl, bb_seq (ei_edge (ei)->dest));
    }
  
-   node = cgraph_node (fn_decl);
    fixup_call_stmt_edges (node, stmts);
    execute_all_ipa_stmt_fixups (node, stmts);
  
--- 1306,1311 ----
*************** lto_read_body (struct lto_file_decl_data
*** 1393,1398 ****
--- 1398,1404 ----
      {
        struct function *fn = DECL_STRUCT_FUNCTION (fn_decl);
        struct lto_in_decl_state *decl_state;
+       struct cgraph_node *node = cgraph_node (fn_decl);
  
        push_cfun (fn);
        init_tree_ssa (fn);
*************** lto_read_body (struct lto_file_decl_data
*** 1402,1408 ****
        gcc_assert (decl_state);
        file_data->current_decl_state = decl_state;
  
!       input_cfg (&ib_cfg, fn);
  
        /* Set up the struct function.  */
        input_function (fn_decl, data_in, &ib_main);
--- 1408,1414 ----
        gcc_assert (decl_state);
        file_data->current_decl_state = decl_state;
  
!       input_cfg (&ib_cfg, fn, node->count_materialization_scale);
  
        /* Set up the struct function.  */
        input_function (fn_decl, data_in, &ib_main);
Index: lto-streamer.h
===================================================================
*** lto-streamer.h	(revision 167457)
--- lto-streamer.h	(working copy)
*************** along with GCC; see the file COPYING3.  
*** 31,36 ****
--- 31,37 ----
  #include "vec.h"
  #include "vecprim.h"
  #include "alloc-pool.h"
+ #include "gcov-io.h"
  
  /* Define when debugging the LTO streamer.  This causes the writer
     to output the numeric value for the memory address of the tree node
*************** struct GTY(()) lto_file_decl_data
*** 610,615 ****
--- 611,618 ----
  
    /* Symbol resolutions for this file */
    VEC(ld_plugin_symbol_resolution_t,heap) * GTY((skip)) resolutions;
+ 
+   struct gcov_ctr_summary GTY((skip)) profile_info;
  };
  
  typedef struct lto_file_decl_data *lto_file_decl_data_ptr;



More information about the Gcc-patches mailing list