This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Fix IPA WRT profile


Hi,
while investigating recent tramp3d regression, I noticed that there are several
bugs regarding to profile (both feedback and guessed) handling at function
versioning/clonning and inlining.  This result in funny ill effects on inline
decisions.  In particular:

  1) When clonning, entry/exit block profile is not set.  This makes all code in function
     body appear to be in loop iterating very many times.
  2) When clonning, profile summaries are not copied. This makes clone itself to be optimized
     without profile info (even guessed profile) but after further inlining we hide this fact
     and optimize with profile again.
  3) Inliner when using original function to represent the inline clone (i.e. it is last
     use of function) forgets to update frequencies and loop nests.
  4) tree-inline had few confused bits concerning frequency scaling while versining.
     this is wrong. BB frequencies are function local so they don't need to be updated
     when versioning, only when inlining.

I also added sanity checking, but there are still some failures, I am
investigating these and will fix incrementally.  This patch brings tramp3d -O2
score from 0.58 to 0.46, while with leafify we get 0.44.  So there is still
some room for improvement.  We used to have same scores on leafify and without,
but that was more by bug than design since we accounted incorrectly COMDATs.

Bootstrapped&regtested x86_64-linux, will commit it shortly.

Honza

	* cgraphbuild.c (compute_call_stmt_bb_frequency): Use proper ENTRY_BLOCK_PTR.
	* cgraph.c (cgraph_clone_edge): Avoid freq_scale 0 to completely zero out all
	callees.
	* cgraphunit.c (verify_cgraph_node): Verify cgraph nodes for frequency and count match.
	* ipa-inline.c (update_noncloned_frequencies): New function.
	(cgraph_clone_inlined_nodes): Use it.
	* tree-inline.c (copy_bb): Fix frequency scaling; output
	diagnostic on frequency mismatches to dump file.
	(initialize_cfun): Do not scale frequency; fix count scaling;
	initialize entry and exit block frequencies; copy profile
	info.
	(copy_cfg_body): Use frequency_scale as argument;
	fix count scaling.
	(copy_body): Use frequency_scale as argument.
	(expand_call_inline): Compute frequency scale and output diagnostic
	to dump file.
	(delete_unreachable_blocks_update_callgrah): Remove checking that
	has to be done after edge redirection.
	(tree_function_versioning): Update initialize_cfun and copy_body call.
Index: cgraphbuild.c
===================================================================
*** cgraphbuild.c	(revision 154200)
--- cgraphbuild.c	(working copy)
*************** reset_inline_failed (struct cgraph_node 
*** 109,115 ****
  int
  compute_call_stmt_bb_frequency (tree decl, basic_block bb)
  {
!   int entry_freq = ENTRY_BLOCK_PTR->frequency;
    int freq = bb->frequency;
  
    if (profile_status_for_function (DECL_STRUCT_FUNCTION (decl)) == PROFILE_ABSENT)
--- 109,116 ----
  int
  compute_call_stmt_bb_frequency (tree decl, basic_block bb)
  {
!   int entry_freq = ENTRY_BLOCK_PTR_FOR_FUNCTION
!   		     (DECL_STRUCT_FUNCTION (decl))->frequency;
    int freq = bb->frequency;
  
    if (profile_status_for_function (DECL_STRUCT_FUNCTION (decl)) == PROFILE_ABSENT)
Index: cgraph.c
===================================================================
*** cgraph.c	(revision 154200)
--- cgraph.c	(working copy)
*************** cgraph_clone_edge (struct cgraph_edge *e
*** 1641,1648 ****
  {
    struct cgraph_edge *new_edge;
    gcov_type count = e->count * count_scale / REG_BR_PROB_BASE;
!   gcov_type freq = e->frequency * (gcov_type) freq_scale / CGRAPH_FREQ_BASE;
  
    if (freq > CGRAPH_FREQ_MAX)
      freq = CGRAPH_FREQ_MAX;
    new_edge = cgraph_create_edge (n, e->callee, call_stmt, count, freq,
--- 1641,1652 ----
  {
    struct cgraph_edge *new_edge;
    gcov_type count = e->count * count_scale / REG_BR_PROB_BASE;
!   gcov_type freq;
  
+   /* We do not want to ignore loop nest after frequency drops to 0.  */
+   if (!freq_scale)
+     freq_scale = 1;
+   freq = e->frequency * (gcov_type) freq_scale / CGRAPH_FREQ_BASE;
    if (freq > CGRAPH_FREQ_MAX)
      freq = CGRAPH_FREQ_MAX;
    new_edge = cgraph_create_edge (n, e->callee, call_stmt, count, freq,
Index: cgraphunit.c
===================================================================
*** cgraphunit.c	(revision 154200)
--- cgraphunit.c	(working copy)
*************** verify_cgraph_node (struct cgraph_node *
*** 620,625 ****
--- 620,645 ----
  	  error ("caller edge frequency is too large");
  	  error_found = true;
  	}
+       if (gimple_has_body_p (e->caller->decl)
+           && !e->caller->global.inlined_to
+           && (e->frequency
+ 	      != compute_call_stmt_bb_frequency (e->caller->decl,
+ 						 gimple_bb (e->call_stmt))))
+ 	{
+ 	  error ("caller edge frequency %i does not match BB freqency %i",
+ 	  	 e->frequency,
+ 		 compute_call_stmt_bb_frequency (e->caller->decl,
+ 						 gimple_bb (e->call_stmt)));
+ 	  error_found = true;
+ 	}
+       /* When clones are involved, the count is distributed across all of them.
+          */
+       if (!e->caller->clones && !e->caller->clone_of
+           && e->count != gimple_bb (e->call_stmt)->count)
+ 	{
+ 	  error ("caller edge count does not match BB count");
+ 	  error_found = true;
+ 	}
        if (!e->inline_failed)
  	{
  	  if (node->global.inlined_to
Index: ipa-inline.c
===================================================================
*** ipa-inline.c	(revision 154200)
--- ipa-inline.c	(working copy)
*************** cgraph_estimate_size_after_inlining (int
*** 207,212 ****
--- 207,235 ----
    return size;
  }
  
+ /* Scale frequency of NODE edges by FREQ_SCALE and increase loop nest
+    by NEST.  */
+ 
+ static void
+ update_noncloned_frequencies (struct cgraph_node *node,
+ 			      int freq_scale, int nest)
+ {
+   struct cgraph_edge *e;
+ 
+   /* We do not want to ignore high loop nest after freq drops to 0.  */
+   if (!freq_scale)
+     freq_scale = 1;
+   for (e = node->callees; e; e = e->next_callee)
+     {
+       e->loop_nest += nest;
+       e->frequency = e->frequency * (gcov_type) freq_scale / CGRAPH_FREQ_BASE;
+       if (e->frequency > CGRAPH_FREQ_MAX)
+         e->frequency = CGRAPH_FREQ_MAX;
+       if (!e->inline_failed)
+         update_noncloned_frequencies (e->callee, freq_scale, nest);
+     }
+ }
+ 
  /* E is expected to be an edge being inlined.  Clone destination node of
     the edge and redirect it to the new clone.
     DUPLICATE is used for bookkeeping on whether we are actually creating new
*************** cgraph_clone_inlined_nodes (struct cgrap
*** 234,239 ****
--- 257,263 ----
  	    }
  	  duplicate = false;
  	  e->callee->local.externally_visible = false;
+           update_noncloned_frequencies (e->callee, e->frequency, e->loop_nest);
  	}
        else
  	{
Index: tree-inline.c
===================================================================
*** tree-inline.c	(revision 154200)
--- tree-inline.c	(working copy)
*************** copy_bb (copy_body_data *id, basic_block
*** 1472,1477 ****
--- 1472,1478 ----
    gimple_stmt_iterator gsi, copy_gsi, seq_gsi;
    basic_block copy_basic_block;
    tree decl;
+   gcov_type freq;
  
    /* create_basic_block() will append every new block to
       basic_block_info automatically.  */
*************** copy_bb (copy_body_data *id, basic_block
*** 1481,1491 ****
  
    /* We are going to rebuild frequencies from scratch.  These values
       have just small importance to drive canonicalize_loop_headers.  */
!   copy_basic_block->frequency = ((gcov_type)bb->frequency
! 				 * frequency_scale / REG_BR_PROB_BASE);
  
!   if (copy_basic_block->frequency > BB_FREQ_MAX)
!     copy_basic_block->frequency = BB_FREQ_MAX;
  
    copy_gsi = gsi_start_bb (copy_basic_block);
  
--- 1482,1493 ----
  
    /* We are going to rebuild frequencies from scratch.  These values
       have just small importance to drive canonicalize_loop_headers.  */
!   freq = ((gcov_type)bb->frequency * frequency_scale / REG_BR_PROB_BASE);
  
!   /* We recompute frequencies after inlining, so this is quite safe.  */
!   if (freq > BB_FREQ_MAX)
!     freq = BB_FREQ_MAX;
!   copy_basic_block->frequency = freq;
  
    copy_gsi = gsi_start_bb (copy_basic_block);
  
*************** copy_bb (copy_body_data *id, basic_block
*** 1631,1640 ****
  		case CB_CGE_DUPLICATE:
  		  edge = cgraph_edge (id->src_node, orig_stmt);
  		  if (edge)
! 		    edge = cgraph_clone_edge (edge, id->dst_node, stmt,
! 					      gimple_uid (stmt),
! 					      REG_BR_PROB_BASE, 1,
! 					      edge->frequency, true);
  		  break;
  
  		case CB_CGE_MOVE_CLONES:
--- 1633,1666 ----
  		case CB_CGE_DUPLICATE:
  		  edge = cgraph_edge (id->src_node, orig_stmt);
  		  if (edge)
! 		    {
! 		      int edge_freq = edge->frequency;
! 		      edge = cgraph_clone_edge (edge, id->dst_node, stmt,
! 					        gimple_uid (stmt),
! 					        REG_BR_PROB_BASE, CGRAPH_FREQ_BASE,
! 					        edge->frequency, true);
! 		      /* We could also just rescale the frequency, but
! 		         doing so would introduce roundoff errors and make
! 			 verifier unhappy.  */
! 		      edge->frequency 
! 		        = compute_call_stmt_bb_frequency (id->dst_node->decl,
! 							  copy_basic_block);
! 		      if (dump_file
! 		      	  && profile_status_for_function (cfun) != PROFILE_ABSENT
! 			  && (edge_freq > edge->frequency + 10
! 			      || edge_freq < edge->frequency - 10))
! 			{
! 			  fprintf (dump_file, "Edge frequency estimated by "
! 			           "cgraph %i diverge from inliner's estimate %i\n",
! 			  	   edge_freq,
! 				   edge->frequency);
! 			  fprintf (dump_file,
! 			  	   "Orig bb: %i, orig bb freq %i, new bb freq %i\n",
! 				   bb->index,
! 				   bb->frequency,
! 				   copy_basic_block->frequency);
! 			}
! 		    }
  		  break;
  
  		case CB_CGE_MOVE_CLONES:
*************** copy_bb (copy_body_data *id, basic_block
*** 1674,1680 ****
  		  if (id->transform_call_graph_edges == CB_CGE_MOVE_CLONES)
  		    cgraph_create_edge_including_clones
  		      (id->dst_node, dest, stmt, bb->count,
! 		       compute_call_stmt_bb_frequency (id->dst_node->decl, bb),
  		       bb->loop_depth, CIF_ORIGINALLY_INDIRECT_CALL);
  		  else
  		    cgraph_create_edge (id->dst_node, dest, stmt,
--- 1700,1707 ----
  		  if (id->transform_call_graph_edges == CB_CGE_MOVE_CLONES)
  		    cgraph_create_edge_including_clones
  		      (id->dst_node, dest, stmt, bb->count,
! 		       compute_call_stmt_bb_frequency (id->dst_node->decl, 
! 		       				       copy_basic_block),
  		       bb->loop_depth, CIF_ORIGINALLY_INDIRECT_CALL);
  		  else
  		    cgraph_create_edge (id->dst_node, dest, stmt,
*************** remap_decl_1 (tree decl, void *data)
*** 1948,1971 ****
     NEW_FNDECL to be build.  CALLEE_FNDECL is the original */
  
  static void
! initialize_cfun (tree new_fndecl, tree callee_fndecl, gcov_type count,
! 		 int frequency)
  {
    struct function *src_cfun = DECL_STRUCT_FUNCTION (callee_fndecl);
!   gcov_type count_scale, frequency_scale;
  
    if (ENTRY_BLOCK_PTR_FOR_FUNCTION (src_cfun)->count)
      count_scale = (REG_BR_PROB_BASE * count
  		   / ENTRY_BLOCK_PTR_FOR_FUNCTION (src_cfun)->count);
    else
!     count_scale = 1;
! 
!   if (ENTRY_BLOCK_PTR_FOR_FUNCTION (src_cfun)->frequency)
!     frequency_scale = (REG_BR_PROB_BASE * frequency
! 		       /
! 		       ENTRY_BLOCK_PTR_FOR_FUNCTION (src_cfun)->frequency);
!   else
!     frequency_scale = count_scale;
  
    /* Register specific tree functions.  */
    gimple_register_cfg_hooks ();
--- 1975,1990 ----
     NEW_FNDECL to be build.  CALLEE_FNDECL is the original */
  
  static void
! initialize_cfun (tree new_fndecl, tree callee_fndecl, gcov_type count)
  {
    struct function *src_cfun = DECL_STRUCT_FUNCTION (callee_fndecl);
!   gcov_type count_scale;
  
    if (ENTRY_BLOCK_PTR_FOR_FUNCTION (src_cfun)->count)
      count_scale = (REG_BR_PROB_BASE * count
  		   / ENTRY_BLOCK_PTR_FOR_FUNCTION (src_cfun)->count);
    else
!     count_scale = REG_BR_PROB_BASE;
  
    /* Register specific tree functions.  */
    gimple_register_cfg_hooks ();
*************** initialize_cfun (tree new_fndecl, tree c
*** 1998,2015 ****
  
    init_empty_tree_cfg ();
  
    ENTRY_BLOCK_PTR->count =
      (ENTRY_BLOCK_PTR_FOR_FUNCTION (src_cfun)->count * count_scale /
       REG_BR_PROB_BASE);
!   ENTRY_BLOCK_PTR->frequency =
!     (ENTRY_BLOCK_PTR_FOR_FUNCTION (src_cfun)->frequency *
!      frequency_scale / REG_BR_PROB_BASE);
    EXIT_BLOCK_PTR->count =
      (EXIT_BLOCK_PTR_FOR_FUNCTION (src_cfun)->count * count_scale /
       REG_BR_PROB_BASE);
    EXIT_BLOCK_PTR->frequency =
!     (EXIT_BLOCK_PTR_FOR_FUNCTION (src_cfun)->frequency *
!      frequency_scale / REG_BR_PROB_BASE);
    if (src_cfun->eh)
      init_eh_for_function ();
  
--- 2017,2033 ----
  
    init_empty_tree_cfg ();
  
+   profile_status_for_function (cfun) = profile_status_for_function (src_cfun);
    ENTRY_BLOCK_PTR->count =
      (ENTRY_BLOCK_PTR_FOR_FUNCTION (src_cfun)->count * count_scale /
       REG_BR_PROB_BASE);
!   ENTRY_BLOCK_PTR->frequency
!     = ENTRY_BLOCK_PTR_FOR_FUNCTION (src_cfun)->frequency;
    EXIT_BLOCK_PTR->count =
      (EXIT_BLOCK_PTR_FOR_FUNCTION (src_cfun)->count * count_scale /
       REG_BR_PROB_BASE);
    EXIT_BLOCK_PTR->frequency =
!     EXIT_BLOCK_PTR_FOR_FUNCTION (src_cfun)->frequency;
    if (src_cfun->eh)
      init_eh_for_function ();
  
*************** initialize_cfun (tree new_fndecl, tree c
*** 2026,2032 ****
     another function.  Walks FN via CFG, returns new fndecl.  */
  
  static tree
! copy_cfg_body (copy_body_data * id, gcov_type count, int frequency,
  	       basic_block entry_block_map, basic_block exit_block_map)
  {
    tree callee_fndecl = id->src_fn;
--- 2044,2050 ----
     another function.  Walks FN via CFG, returns new fndecl.  */
  
  static tree
! copy_cfg_body (copy_body_data * id, gcov_type count, int frequency_scale,
  	       basic_block entry_block_map, basic_block exit_block_map)
  {
    tree callee_fndecl = id->src_fn;
*************** copy_cfg_body (copy_body_data * id, gcov
*** 2035,2055 ****
    struct function *cfun_to_copy;
    basic_block bb;
    tree new_fndecl = NULL;
!   gcov_type count_scale, frequency_scale;
    int last;
  
    if (ENTRY_BLOCK_PTR_FOR_FUNCTION (src_cfun)->count)
      count_scale = (REG_BR_PROB_BASE * count
  		   / ENTRY_BLOCK_PTR_FOR_FUNCTION (src_cfun)->count);
    else
!     count_scale = 1;
! 
!   if (ENTRY_BLOCK_PTR_FOR_FUNCTION (src_cfun)->frequency)
!     frequency_scale = (REG_BR_PROB_BASE * frequency
! 		       /
! 		       ENTRY_BLOCK_PTR_FOR_FUNCTION (src_cfun)->frequency);
!   else
!     frequency_scale = count_scale;
  
    /* Register specific tree functions.  */
    gimple_register_cfg_hooks ();
--- 2053,2066 ----
    struct function *cfun_to_copy;
    basic_block bb;
    tree new_fndecl = NULL;
!   gcov_type count_scale;
    int last;
  
    if (ENTRY_BLOCK_PTR_FOR_FUNCTION (src_cfun)->count)
      count_scale = (REG_BR_PROB_BASE * count
  		   / ENTRY_BLOCK_PTR_FOR_FUNCTION (src_cfun)->count);
    else
!     count_scale = REG_BR_PROB_BASE;
  
    /* Register specific tree functions.  */
    gimple_register_cfg_hooks ();
*************** copy_tree_body (copy_body_data *id)
*** 2204,2210 ****
     another function.  */
  
  static tree
! copy_body (copy_body_data *id, gcov_type count, int frequency,
  	   basic_block entry_block_map, basic_block exit_block_map)
  {
    tree fndecl = id->src_fn;
--- 2215,2221 ----
     another function.  */
  
  static tree
! copy_body (copy_body_data *id, gcov_type count, int frequency_scale,
  	   basic_block entry_block_map, basic_block exit_block_map)
  {
    tree fndecl = id->src_fn;
*************** copy_body (copy_body_data *id, gcov_type
*** 2212,2218 ****
  
    /* If this body has a CFG, walk CFG and copy.  */
    gcc_assert (ENTRY_BLOCK_PTR_FOR_FUNCTION (DECL_STRUCT_FUNCTION (fndecl)));
!   body = copy_cfg_body (id, count, frequency, entry_block_map, exit_block_map);
    copy_debug_stmts (id);
  
    return body;
--- 2223,2229 ----
  
    /* If this body has a CFG, walk CFG and copy.  */
    gcc_assert (ENTRY_BLOCK_PTR_FOR_FUNCTION (DECL_STRUCT_FUNCTION (fndecl)));
!   body = copy_cfg_body (id, count, frequency_scale, entry_block_map, exit_block_map);
    copy_debug_stmts (id);
  
    return body;
*************** expand_call_inline (basic_block bb, gimp
*** 3732,3743 ****
  				       cfun->local_decls);
      }
  
    /* This is it.  Duplicate the callee body.  Assume callee is
       pre-gimplified.  Note that we must not alter the caller
       function in any way before this point, as this CALL_EXPR may be
       a self-referential call; if we're calling ourselves, we need to
       duplicate our body before altering anything.  */
!   copy_body (id, bb->count, bb->frequency, bb, return_block);
  
    /* Reset the escaped and callused solutions.  */
    if (cfun->gimple_df)
--- 3743,3765 ----
  				       cfun->local_decls);
      }
  
+   if (dump_file && (dump_flags & TDF_DETAILS))
+     {
+       fprintf (dump_file, "Inlining ");
+       print_generic_expr (dump_file, id->src_fn, 0); 
+       fprintf (dump_file, " to ");
+       print_generic_expr (dump_file, id->dst_fn, 0); 
+       fprintf (dump_file, " with frequency %i\n", cg_edge->frequency);
+     }
+ 
    /* This is it.  Duplicate the callee body.  Assume callee is
       pre-gimplified.  Note that we must not alter the caller
       function in any way before this point, as this CALL_EXPR may be
       a self-referential call; if we're calling ourselves, we need to
       duplicate our body before altering anything.  */
!   copy_body (id, bb->count,
!   	     cg_edge->frequency * REG_BR_PROB_BASE / CGRAPH_FREQ_BASE,
! 	     bb, return_block);
  
    /* Reset the escaped and callused solutions.  */
    if (cfun->gimple_df)
*************** delete_unreachable_blocks_update_callgra
*** 4732,4761 ****
  
    if (changed)
      tidy_fallthru_edges ();
- #ifdef ENABLE_CHECKING0
-   verify_cgraph_node (id->dst_node);
-   if (id->transform_call_graph_edges == CB_CGE_MOVE_CLONES
-       && id->dst_node->clones)
-     {
-       struct cgraph_node *node;
-       for (node = id->dst_node->clones; node != id->dst_node;)
- 	{
- 	  verify_cgraph_node (node);
- 	   
- 	  if (node->clones)
- 	    node = node->clones;
- 	  else if (node->next_sibling_clone)
- 	    node = node->next_sibling_clone;
- 	  else
- 	    {
- 	      while (node != id->dst_node && !node->next_sibling_clone)
- 		node = node->clone_of;
- 	      if (node != id->dst_node)
- 		node = node->next_sibling_clone;
- 	    }
- 	}
-      }
- #endif
    return changed;
  }
  
--- 4754,4759 ----
*************** tree_function_versioning (tree old_decl,
*** 4876,4883 ****
    old_entry_block = ENTRY_BLOCK_PTR_FOR_FUNCTION
      (DECL_STRUCT_FUNCTION (old_decl));
    initialize_cfun (new_decl, old_decl,
! 		   old_entry_block->count,
! 		   old_entry_block->frequency);
    push_cfun (DECL_STRUCT_FUNCTION (new_decl));
    
    /* Copy the function's static chain.  */
--- 4874,4880 ----
    old_entry_block = ENTRY_BLOCK_PTR_FOR_FUNCTION
      (DECL_STRUCT_FUNCTION (old_decl));
    initialize_cfun (new_decl, old_decl,
! 		   old_entry_block->count);
    push_cfun (DECL_STRUCT_FUNCTION (new_decl));
    
    /* Copy the function's static chain.  */
*************** tree_function_versioning (tree old_decl,
*** 4947,4953 ****
        }
    
    /* Copy the Function's body.  */
!   copy_body (&id, old_entry_block->count, old_entry_block->frequency,
  	     ENTRY_BLOCK_PTR, EXIT_BLOCK_PTR);
    
    if (DECL_RESULT (old_decl) != NULL_TREE)
--- 4944,4950 ----
        }
    
    /* Copy the Function's body.  */
!   copy_body (&id, old_entry_block->count, REG_BR_PROB_BASE,
  	     ENTRY_BLOCK_PTR, EXIT_BLOCK_PTR);
    
    if (DECL_RESULT (old_decl) != NULL_TREE)


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]