This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

REVISED^2 PATCH: adjust init_set_costs to account for call_used_regs (PR42505)


Zdenek Dvorak wrote:
Hi,

what about simply increasing the value of target_res_regs, instead?  After all,
target_res_regs is intended to acount for the scratch registers etc.

What about performance results on some mainstream architecture, say x86_64?
I'm out of time for further experimentation and benchmarking; my manager told me 2 weeks ago already that I'd spent enough time on this bug. At this point, I just want to get something checked in that prevents ivopts from thinking it has 9 registers available when it only has 4 because of the call in the loop. If adding the additional register pressure parameter is unacceptable without further experiments, I'll remove it entirely.

while it is somewhat unfortunate, this seems to be the best course of action. Fine-tuning parameters without sufficient testing is kind of pointless.

This needs to be a bit more careful -- you should exclude builtin calls that will
not be expanded to real function calls (e.g., builtin_prefetch),
Is there code elsewhere in the compiler that knows which builtins are guaranteed not to expand into a real function call?

The code in tree-inline.c:estimate_num_insns for the GIMPLE_CALL case could be factored to a new function and used for that,

OK. How about this version? Bootstrapped and regression-tested on x86_64, as before, plus I checked that it didn't introduce any unexpected performance regressions on ARM.


-Sandra

2010-07-10 Sandra Loosemore <sandra@codesourcery.com>

PR middle-end/42505

	gcc/
	* tree-inline.c (estimate_num_insns): Refactor builtin complexity
	lookup code into....
	* builtins.c (is_simple_builtin, is_inexpensive_builtin): ...these
	new functions.
	* tree.h (is_simple_builtin, is_inexpensive_builtin): Declare.
	* cfgloopanal.c (target_clobbered_regs): Define.
	(init_set_costs): Initialize target_clobbered_regs.
	(estimate_reg_pressure_cost): Add call_p argument.  When true,
	adjust the number of available registers to exclude the
	call-clobbered registers.
	* cfgloop.h (target_clobbered_regs): Declare.
	(estimate_reg_pressure_cost): Adjust declaration.
	* tree-ssa-loop-ivopts.c (struct ivopts_data): Add body_includes_call.
	(ivopts_global_cost_for_size): Pass it to estimate_reg_pressure_cost.
	(determine_set_costs): Dump target_clobbered_regs.
	(loop_body_includes_call): New function.
	(tree_ssa_iv_optimize_loop): Use it to initialize new field.
	* loop-invariant.c (gain_for_invariant): Adjust arguments to pass
	call_p flag through.
	(best_gain_for_invariant): Likewise.
	(find_invariants_to_move): Likewise.
	(move_single_loop_invariants): Likewise, using already-computed
	has_call field.

Index: gcc/tree-inline.c
===================================================================
--- gcc/tree-inline.c	(revision 161843)
+++ gcc/tree-inline.c	(working copy)
@@ -3453,105 +3453,13 @@ estimate_num_insns (gimple stmt, eni_wei
 	if (POINTER_TYPE_P (funtype))
 	  funtype = TREE_TYPE (funtype);
 
-	if (decl && DECL_BUILT_IN_CLASS (decl) == BUILT_IN_MD)
+	if (is_simple_builtin (decl))
+	  return 0;
+	else if (is_inexpensive_builtin (decl))
 	  cost = weights->target_builtin_call_cost;
 	else
 	  cost = weights->call_cost;
 
-	if (decl && DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
-	  switch (DECL_FUNCTION_CODE (decl))
-	    {
-	    /* Builtins that expand to constants.  */
-	    case BUILT_IN_CONSTANT_P:
-	    case BUILT_IN_EXPECT:
-	    case BUILT_IN_OBJECT_SIZE:
-	    case BUILT_IN_UNREACHABLE:
-	    /* Simple register moves or loads from stack.  */
-	    case BUILT_IN_RETURN_ADDRESS:
-	    case BUILT_IN_EXTRACT_RETURN_ADDR:
-	    case BUILT_IN_FROB_RETURN_ADDR:
-	    case BUILT_IN_RETURN:
-	    case BUILT_IN_AGGREGATE_INCOMING_ADDRESS:
-	    case BUILT_IN_FRAME_ADDRESS:
-	    case BUILT_IN_VA_END:
-	    case BUILT_IN_STACK_SAVE:
-	    case BUILT_IN_STACK_RESTORE:
-	    /* Exception state returns or moves registers around.  */
-	    case BUILT_IN_EH_FILTER:
-	    case BUILT_IN_EH_POINTER:
-	    case BUILT_IN_EH_COPY_VALUES:
-	      return 0;
-
-	    /* builtins that are not expensive (that is they are most probably
-	       expanded inline into resonably simple code).  */
-	    case BUILT_IN_ABS:
-	    case BUILT_IN_ALLOCA:
-	    case BUILT_IN_BSWAP32:
-	    case BUILT_IN_BSWAP64:
-	    case BUILT_IN_CLZ:
-	    case BUILT_IN_CLZIMAX:
-	    case BUILT_IN_CLZL:
-	    case BUILT_IN_CLZLL:
-	    case BUILT_IN_CTZ:
-	    case BUILT_IN_CTZIMAX:
-	    case BUILT_IN_CTZL:
-	    case BUILT_IN_CTZLL:
-	    case BUILT_IN_FFS:
-	    case BUILT_IN_FFSIMAX:
-	    case BUILT_IN_FFSL:
-	    case BUILT_IN_FFSLL:
-	    case BUILT_IN_IMAXABS:
-	    case BUILT_IN_FINITE:
-	    case BUILT_IN_FINITEF:
-	    case BUILT_IN_FINITEL:
-	    case BUILT_IN_FINITED32:
-	    case BUILT_IN_FINITED64:
-	    case BUILT_IN_FINITED128:
-	    case BUILT_IN_FPCLASSIFY:
-	    case BUILT_IN_ISFINITE:
-	    case BUILT_IN_ISINF_SIGN:
-	    case BUILT_IN_ISINF:
-	    case BUILT_IN_ISINFF:
-	    case BUILT_IN_ISINFL:
-	    case BUILT_IN_ISINFD32:
-	    case BUILT_IN_ISINFD64:
-	    case BUILT_IN_ISINFD128:
-	    case BUILT_IN_ISNAN:
-	    case BUILT_IN_ISNANF:
-	    case BUILT_IN_ISNANL:
-	    case BUILT_IN_ISNAND32:
-	    case BUILT_IN_ISNAND64:
-	    case BUILT_IN_ISNAND128:
-	    case BUILT_IN_ISNORMAL:
-	    case BUILT_IN_ISGREATER:
-	    case BUILT_IN_ISGREATEREQUAL:
-	    case BUILT_IN_ISLESS:
-	    case BUILT_IN_ISLESSEQUAL:
-	    case BUILT_IN_ISLESSGREATER:
-	    case BUILT_IN_ISUNORDERED:
-	    case BUILT_IN_VA_ARG_PACK:
-	    case BUILT_IN_VA_ARG_PACK_LEN:
-	    case BUILT_IN_VA_COPY:
-	    case BUILT_IN_TRAP:
-	    case BUILT_IN_SAVEREGS:
-	    case BUILT_IN_POPCOUNTL:
-	    case BUILT_IN_POPCOUNTLL:
-	    case BUILT_IN_POPCOUNTIMAX:
-	    case BUILT_IN_POPCOUNT:
-	    case BUILT_IN_PARITYL:
-	    case BUILT_IN_PARITYLL:
-	    case BUILT_IN_PARITYIMAX:
-	    case BUILT_IN_PARITY:
-	    case BUILT_IN_LABS:
-	    case BUILT_IN_LLABS:
-	    case BUILT_IN_PREFETCH:
-	      cost = weights->target_builtin_call_cost;
-	      break;
-
-	    default:
-	      break;
-	    }
-
 	if (decl)
 	  funtype = TREE_TYPE (decl);
 
Index: gcc/builtins.c
===================================================================
--- gcc/builtins.c	(revision 161843)
+++ gcc/builtins.c	(working copy)
@@ -13760,3 +13760,123 @@ set_builtin_user_assembler_name (tree de
       break;
     }
 }
+
+/* Return true if DECL is a builtin that expands to a constant or similarly
+   simple code.  */
+bool
+is_simple_builtin (tree decl)
+{
+  if (decl && DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
+    switch (DECL_FUNCTION_CODE (decl))
+      {
+	/* Builtins that expand to constants.  */
+      case BUILT_IN_CONSTANT_P:
+      case BUILT_IN_EXPECT:
+      case BUILT_IN_OBJECT_SIZE:
+      case BUILT_IN_UNREACHABLE:
+	/* Simple register moves or loads from stack.  */
+      case BUILT_IN_RETURN_ADDRESS:
+      case BUILT_IN_EXTRACT_RETURN_ADDR:
+      case BUILT_IN_FROB_RETURN_ADDR:
+      case BUILT_IN_RETURN:
+      case BUILT_IN_AGGREGATE_INCOMING_ADDRESS:
+      case BUILT_IN_FRAME_ADDRESS:
+      case BUILT_IN_VA_END:
+      case BUILT_IN_STACK_SAVE:
+      case BUILT_IN_STACK_RESTORE:
+	/* Exception state returns or moves registers around.  */
+      case BUILT_IN_EH_FILTER:
+      case BUILT_IN_EH_POINTER:
+      case BUILT_IN_EH_COPY_VALUES:
+	return true;
+
+      default:
+	return false;
+      }
+
+  return false;
+}
+
+/* Return true if DECL is a builtin that is not expensive, i.e., they are
+   most probably expanded inline into reasonably simple code.  This is a
+   superset of is_simple_builtin.  */
+bool
+is_inexpensive_builtin (tree decl)
+{
+  if (!decl)
+    return false;
+  else if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_MD)
+    return true;
+  else if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
+    switch (DECL_FUNCTION_CODE (decl))
+      {
+      case BUILT_IN_ABS:
+      case BUILT_IN_ALLOCA:
+      case BUILT_IN_BSWAP32:
+      case BUILT_IN_BSWAP64:
+      case BUILT_IN_CLZ:
+      case BUILT_IN_CLZIMAX:
+      case BUILT_IN_CLZL:
+      case BUILT_IN_CLZLL:
+      case BUILT_IN_CTZ:
+      case BUILT_IN_CTZIMAX:
+      case BUILT_IN_CTZL:
+      case BUILT_IN_CTZLL:
+      case BUILT_IN_FFS:
+      case BUILT_IN_FFSIMAX:
+      case BUILT_IN_FFSL:
+      case BUILT_IN_FFSLL:
+      case BUILT_IN_IMAXABS:
+      case BUILT_IN_FINITE:
+      case BUILT_IN_FINITEF:
+      case BUILT_IN_FINITEL:
+      case BUILT_IN_FINITED32:
+      case BUILT_IN_FINITED64:
+      case BUILT_IN_FINITED128:
+      case BUILT_IN_FPCLASSIFY:
+      case BUILT_IN_ISFINITE:
+      case BUILT_IN_ISINF_SIGN:
+      case BUILT_IN_ISINF:
+      case BUILT_IN_ISINFF:
+      case BUILT_IN_ISINFL:
+      case BUILT_IN_ISINFD32:
+      case BUILT_IN_ISINFD64:
+      case BUILT_IN_ISINFD128:
+      case BUILT_IN_ISNAN:
+      case BUILT_IN_ISNANF:
+      case BUILT_IN_ISNANL:
+      case BUILT_IN_ISNAND32:
+      case BUILT_IN_ISNAND64:
+      case BUILT_IN_ISNAND128:
+      case BUILT_IN_ISNORMAL:
+      case BUILT_IN_ISGREATER:
+      case BUILT_IN_ISGREATEREQUAL:
+      case BUILT_IN_ISLESS:
+      case BUILT_IN_ISLESSEQUAL:
+      case BUILT_IN_ISLESSGREATER:
+      case BUILT_IN_ISUNORDERED:
+      case BUILT_IN_VA_ARG_PACK:
+      case BUILT_IN_VA_ARG_PACK_LEN:
+      case BUILT_IN_VA_COPY:
+      case BUILT_IN_TRAP:
+      case BUILT_IN_SAVEREGS:
+      case BUILT_IN_POPCOUNTL:
+      case BUILT_IN_POPCOUNTLL:
+      case BUILT_IN_POPCOUNTIMAX:
+      case BUILT_IN_POPCOUNT:
+      case BUILT_IN_PARITYL:
+      case BUILT_IN_PARITYLL:
+      case BUILT_IN_PARITYIMAX:
+      case BUILT_IN_PARITY:
+      case BUILT_IN_LABS:
+      case BUILT_IN_LLABS:
+      case BUILT_IN_PREFETCH:
+	return true;
+
+      default:
+	return is_simple_builtin (decl);
+      }
+
+  return false;
+}
+
Index: gcc/tree.h
===================================================================
--- gcc/tree.h	(revision 161843)
+++ gcc/tree.h	(working copy)
@@ -5041,6 +5041,8 @@ extern tree build_range_check (location_
 extern bool merge_ranges (int *, tree *, tree *, int, tree, tree, int,
 			  tree, tree);
 extern void set_builtin_user_assembler_name (tree decl, const char *asmspec);
+extern bool is_simple_builtin (tree);
+extern bool is_inexpensive_builtin (tree);
 
 /* In convert.c */
 extern tree strip_float_extensions (tree);
Index: gcc/cfgloopanal.c
===================================================================
--- gcc/cfgloopanal.c	(revision 161843)
+++ gcc/cfgloopanal.c	(working copy)
@@ -320,6 +320,8 @@ seq_cost (const_rtx seq, bool speed)
 /* The properties of the target.  */
 
 unsigned target_avail_regs;	/* Number of available registers.  */
+unsigned target_clobbered_regs; /* Number of available registers that are
+				   call-clobbered.  */
 unsigned target_res_regs;	/* Number of registers reserved for temporary
 				   expressions.  */
 unsigned target_reg_cost[2];	/* The cost for register when there still
@@ -342,10 +344,15 @@ init_set_costs (void)
   unsigned i;
 
   target_avail_regs = 0;
+  target_clobbered_regs = 0;
   for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
     if (TEST_HARD_REG_BIT (reg_class_contents[GENERAL_REGS], i)
 	&& !fixed_regs[i])
-      target_avail_regs++;
+      {
+	target_avail_regs++;
+	if (call_used_regs[i])
+	  target_clobbered_regs++;
+      }
 
   target_res_regs = 3;
 
@@ -379,20 +386,29 @@ init_set_costs (void)
 
 /* Estimates cost of increased register pressure caused by making N_NEW new
    registers live around the loop.  N_OLD is the number of registers live
-   around the loop.  */
+   around the loop.  If CALL_P is true, also take into account that
+   call-used registers may be clobbered in the loop body, reducing the
+   number of available registers before we spill.  */
 
 unsigned
-estimate_reg_pressure_cost (unsigned n_new, unsigned n_old, bool speed)
+estimate_reg_pressure_cost (unsigned n_new, unsigned n_old, bool speed,
+			    bool call_p)
 {
   unsigned cost;
   unsigned regs_needed = n_new + n_old;
+  unsigned available_regs = target_avail_regs;
+
+  /* If there is a call in the loop body, the call-clobbered registers
+     are not available for loop invariants.  */
+  if (call_p)
+    available_regs = available_regs - target_clobbered_regs;
 
   /* If we have enough registers, we should use them and not restrict
      the transformations unnecessarily.  */
-  if (regs_needed + target_res_regs <= target_avail_regs)
+  if (regs_needed + target_res_regs <= available_regs)
     return 0;
 
-  if (regs_needed <= target_avail_regs)
+  if (regs_needed <= available_regs)
     /* If we are close to running out of registers, try to preserve
        them.  */
     cost = target_reg_cost [speed] * n_new;
Index: gcc/tree-ssa-loop-ivopts.c
===================================================================
--- gcc/tree-ssa-loop-ivopts.c	(revision 161844)
+++ gcc/tree-ssa-loop-ivopts.c	(working copy)
@@ -260,6 +260,9 @@ struct ivopts_data
 
   /* Are we optimizing for speed?  */
   bool speed;
+
+  /* Whether the loop body includes any function calls.  */
+  bool body_includes_call;
 };
 
 /* An assignment of iv candidates to uses.  */
@@ -4431,7 +4434,8 @@ ivopts_global_cost_for_size (struct ivop
 {
   /* We add size to the cost, so that we prefer eliminating ivs
      if possible.  */
-  return size + estimate_reg_pressure_cost (size, data->regs_used, data->speed);
+  return size + estimate_reg_pressure_cost (size, data->regs_used, data->speed,
+					    data->body_includes_call);
 }
 
 /* For each size of the induction variable set determine the penalty.  */
@@ -4450,6 +4454,7 @@ determine_set_costs (struct ivopts_data 
     {
       fprintf (dump_file, "Global costs:\n");
       fprintf (dump_file, "  target_avail_regs %d\n", target_avail_regs);
+      fprintf (dump_file, "  target_clobbered_regs %d\n", target_clobbered_regs);
       fprintf (dump_file, "  target_reg_cost %d\n", target_reg_cost[data->speed]);
       fprintf (dump_file, "  target_spill_cost %d\n", target_spill_cost[data->speed]);
     }
@@ -5859,6 +5864,25 @@ tree_ssa_iv_optimize_finalize (struct iv
   VEC_free (iv_cand_p, heap, data->iv_candidates);
 }
 
+/* Returns true if the loop body BODY includes any function calls.  */
+
+static bool
+loop_body_includes_call (basic_block *body, unsigned num_nodes)
+{
+  gimple_stmt_iterator gsi;
+  unsigned i;
+
+  for (i = 0; i < num_nodes; i++)
+    for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next (&gsi))
+      {
+	gimple stmt = gsi_stmt (gsi);
+	if (is_gimple_call (stmt)
+	    && !is_inexpensive_builtin (gimple_call_fndecl (stmt)))
+	  return true;
+      }
+  return false;
+}
+
 /* Optimizes the LOOP.  Returns true if anything changed.  */
 
 static bool
@@ -5890,6 +5914,7 @@ tree_ssa_iv_optimize_loop (struct ivopts
     }
 
   body = get_loop_body (loop);
+  data->body_includes_call = loop_body_includes_call (body, loop->num_nodes);
   renumber_gimple_stmt_uids_in_blocks (body, loop->num_nodes);
   free (body);
 
Index: gcc/loop-invariant.c
===================================================================
--- gcc/loop-invariant.c	(revision 161843)
+++ gcc/loop-invariant.c	(working copy)
@@ -1173,11 +1173,13 @@ get_inv_cost (struct invariant *inv, int
 /* Calculates gain for eliminating invariant INV.  REGS_USED is the number
    of registers used in the loop, NEW_REGS is the number of new variables
    already added due to the invariant motion.  The number of registers needed
-   for it is stored in *REGS_NEEDED.  */
+   for it is stored in *REGS_NEEDED.  SPEED and CALL_P are flags passed
+   through to estimate_reg_pressure_cost. */
 
 static int
 gain_for_invariant (struct invariant *inv, unsigned *regs_needed,
-		    unsigned *new_regs, unsigned regs_used, bool speed)
+		    unsigned *new_regs, unsigned regs_used,
+		    bool speed, bool call_p)
 {
   int comp_cost, size_cost;
 
@@ -1188,9 +1190,9 @@ gain_for_invariant (struct invariant *in
   if (! flag_ira_loop_pressure)
     {
       size_cost = (estimate_reg_pressure_cost (new_regs[0] + regs_needed[0],
-					       regs_used, speed)
+					       regs_used, speed, call_p)
 		   - estimate_reg_pressure_cost (new_regs[0],
-						 regs_used, speed));
+						 regs_used, speed, call_p));
     }
   else
     {
@@ -1245,7 +1247,8 @@ gain_for_invariant (struct invariant *in
 
 static int
 best_gain_for_invariant (struct invariant **best, unsigned *regs_needed,
-			 unsigned *new_regs, unsigned regs_used, bool speed)
+			 unsigned *new_regs, unsigned regs_used,
+			 bool speed, bool call_p)
 {
   struct invariant *inv;
   int i, gain = 0, again;
@@ -1261,7 +1264,7 @@ best_gain_for_invariant (struct invarian
 	continue;
 
       again = gain_for_invariant (inv, aregs_needed, new_regs, regs_used,
-      				  speed);
+      				  speed, call_p);
       if (again > gain)
 	{
 	  gain = again;
@@ -1314,7 +1317,7 @@ set_move_mark (unsigned invno, int gain)
 /* Determines which invariants to move.  */
 
 static void
-find_invariants_to_move (bool speed)
+find_invariants_to_move (bool speed, bool call_p)
 {
   int gain;
   unsigned i, regs_used, regs_needed[N_REG_CLASSES], new_regs[N_REG_CLASSES];
@@ -1353,7 +1356,8 @@ find_invariants_to_move (bool speed)
 	new_regs[ira_reg_class_cover[i]] = 0;
     }
   while ((gain = best_gain_for_invariant (&inv, regs_needed,
-					  new_regs, regs_used, speed)) > 0)
+					  new_regs, regs_used,
+					  speed, call_p)) > 0)
     {
       set_move_mark (inv->invno, gain);
       if (! flag_ira_loop_pressure)
@@ -1573,7 +1577,8 @@ move_single_loop_invariants (struct loop
   init_inv_motion_data ();
 
   find_invariants (loop);
-  find_invariants_to_move (optimize_loop_for_speed_p (loop));
+  find_invariants_to_move (optimize_loop_for_speed_p (loop),
+			   LOOP_DATA (loop)->has_call);
   move_invariants (loop);
 
   free_inv_motion_data ();
Index: gcc/cfgloop.h
===================================================================
--- gcc/cfgloop.h	(revision 161843)
+++ gcc/cfgloop.h	(working copy)
@@ -627,13 +627,14 @@ fel_init (loop_iterator *li, loop_p *loo
 /* The properties of the target.  */
 
 extern unsigned target_avail_regs;
+extern unsigned target_clobbered_regs;
 extern unsigned target_res_regs;
 extern unsigned target_reg_cost [2];
 extern unsigned target_spill_cost [2];
 
 /* Register pressure estimation for induction variable optimizations & loop
    invariant motion.  */
-extern unsigned estimate_reg_pressure_cost (unsigned, unsigned, bool);
+extern unsigned estimate_reg_pressure_cost (unsigned, unsigned, bool, bool);
 extern void init_set_costs (void);
 
 /* Loop optimizer initialization.  */

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]