This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

REVISED PATCH: adjust init_set_costs to account for call_used_regs (PR42505)


Zdenek Dvorak wrote:

As for this change:


    * cfgloopanal.c (init_set_costs): Use call_used_regs rather than
    fixed_regs to count number of registers available for loop variables.

Should not we make call_used_regs unavailable only if there is a function call in the loop?

OK, how about this version instead?


As I noted previously, even when there is no call in the loop, ivopts pretty clearly seems to be underestimating register pressure -- we need some scratch registers for loading constants and for holding new local variables introduced by subsequent CSE passes, so not all of fixed_regs are going to be available for things ivopts is counting. (You can see an example of an extra CSE-introduced variable in the original test case attached to PR42505, for instance.)

So, my patch adds a new constant target_scratch_regs to represent the number of registers to reserve for this purpose; they can overlap the call-clobbered registers. The value of 2 I chose is as arbitrary as the existing choice of 3 for target_res_regs, but on ARM 2 is definitely better than 0 in terms of both code size and speed, and while I found that code size continued to go down a little as I increased it to 3, at 3 I started seeing performance degradation on some of the benchmark configurations I was using. Maybe both of these register availability parameters need to be made target-specific and/or dependent on speed/size tradeoffs? Anyway, excluding the call-clobbered registers is an important correction to the costs model and the new target_scratch_regs parameter at least gives us another hook for tuning things further.

For testing, I bootstrapped and regression-tested this patch on x86_64-unknown-linux-gnu and did some performance analysis on ARM, as I did for the rest of the PR42505 changes.

-Sandra

2010-07-08 Sandra Loosemore <sandra@codesourcery.com>

PR middle-end/42505

	gcc/
	* cfgloopanal.c (target_clobbered_regs, target_scratch_regs): Define.
	(init_set_costs): Initialize them.
	(estimate_reg_pressure_cost): Add call_p argument.  Adjust the
	number of available registers to exclude target_scratch_regs and/or
	the call-clobbered registers.
	* cfgloop.h (target_clobbered_regs, target_scratch_regs): Declare.
	(estimate_reg_pressure_cost): Adjust declaration.
	* tree-ssa-loop-ivopts.c (struct ivopts_data): Add body_includes_call.
	(ivopts_global_cost_for_size): Pass it to estimate_reg_pressure_cost.
	(determine_set_costs): Dump target_clobbered_regs.
	(loop_body_includes_call): New function.
	(tree_ssa_iv_optimize_loop): Use it to initialize new field.
	* loop-invariant.c (gain_for_invariant): Adjust arguments to pass
	call_p flag through.
	(best_gain_for_invariant): Likewise.
	(find_invariants_to_move): Likewise.
	(move_single_loop_invariants): Likewise, using already-computed
	has_call field.


Index: gcc/cfgloopanal.c
===================================================================
--- gcc/cfgloopanal.c	(revision 161843)
+++ gcc/cfgloopanal.c	(working copy)
@@ -320,8 +320,11 @@ seq_cost (const_rtx seq, bool speed)
 /* The properties of the target.  */
 
 unsigned target_avail_regs;	/* Number of available registers.  */
+unsigned target_clobbered_regs; /* Number of available registers that are
+				   call-clobbered.  */
 unsigned target_res_regs;	/* Number of registers reserved for temporary
 				   expressions.  */
+unsigned target_scratch_regs;   /* Number of scratch registers required.  */
 unsigned target_reg_cost[2];	/* The cost for register when there still
 				   is some reserve, but we are approaching
 				   the number of available registers.  */
@@ -342,12 +345,18 @@ init_set_costs (void)
   unsigned i;
 
   target_avail_regs = 0;
+  target_clobbered_regs = 0;
   for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
     if (TEST_HARD_REG_BIT (reg_class_contents[GENERAL_REGS], i)
 	&& !fixed_regs[i])
-      target_avail_regs++;
+      {
+	target_avail_regs++;
+	if (call_used_regs[i])
+	  target_clobbered_regs++;
+      }
 
   target_res_regs = 3;
+  target_scratch_regs = 2;
 
   for (speed = 0; speed < 2; speed++)
      {
@@ -379,20 +388,34 @@ init_set_costs (void)
 
 /* Estimates cost of increased register pressure caused by making N_NEW new
    registers live around the loop.  N_OLD is the number of registers live
-   around the loop.  */
+   around the loop.  If CALL_P is true, also take into account that
+   call-used registers may be clobbered in the loop body, reducing the
+   number of available registers before we spill.  */
 
 unsigned
-estimate_reg_pressure_cost (unsigned n_new, unsigned n_old, bool speed)
+estimate_reg_pressure_cost (unsigned n_new, unsigned n_old, bool speed,
+			    bool call_p)
 {
   unsigned cost;
   unsigned regs_needed = n_new + n_old;
+  unsigned available_regs = target_avail_regs;
+
+  /* If there is a call in the loop body, the call-clobbered registers
+     are not available for loop invariants.  We may also need some scratch
+     registers for loading constants, holding local CSE'ed values, and the
+     like.  The call-clobbered registers can be used for this  purpose if
+     they are not potentially being used to hold loop variables.  */
+  if (call_p && target_clobbered_regs >= target_scratch_regs)
+    available_regs = available_regs - target_clobbered_regs;
+  else
+    available_regs = available_regs - target_scratch_regs;
 
   /* If we have enough registers, we should use them and not restrict
      the transformations unnecessarily.  */
-  if (regs_needed + target_res_regs <= target_avail_regs)
+  if (regs_needed + target_res_regs <= available_regs)
     return 0;
 
-  if (regs_needed <= target_avail_regs)
+  if (regs_needed <= available_regs)
     /* If we are close to running out of registers, try to preserve
        them.  */
     cost = target_reg_cost [speed] * n_new;
Index: gcc/cfgloop.h
===================================================================
--- gcc/cfgloop.h	(revision 161843)
+++ gcc/cfgloop.h	(working copy)
@@ -627,13 +627,15 @@ fel_init (loop_iterator *li, loop_p *loo
 /* The properties of the target.  */
 
 extern unsigned target_avail_regs;
+extern unsigned target_clobbered_regs;
 extern unsigned target_res_regs;
+extern unsigned target_scratch_regs;
 extern unsigned target_reg_cost [2];
 extern unsigned target_spill_cost [2];
 
 /* Register pressure estimation for induction variable optimizations & loop
    invariant motion.  */
-extern unsigned estimate_reg_pressure_cost (unsigned, unsigned, bool);
+extern unsigned estimate_reg_pressure_cost (unsigned, unsigned, bool, bool);
 extern void init_set_costs (void);
 
 /* Loop optimizer initialization.  */
Index: gcc/tree-ssa-loop-ivopts.c
===================================================================
--- gcc/tree-ssa-loop-ivopts.c	(revision 161844)
+++ gcc/tree-ssa-loop-ivopts.c	(working copy)
@@ -260,6 +260,9 @@ struct ivopts_data
 
   /* Are we optimizing for speed?  */
   bool speed;
+
+  /* Whether the loop body includes any function calls.  */
+  bool body_includes_call;
 };
 
 /* An assignment of iv candidates to uses.  */
@@ -4431,7 +4434,8 @@ ivopts_global_cost_for_size (struct ivop
 {
   /* We add size to the cost, so that we prefer eliminating ivs
      if possible.  */
-  return size + estimate_reg_pressure_cost (size, data->regs_used, data->speed);
+  return size + estimate_reg_pressure_cost (size, data->regs_used, data->speed,
+					    data->body_includes_call);
 }
 
 /* For each size of the induction variable set determine the penalty.  */
@@ -4450,6 +4454,7 @@ determine_set_costs (struct ivopts_data 
     {
       fprintf (dump_file, "Global costs:\n");
       fprintf (dump_file, "  target_avail_regs %d\n", target_avail_regs);
+      fprintf (dump_file, "  target_clobbered_regs %d\n", target_clobbered_regs);
       fprintf (dump_file, "  target_reg_cost %d\n", target_reg_cost[data->speed]);
       fprintf (dump_file, "  target_spill_cost %d\n", target_spill_cost[data->speed]);
     }
@@ -5859,6 +5864,21 @@ tree_ssa_iv_optimize_finalize (struct iv
   VEC_free (iv_cand_p, heap, data->iv_candidates);
 }
 
+/* Returns true if the loop body BODY includes any function calls.  */
+
+static bool
+loop_body_includes_call (basic_block *body, unsigned num_nodes)
+{
+  gimple_stmt_iterator gsi;
+  unsigned i;
+
+  for (i = 0; i < num_nodes; i++)
+    for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next (&gsi))
+      if (is_gimple_call (gsi_stmt (gsi)))
+	return true;
+  return false;
+}
+
 /* Optimizes the LOOP.  Returns true if anything changed.  */
 
 static bool
@@ -5890,6 +5910,7 @@ tree_ssa_iv_optimize_loop (struct ivopts
     }
 
   body = get_loop_body (loop);
+  data->body_includes_call = loop_body_includes_call (body, loop->num_nodes);
   renumber_gimple_stmt_uids_in_blocks (body, loop->num_nodes);
   free (body);
 
Index: gcc/loop-invariant.c
===================================================================
--- gcc/loop-invariant.c	(revision 161843)
+++ gcc/loop-invariant.c	(working copy)
@@ -1173,11 +1173,13 @@ get_inv_cost (struct invariant *inv, int
 /* Calculates gain for eliminating invariant INV.  REGS_USED is the number
    of registers used in the loop, NEW_REGS is the number of new variables
    already added due to the invariant motion.  The number of registers needed
-   for it is stored in *REGS_NEEDED.  */
+   for it is stored in *REGS_NEEDED.  SPEED and CALL_P are flags passed
+   through to estimate_reg_pressure_cost. */
 
 static int
 gain_for_invariant (struct invariant *inv, unsigned *regs_needed,
-		    unsigned *new_regs, unsigned regs_used, bool speed)
+		    unsigned *new_regs, unsigned regs_used,
+		    bool speed, bool call_p)
 {
   int comp_cost, size_cost;
 
@@ -1188,9 +1190,9 @@ gain_for_invariant (struct invariant *in
   if (! flag_ira_loop_pressure)
     {
       size_cost = (estimate_reg_pressure_cost (new_regs[0] + regs_needed[0],
-					       regs_used, speed)
+					       regs_used, speed, call_p)
 		   - estimate_reg_pressure_cost (new_regs[0],
-						 regs_used, speed));
+						 regs_used, speed, call_p));
     }
   else
     {
@@ -1245,7 +1247,8 @@ gain_for_invariant (struct invariant *in
 
 static int
 best_gain_for_invariant (struct invariant **best, unsigned *regs_needed,
-			 unsigned *new_regs, unsigned regs_used, bool speed)
+			 unsigned *new_regs, unsigned regs_used,
+			 bool speed, bool call_p)
 {
   struct invariant *inv;
   int i, gain = 0, again;
@@ -1261,7 +1264,7 @@ best_gain_for_invariant (struct invarian
 	continue;
 
       again = gain_for_invariant (inv, aregs_needed, new_regs, regs_used,
-      				  speed);
+      				  speed, call_p);
       if (again > gain)
 	{
 	  gain = again;
@@ -1314,7 +1317,7 @@ set_move_mark (unsigned invno, int gain)
 /* Determines which invariants to move.  */
 
 static void
-find_invariants_to_move (bool speed)
+find_invariants_to_move (bool speed, bool call_p)
 {
   int gain;
   unsigned i, regs_used, regs_needed[N_REG_CLASSES], new_regs[N_REG_CLASSES];
@@ -1353,7 +1356,8 @@ find_invariants_to_move (bool speed)
 	new_regs[ira_reg_class_cover[i]] = 0;
     }
   while ((gain = best_gain_for_invariant (&inv, regs_needed,
-					  new_regs, regs_used, speed)) > 0)
+					  new_regs, regs_used,
+					  speed, call_p)) > 0)
     {
       set_move_mark (inv->invno, gain);
       if (! flag_ira_loop_pressure)
@@ -1573,7 +1577,8 @@ move_single_loop_invariants (struct loop
   init_inv_motion_data ();
 
   find_invariants (loop);
-  find_invariants_to_move (optimize_loop_for_speed_p (loop));
+  find_invariants_to_move (optimize_loop_for_speed_p (loop),
+			   LOOP_DATA (loop)->has_call);
   move_invariants (loop);
 
   free_inv_motion_data ();

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]