This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
REVISED PATCH: adjust init_set_costs to account for call_used_regs (PR42505)
- From: Sandra Loosemore <sandra at codesourcery dot com>
- To: Zdenek Dvorak <rakdver at kam dot mff dot cuni dot cz>
- Cc: gcc-patches at gcc dot gnu dot org
- Date: Thu, 08 Jul 2010 19:57:01 -0400
- Subject: REVISED PATCH: adjust init_set_costs to account for call_used_regs (PR42505)
- References: <4C2DD6D9.3000407@codesourcery.com> <20100702140706.GA9954@kam.mff.cuni.cz>
Zdenek Dvorak wrote:
As for this change:
* cfgloopanal.c (init_set_costs): Use call_used_regs rather than
fixed_regs to count number of registers available for loop variables.
Should not we make call_used_regs unavailable only if there is a function call in the
loop?
OK, how about this version instead?
As I noted previously, even when there is no call in the loop, ivopts pretty
clearly seems to be underestimating register pressure -- we need some scratch
registers for loading constants and for holding new local variables introduced
by subsequent CSE passes, so not all of fixed_regs are going to be available for
things ivopts is counting. (You can see an example of an extra CSE-introduced
variable in the original test case attached to PR42505, for instance.)
So, my patch adds a new constant target_scratch_regs to represent the number of
registers to reserve for this purpose; they can overlap the call-clobbered
registers. The value of 2 I chose is as arbitrary as the existing choice of 3
for target_res_regs, but on ARM 2 is definitely better than 0 in terms of both
code size and speed, and while I found that code size continued to go down a
little as I increased it to 3, at 3 I started seeing performance degradation on
some of the benchmark configurations I was using. Maybe both of these register
availability parameters need to be made target-specific and/or dependent on
speed/size tradeoffs? Anyway, excluding the call-clobbered registers is an
important correction to the costs model and the new target_scratch_regs
parameter at least gives us another hook for tuning things further.
For testing, I bootstrapped and regression-tested this patch on
x86_64-unknown-linux-gnu and did some performance analysis on ARM, as I did for
the rest of the PR42505 changes.
-Sandra
2010-07-08 Sandra Loosemore <sandra@codesourcery.com>
PR middle-end/42505
gcc/
* cfgloopanal.c (target_clobbered_regs, target_scratch_regs): Define.
(init_set_costs): Initialize them.
(estimate_reg_pressure_cost): Add call_p argument. Adjust the
number of available registers to exclude target_scratch_regs and/or
the call-clobbered registers.
* cfgloop.h (target_clobbered_regs, target_scratch_regs): Declare.
(estimate_reg_pressure_cost): Adjust declaration.
* tree-ssa-loop-ivopts.c (struct ivopts_data): Add body_includes_call.
(ivopts_global_cost_for_size): Pass it to estimate_reg_pressure_cost.
(determine_set_costs): Dump target_clobbered_regs.
(loop_body_includes_call): New function.
(tree_ssa_iv_optimize_loop): Use it to initialize new field.
* loop-invariant.c (gain_for_invariant): Adjust arguments to pass
call_p flag through.
(best_gain_for_invariant): Likewise.
(find_invariants_to_move): Likewise.
(move_single_loop_invariants): Likewise, using already-computed
has_call field.
Index: gcc/cfgloopanal.c
===================================================================
--- gcc/cfgloopanal.c (revision 161843)
+++ gcc/cfgloopanal.c (working copy)
@@ -320,8 +320,11 @@ seq_cost (const_rtx seq, bool speed)
/* The properties of the target. */
unsigned target_avail_regs; /* Number of available registers. */
+unsigned target_clobbered_regs; /* Number of available registers that are
+ call-clobbered. */
unsigned target_res_regs; /* Number of registers reserved for temporary
expressions. */
+unsigned target_scratch_regs; /* Number of scratch registers required. */
unsigned target_reg_cost[2]; /* The cost for register when there still
is some reserve, but we are approaching
the number of available registers. */
@@ -342,12 +345,18 @@ init_set_costs (void)
unsigned i;
target_avail_regs = 0;
+ target_clobbered_regs = 0;
for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
if (TEST_HARD_REG_BIT (reg_class_contents[GENERAL_REGS], i)
&& !fixed_regs[i])
- target_avail_regs++;
+ {
+ target_avail_regs++;
+ if (call_used_regs[i])
+ target_clobbered_regs++;
+ }
target_res_regs = 3;
+ target_scratch_regs = 2;
for (speed = 0; speed < 2; speed++)
{
@@ -379,20 +388,34 @@ init_set_costs (void)
/* Estimates cost of increased register pressure caused by making N_NEW new
registers live around the loop. N_OLD is the number of registers live
- around the loop. */
+ around the loop. If CALL_P is true, also take into account that
+ call-used registers may be clobbered in the loop body, reducing the
+ number of available registers before we spill. */
unsigned
-estimate_reg_pressure_cost (unsigned n_new, unsigned n_old, bool speed)
+estimate_reg_pressure_cost (unsigned n_new, unsigned n_old, bool speed,
+ bool call_p)
{
unsigned cost;
unsigned regs_needed = n_new + n_old;
+ unsigned available_regs = target_avail_regs;
+
+ /* If there is a call in the loop body, the call-clobbered registers
+ are not available for loop invariants. We may also need some scratch
+ registers for loading constants, holding local CSE'ed values, and the
+ like. The call-clobbered registers can be used for this purpose if
+ they are not potentially being used to hold loop variables. */
+ if (call_p && target_clobbered_regs >= target_scratch_regs)
+ available_regs = available_regs - target_clobbered_regs;
+ else
+ available_regs = available_regs - target_scratch_regs;
/* If we have enough registers, we should use them and not restrict
the transformations unnecessarily. */
- if (regs_needed + target_res_regs <= target_avail_regs)
+ if (regs_needed + target_res_regs <= available_regs)
return 0;
- if (regs_needed <= target_avail_regs)
+ if (regs_needed <= available_regs)
/* If we are close to running out of registers, try to preserve
them. */
cost = target_reg_cost [speed] * n_new;
Index: gcc/cfgloop.h
===================================================================
--- gcc/cfgloop.h (revision 161843)
+++ gcc/cfgloop.h (working copy)
@@ -627,13 +627,15 @@ fel_init (loop_iterator *li, loop_p *loo
/* The properties of the target. */
extern unsigned target_avail_regs;
+extern unsigned target_clobbered_regs;
extern unsigned target_res_regs;
+extern unsigned target_scratch_regs;
extern unsigned target_reg_cost [2];
extern unsigned target_spill_cost [2];
/* Register pressure estimation for induction variable optimizations & loop
invariant motion. */
-extern unsigned estimate_reg_pressure_cost (unsigned, unsigned, bool);
+extern unsigned estimate_reg_pressure_cost (unsigned, unsigned, bool, bool);
extern void init_set_costs (void);
/* Loop optimizer initialization. */
Index: gcc/tree-ssa-loop-ivopts.c
===================================================================
--- gcc/tree-ssa-loop-ivopts.c (revision 161844)
+++ gcc/tree-ssa-loop-ivopts.c (working copy)
@@ -260,6 +260,9 @@ struct ivopts_data
/* Are we optimizing for speed? */
bool speed;
+
+ /* Whether the loop body includes any function calls. */
+ bool body_includes_call;
};
/* An assignment of iv candidates to uses. */
@@ -4431,7 +4434,8 @@ ivopts_global_cost_for_size (struct ivop
{
/* We add size to the cost, so that we prefer eliminating ivs
if possible. */
- return size + estimate_reg_pressure_cost (size, data->regs_used, data->speed);
+ return size + estimate_reg_pressure_cost (size, data->regs_used, data->speed,
+ data->body_includes_call);
}
/* For each size of the induction variable set determine the penalty. */
@@ -4450,6 +4454,7 @@ determine_set_costs (struct ivopts_data
{
fprintf (dump_file, "Global costs:\n");
fprintf (dump_file, " target_avail_regs %d\n", target_avail_regs);
+ fprintf (dump_file, " target_clobbered_regs %d\n", target_clobbered_regs);
fprintf (dump_file, " target_reg_cost %d\n", target_reg_cost[data->speed]);
fprintf (dump_file, " target_spill_cost %d\n", target_spill_cost[data->speed]);
}
@@ -5859,6 +5864,21 @@ tree_ssa_iv_optimize_finalize (struct iv
VEC_free (iv_cand_p, heap, data->iv_candidates);
}
+/* Returns true if the loop body BODY includes any function calls. */
+
+static bool
+loop_body_includes_call (basic_block *body, unsigned num_nodes)
+{
+ gimple_stmt_iterator gsi;
+ unsigned i;
+
+ for (i = 0; i < num_nodes; i++)
+ for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next (&gsi))
+ if (is_gimple_call (gsi_stmt (gsi)))
+ return true;
+ return false;
+}
+
/* Optimizes the LOOP. Returns true if anything changed. */
static bool
@@ -5890,6 +5910,7 @@ tree_ssa_iv_optimize_loop (struct ivopts
}
body = get_loop_body (loop);
+ data->body_includes_call = loop_body_includes_call (body, loop->num_nodes);
renumber_gimple_stmt_uids_in_blocks (body, loop->num_nodes);
free (body);
Index: gcc/loop-invariant.c
===================================================================
--- gcc/loop-invariant.c (revision 161843)
+++ gcc/loop-invariant.c (working copy)
@@ -1173,11 +1173,13 @@ get_inv_cost (struct invariant *inv, int
/* Calculates gain for eliminating invariant INV. REGS_USED is the number
of registers used in the loop, NEW_REGS is the number of new variables
already added due to the invariant motion. The number of registers needed
- for it is stored in *REGS_NEEDED. */
+ for it is stored in *REGS_NEEDED. SPEED and CALL_P are flags passed
+ through to estimate_reg_pressure_cost. */
static int
gain_for_invariant (struct invariant *inv, unsigned *regs_needed,
- unsigned *new_regs, unsigned regs_used, bool speed)
+ unsigned *new_regs, unsigned regs_used,
+ bool speed, bool call_p)
{
int comp_cost, size_cost;
@@ -1188,9 +1190,9 @@ gain_for_invariant (struct invariant *in
if (! flag_ira_loop_pressure)
{
size_cost = (estimate_reg_pressure_cost (new_regs[0] + regs_needed[0],
- regs_used, speed)
+ regs_used, speed, call_p)
- estimate_reg_pressure_cost (new_regs[0],
- regs_used, speed));
+ regs_used, speed, call_p));
}
else
{
@@ -1245,7 +1247,8 @@ gain_for_invariant (struct invariant *in
static int
best_gain_for_invariant (struct invariant **best, unsigned *regs_needed,
- unsigned *new_regs, unsigned regs_used, bool speed)
+ unsigned *new_regs, unsigned regs_used,
+ bool speed, bool call_p)
{
struct invariant *inv;
int i, gain = 0, again;
@@ -1261,7 +1264,7 @@ best_gain_for_invariant (struct invarian
continue;
again = gain_for_invariant (inv, aregs_needed, new_regs, regs_used,
- speed);
+ speed, call_p);
if (again > gain)
{
gain = again;
@@ -1314,7 +1317,7 @@ set_move_mark (unsigned invno, int gain)
/* Determines which invariants to move. */
static void
-find_invariants_to_move (bool speed)
+find_invariants_to_move (bool speed, bool call_p)
{
int gain;
unsigned i, regs_used, regs_needed[N_REG_CLASSES], new_regs[N_REG_CLASSES];
@@ -1353,7 +1356,8 @@ find_invariants_to_move (bool speed)
new_regs[ira_reg_class_cover[i]] = 0;
}
while ((gain = best_gain_for_invariant (&inv, regs_needed,
- new_regs, regs_used, speed)) > 0)
+ new_regs, regs_used,
+ speed, call_p)) > 0)
{
set_move_mark (inv->invno, gain);
if (! flag_ira_loop_pressure)
@@ -1573,7 +1577,8 @@ move_single_loop_invariants (struct loop
init_inv_motion_data ();
find_invariants (loop);
- find_invariants_to_move (optimize_loop_for_speed_p (loop));
+ find_invariants_to_move (optimize_loop_for_speed_p (loop),
+ LOOP_DATA (loop)->has_call);
move_invariants (loop);
free_inv_motion_data ();