This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
More sensible unrolling for low iteration counts (work in progress)
- From: Joern Rennecke <joern dot rennecke at superh dot com>
- To: gcc-patches at gcc dot gnu dot org
- Date: Thu, 23 Sep 2004 13:29:46 +0100 (BST)
- Subject: More sensible unrolling for low iteration counts (work in progress)
--- cfgloop.h@@/main/SH5GCC-CVS20040317.2300-int/renneckej-3.4-unroll/0 Mon Mar 22 12:42:52 2004
+++ cfgloop.h Thu Mar 25 20:31:01 2004
@@ -27,6 +27,7 @@ enum lpt_dec
LPT_PEEL_SIMPLE,
LPT_UNROLL_CONSTANT,
LPT_UNROLL_RUNTIME,
+ LPT_UNROLL_MOP_UP,
LPT_UNROLL_STUPID
};
@@ -307,6 +308,7 @@ extern bool simple_loop_p (struct loop *
extern rtx count_loop_iterations (struct loop_desc *, rtx, rtx);
extern bool just_once_each_iteration_p (struct loop *, basic_block);
extern unsigned expected_loop_iterations (const struct loop *);
+extern unsigned expected_loop_iterations_1 (const struct loop *, int);
/* Loop manipulation. */
extern bool can_duplicate_loop_p (struct loop *loop);
@@ -318,6 +320,7 @@ extern int duplicate_loop_to_header_edge
unsigned, sbitmap, edge, edge *,
unsigned *, int);
extern struct loop *loopify (struct loops *, edge, edge, basic_block);
+extern struct loop *do_loopify (struct loops *, edge);
extern void unloop (struct loops *, struct loop *);
extern bool remove_path (struct loops *, edge);
extern edge split_loop_bb (basic_block, rtx);
--- cfgloopanal.c@@/main/SH5GCC-CVS20040317.2300-int/renneckej-3.4-unroll/0 Mon Mar 22 12:42:55 2004
+++ cfgloopanal.c Mon Mar 22 12:42:55 2004
@@ -1381,6 +1381,13 @@ average_num_loop_insns (struct loop *loo
unsigned
expected_loop_iterations (const struct loop *loop)
{
+ return expected_loop_iterations_1 (loop, 1);
+}
+
+/* Likewise, but return result multiplied by SCALE. */
+unsigned
+expected_loop_iterations_1 (const struct loop *loop, int scale)
+{
edge e;
if (loop->header->count)
@@ -1399,14 +1406,16 @@ expected_loop_iterations (const struct l
if (count_in == 0)
return 0;
- expected = (count_latch + count_in - 1) / count_in;
+ expected = (count_latch + count_in) / count_in;
+ expected = (((count_latch + count_in - expected * count_in)
+ * scale / count_in) + expected * scale);
/* Avoid overflows. */
return (expected > REG_BR_PROB_BASE ? REG_BR_PROB_BASE : expected);
}
else
{
- int freq_in, freq_latch;
+ int freq_in, freq_latch, expected;
freq_in = 0;
freq_latch = 0;
@@ -1420,7 +1429,10 @@ expected_loop_iterations (const struct l
if (freq_in == 0)
return 0;
- return (freq_latch + freq_in - 1) / freq_in;
+ expected = (freq_latch + freq_in) / freq_in;
+ expected = ((freq_latch + freq_in - expected) * scale / freq_in
+ + expected * scale);
+ return expected;
}
}
--- cfgloopmanip.c@@/main/SH5GCC-CVS20040317.2300-int/renneckej-3.4-unroll/0 Wed Mar 24 17:24:08 2004
+++ cfgloopmanip.c Fri Mar 26 15:38:37 2004
@@ -40,7 +40,7 @@ static void remove_bbs (basic_block *, i
static bool rpe_enum_p (basic_block, void *);
static int find_path (edge, basic_block **);
static bool alp_enum_p (basic_block, void *);
-static void add_loop (struct loops *, struct loop *);
+static void add_loop (struct loops *, struct loop *, int);
static void fix_loop_placements (struct loop *);
static bool fix_bb_placement (struct loops *, basic_block);
static void fix_bb_placements (struct loops *, basic_block);
@@ -432,7 +432,7 @@ alp_enum_p (basic_block bb, void *alp_he
/* Given LOOP structure with filled header and latch, find the body of the
corresponding loop and add it to LOOPS tree. */
static void
-add_loop (struct loops *loops, struct loop *loop)
+add_loop (struct loops *loops, struct loop *loop, int update_outer_loops_p)
{
basic_block *bbs;
int i, n;
@@ -445,10 +445,18 @@ add_loop (struct loops *loops, struct lo
bbs = xcalloc (n_basic_blocks, sizeof (basic_block));
n = dfs_enumerate_from (loop->latch, 1, alp_enum_p,
bbs, n_basic_blocks, loop->header);
+ bbs[n++] = loop->header;
- for (i = 0; i < n; i++)
- add_bb_to_loop (bbs[i], loop);
- add_bb_to_loop (loop->header, loop);
+ if (update_outer_loops_p)
+ for (i = 0; i < n; i++)
+ add_bb_to_loop (bbs[i], loop);
+ else
+ for (i = 0; i < n; i++)
+ {
+ bbs[i]->loop_father = loop;
+ bbs[i]->loop_depth = loop->depth;
+ loop->num_nodes++;
+ }
free (bbs);
}
@@ -525,7 +533,7 @@ loopify (struct loops *loops, edge latch
set_immediate_dominator (CDI_DOMINATORS, succ_bb, switch_bb);
/* Compute new loop. */
- add_loop (loops, loop);
+ add_loop (loops, loop, 1);
flow_loop_tree_node_add (outer, loop);
/* Add switch_bb to appropriate loop. */
@@ -571,6 +579,24 @@ loopify (struct loops *loops, edge latch
return loop;
}
+
+struct loop *
+do_loopify (struct loops *loops, edge latch_edge)
+{
+ basic_block header = latch_edge->dest;
+ struct loop *loop = xcalloc (1, sizeof (struct loop));
+ struct loop *outer = header->loop_father;
+
+ loop->header = header;
+
+ loop->latch = loop_split_edge_with (latch_edge, NULL_RTX);
+
+ /* Compute new loop. */
+ flow_loop_tree_node_add (outer, loop);
+ add_loop (loops, loop, 0);
+
+ return loop;
+}
/* Remove the latch edge of a LOOP and update LOOPS tree to indicate that
the LOOP was removed. After this function, original loop latch will
--- loop-unroll.c@@/main/SH5GCC-CVS20040317.2300-int/renneckej-3.4-unroll/0 Mon Mar 22 12:43:00 2004
+++ loop-unroll.c Fri Mar 26 16:52:05 2004
@@ -81,6 +81,7 @@ static void peel_loop_completely (struct
static void unroll_loop_stupid (struct loops *, struct loop *);
static void unroll_loop_constant_iterations (struct loops *, struct loop *);
static void unroll_loop_runtime_iterations (struct loops *, struct loop *);
+static void unroll_loop_runtime_iterations_mop_up (struct loops *, struct loop *);
static void expand_bct (edge, int);
static bool discard_increment (struct loop *);
@@ -130,6 +131,9 @@ unroll_and_peel_loops (struct loops *loo
case LPT_UNROLL_RUNTIME:
unroll_loop_runtime_iterations (loops, loop);
break;
+ case LPT_UNROLL_MOP_UP:
+ unroll_loop_runtime_iterations_mop_up (loops, loop);
+ break;
case LPT_UNROLL_STUPID:
unroll_loop_stupid (loops, loop);
break;
@@ -696,7 +700,7 @@ unroll_loop_constant_iterations (struct
static void
decide_unroll_runtime_iterations (struct loop *loop, int flags)
{
- unsigned nunroll, nunroll_by_av, i;
+ unsigned nunroll, nunroll_by_av, max_nunroll, i;
if (!(flags & UAP_UNROLL))
{
@@ -713,6 +717,7 @@ decide_unroll_runtime_iterations (struct
nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
if (nunroll > nunroll_by_av)
nunroll = nunroll_by_av;
+ max_nunroll = nunroll;
if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
@@ -749,8 +754,36 @@ decide_unroll_runtime_iterations (struct
/* If we have profile feedback, check whether the loop rolls. */
if (loop->header->count && expected_loop_iterations (loop) < 2 * nunroll)
{
- if (rtl_dump_file)
- fprintf (rtl_dump_file, ";; Not unrolling loop, doesn't roll\n");
+ int niter_scaled = (expected_loop_iterations_1 (loop, 16));
+ int niter = (niter_scaled) + 8 / 16;
+ HOST_WIDE_INT max_increment_factor;
+ if (niter < 3 || GET_CODE (loop->desc.stride) != CONST_INT)
+ {
+ if (rtl_dump_file)
+ fprintf (rtl_dump_file,
+ ";; Not unrolling loop, doesn't roll enough\n");
+ return;
+ }
+ max_increment_factor = INTVAL (loop->desc.stride);
+ max_increment_factor =
+ ((HOST_WIDE_INT) ((unsigned HOST_WIDE_INT)~0 >> 1)
+ / (max_increment_factor > 0
+ ? max_increment_factor : -max_increment_factor));
+ /* We are considering unrolling so that the loop rolls I times, using
+ NUNROLL loop copies, followed a switch with NUNROLL-1 cases, each
+ taking 1 .. NUNROLL-1 loop copies (all separate to allow maximum
+ scheduling). Thus, the switch needs NUNROLL * (NUNROLL - 1) / 2
+ loop copies, and we have a total of NUNROLL * (NUNROLL) + 1 / 2
+ loop copies. */
+ for (i = 1;
+ nunroll = niter / i,
+ (nunroll * (nunroll + 1) > max_nunroll
+ || nunroll - 1 > (unsigned) max_increment_factor);
+ i++)
+ if (nunroll <= 1)
+ return;
+ loop->lpt_decision.decision = LPT_UNROLL_MOP_UP;
+ loop->lpt_decision.times = nunroll;
return;
}
@@ -1030,6 +1063,228 @@ unroll_loop_runtime_iterations (struct l
/* Remove the edges. */
for (i = 0; i < n_remove_edges; i++)
+ remove_path (loops, remove_edges[i]);
+ free (remove_edges);
+
+ if (rtl_dump_file)
+ fprintf (rtl_dump_file,
+ ";; Unrolled loop %d times, counting # of iterations in runtime, %i insns\n",
+ max_unroll, num_loop_insns (loop));
+}
+
+/* Unroll LOOP for that we are able to count number of iterations in runtime
+ LOOP->LPT_DECISION.TIMES + 1 times. The transformation does this (with some
+ extra care for case (n - 3) > n):
+
+ for (i = 0; i < n; i++)
+ body;
+
+ ==>
+
+ i = 0;
+
+ while (i < n - 3)
+ {
+ body; i++;
+ body; i++;
+ body; i++;
+ body; i++;
+ }
+ switch (n - i)
+ {
+ case 3:
+ body; i++; body; i++; body; i++; break;
+ case 2:
+ body; i++; body; i++; break;
+ case 1:
+ body; i++;
+ case 0: ;
+ }
+
+ */
+static void
+unroll_loop_runtime_iterations_mop_up (struct loops *loops, struct loop *loop)
+{
+ rtx init_code, label;
+ unsigned i, j;
+ basic_block preheader, *body, *dom_bbs, new_header, switch_header;
+ basic_block simple_preheader, bound_check_bb = NULL;
+ basic_block unrolled_end_bb, new_unrolled_end_bb, after_loop;
+ rtx unrolled_end, new_unrolled_end, vtop_code;
+ unsigned n_dom_bbs;
+ sbitmap wont_exit;
+ int may_exit_copy;
+ unsigned n_peel, n_remove_edges;
+ edge *remove_edges, unrolled_exit_edge, new_back_edge, e, unrolled_end_edge;
+ bool extra_zero_check, last_may_exit;
+ unsigned max_unroll = loop->lpt_decision.times;
+ struct loop_desc *desc = &loop->desc;
+ HOST_WIDE_INT adjust;
+ rtx new_lim, skip_unrolled;
+ int mop_up_unrolled = 0;
+
+ /* Remember blocks whose dominators will have to be updated. */
+ dom_bbs = xcalloc (n_basic_blocks + 3, sizeof (basic_block));
+ n_dom_bbs = 0;
+
+ body = get_loop_body (loop);
+ for (i = 0; i < loop->num_nodes; i++)
+ {
+ unsigned nldom;
+ basic_block *ldom;
+
+ nldom = get_dominated_by (CDI_DOMINATORS, body[i], &ldom);
+ for (j = 0; j < nldom; j++)
+ if (!flow_bb_inside_loop_p (loop, ldom[j]))
+ dom_bbs[n_dom_bbs++] = ldom[j];
+
+ free (ldom);
+ }
+ free (body);
+
+ if (desc->postincr)
+ {
+ /* Leave exit in first copy (for explanation why see comment in
+ unroll_loop_constant_iterations). */
+ may_exit_copy = 0;
+ n_peel = max_unroll - 1;
+ extra_zero_check = true;
+ last_may_exit = false;
+ }
+ else
+ {
+ /* Leave exit in last copy (for explanation why see comment in
+ unroll_loop_constant_iterations). */
+ may_exit_copy = max_unroll;
+ n_peel = max_unroll;
+ extra_zero_check = false;
+ last_may_exit = true;
+ }
+
+/* FIXME */
+ /* Compute adjusted loop bound. */
+ adjust = (max_unroll - 1) * INTVAL (desc->stride);
+ skip_unrolled = gen_label_rtx ();
+ if (GET_CODE (desc->lim) == CONST_INT)
+ {
+ /* This can't overflow, because then the unrolled loop would never
+ be executed, but we have selected the unroll factor so that
+ it will be at least sometimes executed. */
+ new_lim = GEN_INT (INTVAL (desc->lim) - adjust);
+ }
+ else
+ {
+ new_lim = expand_simple_binop (GET_MODE (desc->var), MINUS,
+ desc->lim, GEN_INT (adjust),
+ NULL_RTX, 0, OPTAB_LIB_WIDEN);
+ /* Skip unrolled loop if loop bound adjustment overflows. */
+ start_sequence ();
+ emit_cmp_and_jump_insns (new_lim, desc->lim,
+ INTVAL (desc->stride) > 0 ? GT : LT,
+ NULL_RTX, GET_MODE (desc->var),
+ desc->cond != signed_condition (desc->cond),
+ skip_unrolled);
+ init_code = get_insns ();
+ end_sequence ();
+ dom_bbs[n_dom_bbs++] = bound_check_bb =
+ loop_split_edge_with (loop_preheader_edge (loop), init_code);
+ }
+ start_sequence ();
+ /* Skip unrolled loop if we don't have at least one unrolled iteration.
+ ??? We should find and delete any previously copied loop exit test. */
+ emit_cmp_and_jump_insns (desc->var, new_lim,
+ (desc->neg
+ ? reverse_condition (desc->cond) : desc->cond),
+ NULL_RTX, GET_MODE (desc->var),
+ desc->cond != signed_condition (desc->cond),
+ skip_unrolled);
+ init_code = get_insns ();
+ end_sequence ();
+ preheader = loop_split_edge_with (loop_preheader_edge (loop), NULL_RTX);
+ /* The following call needs to be done before the jump is inserted into
+ PREHEADER, to avoid deleting the jump. */
+ simple_preheader = loop_split_edge_with (loop_preheader_edge (loop),
+ NULL_RTX);
+ emit_insn_after (init_code, BB_HEAD (preheader));
+ remove_edges = xcalloc (max_unroll, sizeof (edge));
+ n_remove_edges = 0;
+ wont_exit = sbitmap_alloc (max_unroll + 1);
+ sbitmap_ones (wont_exit);
+ if (! mop_up_unrolled)
+ RESET_BIT (wont_exit, 0);
+ if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
+ loops, max_unroll,
+ wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
+ 0))
+ abort ();
+ new_header = simple_preheader->succ->dest;
+ unrolled_exit_edge = remove_edges[--n_remove_edges];
+ unrolled_end_bb = unrolled_exit_edge->src;
+ set_immediate_dominator (CDI_DOMINATORS, new_header, simple_preheader);
+ after_loop = unrolled_exit_edge->dest;
+ unrolled_end = BB_END (unrolled_end_bb);
+ if (!onlyjump_p (unrolled_end))
+ abort ();
+ start_sequence ();
+ label = block_label (new_header);
+ emit_cmp_and_jump_insns (desc->var, new_lim,
+ (desc->neg
+ ? reverse_condition (desc->cond) : desc->cond),
+ NULL_RTX, GET_MODE (desc->var),
+ desc->cond != signed_condition (desc->cond),
+ label);
+ JUMP_LABEL (get_last_insn ()) = label;
+ LABEL_NUSES (label)++;
+ new_unrolled_end = get_last_insn ();
+ emit_label (skip_unrolled);
+ label = block_label (after_loop);
+ emit_cmp_and_jump_insns (desc->var, desc->lim,
+ (desc->neg
+ ? reverse_condition (desc->cond) : desc->cond),
+ NULL_RTX, GET_MODE (desc->var),
+ desc->cond != signed_condition (desc->cond),
+ label);
+ JUMP_LABEL (get_last_insn ()) = label;
+ LABEL_NUSES (label)++;
+ vtop_code = get_insns ();
+ end_sequence ();
+ emit_insn_after (vtop_code, unrolled_end);
+ delete_insn (unrolled_end);
+ unrolled_end_edge = split_block (unrolled_end_bb, new_unrolled_end);
+ new_unrolled_end_bb = unrolled_end_edge->dest;
+ add_bb_to_loop (new_unrolled_end_bb, unrolled_end_bb->loop_father);
+ add_to_dominance_info (CDI_DOMINATORS, new_unrolled_end_bb);
+ set_immediate_dominator (CDI_DOMINATORS, new_unrolled_end_bb,
+ unrolled_end_bb);
+ dom_bbs[n_dom_bbs++] = new_unrolled_end_bb->succ->dest;
+ dom_bbs[n_dom_bbs++] = new_unrolled_end_bb->succ->succ_next->dest;
+ if (new_unrolled_end_bb->succ->succ_next->succ_next)
+ abort ();
+ new_back_edge
+ = unchecked_make_edge (unrolled_end_bb, new_header,
+ unrolled_end_edge->flags & EDGE_IRREDUCIBLE_LOOP);
+ switch_header = new_back_edge->dest;
+ e = make_edge (preheader, switch_header,
+ switch_header->succ->flags & EDGE_IRREDUCIBLE_LOOP);
+ set_immediate_dominator (CDI_DOMINATORS, switch_header, unrolled_end_bb);
+ if (bound_check_bb)
+ {
+ e = make_edge (bound_check_bb, switch_header,
+ switch_header->succ->flags & EDGE_IRREDUCIBLE_LOOP);
+ set_immediate_dominator (CDI_DOMINATORS, switch_header, bound_check_bb);
+ }
+ /* FIXME: update probabilities. */
+ /* FIXME: update frequencies. */
+
+ free (wont_exit);
+
+ do_loopify (loops, new_back_edge);
+
+ /* Recompute dominators for outer blocks. */
+ iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, n_dom_bbs);
+
+ /* Remove the edges. */
+ for (i = 0; i < n_remove_edges; i++)
remove_path (loops, remove_edges[i]);
free (remove_edges);