This is the mail archive of the
gcc@gcc.gnu.org
mailing list for the GCC project.
Re: New loop unroller broken?
- From: Mircea Namolaru <NAMOLARU at il dot ibm dot com>
- To: Zdenek Dvorak <rakdver at atrey dot karlin dot mff dot cuni dot cz>
- Cc: Dale Johannesen <dalej at apple dot com>, David Edelsohn <dje at watson dot ibm dot com>, gcc at gcc dot gnu dot org, Andrew Pinski <pinskia at physics dot uc dot edu>, Ulrich Weigand <weigand at i1 dot informatik dot uni-erlangen dot de>
- Date: Thu, 29 Jan 2004 13:53:48 +0200
- Subject: Re: New loop unroller broken?
We modified our patch following Zdenek's comments (thanks!).
This patch gains more than 4% improvement overall CFSPEC2000 on Power4,
with 3 benchmarks showing around 10% improvement (wupwise, swim and art).
OK to submit for GCC 3.4, after bootstraping and regtesting etc.?
gdiff -c3t loop-unroll.c.20031210 loop-unroll.c
*** loop-unroll.c.20031210 Fri Jan 23 14:30:56 2004
--- loop-unroll.c Thu Jan 29 12:21:51 2004
***************
*** 31,36 ****
--- 31,39 ----
#include "output.h"
#include "expr.h"
+ /* We need to use the macro exact_log2. */
+ #include "toplev.h"
+
/* This pass performs loop unrolling and peeling. We only perform these
optimizations on innermost loops (with single exception) because
the impact on performance is greatest here, and we want to avoid
***************
*** 82,87 ****
--- 85,96 ----
static void unroll_loop_constant_iterations (struct loops *, struct loop
*);
static void unroll_loop_runtime_iterations (struct loops *, struct loop
*);
+ bool is_bct_cond (rtx);
+ rtx get_var_set_from_bct (rtx);
+ void expand_bct (edge, int);
+ bool single_reg_use (struct loop *);
+
+
/* Unroll and/or peel (depending on FLAGS) LOOPS. */
void
unroll_and_peel_loops (struct loops *loops, int flags)
***************
*** 403,409 ****
unsigned n_remove_edges, i;
edge *remove_edges;
struct loop_desc *desc = &loop->desc;
!
npeel = desc->niter;
if (npeel)
--- 412,418 ----
unsigned n_remove_edges, i;
edge *remove_edges;
struct loop_desc *desc = &loop->desc;
! bool discard_inc = single_reg_use (loop);
npeel = desc->niter;
if (npeel)
***************
*** 425,436 ****
--- 434,454 ----
free (wont_exit);
+ /* Expand the branch and count. */
+ if (is_bct_cond(loop->desc.out_edge->src->end))
+ for (i = 0; i < n_remove_edges; i++)
+ expand_bct (remove_edges[i], discard_inc);
+
/* Remove the exit edges. */
for (i = 0; i < n_remove_edges; i++)
remove_path (loops, remove_edges[i]);
free (remove_edges);
}
+ /* Expand the branch and count. */
+ if (is_bct_cond(loop->desc.out_edge->src->end))
+ expand_bct (desc->in_edge, discard_inc);
+
/* Now remove the unreachable part of the last iteration and cancel
the loop. */
remove_path (loops, desc->in_edge);
***************
*** 560,565 ****
--- 578,584 ----
edge *remove_edges;
unsigned max_unroll = loop->lpt_decision.times;
struct loop_desc *desc = &loop->desc;
+ bool discard_inc = single_reg_use (loop);
niter = desc->niter;
***************
*** 574,579 ****
--- 593,623 ----
remove_edges = xcalloc (max_unroll + exit_mod + 1, sizeof (edge));
n_remove_edges = 0;
+ /* For a loop ending with a branch and count for which the increment
+ of the count register will be discarded, adjust the initialization
of
+ the count register. */
+ if (is_bct_cond(desc->out_edge->src->end)
+ && discard_inc)
+ {
+ rtx ini_var;
+
+ rtx init_code;
+ int n_peel, new_bct_value;
+
+ /* Get expression for number of iterations. */
+ start_sequence ();
+
+ n_peel = (niter+1) % (max_unroll+1);
+ new_bct_value = (niter+1 - n_peel) / (max_unroll+1) ;
+ ini_var = GEN_INT (new_bct_value);
+
+ emit_move_insn (desc->var, ini_var);
+ init_code = get_insns ();
+ end_sequence ();
+
+ loop_split_edge_with (loop_preheader_edge (loop), init_code,
loops);
+ }
+
if (desc->postincr)
{
/* Counter is incremented after the exit test; leave exit test
***************
*** 637,642 ****
--- 681,691 ----
free (wont_exit);
+ /* Expand the branch and count. */
+ if (is_bct_cond(loop->desc.out_edge->src->end))
+ for (i = 0; i < n_remove_edges; i++)
+ expand_bct (remove_edges[i], discard_inc);
+
/* Remove the edges. */
for (i = 0; i < n_remove_edges; i++)
remove_path (loops, remove_edges[i]);
***************
*** 710,715 ****
--- 759,765 ----
return;
}
+
/* Success; now force nunroll to be power of 2, as we are unable to
cope with overflows in computation of number of iterations. */
for (i = 1; 2 * i <= nunroll; i *= 2);
***************
*** 763,768 ****
--- 813,819 ----
bool extra_zero_check, last_may_exit;
unsigned max_unroll = loop->lpt_decision.times;
struct loop_desc *desc = &loop->desc;
+ bool discard_inc = single_reg_use (loop);
/* Remember blocks whose dominators will have to be updated. */
dom_bbs = xcalloc (n_basic_blocks, sizeof (basic_block));
***************
*** 817,822 ****
--- 868,902 ----
GEN_INT (max_unroll),
NULL_RTX, 0, OPTAB_LIB_WIDEN);
+ /* For a loop ending with a branch and count for which the increment
+ of the count register will be discarded, adjust the initialization
of
+ the count register. */
+ if (is_bct_cond(desc->out_edge->src->end)
+ && discard_inc)
+ {
+ rtx count, count2,count_unroll_mod;
+ int count_unroll;
+
+ /* start_sequence (); */
+
+ count = count_loop_iterations (desc, NULL, NULL);
+
+ count_unroll = loop->lpt_decision.times+1;
+ /* count = expand_simple_binop (GET_MODE (desc->var), DIV,
+ count, count_unroll,
+ 0, 0, OPTAB_LIB_WIDEN); */
+ count_unroll_mod = GEN_INT(exact_log2(count_unroll));
+ count = expand_simple_binop (GET_MODE (desc->var), LSHIFTRT,
+ count, count_unroll_mod,
+ 0, 0, OPTAB_LIB_WIDEN);
+
+ count2 = expand_simple_binop (GET_MODE (desc->var), PLUS,
+ count, GEN_INT (2),
+ 0, 0, OPTAB_LIB_WIDEN);
+
+ emit_move_insn (desc->var, count2);
+ }
+
init_code = get_insns ();
end_sequence ();
***************
*** 862,869 ****
j = n_peel - i - (extra_zero_check ? 0 : 1);
p = REG_BR_PROB_BASE / (i + 2);
! preheader = loop_split_edge_with (loop_preheader_edge (loop),
! NULL_RTX, loops);
label = block_label (preheader);
start_sequence ();
do_compare_rtx_and_jump (copy_rtx (niter), GEN_INT (j), EQ, 0,
--- 942,965 ----
j = n_peel - i - (extra_zero_check ? 0 : 1);
p = REG_BR_PROB_BASE / (i + 2);
! /* If modulo is zero do not jump to the header of the unrolled
loops.
! Jump instead to the last branch and count that precedes it. */
! if (is_bct_cond(desc->out_edge->src->end) && (j == 0))
! {
! basic_block lastbb = loop_preheader_edge(loop)->src;
! rtx split_after;
!
! /* Skip dummy basic blocks generated during the unrolling. */
! while (!is_bct_cond(lastbb->end))
! lastbb = lastbb->pred->src;
!
! split_after = PREV_INSN (lastbb->end);
!
! preheader = split_loop_bb (loops, lastbb , split_after)->dest;
! }
! else
! preheader = loop_split_edge_with (loop_preheader_edge (loop),
! NULL_RTX, loops);
label = block_label (preheader);
start_sequence ();
do_compare_rtx_and_jump (copy_rtx (niter), GEN_INT (j), EQ, 0,
***************
*** 933,938 ****
--- 1029,1039 ----
free (wont_exit);
+ /* Expand the branch and count. */
+ if (is_bct_cond(loop->desc.out_edge->src->end))
+ for (i = 0; i < n_remove_edges; i++)
+ expand_bct (remove_edges[i], discard_inc);
+
/* Remove the edges. */
for (i = 0; i < n_remove_edges; i++)
remove_path (loops, remove_edges[i]);
***************
*** 1170,1172 ****
--- 1271,1350 ----
fprintf (rtl_dump_file, ";; Unrolled loop %d times, %i insns\n",
nunroll, num_loop_insns (loop));
}
+
+ /* Expand a bct instruction in a branch and an increment. */
+ void expand_bct(edge e, int flag_inc)
+ {
+ rtx bct_insn = e->src->end;
+ rtx cmp;
+ rtx inc;
+ rtx seq;
+ rtx tgt;
+ rtx condition;
+ rtx label;
+ rtx reg;
+ rtx jump;
+ rtx pattern = PATTERN(bct_insn);
+
+ if (!(is_bct_cond(bct_insn)))
+ return;
+
+ inc = get_var_set_from_bct (bct_insn);
+ cmp = XVECEXP (pattern, 0, 0);
+ reg = SET_DEST (inc);
+
+ start_sequence ();
+ if (!flag_inc)
+ {
+ tgt = force_operand (XEXP (inc, 1), XEXP (inc, 0));
+ if (tgt != XEXP (inc, 0))
+ emit_move_insn (XEXP (inc, 0), tgt);
+ }
+
+ condition = XEXP (SET_SRC (cmp), 0);
+ label = XEXP (SET_SRC (cmp), 1);
+
+ do_compare_rtx_and_jump (copy_rtx (reg), XEXP (condition, 1),
+ GET_CODE (condition), 0,
+ GET_MODE (reg), NULL_RTX, NULL_RTX,
+ label);
+ jump = get_last_insn ();
+ JUMP_LABEL (jump) = label;
+ seq = get_insns ();
+ end_sequence ();
+ emit_insn_after (seq, bct_insn);
+
+ delete_insn (bct_insn);
+
+ return;
+ }
+
+ /* Check that the count register has no other uses in the loop. */
+ bool
+ single_reg_use (struct loop *loop)
+ {
+ struct loop_desc *desc = &loop->desc;
+ basic_block *body;
+ int nbbs;
+ int i;
+ basic_block exit_bb;
+
+ body = get_loop_body (loop);
+ nbbs = loop->num_nodes;
+ exit_bb = desc->out_edge->src;
+
+ for(i=0;i<nbbs;i++)
+ {
+ if (reg_mentioned_p(desc->var, body[i]->head))
+ return false;
+
+ if (body[i] != exit_bb)
+ if (reg_mentioned_p(desc->var, body[i]->end))
+ return false;
+
+ if (reg_used_between_p (desc->var, body[i]->head, body[i]->end))
+ return false;
+ }
+ return true;
+ }
+
gdiff -c3t cfgloopanal.c.20031210 cfgloopanal.c
*** cfgloopanal.c.20031210 Fri Jan 23 14:30:29 2004
--- cfgloopanal.c Thu Jan 29 12:09:10 2004
***************
*** 45,50 ****
--- 45,51 ----
static rtx variable_initial_values (edge, rtx, enum machine_mode);
static bool simple_condition_p (struct loop *, rtx, regset,
struct loop_desc *);
+
static basic_block simple_increment (struct loops *, struct loop *, rtx
*,
struct loop_desc *);
static rtx count_strange_loop_iterations (rtx, rtx, enum rtx_code,
***************
*** 53,58 ****
--- 54,63 ----
static unsigned HOST_WIDEST_INT inverse (unsigned HOST_WIDEST_INT, int);
static bool fits_in_mode_p (enum machine_mode mode, rtx expr);
+ bool is_bct_cond (rtx);
+ rtx get_var_set_from_bct (rtx);
+ extern rtx doloop_condition_get (rtx);
+
/* Computes inverse to X modulo (1 << MOD). */
static unsigned HOST_WIDEST_INT
inverse (unsigned HOST_WIDEST_INT x, int mod)
***************
*** 162,167 ****
--- 167,176 ----
insn = NEXT_INSN (insn))
{
rtx set = single_set (insn);
+
+ if (!set && is_bct_cond (insn))
+ set = get_var_set_from_bct(insn);
+
if (!set)
continue;
if (!REG_P (SET_DEST (set)))
***************
*** 313,318 ****
--- 322,331 ----
/* mod_insn must be a simple increment/decrement. */
set = single_set (mod_insn);
+
+ if (!set && is_bct_cond (mod_insn))
+ set = get_var_set_from_bct(mod_insn);
+
if (!set)
abort ();
if (!rtx_equal_p (SET_DEST (set), desc->var))
***************
*** 835,843 ****
return NULL_RTX;
/* Normalize difference so the value is always first examined
! and later incremented. */
! if (!desc->postincr)
! exp = simplify_gen_binary (MINUS, mode, exp, stride);
/* Determine delta caused by exit condition. */
switch (cond)
--- 848,857 ----
return NULL_RTX;
/* Normalize difference so the value is always first examined
! and later incremented. Do not do this for a loop ending with a
branch
! and count register. */
! if ((!is_bct_cond(desc->out_edge->src->end)) && (!desc->postincr))
! exp = simplify_gen_binary (MINUS, mode, exp, stride);
/* Determine delta caused by exit condition. */
switch (cond)
***************
*** 1413,1415 ****
--- 1427,1485 ----
return (freq_latch + freq_in - 1) / freq_in;
}
}
+
+ /* This function checks if an instruction is a branch and count
instruction
+ no matter if the flag HAVE_doloop_end is enabled or not. An
alternative
+ would be the modification of doloop_condition_get function itself. */
+ bool
+ is_bct_cond (rtx insn)
+ {
+ if (GET_CODE (insn) != JUMP_INSN)
+ return false;
+
+ #ifdef HAVE_doloop_end
+ if (!doloop_condition_get (PATTERN(insn)))
+ return false;
+ #else
+ return false;
+ #endif
+
+ return true;
+ }
+
+ /* Extract the increment of the count register from the branch and count
+ instruction. */
+ rtx
+ get_var_set_from_bct (rtx insn)
+ {
+ rtx rhs, lhs, cond;
+ rtx pattern;
+ rtx set;
+ pattern = PATTERN (insn);
+
+ if (!is_bct_cond (insn))
+ abort ();
+
+ set = XVECEXP (pattern, 0, 1);
+
+ /* IA64 has the decrement conditional, i.e. done only when the loop
does not end. Separate the set. We match
+
+ (set (x (if_then_else (ne x 0) (plus x -1) x))) here. */
+
+ lhs = XEXP (set, 0);
+ rhs = XEXP (set, 1);
+ if (GET_CODE (set) != IF_THEN_ELSE)
+ return set;
+
+ cond = XEXP (rhs, 0);
+ if (GET_CODE (cond) != NE
+ || !rtx_equal_p (XEXP (cond, 0), lhs)
+ || !rtx_equal_p (XEXP (cond, 1), const0_rtx))
+ return set;
+
+ rhs = XEXP (rhs, 1);
+
+ return gen_rtx_SET (GET_MODE (lhs), lhs, rhs);
+ }
+
+
gdiff -c3t loop-init.c.2031210 loop-init.c
*** loop-init.c.20031210 Thu Jan 29 12:45:04 2004
--- loop-init.c Thu Jan 29 11:43:10 2004
***************
*** 28,33 ****
--- 28,36 ----
#include "cfgloop.h"
#include "cfglayout.h"
+ extern bool is_bct_cond (rtx);
+ static void fixup_loop_exit_succesor (basic_block, basic_block);
+
/* Initialize loop optimizer. */
struct loops *
***************
*** 88,98 ****
--- 91,142 ----
return loops;
}
+ /* The first basic block is moved after the second in the reorder chain.
*/
+ static void
+ fixup_loop_exit_succesor (basic_block exit_succ, basic_block latch)
+ {
+ basic_block bb = exit_succ;
+ basic_block bb1 = latch;
+
+ if (!(bb && bb->rbi->next))
+ return;
+
+ if (!bb1)
+ return;
+
+
+ if (bb && bb->rbi->next)
+ {
+ basic_block c = ENTRY_BLOCK_PTR->next_bb;
+
+ while (c->rbi->next != bb)
+ c = c->rbi->next;
+
+ c->rbi->next = bb->rbi->next;
+ }
+
+ if(bb1->rbi->next == NULL)
+ {
+ bb1->rbi->next=bb;
+ bb->rbi->next=NULL;
+ }
+ else
+
+ {
+ basic_block tmp;
+
+ tmp = bb1->rbi->next;
+ bb1->rbi->next = bb;
+ bb->rbi->next = tmp;
+ }
+ }
+
/* Finalize loop optimizer. */
void
loop_optimizer_finalize (struct loops *loops, FILE *dumpfile)
{
basic_block bb;
+ int i;
/* Finalize layout changes. */
/* Make chain. */
***************
*** 102,107 ****
--- 146,208 ----
/* Another dump. */
flow_loops_dump (loops, dumpfile, NULL, 1);
+
+ /* For loops ending with a branch and count instruction, move the basic
+ block found before the unrolling on the fallthru path of this
branch,
+ after the unrolled code. This will allow branch simplification. */
+ for (i = 1; i < loops->num; i++)
+ {
+ struct loop *loop = loops->parray[i];
+ struct loop_desc *desc;
+ basic_block loop_real_latch, bb, bb_exit;
+ edge e;
+
+ if (loop == NULL)
+ continue;
+ if (!loop->simple)
+ continue;
+ if (!loop->has_desc)
+ continue;
+
+ if (loop->lpt_decision.decision != LPT_UNROLL_RUNTIME)
+ continue;
+
+ desc = &loop->desc;
+ if (desc == NULL)
+ continue;
+ if (loop->latch->pred == NULL)
+ continue;
+
+ loop_real_latch = loop->latch->pred->src;
+
+
+ if (desc->postincr)
+ continue;
+ if (!is_bct_cond(loop_real_latch->end))
+ continue;
+
+ for (e = loop_real_latch->succ; e ; e = e->succ_next)
+ if (e->flags & EDGE_FALLTHRU)
+ break;
+
+ if (e == NULL)
+ continue;
+
+ bb_exit = e->dest;
+
+ bb = NULL;
+
+ /* Leave the case of the bb_exit falling through to exit to
+ fixed_fallthru_exit_predecessor */
+ for (e = EXIT_BLOCK_PTR->pred; e; e = e->pred_next)
+ if (e->flags & EDGE_FALLTHRU)
+ bb = e->src;
+
+ if (bb_exit == bb)
+ continue;
+
+ fixup_loop_exit_succesor (bb_exit, loop->latch);
+ }
/* Clean up. */
flow_loops_free (loops);
gdiff -c3t doloop.c.20031210 doloop.c
*** doloop.c.20031210 Wed Jan 28 18:59:34 2004
--- doloop.c Wed Jan 28 19:01:27 2004
***************
*** 59,66 ****
#ifdef HAVE_doloop_end
-
- static rtx doloop_condition_get (rtx);
static unsigned HOST_WIDE_INT doloop_iterations_max (const struct
loop_info *,
enum machine_mode,
int);
static int doloop_valid_p (const struct loop *, rtx);
--- 59,64 ----
***************
*** 71,77 ****
/* Return the loop termination condition for PATTERN or zero
if it is not a decrement and branch jump insn. */
! static rtx
doloop_condition_get (rtx pattern)
{
rtx cmp;
--- 69,75 ----
/* Return the loop termination condition for PATTERN or zero
if it is not a decrement and branch jump insn. */
! rtx
doloop_condition_get (rtx pattern)
{
rtx cmp;
gdiff -c3t loop.h.20031210 loop.h
*** loop.h.20031210 Wed Jan 28 18:57:48 2004
--- loop.h Wed Jan 28 19:01:26 2004
***************
*** 428,431 ****
--- 428,432 ----
extern rtx loop_insn_hoist (const struct loop *, rtx);
/* Forward declarations for non-static functions declared in doloop.c.
*/
+ extern rtx doloop_condition_get (rtx);
extern int doloop_optimize (const struct loop *);