This is the mail archive of the gcc@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: New loop unroller broken?





We modified our patch following Zdenek's comments (thanks!).
This patch gains more than 4% improvement overall CFSPEC2000 on Power4,
with 3 benchmarks showing around 10% improvement (wupwise, swim and art).

OK to submit for GCC 3.4, after bootstraping and regtesting etc.?


gdiff -c3t loop-unroll.c.20031210 loop-unroll.c
*** loop-unroll.c.20031210    Fri Jan 23 14:30:56 2004
--- loop-unroll.c Thu Jan 29 12:21:51 2004
***************
*** 31,36 ****
--- 31,39 ----
  #include "output.h"
  #include "expr.h"

+ /* We need to use the macro exact_log2. */
+ #include "toplev.h"
+
  /* This pass performs loop unrolling and peeling.  We only perform these
     optimizations on innermost loops (with single exception) because
     the impact on performance is greatest here, and we want to avoid
***************
*** 82,87 ****
--- 85,96 ----
  static void unroll_loop_constant_iterations (struct loops *, struct loop
*);
  static void unroll_loop_runtime_iterations (struct loops *, struct loop
*);

+ bool is_bct_cond (rtx);
+ rtx get_var_set_from_bct (rtx);
+ void expand_bct (edge, int);
+ bool single_reg_use (struct loop *);
+
+
  /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
  void
  unroll_and_peel_loops (struct loops *loops, int flags)
***************
*** 403,409 ****
    unsigned n_remove_edges, i;
    edge *remove_edges;
    struct loop_desc *desc = &loop->desc;
!
    npeel = desc->niter;

    if (npeel)
--- 412,418 ----
    unsigned n_remove_edges, i;
    edge *remove_edges;
    struct loop_desc *desc = &loop->desc;
!   bool discard_inc = single_reg_use (loop);
    npeel = desc->niter;

    if (npeel)
***************
*** 425,436 ****
--- 434,454 ----

        free (wont_exit);

+       /* Expand the branch and count.  */
+       if (is_bct_cond(loop->desc.out_edge->src->end))
+         for (i = 0; i < n_remove_edges; i++)
+           expand_bct (remove_edges[i], discard_inc);
+
        /* Remove the exit edges.  */
        for (i = 0; i < n_remove_edges; i++)
          remove_path (loops, remove_edges[i]);
        free (remove_edges);
      }

+   /* Expand the branch and count.  */
+   if (is_bct_cond(loop->desc.out_edge->src->end))
+     expand_bct (desc->in_edge, discard_inc);
+
    /* Now remove the unreachable part of the last iteration and cancel
       the loop.  */
    remove_path (loops, desc->in_edge);
***************
*** 560,565 ****
--- 578,584 ----
    edge *remove_edges;
    unsigned max_unroll = loop->lpt_decision.times;
    struct loop_desc *desc = &loop->desc;
+   bool discard_inc = single_reg_use (loop);

    niter = desc->niter;

***************
*** 574,579 ****
--- 593,623 ----
    remove_edges = xcalloc (max_unroll + exit_mod + 1, sizeof (edge));
    n_remove_edges = 0;

+   /* For a loop ending with a branch and count for which the increment
+      of the count register will be discarded, adjust the initialization
of
+      the count register.  */
+   if (is_bct_cond(desc->out_edge->src->end)
+       && discard_inc)
+     {
+       rtx ini_var;
+
+       rtx init_code;
+       int n_peel, new_bct_value;
+
+       /* Get expression for number of iterations.  */
+       start_sequence ();
+
+       n_peel = (niter+1) % (max_unroll+1);
+       new_bct_value = (niter+1 - n_peel) / (max_unroll+1) ;
+       ini_var = GEN_INT (new_bct_value);
+
+       emit_move_insn (desc->var, ini_var);
+       init_code = get_insns ();
+       end_sequence ();
+
+       loop_split_edge_with (loop_preheader_edge (loop), init_code,
loops);
+     }
+
    if (desc->postincr)
      {
        /* Counter is incremented after the exit test; leave exit test
***************
*** 637,642 ****
--- 681,691 ----

    free (wont_exit);

+   /* Expand the branch and count.  */
+   if (is_bct_cond(loop->desc.out_edge->src->end))
+     for (i = 0; i < n_remove_edges; i++)
+       expand_bct (remove_edges[i], discard_inc);
+
    /* Remove the edges.  */
    for (i = 0; i < n_remove_edges; i++)
      remove_path (loops, remove_edges[i]);
***************
*** 710,715 ****
--- 759,765 ----
        return;
      }

+
    /* Success; now force nunroll to be power of 2, as we are unable to
       cope with overflows in computation of number of iterations.  */
    for (i = 1; 2 * i <= nunroll; i *= 2);
***************
*** 763,768 ****
--- 813,819 ----
    bool extra_zero_check, last_may_exit;
    unsigned max_unroll = loop->lpt_decision.times;
    struct loop_desc *desc = &loop->desc;
+   bool discard_inc = single_reg_use (loop);

    /* Remember blocks whose dominators will have to be updated.  */
    dom_bbs = xcalloc (n_basic_blocks, sizeof (basic_block));
***************
*** 817,822 ****
--- 868,902 ----
                                 GEN_INT (max_unroll),
                                 NULL_RTX, 0, OPTAB_LIB_WIDEN);

+   /* For a loop ending with a branch and count for which the increment
+      of the count register will be discarded, adjust the initialization
of
+      the count register.  */
+   if (is_bct_cond(desc->out_edge->src->end)
+       && discard_inc)
+     {
+       rtx count, count2,count_unroll_mod;
+       int count_unroll;
+
+       /* start_sequence (); */
+
+       count = count_loop_iterations (desc, NULL, NULL);
+
+       count_unroll = loop->lpt_decision.times+1;
+       /*  count = expand_simple_binop (GET_MODE (desc->var), DIV,
+                                      count, count_unroll,
+                                      0, 0, OPTAB_LIB_WIDEN); */
+       count_unroll_mod =  GEN_INT(exact_log2(count_unroll));
+       count = expand_simple_binop (GET_MODE (desc->var), LSHIFTRT,
+                                   count, count_unroll_mod,
+                                   0, 0, OPTAB_LIB_WIDEN);
+
+       count2 = expand_simple_binop (GET_MODE (desc->var), PLUS,
+                                      count, GEN_INT (2),
+                                      0, 0, OPTAB_LIB_WIDEN);
+
+       emit_move_insn (desc->var, count2);
+     }
+
    init_code = get_insns ();
    end_sequence ();

***************
*** 862,869 ****
        j = n_peel - i - (extra_zero_check ? 0 : 1);
        p = REG_BR_PROB_BASE / (i + 2);

!       preheader = loop_split_edge_with (loop_preheader_edge (loop),
!                                         NULL_RTX, loops);
        label = block_label (preheader);
        start_sequence ();
        do_compare_rtx_and_jump (copy_rtx (niter), GEN_INT (j), EQ, 0,
--- 942,965 ----
        j = n_peel - i - (extra_zero_check ? 0 : 1);
        p = REG_BR_PROB_BASE / (i + 2);

!       /* If modulo is zero do not jump to the header of the unrolled
loops.
!          Jump instead to the last branch and count that precedes it.  */
!       if (is_bct_cond(desc->out_edge->src->end) && (j == 0))
!         {
!           basic_block lastbb = loop_preheader_edge(loop)->src;
!           rtx split_after;
!
!           /* Skip dummy basic blocks generated during the unrolling.  */
!           while (!is_bct_cond(lastbb->end))
!             lastbb = lastbb->pred->src;
!
!           split_after = PREV_INSN (lastbb->end);
!
!           preheader = split_loop_bb (loops, lastbb , split_after)->dest;
!         }
!       else
!         preheader = loop_split_edge_with (loop_preheader_edge (loop),
!                                           NULL_RTX, loops);
        label = block_label (preheader);
        start_sequence ();
        do_compare_rtx_and_jump (copy_rtx (niter), GEN_INT (j), EQ, 0,
***************
*** 933,938 ****
--- 1029,1039 ----

    free (wont_exit);

+   /* Expand the branch and count.  */
+   if (is_bct_cond(loop->desc.out_edge->src->end))
+     for (i = 0; i < n_remove_edges; i++)
+       expand_bct (remove_edges[i], discard_inc);
+
    /* Remove the edges.  */
    for (i = 0; i < n_remove_edges; i++)
      remove_path (loops, remove_edges[i]);
***************
*** 1170,1172 ****
--- 1271,1350 ----
      fprintf (rtl_dump_file, ";; Unrolled loop %d times, %i insns\n",
               nunroll, num_loop_insns (loop));
  }
+
+ /* Expand a bct instruction in a branch and an increment. */
+ void expand_bct(edge e, int flag_inc)
+ {
+   rtx bct_insn = e->src->end;
+   rtx cmp;
+   rtx inc;
+   rtx seq;
+   rtx tgt;
+   rtx condition;
+   rtx label;
+   rtx reg;
+   rtx jump;
+   rtx pattern = PATTERN(bct_insn);
+
+   if (!(is_bct_cond(bct_insn)))
+     return;
+
+   inc = get_var_set_from_bct (bct_insn);
+   cmp = XVECEXP (pattern, 0, 0);
+   reg = SET_DEST (inc);
+
+   start_sequence ();
+   if (!flag_inc)
+     {
+       tgt = force_operand (XEXP (inc, 1), XEXP (inc, 0));
+       if (tgt != XEXP (inc, 0))
+         emit_move_insn (XEXP (inc, 0), tgt);
+     }
+
+   condition = XEXP (SET_SRC (cmp), 0);
+   label = XEXP (SET_SRC (cmp), 1);
+
+   do_compare_rtx_and_jump (copy_rtx (reg), XEXP (condition, 1),
+                            GET_CODE (condition), 0,
+                            GET_MODE (reg), NULL_RTX, NULL_RTX,
+                            label);
+   jump = get_last_insn ();
+   JUMP_LABEL (jump) = label;
+   seq = get_insns ();
+   end_sequence ();
+   emit_insn_after (seq, bct_insn);
+
+   delete_insn (bct_insn);
+
+   return;
+ }
+
+ /* Check that the count register has no other uses in the loop.  */
+ bool
+ single_reg_use (struct loop *loop)
+ {
+   struct loop_desc *desc = &loop->desc;
+   basic_block *body;
+   int nbbs;
+   int i;
+   basic_block exit_bb;
+
+   body = get_loop_body (loop);
+   nbbs = loop->num_nodes;
+   exit_bb = desc->out_edge->src;
+
+   for(i=0;i<nbbs;i++)
+   {
+     if (reg_mentioned_p(desc->var, body[i]->head))
+       return false;
+
+     if (body[i] != exit_bb)
+       if (reg_mentioned_p(desc->var, body[i]->end))
+         return false;
+
+     if (reg_used_between_p (desc->var, body[i]->head, body[i]->end))
+       return false;
+   }
+   return true;
+ }
+

 gdiff -c3t cfgloopanal.c.20031210 cfgloopanal.c
*** cfgloopanal.c.20031210    Fri Jan 23 14:30:29 2004
--- cfgloopanal.c Thu Jan 29 12:09:10 2004
***************
*** 45,50 ****
--- 45,51 ----
  static rtx variable_initial_values (edge, rtx, enum machine_mode);
  static bool simple_condition_p (struct loop *, rtx, regset,
                                  struct loop_desc *);
+
  static basic_block simple_increment (struct loops *, struct loop *, rtx
*,
                                       struct loop_desc *);
  static rtx count_strange_loop_iterations (rtx, rtx, enum rtx_code,
***************
*** 53,58 ****
--- 54,63 ----
  static unsigned HOST_WIDEST_INT inverse (unsigned HOST_WIDEST_INT, int);
  static bool fits_in_mode_p (enum machine_mode mode, rtx expr);

+ bool is_bct_cond (rtx);
+ rtx get_var_set_from_bct (rtx);
+ extern rtx doloop_condition_get (rtx);
+
  /* Computes inverse to X modulo (1 << MOD).  */
  static unsigned HOST_WIDEST_INT
  inverse (unsigned HOST_WIDEST_INT x, int mod)
***************
*** 162,167 ****
--- 167,176 ----
           insn = NEXT_INSN (insn))
        {
          rtx set = single_set (insn);
+
+         if (!set && is_bct_cond (insn))
+           set = get_var_set_from_bct(insn);
+
          if (!set)
            continue;
          if (!REG_P (SET_DEST (set)))
***************
*** 313,318 ****
--- 322,331 ----

    /* mod_insn must be a simple increment/decrement.  */
    set = single_set (mod_insn);
+
+   if (!set && is_bct_cond (mod_insn))
+     set = get_var_set_from_bct(mod_insn);
+
    if (!set)
      abort ();
    if (!rtx_equal_p (SET_DEST (set), desc->var))
***************
*** 835,843 ****
      return NULL_RTX;

    /* Normalize difference so the value is always first examined
!      and later incremented.  */
!   if (!desc->postincr)
!     exp = simplify_gen_binary (MINUS, mode, exp, stride);

    /* Determine delta caused by exit condition.  */
    switch (cond)
--- 848,857 ----
      return NULL_RTX;

    /* Normalize difference so the value is always first examined
!      and later incremented.  Do not do this for a loop ending with a
branch
!      and count register.  */
!    if ((!is_bct_cond(desc->out_edge->src->end)) && (!desc->postincr))
!      exp = simplify_gen_binary (MINUS, mode, exp, stride);

    /* Determine delta caused by exit condition.  */
    switch (cond)
***************
*** 1413,1415 ****
--- 1427,1485 ----
        return (freq_latch + freq_in - 1) / freq_in;
      }
  }
+
+ /* This function checks if an instruction is a branch and count
instruction
+    no matter if the flag HAVE_doloop_end is enabled or not. An
alternative
+    would be the modification of doloop_condition_get function itself.  */
+ bool
+ is_bct_cond (rtx insn)
+ {
+   if (GET_CODE (insn) != JUMP_INSN)
+     return false;
+
+ #ifdef HAVE_doloop_end
+   if (!doloop_condition_get (PATTERN(insn)))
+     return false;
+ #else
+   return false;
+ #endif
+
+   return true;
+ }
+
+ /* Extract the increment of the count register from the branch and count
+    instruction.  */
+ rtx
+ get_var_set_from_bct (rtx insn)
+ {
+   rtx rhs, lhs, cond;
+   rtx pattern;
+   rtx set;
+   pattern = PATTERN (insn);
+
+   if (!is_bct_cond (insn))
+     abort ();
+
+   set = XVECEXP (pattern, 0, 1);
+
+   /* IA64 has the decrement conditional, i.e. done only when the loop
does not      end.  Separate the set.  We match
+
+      (set (x (if_then_else (ne x 0) (plus x -1) x))) here.  */
+
+   lhs = XEXP (set, 0);
+   rhs = XEXP (set, 1);
+   if (GET_CODE (set) != IF_THEN_ELSE)
+     return set;
+
+   cond = XEXP (rhs, 0);
+   if (GET_CODE (cond) != NE
+       || !rtx_equal_p (XEXP (cond, 0), lhs)
+       || !rtx_equal_p (XEXP (cond, 1), const0_rtx))
+          return set;
+
+   rhs = XEXP (rhs, 1);
+
+   return gen_rtx_SET (GET_MODE (lhs), lhs, rhs);
+ }
+
+

gdiff -c3t loop-init.c.2031210 loop-init.c
*** loop-init.c.20031210      Thu Jan 29 12:45:04 2004
--- loop-init.c   Thu Jan 29 11:43:10 2004
***************
*** 28,33 ****
--- 28,36 ----
  #include "cfgloop.h"
  #include "cfglayout.h"

+ extern bool is_bct_cond (rtx);
+ static void fixup_loop_exit_succesor (basic_block, basic_block);
+
  /* Initialize loop optimizer.  */

  struct loops *
***************
*** 88,98 ****
--- 91,142 ----
    return loops;
  }

+ /* The first basic block is moved after the second in the reorder chain.
*/
+ static void
+ fixup_loop_exit_succesor (basic_block exit_succ, basic_block latch)
+ {
+   basic_block bb = exit_succ;
+   basic_block bb1 = latch;
+
+   if (!(bb && bb->rbi->next))
+     return;
+
+   if (!bb1)
+     return;
+
+
+   if (bb && bb->rbi->next)
+     {
+       basic_block c = ENTRY_BLOCK_PTR->next_bb;
+
+       while (c->rbi->next != bb)
+         c = c->rbi->next;
+
+       c->rbi->next = bb->rbi->next;
+     }
+
+   if(bb1->rbi->next == NULL)
+     {
+       bb1->rbi->next=bb;
+       bb->rbi->next=NULL;
+     }
+   else
+
+     {
+       basic_block tmp;
+
+       tmp = bb1->rbi->next;
+       bb1->rbi->next = bb;
+       bb->rbi->next = tmp;
+     }
+ }
+
  /* Finalize loop optimizer.  */
  void
  loop_optimizer_finalize (struct loops *loops, FILE *dumpfile)
  {
    basic_block bb;
+   int i;

    /* Finalize layout changes.  */
    /* Make chain.  */
***************
*** 102,107 ****
--- 146,208 ----

    /* Another dump.  */
    flow_loops_dump (loops, dumpfile, NULL, 1);
+
+   /* For loops ending with a branch and count instruction, move the basic
+      block found before the unrolling on the fallthru path of this
branch,
+      after the unrolled code.  This will allow branch simplification.  */
+   for (i = 1; i < loops->num; i++)
+     {
+       struct loop *loop = loops->parray[i];
+       struct loop_desc *desc;
+       basic_block loop_real_latch, bb, bb_exit;
+       edge e;
+
+       if (loop == NULL)
+         continue;
+       if (!loop->simple)
+         continue;
+       if (!loop->has_desc)
+         continue;
+
+       if (loop->lpt_decision.decision != LPT_UNROLL_RUNTIME)
+         continue;
+
+       desc = &loop->desc;
+       if (desc == NULL)
+         continue;
+       if (loop->latch->pred == NULL)
+         continue;
+
+       loop_real_latch = loop->latch->pred->src;
+
+
+       if (desc->postincr)
+         continue;
+       if (!is_bct_cond(loop_real_latch->end))
+          continue;
+
+       for (e = loop_real_latch->succ; e ; e = e->succ_next)
+         if (e->flags & EDGE_FALLTHRU)
+           break;
+
+       if (e == NULL)
+         continue;
+
+       bb_exit = e->dest;
+
+       bb = NULL;
+
+       /* Leave the case of the bb_exit falling through to exit to
+          fixed_fallthru_exit_predecessor */
+       for (e = EXIT_BLOCK_PTR->pred; e; e = e->pred_next)
+          if (e->flags & EDGE_FALLTHRU)
+         bb = e->src;
+
+       if (bb_exit == bb)
+         continue;
+
+       fixup_loop_exit_succesor (bb_exit, loop->latch);
+     }

    /* Clean up.  */
    flow_loops_free (loops);

gdiff -c3t doloop.c.20031210 doloop.c
*** doloop.c.20031210   Wed Jan 28 18:59:34 2004
--- doloop.c      Wed Jan 28 19:01:27 2004
***************
*** 59,66 ****


  #ifdef HAVE_doloop_end
-
- static rtx doloop_condition_get (rtx);
  static unsigned HOST_WIDE_INT doloop_iterations_max (const struct
loop_info *,
                                                       enum machine_mode,
int);
  static int doloop_valid_p (const struct loop *, rtx);
--- 59,64 ----
***************
*** 71,77 ****

  /* Return the loop termination condition for PATTERN or zero
     if it is not a decrement and branch jump insn.  */
! static rtx
  doloop_condition_get (rtx pattern)
  {
    rtx cmp;
--- 69,75 ----

  /* Return the loop termination condition for PATTERN or zero
     if it is not a decrement and branch jump insn.  */
! rtx
  doloop_condition_get (rtx pattern)
  {
    rtx cmp;

gdiff -c3t loop.h.20031210 loop.h
*** loop.h.20031210     Wed Jan 28 18:57:48 2004
--- loop.h  Wed Jan 28 19:01:26 2004
***************
*** 428,431 ****
--- 428,432 ----
  extern rtx loop_insn_hoist (const struct loop *, rtx);

  /* Forward declarations for non-static functions declared in doloop.c.
*/
+ extern rtx doloop_condition_get (rtx);
  extern int doloop_optimize (const struct loop *);


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]