This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

More sensible unrolling for low iteration counts (work in progress)


--- cfgloop.h@@/main/SH5GCC-CVS20040317.2300-int/renneckej-3.4-unroll/0	Mon Mar 22 12:42:52 2004
+++ cfgloop.h	Thu Mar 25 20:31:01 2004
@@ -27,6 +27,7 @@ enum lpt_dec
   LPT_PEEL_SIMPLE,
   LPT_UNROLL_CONSTANT,
   LPT_UNROLL_RUNTIME,
+  LPT_UNROLL_MOP_UP,
   LPT_UNROLL_STUPID
 };
 
@@ -307,6 +308,7 @@ extern bool simple_loop_p (struct loop *
 extern rtx count_loop_iterations (struct loop_desc *, rtx, rtx);
 extern bool just_once_each_iteration_p (struct loop *, basic_block);
 extern unsigned expected_loop_iterations (const struct loop *);
+extern unsigned expected_loop_iterations_1 (const struct loop *, int);
 
 /* Loop manipulation.  */
 extern bool can_duplicate_loop_p (struct loop *loop);
@@ -318,6 +320,7 @@ extern int duplicate_loop_to_header_edge
 					  unsigned, sbitmap, edge, edge *,
 					  unsigned *, int);
 extern struct loop *loopify (struct loops *, edge, edge, basic_block);
+extern struct loop *do_loopify (struct loops *, edge);
 extern void unloop (struct loops *, struct loop *);
 extern bool remove_path (struct loops *, edge);
 extern edge split_loop_bb (basic_block, rtx);
--- cfgloopanal.c@@/main/SH5GCC-CVS20040317.2300-int/renneckej-3.4-unroll/0	Mon Mar 22 12:42:55 2004
+++ cfgloopanal.c	Mon Mar 22 12:42:55 2004
@@ -1381,6 +1381,13 @@ average_num_loop_insns (struct loop *loo
 unsigned
 expected_loop_iterations (const struct loop *loop)
 {
+  return expected_loop_iterations_1 (loop, 1);
+}
+
+/* Likewise, but return result multiplied by SCALE.  */
+unsigned
+expected_loop_iterations_1 (const struct loop *loop, int scale)
+{
   edge e;
 
   if (loop->header->count)
@@ -1399,14 +1406,16 @@ expected_loop_iterations (const struct l
       if (count_in == 0)
 	return 0;
 
-      expected = (count_latch + count_in - 1) / count_in;
+      expected = (count_latch + count_in) / count_in;
+      expected = (((count_latch + count_in - expected * count_in)
+		   * scale / count_in) + expected * scale);
 
       /* Avoid overflows.  */
       return (expected > REG_BR_PROB_BASE ? REG_BR_PROB_BASE : expected);
     }
   else
     {
-      int freq_in, freq_latch;
+      int freq_in, freq_latch, expected;
 
       freq_in = 0;
       freq_latch = 0;
@@ -1420,7 +1429,10 @@ expected_loop_iterations (const struct l
       if (freq_in == 0)
 	return 0;
 
-      return (freq_latch + freq_in - 1) / freq_in;
+      expected = (freq_latch + freq_in) / freq_in;
+      expected = ((freq_latch + freq_in - expected) * scale / freq_in
+		  + expected * scale);
+      return expected;
     }
 }
 
--- cfgloopmanip.c@@/main/SH5GCC-CVS20040317.2300-int/renneckej-3.4-unroll/0	Wed Mar 24 17:24:08 2004
+++ cfgloopmanip.c	Fri Mar 26 15:38:37 2004
@@ -40,7 +40,7 @@ static void remove_bbs (basic_block *, i
 static bool rpe_enum_p (basic_block, void *);
 static int find_path (edge, basic_block **);
 static bool alp_enum_p (basic_block, void *);
-static void add_loop (struct loops *, struct loop *);
+static void add_loop (struct loops *, struct loop *, int);
 static void fix_loop_placements (struct loop *);
 static bool fix_bb_placement (struct loops *, basic_block);
 static void fix_bb_placements (struct loops *, basic_block);
@@ -432,7 +432,7 @@ alp_enum_p (basic_block bb, void *alp_he
 /* Given LOOP structure with filled header and latch, find the body of the
    corresponding loop and add it to LOOPS tree.  */
 static void
-add_loop (struct loops *loops, struct loop *loop)
+add_loop (struct loops *loops, struct loop *loop, int update_outer_loops_p)
 {
   basic_block *bbs;
   int i, n;
@@ -445,10 +445,18 @@ add_loop (struct loops *loops, struct lo
   bbs = xcalloc (n_basic_blocks, sizeof (basic_block));
   n = dfs_enumerate_from (loop->latch, 1, alp_enum_p,
 			  bbs, n_basic_blocks, loop->header);
+  bbs[n++] = loop->header;
 
-  for (i = 0; i < n; i++)
-    add_bb_to_loop (bbs[i], loop);
-  add_bb_to_loop (loop->header, loop);
+  if (update_outer_loops_p)
+    for (i = 0; i < n; i++)
+      add_bb_to_loop (bbs[i], loop);
+  else
+    for (i = 0; i < n; i++)
+      {
+	bbs[i]->loop_father = loop;
+	bbs[i]->loop_depth = loop->depth;
+	loop->num_nodes++;
+      }
 
   free (bbs);
 }
@@ -525,7 +533,7 @@ loopify (struct loops *loops, edge latch
   set_immediate_dominator (CDI_DOMINATORS, succ_bb, switch_bb);
 
   /* Compute new loop.  */
-  add_loop (loops, loop);
+  add_loop (loops, loop, 1);
   flow_loop_tree_node_add (outer, loop);
 
   /* Add switch_bb to appropriate loop.  */
@@ -571,6 +579,24 @@ loopify (struct loops *loops, edge latch
 
   return loop;
 }
+
+struct loop *
+do_loopify (struct loops *loops, edge latch_edge)
+{
+  basic_block header = latch_edge->dest;
+  struct loop *loop = xcalloc (1, sizeof (struct loop));
+  struct loop *outer = header->loop_father;
+
+  loop->header = header;
+
+  loop->latch = loop_split_edge_with (latch_edge, NULL_RTX);
+
+  /* Compute new loop.  */
+  flow_loop_tree_node_add (outer, loop);
+  add_loop (loops, loop, 0);
+
+  return loop;
+}
 
 /* Remove the latch edge of a LOOP and update LOOPS tree to indicate that
    the LOOP was removed.  After this function, original loop latch will
--- loop-unroll.c@@/main/SH5GCC-CVS20040317.2300-int/renneckej-3.4-unroll/0	Mon Mar 22 12:43:00 2004
+++ loop-unroll.c	Fri Mar 26 16:52:05 2004
@@ -81,6 +81,7 @@ static void peel_loop_completely (struct
 static void unroll_loop_stupid (struct loops *, struct loop *);
 static void unroll_loop_constant_iterations (struct loops *, struct loop *);
 static void unroll_loop_runtime_iterations (struct loops *, struct loop *);
+static void unroll_loop_runtime_iterations_mop_up (struct loops *, struct loop *);
 static void expand_bct (edge, int);
 static bool discard_increment (struct loop *);
 
@@ -130,6 +131,9 @@ unroll_and_peel_loops (struct loops *loo
 	case LPT_UNROLL_RUNTIME:
 	  unroll_loop_runtime_iterations (loops, loop);
 	  break;
+	case LPT_UNROLL_MOP_UP:
+	  unroll_loop_runtime_iterations_mop_up (loops, loop);
+	  break;
 	case LPT_UNROLL_STUPID:
 	  unroll_loop_stupid (loops, loop);
 	  break;
@@ -696,7 +700,7 @@ unroll_loop_constant_iterations (struct 
 static void
 decide_unroll_runtime_iterations (struct loop *loop, int flags)
 {
-  unsigned nunroll, nunroll_by_av, i;
+  unsigned nunroll, nunroll_by_av, max_nunroll, i;
 
   if (!(flags & UAP_UNROLL))
     {
@@ -713,6 +717,7 @@ decide_unroll_runtime_iterations (struct
   nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
   if (nunroll > nunroll_by_av)
     nunroll = nunroll_by_av;
+  max_nunroll = nunroll;
   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 
@@ -749,8 +754,36 @@ decide_unroll_runtime_iterations (struct
   /* If we have profile feedback, check whether the loop rolls.  */
   if (loop->header->count && expected_loop_iterations (loop) < 2 * nunroll)
     {
-      if (rtl_dump_file)
-	fprintf (rtl_dump_file, ";; Not unrolling loop, doesn't roll\n");
+      int niter_scaled = (expected_loop_iterations_1 (loop, 16));
+      int niter = (niter_scaled) + 8 / 16;
+      HOST_WIDE_INT max_increment_factor;
+      if (niter < 3 || GET_CODE (loop->desc.stride) != CONST_INT)
+	{
+	  if (rtl_dump_file)
+	    fprintf (rtl_dump_file,
+		     ";; Not unrolling loop, doesn't roll enough\n");
+	  return;
+	}
+      max_increment_factor = INTVAL (loop->desc.stride);
+      max_increment_factor =
+	((HOST_WIDE_INT) ((unsigned HOST_WIDE_INT)~0 >> 1)
+	 / (max_increment_factor > 0
+	    ? max_increment_factor : -max_increment_factor));
+      /* We are considering unrolling so that the loop rolls I times, using
+	 NUNROLL loop copies, followed a switch with NUNROLL-1 cases, each
+	 taking 1 .. NUNROLL-1 loop copies (all separate to allow maximum
+	 scheduling).  Thus, the switch needs NUNROLL * (NUNROLL - 1) / 2
+	 loop copies, and we have a total of NUNROLL * (NUNROLL) + 1 / 2
+	 loop copies.  */
+      for (i = 1;
+	   nunroll = niter / i,
+	   (nunroll * (nunroll + 1) > max_nunroll
+	    || nunroll - 1 > (unsigned) max_increment_factor);
+	   i++)
+	if (nunroll <= 1)
+	  return;
+      loop->lpt_decision.decision = LPT_UNROLL_MOP_UP;
+      loop->lpt_decision.times = nunroll;
       return;
     }
 
@@ -1030,6 +1063,228 @@ unroll_loop_runtime_iterations (struct l
 
   /* Remove the edges.  */
   for (i = 0; i < n_remove_edges; i++)
+    remove_path (loops, remove_edges[i]);
+  free (remove_edges);
+
+  if (rtl_dump_file)
+    fprintf (rtl_dump_file,
+	     ";; Unrolled loop %d times, counting # of iterations in runtime, %i insns\n",
+	     max_unroll, num_loop_insns (loop));
+}
+
+/* Unroll LOOP for that we are able to count number of iterations in runtime
+   LOOP->LPT_DECISION.TIMES + 1 times.  The transformation does this (with some
+   extra care for case (n - 3) > n):
+
+   for (i = 0; i < n; i++)
+     body;
+
+   ==>
+
+   i = 0;
+
+   while (i < n - 3)
+     {
+       body; i++;
+       body; i++;
+       body; i++;
+       body; i++;
+     }
+   switch (n - i)
+     {
+       case 3:
+         body; i++; body; i++; body; i++; break;
+       case 2:
+         body; i++; body; i++; break;
+       case 1:
+         body; i++;
+       case 0: ;
+     }
+
+   */
+static void
+unroll_loop_runtime_iterations_mop_up (struct loops *loops, struct loop *loop)
+{
+  rtx init_code, label;
+  unsigned i, j;
+  basic_block preheader, *body, *dom_bbs, new_header, switch_header;
+  basic_block simple_preheader, bound_check_bb = NULL;
+  basic_block unrolled_end_bb, new_unrolled_end_bb, after_loop;
+  rtx unrolled_end, new_unrolled_end, vtop_code;
+  unsigned n_dom_bbs;
+  sbitmap wont_exit;
+  int may_exit_copy;
+  unsigned n_peel, n_remove_edges;
+  edge *remove_edges, unrolled_exit_edge, new_back_edge, e, unrolled_end_edge;
+  bool extra_zero_check, last_may_exit;
+  unsigned max_unroll = loop->lpt_decision.times;
+  struct loop_desc *desc = &loop->desc;
+  HOST_WIDE_INT adjust;
+  rtx new_lim, skip_unrolled;
+  int mop_up_unrolled = 0;
+
+  /* Remember blocks whose dominators will have to be updated.  */
+  dom_bbs = xcalloc (n_basic_blocks + 3, sizeof (basic_block));
+  n_dom_bbs = 0;
+
+  body = get_loop_body (loop);
+  for (i = 0; i < loop->num_nodes; i++)
+    {
+      unsigned nldom;
+      basic_block *ldom;
+
+      nldom = get_dominated_by (CDI_DOMINATORS, body[i], &ldom);
+      for (j = 0; j < nldom; j++)
+	if (!flow_bb_inside_loop_p (loop, ldom[j]))
+	  dom_bbs[n_dom_bbs++] = ldom[j];
+
+      free (ldom);
+    }
+  free (body);
+
+  if (desc->postincr)
+    {
+      /* Leave exit in first copy (for explanation why see comment in
+	 unroll_loop_constant_iterations).  */
+      may_exit_copy = 0;
+      n_peel = max_unroll - 1;
+      extra_zero_check = true;
+      last_may_exit = false;
+    }
+  else
+    {
+      /* Leave exit in last copy (for explanation why see comment in
+	 unroll_loop_constant_iterations).  */
+      may_exit_copy = max_unroll;
+      n_peel = max_unroll;
+      extra_zero_check = false;
+      last_may_exit = true;
+    }
+
+/* FIXME */
+  /* Compute adjusted loop bound.  */
+  adjust = (max_unroll - 1) * INTVAL (desc->stride);
+  skip_unrolled = gen_label_rtx ();
+  if (GET_CODE (desc->lim) == CONST_INT)
+    {
+      /* This can't overflow, because then the unrolled loop would never
+	 be executed, but we have selected the unroll factor so that
+	 it will be at least sometimes executed.  */
+      new_lim = GEN_INT (INTVAL (desc->lim) - adjust);
+    }
+  else
+    {
+      new_lim = expand_simple_binop (GET_MODE (desc->var), MINUS,
+				     desc->lim, GEN_INT (adjust),
+				     NULL_RTX, 0, OPTAB_LIB_WIDEN);
+      /* Skip unrolled loop if loop bound adjustment overflows.  */
+      start_sequence ();
+      emit_cmp_and_jump_insns (new_lim, desc->lim,
+			       INTVAL (desc->stride) > 0 ? GT : LT,
+			       NULL_RTX, GET_MODE (desc->var),
+			       desc->cond != signed_condition (desc->cond),
+			       skip_unrolled);
+      init_code = get_insns ();
+      end_sequence ();
+      dom_bbs[n_dom_bbs++] = bound_check_bb = 
+        loop_split_edge_with (loop_preheader_edge (loop), init_code);
+    }
+  start_sequence ();
+  /* Skip unrolled loop if we don't have at least one unrolled iteration.
+     ??? We should find and delete any previously copied loop exit test.  */
+  emit_cmp_and_jump_insns (desc->var, new_lim,
+			   (desc->neg
+			    ? reverse_condition (desc->cond) : desc->cond),
+			   NULL_RTX, GET_MODE (desc->var),
+			   desc->cond != signed_condition (desc->cond),
+			   skip_unrolled);
+  init_code = get_insns ();
+  end_sequence ();
+  preheader = loop_split_edge_with (loop_preheader_edge (loop), NULL_RTX);
+  /* The following call needs to be done before the jump is inserted into
+     PREHEADER, to avoid deleting the jump.  */
+  simple_preheader = loop_split_edge_with (loop_preheader_edge (loop),
+					   NULL_RTX);
+  emit_insn_after (init_code, BB_HEAD (preheader));
+  remove_edges = xcalloc (max_unroll, sizeof (edge));
+  n_remove_edges = 0;
+  wont_exit = sbitmap_alloc (max_unroll + 1);
+  sbitmap_ones (wont_exit);
+  if (! mop_up_unrolled)
+    RESET_BIT (wont_exit, 0);
+  if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
+		loops, max_unroll,
+		wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
+		0))
+    abort ();
+  new_header = simple_preheader->succ->dest;
+  unrolled_exit_edge = remove_edges[--n_remove_edges];
+  unrolled_end_bb = unrolled_exit_edge->src;
+  set_immediate_dominator (CDI_DOMINATORS, new_header, simple_preheader);
+  after_loop = unrolled_exit_edge->dest;
+  unrolled_end = BB_END (unrolled_end_bb);
+  if (!onlyjump_p (unrolled_end))
+    abort ();
+  start_sequence ();
+  label = block_label (new_header);
+  emit_cmp_and_jump_insns (desc->var, new_lim,
+			   (desc->neg
+			    ? reverse_condition (desc->cond) : desc->cond),
+			   NULL_RTX, GET_MODE (desc->var),
+			   desc->cond != signed_condition (desc->cond),
+			   label);
+  JUMP_LABEL (get_last_insn ()) = label;
+  LABEL_NUSES (label)++;
+  new_unrolled_end = get_last_insn ();
+  emit_label (skip_unrolled);
+  label = block_label (after_loop);
+  emit_cmp_and_jump_insns (desc->var, desc->lim,
+			   (desc->neg
+			    ? reverse_condition (desc->cond) : desc->cond),
+			   NULL_RTX, GET_MODE (desc->var),
+			   desc->cond != signed_condition (desc->cond),
+			   label);
+  JUMP_LABEL (get_last_insn ()) = label;
+  LABEL_NUSES (label)++;
+  vtop_code = get_insns ();
+  end_sequence ();
+  emit_insn_after (vtop_code, unrolled_end);
+  delete_insn (unrolled_end);
+  unrolled_end_edge = split_block (unrolled_end_bb, new_unrolled_end);
+  new_unrolled_end_bb = unrolled_end_edge->dest;
+  add_bb_to_loop (new_unrolled_end_bb, unrolled_end_bb->loop_father);
+  add_to_dominance_info (CDI_DOMINATORS, new_unrolled_end_bb);
+  set_immediate_dominator (CDI_DOMINATORS, new_unrolled_end_bb,
+			   unrolled_end_bb);
+  dom_bbs[n_dom_bbs++] = new_unrolled_end_bb->succ->dest;
+  dom_bbs[n_dom_bbs++] = new_unrolled_end_bb->succ->succ_next->dest;
+  if (new_unrolled_end_bb->succ->succ_next->succ_next)
+    abort ();
+  new_back_edge
+    = unchecked_make_edge (unrolled_end_bb, new_header,
+			   unrolled_end_edge->flags &  EDGE_IRREDUCIBLE_LOOP);
+  switch_header = new_back_edge->dest;
+  e = make_edge (preheader, switch_header,
+		 switch_header->succ->flags & EDGE_IRREDUCIBLE_LOOP);
+  set_immediate_dominator (CDI_DOMINATORS, switch_header, unrolled_end_bb);
+  if (bound_check_bb)
+    {
+      e = make_edge (bound_check_bb, switch_header,
+		     switch_header->succ->flags & EDGE_IRREDUCIBLE_LOOP);
+      set_immediate_dominator (CDI_DOMINATORS, switch_header, bound_check_bb);
+    }
+  /* FIXME: update probabilities.  */
+  /* FIXME: update frequencies.  */
+  
+  free (wont_exit);
+
+  do_loopify (loops, new_back_edge);
+
+  /* Recompute dominators for outer blocks.  */
+  iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, n_dom_bbs);
+
+  /* Remove the edges.  */
+  for (i = 0; i < n_remove_edges; i++)
     remove_path (loops, remove_edges[i]);
   free (remove_edges);
 


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]