This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] Take branch misprediction effects into account when RTL loop unrolling (issue6099055)


This patch adds heuristics to limit unrolling in loops with branches that may increase
branch mispredictions. It affects loops that are not frequently iterated, and that are
nested within a hot region of code that already contains many branch instructions.

Performance tested with both internal benchmarks and with SPEC 2000/2006 on a variety
of Intel systems (Core2, Corei7, SandyBridge) and a couple of different AMD Opteron systems.
This improves performance of an internal search indexing benchmark by close to 2% on
all the tested Intel platforms.  It also consistently improves 445.gobmk (with FDO feedback
where unrolling kicks in) by close to 1% on AMD Opteron. Other performance effects are
neutral.

Bootstrapped and tested on x86_64-unknown-linux-gnu.  Is this ok for trunk?

Thanks,
Teresa

2012-04-24   Teresa Johnson  <tejohnson@google.com>

	* loop-unroll.c (loop_has_call): New function.
	(loop_has_FP_comp): Ditto.
	(compute_weighted_branches): Ditto.
	(max_unroll_with_branches): Ditto.
	(decide_unroll_constant_iterations): Add heuristic to avoid
        increasing branch mispredicts when unrolling.
	(decide_unroll_runtime_iterations): Ditto.
	* params.def (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES): New param.
        (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET): Ditto.

Index: loop-unroll.c
===================================================================
--- loop-unroll.c	(revision 186783)
+++ loop-unroll.c	(working copy)
@@ -152,6 +152,180 @@ static void combine_var_copies_in_loop_exit (struc
 					     basic_block);
 static rtx get_expansion (struct var_to_expand *);
 
+/* Determine whether LOOP contains call.  */
+static bool
+loop_has_call(struct loop *loop)
+{
+  basic_block *body, bb;
+  unsigned i;
+  rtx insn;
+
+  body = get_loop_body (loop);
+  for (i = 0; i < loop->num_nodes; i++)
+    {
+      bb = body[i];
+
+      FOR_BB_INSNS (bb, insn)
+        {
+          if (CALL_P (insn))
+            {
+              free (body);
+              return true;
+            }
+        }
+    }
+  free (body);
+  return false;
+}
+
+/* Determine whether LOOP contains floating-point computation.  */
+static bool
+loop_has_FP_comp(struct loop *loop)
+{
+  rtx set, dest;
+  basic_block *body, bb;
+  unsigned i;
+  rtx insn;
+
+  body = get_loop_body (loop);
+  for (i = 0; i < loop->num_nodes; i++)
+    {
+      bb = body[i];
+
+      FOR_BB_INSNS (bb, insn)
+        {
+          set = single_set (insn);
+          if (!set)
+            continue;
+
+          dest = SET_DEST (set);
+          if (FLOAT_MODE_P (GET_MODE (dest)))
+            {
+              free (body);
+              return true;
+            }
+        }
+    }
+  free (body);
+  return false;
+}
+
+/* Compute the number of branches in LOOP, weighted by execution counts.  */
+static float
+compute_weighted_branches(struct loop *loop)
+{
+  int header_count = loop->header->count;
+  unsigned i;
+  float n;
+  basic_block * body;
+
+  /* If no profile feedback data exists, don't limit unrolling  */
+  if (header_count == 0)
+    return 0.0;
+
+  gcc_assert (loop->latch != EXIT_BLOCK_PTR);
+
+  body = get_loop_body (loop);
+  n = 0.0;
+  for (i = 0; i < loop->num_nodes; i++)
+    {
+      if (EDGE_COUNT (body[i]->succs) >= 2)
+        {
+          /* If this block is executed less frequently than the header (loop
+             entry), then it is weighted based on the ratio of times it is
+             executed compared to the header.  */
+          if (body[i]->count < header_count)
+            n += ((float)body[i]->count)/header_count;
+
+          /* When it is executed more frequently than the header (i.e. it is
+             in a nested inner loop), simply weight the branch at 1.0.  */
+          else
+            n += 1.0;
+        }
+    }
+  free (body);
+
+  return n;
+}
+
+/* Compute the maximum number of times LOOP can be unrolled without exceeding
+   a branch budget, which can increase branch mispredictions. The number of
+   branches is computed by weighting each branch with its expected execution
+   probability through the loop based on profile data. If no profile feedback
+   data exists, simply return the current NUNROLL factor.  */
+static unsigned
+max_unroll_with_branches(struct loop *loop, unsigned nunroll)
+{
+  struct loop *outer;
+  struct niter_desc *outer_desc;
+  int outer_niters = 1;
+  float weighted_outer_branches = 0.0;
+  float weighted_num_branches = compute_weighted_branches (loop);
+
+  /* If there was no profile feedback data, weighted_num_branches will be 0.0
+     and we won't limit unrolling. If the weighted_num_branches is at most 1.0,
+     also don't limit unrolling as the back-edge branch will not be duplicated.  */
+  if (weighted_num_branches <= 1.0)
+    return nunroll;
+
+  /* Walk up the loop tree until we find a hot outer loop in which the current
+     loop is nested. At that point we will compute the number of times the
+     current loop can be unrolled based on the number of branches in the hot
+     outer loop.  */
+  outer = loop_outer(loop);
+  /* The loop structure contains a fake outermost loop, so this should always
+     be non-NULL for our current loop.  */
+  gcc_assert (outer);
+  /* Detect if this is the fake outermost loop (at which point we are done)
+     by checking its outer loop.  */
+  while (loop_outer(outer))
+    {
+      outer_desc = get_simple_loop_desc (outer);
+
+      if (outer_desc->const_iter)
+        outer_niters *= outer_desc->niter;
+      else if (outer->header->count)
+        outer_niters *= expected_loop_iterations (outer);
+
+      weighted_outer_branches = compute_weighted_branches (outer);
+
+      /* Should have been checked by caller.  */
+      gcc_assert(PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1);
+
+      /* If the outer loop has enough iterations to be considered hot, then
+         we can stop our upwards loop tree traversal and examine the current
+         outer loop.  */
+      if (outer_niters >= PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
+        {
+          /* Assume that any call will cause the branch budget to be exceeded,
+             and that we can't unroll the current loop without increasing
+             mispredicts.  */
+          if (loop_has_call(outer))
+            return 0;
+
+          /* Otherwise, compute the maximum number of times current loop can be
+             unrolled without exceeding our branch budget. First we subtract
+             off the outer loop's weighted branch count from the budget. Note
+             that this includes the branches in the current loop. This yields
+             the number of branches left in the budget for the unrolled copies.
+             We divide this by the number of branches in the current loop that
+             must be duplicated when we unroll, which is the total weighted
+             number of branches minus the back-edge branch. This yields the
+             number of new loop body copies that can be created by unrolling
+             without exceeding the budget, to which we add 1 to get the unroll
+             factor.  */
+          return (PARAM_VALUE (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET) -
+                  weighted_outer_branches)/(weighted_num_branches - 1) + 1;
+        }
+      outer = loop_outer(outer);
+    }
+
+  /* The current loop is not enclosed by a hot enough outer loop in this
+     procedure, since the hot outer loop is inter-procedural, assume that
+     it already contains a significant number of branches, so don't unroll.  */
+  return 0;
+}
+
 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
 void
 unroll_and_peel_loops (int flags)
@@ -522,6 +696,7 @@ static void
 decide_unroll_constant_iterations (struct loop *loop, int flags)
 {
   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
+  unsigned nunroll_branches;
   struct niter_desc *desc;
 
   if (!(flags & UAP_UNROLL))
@@ -565,6 +740,25 @@ decide_unroll_constant_iterations (struct loop *lo
       return;
     }
 
+  /* Be careful when unrolling loops with branches inside -- it can increase
+     the number of mispredicts. Ignore loops with FP computation as these
+     tend to benefit much more consistently from unrolling.  */
+  if (num_loop_branches (loop) > 1
+      && loop_has_FP_comp(loop)
+      && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1
+      && desc->niter < (unsigned) PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
+    {
+      nunroll_branches = max_unroll_with_branches(loop, nunroll);
+      if (nunroll > nunroll_branches)
+        nunroll = nunroll_branches;
+      if (nunroll <= 1)
+        {
+          if (dump_file)
+	    fprintf (dump_file, ";; Not unrolling, contains branches\n");
+          return;
+        }
+    }
+
   /* Check whether the loop rolls enough to consider.  */
   if (desc->niter < 2 * nunroll)
     {
@@ -802,7 +996,7 @@ unroll_loop_constant_iterations (struct loop *loop
 static void
 decide_unroll_runtime_iterations (struct loop *loop, int flags)
 {
-  unsigned nunroll, nunroll_by_av, i;
+  unsigned nunroll, nunroll_by_av, nunroll_branches, i;
   struct niter_desc *desc;
 
   if (!(flags & UAP_UNROLL))
@@ -856,6 +1050,25 @@ decide_unroll_runtime_iterations (struct loop *loo
       return;
     }
 
+  /* Be careful when unrolling loops with branches inside -- it can increase
+     the number of mispredicts. Ignore loops with FP computation as these
+     tend to benefit much more consistently from unrolling.  */
+  if (num_loop_branches (loop) > 1
+      && loop_has_FP_comp(loop)
+      && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1
+      && expected_loop_iterations (loop) < (unsigned) PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
+    {
+      nunroll_branches = max_unroll_with_branches(loop, nunroll);
+      if (nunroll > nunroll_branches)
+        nunroll = nunroll_branches;
+      if (nunroll <= 1)
+        {
+          if (dump_file)
+            fprintf (dump_file, ";; Not unrolling, contains branches\n");
+          return;
+        }
+    }
+
   /* If we have profile feedback, check whether the loop rolls.  */
   if ((loop->header->count
        && expected_loop_iterations (loop) < 2 * nunroll)
Index: params.def
===================================================================
--- params.def	(revision 186783)
+++ params.def	(working copy)
@@ -312,6 +312,16 @@ DEFPARAM(PARAM_MAX_UNROLL_ITERATIONS,
 	 "The maximum depth of a loop nest we completely peel",
 	 8, 0, 0)
 
+DEFPARAM(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES,
+	"min-iter-unroll-with-branches",
+	"Minimum iteration count to ignore branch effects when unrolling",
+	50, 0, 0)
+
+DEFPARAM(PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET,
+	"unroll-outer-loop-branch-budget",
+	"Maximum number of branches allowed in hot outer loop region after unroll",
+	25, 0, 0)
+
 /* The maximum number of insns of an unswitched loop.  */
 DEFPARAM(PARAM_MAX_UNSWITCH_INSNS,
 	"max-unswitch-insns",

--
This patch is available for review at http://codereview.appspot.com/6099055


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]