This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Fix profile updating after complette unrolling


Hi,
this patch fixes profile updating after complette unroll.  Present code expects
that unrolling is done only in the case there is exit which counts iteration
and the number of iterations is n_unroll.  This is no longer true because we
can derive upper bounds on number of iterations from array sizes and unroll
based on that.  In that case we will still adjust probabilities of one of the
exit edges from the loop in random ways.

It would pehraps make more sense to communicate the per-exit iteration count
analysis with the main loop duplication machinery, but I would like to do
things incrementally. There are many profile updating issues in loop code and
it seem to make sense to first fix low hanging fruit and add testcases ensuring
consistency.

Bootstrapped/regtested x86_64-linux, plan to commit after the benchmark machines
picks up my previous fix to predict.c.

Similar bugs exists in the peeling path, too, will fix that separately.

There are three main paths through the updating:
  - when we know number of iterations precisely
  - when we only have upper bound
  - when we have upper bound that is lower than number of iterations determined
    for specific exit.

I added testcases only for later two tests.  Testcase for first triggers
updating errors in loop header duplication. I will look into that, too.

Honza

	* predict.h (force_edge_cold): Declare.
	* predict.c (force_edge_cold): New function.
	* tree-ssa-loop-ivcanon.c (try_unroll_loop_completely): Fix profile
	updating.
	(canonicalize_loop_induction_variables): Fix formating.

	* gcc.dg/tree-ssa/cunroll-12.c: New testcase.
	* gcc.dg/tree-ssa/cunroll-13.c: New testcase.
Index: predict.h
===================================================================
--- predict.h	(revision 236850)
+++ predict.h	(working copy)
@@ -91,5 +91,6 @@ extern tree build_predict_expr (enum br_
 extern const char *predictor_name (enum br_predictor);
 extern void rebuild_frequencies (void);
 extern void report_predictor_hitrates (void);
+extern void force_edge_cold (edge, bool);
 
 #endif  /* GCC_PREDICT_H */
Index: predict.c
===================================================================
--- predict.c	(revision 236862)
+++ predict.c	(working copy)
@@ -3249,3 +3249,99 @@ report_predictor_hitrates (void)
   loop_optimizer_finalize ();
 }
 
+/* Force edge E to be cold.
+   If IMPOSSIBLE is true, for edge to have count and probability 0 otherwise
+   keep low probability to represent possible error in a guess.  This is used
+   i.e. in case we predict loop to likely iterate given number of times but
+   we are not 100% sure.
+
+   This function locally updates profile without attempt to keep global
+   consistency which can not be reached in full generality without full profile
+   rebuild from probabilities alone.  Doing so is not necessarily a good idea
+   because frequencies and counts may be more realistic then probabilities.
+
+   In some cases (such as for elimination of early exits during full loop
+   unrolling) the caller can ensure that profile will get consistent
+   afterwards.  */
+
+void
+force_edge_cold (edge e, bool impossible)
+{
+  gcov_type count_sum = 0;
+  int prob_sum = 0;
+  edge_iterator ei;
+  edge e2;
+  gcov_type old_count = e->count;
+  int old_probability = e->probability;
+  gcov_type gcov_scale = REG_BR_PROB_BASE;
+  int prob_scale = REG_BR_PROB_BASE;
+
+  /* If edge is already improbably or cold, just return.  */
+  if (e->probability <= impossible ? PROB_VERY_UNLIKELY : 0
+      && (!impossible || !e->count))
+    return;
+  FOR_EACH_EDGE (e2, ei, e->src->succs)
+    if (e2 != e)
+      {
+	count_sum += e2->count;
+	prob_sum += e2->probability;
+      }
+
+  /* If there are other edges out of e->src, redistribute probabilitity
+     there.  */
+  if (prob_sum)
+    {
+      e->probability
+	 = MIN (e->probability, impossible ? 0 : PROB_VERY_UNLIKELY);
+      if (old_probability)
+	e->count = RDIV (e->count * e->probability, old_probability);
+      else
+        e->count = MIN (e->count, impossible ? 0 : 1);
+
+      if (count_sum)
+	gcov_scale = RDIV ((count_sum + old_count - e->count) * REG_BR_PROB_BASE,
+			   count_sum);
+      prob_scale = RDIV ((REG_BR_PROB_BASE - e->probability) * REG_BR_PROB_BASE,
+			 prob_sum);
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "Making edge %i->%i %s by redistributing "
+		 "probability to other edges.\n",
+		 e->src->index, e->dest->index,
+		 impossible ? "imposisble" : "cold");
+      FOR_EACH_EDGE (e2, ei, e->src->succs)
+	if (e2 != e)
+	  {
+	    e2->count = RDIV (e2->count * gcov_scale, REG_BR_PROB_BASE);
+	    e2->probability = RDIV (e2->probability * prob_scale,
+				    REG_BR_PROB_BASE);
+	  }
+    }
+  /* If all edges out of e->src are unlikely, the basic block itself
+     is unlikely.  */
+  else
+    {
+      e->probability = REG_BR_PROB_BASE;
+
+      /* If we did not adjusting, the source basic block has no likely edeges
+ 	 leaving other direction. In that case force that bb cold, too.
+	 This in general is difficult task to do, but handle special case when
+	 BB has only one predecestor.  This is common case when we are updating
+	 after loop transforms.  */
+      if (!prob_sum && !count_sum && single_pred_p (e->src)
+	  && e->src->frequency > (impossible ? 0 : 1))
+	{
+	  int old_frequency = e->src->frequency;
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "Making bb %i %s.\n", e->src->index,
+		     impossible ? "imposisble" : "cold");
+	  e->src->frequency = MIN (e->src->frequency, impossible ? 0 : 1);
+	  e->src->count = e->count = RDIV (e->src->count * e->src->frequency,
+					   old_frequency);
+	  force_edge_cold (single_pred_edge (e->src), impossible);
+	}
+      else if (dump_file && (dump_flags & TDF_DETAILS)
+	       && maybe_hot_bb_p (cfun, e->src))
+	fprintf (dump_file, "Giving up on making bb %i %s.\n", e->src->index,
+		 impossible ? "imposisble" : "cold");
+    }
+}
Index: testsuite/gcc.dg/tree-ssa/cunroll-12.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/cunroll-12.c	(revision 0)
+++ testsuite/gcc.dg/tree-ssa/cunroll-12.c	(working copy)
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -Warray-bounds -fdump-tree-cunroll-blocks-details" } */
+struct a {int a[8];int b;};
+void
+t(struct a *a)
+{
+  for (int i=0;a->a[i];i++)
+    a->a[i]++;
+}
+/* { dg-final { scan-tree-dump-times "loop with 7 iterations completely unrolled" 1 "cunroll" } } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "cunroll" } } */
Index: testsuite/gcc.dg/tree-ssa/cunroll-13.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/cunroll-13.c	(revision 0)
+++ testsuite/gcc.dg/tree-ssa/cunroll-13.c	(working copy)
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fdisable-tree-cunrolli -fdisable-tree-vrp1 -fdump-tree-cunroll-blocks-details" } */
+struct a {int a[8];int b;};
+void
+t(struct a *a)
+{
+  for (int i=0;i<123456 && a->a[i];i++)
+    a->a[i]++;
+}
+/* This pass relies on the fact that we do not eliminate the redundant test for i early.
+   It is necessary to disable all passes that do so.  At the moment it is vrp1 and cunrolli.  */
+/* { dg-final { scan-tree-dump-times "Loop 1 iterates 123454 times" 1 "cunroll" } } */
+/* { dg-final { scan-tree-dump-times "Last iteration exit edge was proved true" 1 "cunroll" } } */
+/* { dg-final { scan-tree-dump-times "Exit condition of peeled iterations was eliminated" 1 "cunroll" } } */
+/* { dg-final { scan-tree-dump-times "loop with 7 iterations completely unrolled" 1 "cunroll" } } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "cunroll" } } */
Index: tree-ssa-loop-ivcanon.c
===================================================================
--- tree-ssa-loop-ivcanon.c	(revision 236850)
+++ tree-ssa-loop-ivcanon.c	(working copy)
@@ -730,8 +730,14 @@ try_unroll_loop_completely (struct loop
       if (ul == UL_SINGLE_ITER)
 	return false;
 
+      /* EXIT can be removed only if we are sure it passes first N_UNROLL
+	 iterations.  */
+      bool remove_exit = (exit && niter
+			  && TREE_CODE (niter) == INTEGER_CST
+			  && wi::leu_p (n_unroll, wi::to_widest (niter)));
+
       large = tree_estimate_loop_size
-		 (loop, exit, edge_to_cancel, &size,
+		 (loop, remove_exit ? exit : NULL, edge_to_cancel, &size,
 		  PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS));
       ninsns = size.overall;
       if (large)
@@ -837,8 +843,20 @@ try_unroll_loop_completely (struct loop
 
       initialize_original_copy_tables ();
       wont_exit = sbitmap_alloc (n_unroll + 1);
-      bitmap_ones (wont_exit);
-      bitmap_clear_bit (wont_exit, 0);
+      if (exit && niter
+	  && TREE_CODE (niter) == INTEGER_CST
+	  && wi::leu_p (n_unroll, wi::to_widest (niter)))
+	{
+	  bitmap_ones (wont_exit);
+	  if (wi::eq_p (wi::to_widest (niter), n_unroll)
+	      || edge_to_cancel)
+	    bitmap_clear_bit (wont_exit, 0);
+	}
+      else
+	{
+	  exit = NULL;
+	  bitmap_clear (wont_exit);
+	}
 
       if (!gimple_duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 						 n_unroll, wont_exit,
@@ -869,6 +887,7 @@ try_unroll_loop_completely (struct loop
   if (edge_to_cancel)
     {
       gcond *cond = as_a <gcond *> (last_stmt (edge_to_cancel->src));
+      force_edge_cold (edge_to_cancel, true);
       if (edge_to_cancel->flags & EDGE_TRUE_VALUE)
 	gimple_cond_make_false (cond);
       else
@@ -951,7 +970,9 @@ try_peel_loop (struct loop *loop,
   if (!flag_peel_loops || PARAM_VALUE (PARAM_MAX_PEEL_TIMES) <= 0)
     return false;
 
-  /* Peel only innermost loops.  */
+  /* Peel only innermost loops.
+     While the code is perfectly capable of peeling non-innermost loops,
+     the heuristics would probably need some improvements. */
   if (loop->inner)
     {
       if (dump_file)
   /* Remove exits that are known to be never taken based on loop bound.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]