Enable loop peeling at -O3

Sat May 28 23:56:00 GMT 2016

Hello,
thanks for feedback. I updated the patch and also noticed that -fpeel-all-loops gives up when
upper bound is known but it is large and when the max-peel-insns is too small to permit
peeling max-peel-times.  This patch also updates  pr61743-2.c which are now peeled before
we manage to propagate the proper loop bound.

Bootstrapped/regtested x86_64-linux. OK?

Honza

	* common.opt (flag_peel_all_loops): New option.
	* doc/invoke.texi: (-fpeel-loops): Update documentation.
	(-fpeel-all-loops): Document.
	* opts.c (default_options): Add OPT_fpeel_loops to -O3+.
	* toplev.c (process_options): flag_peel_all_loops implies
	flag_peel_loops.
	* tree-ssa-lop-ivcanon.c (try_peel_loop): Update comment; handle
	-fpeel-all-loops, use likely estimates.

	* gcc.dg/tree-ssa/peel1.c: New testcase.
	* gcc.dg/tree-ssa/peel2.c: New testcase.
	* gcc.dg/tree-ssa/pr61743-1.c: Pass -fno-peel-loops.
	* gcc.dg/tree-ssa/pr61743-2.c: Pass -fno-peel-loops.
Index: common.opt
===================================================================

--- common.opt	(revision 236815)
+++ common.opt	(working copy)
@@ -1840,6 +1840,10 @@ fpeel-loops
 Common Report Var(flag_peel_loops) Optimization
 Perform loop peeling.
 
+fpeel-all-loops
+Common Report Var(flag_peel_all_loops) Optimization
+Perform loop peeling of all loops.
+
 fpeephole
 Common Report Var(flag_no_peephole,0) Optimization
 Enable machine specific peephole optimizations.
Index: doc/invoke.texi
===================================================================
--- doc/invoke.texi	(revision 236815)
+++ doc/invoke.texi	(working copy)
@@ -375,7 +375,7 @@ Objective-C and Objective-C++ Dialects}.
 -fno-sched-interblock -fno-sched-spec -fno-signed-zeros @gol
 -fno-toplevel-reorder -fno-trapping-math -fno-zero-initialized-in-bss @gol
 -fomit-frame-pointer -foptimize-sibling-calls @gol
--fpartial-inlining -fpeel-loops -fpredictive-commoning @gol
+-fpartial-inlining -fpeel-loops -fpeel-all-loops -fpredictive-commoning @gol
 -fprefetch-loop-arrays @gol
 -fprofile-correction @gol
 -fprofile-use -fprofile-use=@var{path} -fprofile-values @gol
@@ -6338,7 +6338,8 @@ by @option{-O2} and also turns on the @o
 @option{-fgcse-after-reload}, @option{-ftree-loop-vectorize},
 @option{-ftree-loop-distribute-patterns}, @option{-fsplit-paths}
 @option{-ftree-slp-vectorize}, @option{-fvect-cost-model},
-@option{-ftree-partial-pre} and @option{-fipa-cp-clone} options.
+@option{-ftree-partial-pre}, @option{-fpeel-loops}
+and @option{-fipa-cp-clone} options.
 
 @item -O0
 @opindex O0
@@ -8593,7 +8594,7 @@ data about values of expressions in the
 With @option{-fbranch-probabilities}, it reads back the data gathered
 from profiling values of expressions for usage in optimizations.
 
-Enabled with @option{-fprofile-generate} and @option{-fprofile-use}.
+Enabled with @option{-fprofile-generate} and/or @option{-fprofile-use}.
 
 @item -fprofile-reorder-functions
 @opindex fprofile-reorder-functions
@@ -8661,10 +8662,17 @@ the loop is entered.  This usually makes
 @item -fpeel-loops
 @opindex fpeel-loops
 Peels loops for which there is enough information that they do not
-roll much (from profile feedback).  It also turns on complete loop peeling
-(i.e.@: complete removal of loops with small constant number of iterations).
-
-Enabled with @option{-fprofile-use}.
+roll much (from profile feedback or static analysis).  It also turns on
+complete loop peeling (i.e.@: complete removal of loops with small constant
+number of iterations).
+
+Enabled with @option{-O3} and @option{-fprofile-use}.
+
+@item -fpeel-all-loops
+@opindex fpeel-all-loops
+Peel all loops, even if their number of iterations is uncertain when
+the loop is entered.  For loops with large number of iterations this leads
+to wasted code size.
 
 @item -fmove-loop-invariants
 @opindex fmove-loop-invariants
Index: opts.c
===================================================================
--- opts.c	(revision 236815)
+++ opts.c	(working copy)
@@ -535,6 +535,7 @@ static const struct default_options defa
     { OPT_LEVELS_3_PLUS, OPT_fvect_cost_model_, NULL, VECT_COST_MODEL_DYNAMIC },
     { OPT_LEVELS_3_PLUS, OPT_fipa_cp_clone, NULL, 1 },
     { OPT_LEVELS_3_PLUS, OPT_ftree_partial_pre, NULL, 1 },
+    { OPT_LEVELS_3_PLUS, OPT_fpeel_loops, NULL, 1 },
 
     /* -Ofast adds optimizations to -O3.  */
     { OPT_LEVELS_FAST, OPT_ffast_math, NULL, 1 },
Index: testsuite/gcc.dg/tree-ssa/peel1.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/peel1.c	(revision 0)
+++ testsuite/gcc.dg/tree-ssa/peel1.c	(working copy)
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fdump-tree-cunroll-details" } */
+struct foo {int b; int a[3];} foo;
+void add(struct foo *a,int l)
+{
+  int i;
+  for (i=0;i<l;i++)
+    a->a[i]++;
+}
+/* { dg-final { scan-tree-dump "Loop 1 likely iterates at most 3 times." "cunroll"} } */
+/* { dg-final { scan-tree-dump "Peeled loop 1, 4 times." "cunroll"} } */
Index: testsuite/gcc.dg/tree-ssa/peel2.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/peel2.c	(revision 0)
+++ testsuite/gcc.dg/tree-ssa/peel2.c	(working copy)
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fpeel-all-loops -fdump-tree-cunroll-details --param max-peel-times=16 --param max-peeled-insns=100" } */
+void add(int *a,int l)
+{
+  int i;
+  for (i=0;i<l;i++)
+    a[i]++;
+}
+/* { dg-final { scan-tree-dump "Peeled loop 1, 16 times." "cunroll"} } */
Index: testsuite/gcc.dg/tree-ssa/pr61743-1.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/pr61743-1.c	(revision 236815)
+++ testsuite/gcc.dg/tree-ssa/pr61743-1.c	(working copy)
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O3 -funroll-loops -fno-tree-vectorize -fdump-tree-cunroll-details" } */
+/* { dg-options "-O3 -funroll-loops -fno-tree-vectorize -fdump-tree-cunroll-details -fno-peel-loops" } */
 
 #define N 8
 #define M 14
Index: testsuite/gcc.dg/tree-ssa/pr61743-2.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/pr61743-2.c	(revision 236815)
+++ testsuite/gcc.dg/tree-ssa/pr61743-2.c	(working copy)
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O3 -funroll-loops -fno-tree-vectorize -fdump-tree-cunroll-details" } */
+/* { dg-options "-O3 -funroll-loops -fno-tree-vectorize -fdump-tree-cunroll-details -fno-peel-loops" } */
 
 #define N 8
 #define M 14
Index: toplev.c
===================================================================
--- toplev.c	(revision 236815)
+++ toplev.c	(working copy)
@@ -1294,6 +1294,9 @@ process_options (void)
   if (flag_unroll_all_loops)
     flag_unroll_loops = 1;
 
+  if (flag_peel_all_loops)
+    flag_peel_loops = 1;
+
   /* web and rename-registers help when run after loop unrolling.  */
   if (flag_web == AUTODETECT_VALUE)
     flag_web = flag_unroll_loops || flag_peel_loops;
Index: tree-ssa-loop-ivcanon.c
===================================================================
--- tree-ssa-loop-ivcanon.c	(revision 236816)
+++ tree-ssa-loop-ivcanon.c	(working copy)
@@ -951,7 +951,9 @@ try_peel_loop (struct loop *loop,
   if (!flag_peel_loops || PARAM_VALUE (PARAM_MAX_PEEL_TIMES) <= 0)
     return false;
 
-  /* Peel only innermost loops.  */
+  /* Peel only innermost loops.
+     While the code is perfectly capable of peeling non-innermost loops,
+     the heuristics would probably need some improvements. */
   if (loop->inner)
     {
       if (dump_file)
@@ -970,11 +972,22 @@ try_peel_loop (struct loop *loop,
   npeel = estimated_loop_iterations_int (loop);
   if (npeel < 0)
     {
+      npeel = likely_max_loop_iterations_int (loop);
+      if (flag_peel_all_loops
+	  && npeel >= PARAM_VALUE (PARAM_MAX_PEEL_TIMES) - 1)
+	npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES) - 1;
+    }
+  if (npeel < 0 && flag_peel_all_loops)
+    npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES) - 1;
+  if (npeel < 0)
+    {
       if (dump_file)
         fprintf (dump_file, "Not peeling: number of iterations is not "
 	         "estimated\n");
       return false;
     }
+  gcc_assert (maxiter < 0 || maxiter >= npeel
+	      || npeel <= max_loop_iterations_int (loop));
   if (maxiter >= 0 && maxiter <= npeel)
     {
       if (dump_file)
@@ -998,8 +1011,25 @@ try_peel_loop (struct loop *loop,
   /* Check peeled loops size.  */
   tree_estimate_loop_size (loop, exit, NULL, &size,
 			   PARAM_VALUE (PARAM_MAX_PEELED_INSNS));
-  if ((peeled_size = estimated_peeled_sequence_size (&size, (int) npeel))
-      > PARAM_VALUE (PARAM_MAX_PEELED_INSNS))
+  peeled_size = estimated_peeled_sequence_size (&size, (int) npeel);
+
+  /* When asked to peel all loops, try to reduce number of peeled copies to
+     fit in the size bound.  */
+  while (flag_peel_all_loops
+	 && peeled_size > PARAM_VALUE (PARAM_MAX_PEELED_INSNS)
+	 && npeel > 1)
+    {
+      /* Number of peeled copies is capped by PARAM_MAX_PEEL_TIMES and thus this
+	 loop will converge quickly.
+	 Just be sure we won't get a compile time hog when user asks for
+	 insanely many copies by --param parameter.  */
+      if (npeel > 256)
+	npeel /= 2;
+      else
+	npeel--;
+      peeled_size = estimated_peeled_sequence_size (&size, (int) npeel);
+    }
+  if (peeled_size > PARAM_VALUE (PARAM_MAX_PEELED_INSNS))
     {
       if (dump_file)
         fprintf (dump_file, "Not peeling: peeled sequence size is too large "
@@ -1112,8 +1142,8 @@ canonicalize_loop_induction_variables (s
   if (dump_file && (dump_flags & TDF_DETAILS)
       && likely_max_loop_iterations_int (loop) >= 0)
     {
-      fprintf (dump_file, "Loop likely %d iterates at most %i times.\n", loop->num,
-	       (int)likely_max_loop_iterations_int (loop));
+      fprintf (dump_file, "Loop %d likely iterates at most %i times.\n",
+	       loop->num, (int)likely_max_loop_iterations_int (loop));
     }
 
   /* Remove exits that are known to be never taken based on loop bound.