[PATCH] Avoid peeling in cunrolli

Wed Nov 29 14:34:00 GMT 2017

It turns out that we don't vectorize the 2nd testcase in PR83202
(or rather we do that in weird ways during BB vectorization) because
cunrolli decides to peel the inner loop completely based on
the size of the accessed arrays.  That unfortunately leaves exit
tests in the outer loop body which in turn makes us not vectorize
the loop.

We have a late unrolling pass for these kind of unrollings so this
patch simply avoids doing this during cunrolli.

Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.

Richard.

2017-11-29  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/83202
	* tree-ssa-loop-ivcanon.c (try_unroll_loop_completely): Add
	allow_peel argument and guard peeling.
	(canonicalize_loop_induction_variables): Likewise.
	(canonicalize_induction_variables): Pass false.
	(tree_unroll_loops_completely_1): Pass unroll_outer to disallow
	peeling from cunrolli.

	* gcc.dg/vect/pr83202-1.c: New testcase.

Index: gcc/tree-ssa-loop-ivcanon.c
===================================================================

--- gcc/tree-ssa-loop-ivcanon.c	(revision 255201)
+++ gcc/tree-ssa-loop-ivcanon.c	(working copy)
@@ -679,7 +679,7 @@ try_unroll_loop_completely (struct loop
 			    edge exit, tree niter,
 			    enum unroll_level ul,
 			    HOST_WIDE_INT maxiter,
-			    location_t locus)
+			    location_t locus, bool allow_peel)
 {
   unsigned HOST_WIDE_INT n_unroll = 0;
   bool n_unroll_found = false;
@@ -711,7 +711,8 @@ try_unroll_loop_completely (struct loop
     exit = NULL;
 
   /* See if we can improve our estimate by using recorded loop bounds.  */
-  if (maxiter >= 0
+  if (allow_peel
+      && maxiter >= 0
       && (!n_unroll_found || (unsigned HOST_WIDE_INT)maxiter < n_unroll))
     {
       n_unroll = maxiter;
@@ -1139,7 +1140,7 @@ try_peel_loop (struct loop *loop,
 static bool
 canonicalize_loop_induction_variables (struct loop *loop,
 				       bool create_iv, enum unroll_level ul,
-				       bool try_eval)
+				       bool try_eval, bool allow_peel)
 {
   edge exit = NULL;
   tree niter;
@@ -1207,7 +1208,8 @@ canonicalize_loop_induction_variables (s
      populates the loop bounds.  */
   modified |= remove_redundant_iv_tests (loop);
 
-  if (try_unroll_loop_completely (loop, exit, niter, ul, maxiter, locus))
+  if (try_unroll_loop_completely (loop, exit, niter, ul, maxiter, locus,
+				  allow_peel))
     return true;
 
   if (create_iv
@@ -1238,7 +1240,7 @@ canonicalize_induction_variables (void)
     {
       changed |= canonicalize_loop_induction_variables (loop,
 							true, UL_SINGLE_ITER,
-							true);
+							true, false);
     }
   gcc_assert (!need_ssa_update_p (cfun));
 
@@ -1353,7 +1355,7 @@ tree_unroll_loops_completely_1 (bool may
     ul = UL_NO_GROWTH;
 
   if (canonicalize_loop_induction_variables
-        (loop, false, ul, !flag_tree_loop_ivcanon))
+        (loop, false, ul, !flag_tree_loop_ivcanon, unroll_outer))
     {
       /* If we'll continue unrolling, we need to propagate constants
 	 within the new basic blocks to fold away induction variable
Index: gcc/testsuite/gcc.dg/vect/pr83202-1.c
===================================================================
--- gcc/testsuite/gcc.dg/vect/pr83202-1.c	(nonexistent)
+++ gcc/testsuite/gcc.dg/vect/pr83202-1.c	(working copy)
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_double } */
+
+void test(double data[8][8])
+{
+  for (int i = 0; i < 8; i++)
+    {
+      for (int j = 0; j < i; j+=4)
+	{
+	  data[i][j] *= data[i][j];
+	  data[i][j+1] *= data[i][j+1];
+	  data[i][j+2] *= data[i][j+2];
+	  data[i][j+3] *= data[i][j+3];
+	}
+    }
+}
+
+/* { dg-final { scan-tree-dump "Loop contains only SLP stmts" "vect" } } */
+/* { dg-final { scan-tree-dump "ectorized 1 loops" "vect" } } */