[PATCH] Outer vs. inner loop ifcvt (PR tree-optimization/78899)

Jakub Jelinek jakub@redhat.com
Mon Jan 9 11:30:00 GMT 2017


On Mon, Jan 09, 2017 at 11:08:08AM +0100, Richard Biener wrote:
> > > > There is one thing my patch doesn't do but should for efficiency, if loop1
> > > > (outer loop) is not successfully outer-loop vectorized, then we should mark
> > > > loop2 (its inner loop) as dont_vectorize if the outer loop has been
> > > > LOOP_VECTORIZED guarded.  Then the gcc.dg/gomp/pr68128-1.c change
> > > > wouldn't be actually needed.
> 
> (*)
> Ok, I don't have too many spare cycles either so can you fix (*)?  Then
> we can go with the extra versionings for GCC 7 for the moment and if any
> of us has enough time to revisit this soon we can.

Here is untested (except for the affected testcases) patch to do that,
ok if it passes bootstrap/regtest?  I'll file [8 Regression] with details
what we want to undo and do for GCC 8.

2017-01-09  Jakub Jelinek  <jakub@redhat.com>

	PR tree-optimization/78899
	* tree-if-conv.c (version_loop_for_if_conversion): Instead of
	returning bool return struct loop *, NULL for failure and the new
	loop on success.
	(versionable_outer_loop_p): Don't version outer loop if it has
	dont_vectorized bit set.
	(tree_if_conversion): When versioning outer loop, ensure
	tree_if_conversion is performed also on the inner loop of the
	non-vectorizable outer loop copy.
	* tree-vectorizer.c (set_uid_loop_bbs): Formatting fix.  Fold
	LOOP_VECTORIZED in inner loop of the scalar outer loop and
	prevent vectorization of it.
	(vectorize_loops): For outer + inner LOOP_VECTORIZED, ensure
	the outer loop vectorization of the non-scalar version is attempted
	before vectorization of the inner loop in scalar version.  If
	outer LOOP_VECTORIZED guarded loop is not vectorized, prevent
	vectorization of its inner loop.
	* tree-vect-loop-manip.c (rename_variables_in_bb): If outer_loop
	has 2 inner loops, rename also on edges from bb whose single pred
	is outer_loop->header.  Fix typo in function comment.

	* gcc.target/i386/pr78899.c: New test.
	* gcc.dg/pr71077.c: New test.

--- gcc/tree-if-conv.c.jj	2017-01-06 19:34:04.052560851 +0100
+++ gcc/tree-if-conv.c	2017-01-09 11:52:59.154806369 +0100
@@ -2535,7 +2535,7 @@ combine_blocks (struct loop *loop)
    loop to execute.  The vectorizer pass will fold this
    internal call into either true or false.  */
 
-static bool
+static struct loop *
 version_loop_for_if_conversion (struct loop *loop)
 {
   basic_block cond_bb;
@@ -2566,7 +2566,7 @@ version_loop_for_if_conversion (struct l
     ifc_bbs[i]->aux = saved_preds[i];
 
   if (new_loop == NULL)
-    return false;
+    return NULL;
 
   new_loop->dont_vectorize = true;
   new_loop->force_vectorize = false;
@@ -2574,7 +2574,7 @@ version_loop_for_if_conversion (struct l
   gimple_call_set_arg (g, 1, build_int_cst (integer_type_node, new_loop->num));
   gsi_insert_before (&gsi, g, GSI_SAME_STMT);
   update_ssa (TODO_update_ssa);
-  return true;
+  return new_loop;
 }
 
 /* Return true when LOOP satisfies the follow conditions that will
@@ -2594,6 +2594,7 @@ static bool
 versionable_outer_loop_p (struct loop *loop)
 {
   if (!loop_outer (loop)
+      || loop->dont_vectorize
       || !loop->inner
       || loop->inner->next
       || !single_exit (loop)
@@ -2602,7 +2603,7 @@ versionable_outer_loop_p (struct loop *l
       || !single_pred_p (loop->latch)
       || !single_pred_p (loop->inner->latch))
     return false;
-  
+
   basic_block outer_exit = single_pred (loop->latch);
   basic_block inner_exit = single_pred (loop->inner->latch);
 
@@ -2789,7 +2790,10 @@ tree_if_conversion (struct loop *loop)
 {
   unsigned int todo = 0;
   bool aggressive_if_conv;
+  struct loop *rloop;
 
+ again:
+  rloop = NULL;
   ifc_bbs = NULL;
   any_pred_load_store = false;
   any_complicated_phi = false;
@@ -2829,8 +2833,31 @@ tree_if_conversion (struct loop *loop)
       struct loop *vloop
 	= (versionable_outer_loop_p (loop_outer (loop))
 	   ? loop_outer (loop) : loop);
-      if (!version_loop_for_if_conversion (vloop))
+      struct loop *nloop = version_loop_for_if_conversion (vloop);
+      if (nloop == NULL)
 	goto cleanup;
+      if (vloop != loop)
+	{
+	  /* If versionable_outer_loop_p decided to version the
+	     outer loop, version also the inner loop of the non-vectorized
+	     loop copy.  So we transform:
+	      loop1
+		loop2
+	     into:
+	      if (LOOP_VECTORIZED (1, 3))
+		{
+		  loop1
+		    loop2
+		}
+	      else
+		loop3 (copy of loop1)
+		  if (LOOP_VECTORIZED (4, 5))
+		    loop4 (copy of loop2)
+		  else
+		    loop5 (copy of loop4)  */
+	  gcc_assert (nloop->inner && nloop->inner->next == NULL);
+	  rloop = nloop->inner;
+	}
     }
 
   /* Now all statements are if-convertible.  Combine all the basic
@@ -2854,6 +2881,11 @@ tree_if_conversion (struct loop *loop)
       free (ifc_bbs);
       ifc_bbs = NULL;
     }
+  if (rloop != NULL)
+    {
+      loop = rloop;
+      goto again;
+    }
 
   return todo;
 }
--- gcc/tree-vectorizer.c.jj	2017-01-06 19:34:04.064560697 +0100
+++ gcc/tree-vectorizer.c	2017-01-09 12:04:08.721053174 +0100
@@ -465,6 +465,7 @@ fold_loop_vectorized_call (gimple *g, tr
       update_stmt (use_stmt);
     }
 }
+
 /* Set the uids of all the statements in basic blocks inside loop
    represented by LOOP_VINFO. LOOP_VECTORIZED_CALL is the internal
    call guarding the loop which has been if converted.  */
@@ -477,9 +478,22 @@ set_uid_loop_bbs (loop_vec_info loop_vin
   struct loop *scalar_loop = get_loop (cfun, tree_to_shwi (arg));
 
   LOOP_VINFO_SCALAR_LOOP (loop_vinfo) = scalar_loop;
-  gcc_checking_assert (vect_loop_vectorized_call
-		       (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
+  gcc_checking_assert (vect_loop_vectorized_call (scalar_loop)
 		       == loop_vectorized_call);
+  /* If we are going to vectorize outer loop, prevent vectorization
+     of the inner loop in the scalar loop - either the scalar loop is
+     thrown away, so it is a wasted work, or is used only for
+     a few iterations.  */
+  if (scalar_loop->inner)
+    {
+      gimple *g = vect_loop_vectorized_call (scalar_loop->inner);
+      if (g)
+	{
+	  arg = gimple_call_arg (g, 0);
+	  get_loop (cfun, tree_to_shwi (arg))->dont_vectorize = true;
+	  fold_loop_vectorized_call (g, boolean_false_node);
+	}
+    }
   bbs = get_loop_body (scalar_loop);
   for (i = 0; i < scalar_loop->num_nodes; i++)
     {
@@ -534,14 +548,59 @@ vectorize_loops (void)
      only over initial loops skipping newly generated ones.  */
   FOR_EACH_LOOP (loop, 0)
     if (loop->dont_vectorize)
-      any_ifcvt_loops = true;
-    else if ((flag_tree_loop_vectorize
-	      && optimize_loop_nest_for_speed_p (loop))
-	     || loop->force_vectorize)
       {
-	loop_vec_info loop_vinfo, orig_loop_vinfo = NULL;
-	gimple *loop_vectorized_call = vect_loop_vectorized_call (loop);
-vectorize_epilogue:
+	any_ifcvt_loops = true;
+	/* If-conversion sometimes versions both the outer loop
+	   (for the case when outer loop vectorization might be
+	   desirable) as well as the inner loop in the scalar version
+	   of the loop.  So we have:
+	    if (LOOP_VECTORIZED (1, 3))
+	      {
+		loop1
+		  loop2
+	      }
+	    else
+	      loop3 (copy of loop1)
+		if (LOOP_VECTORIZED (4, 5))
+		  loop4 (copy of loop2)
+		else
+		  loop5 (copy of loop4)
+	   If FOR_EACH_LOOP gives us loop3 first (which has
+	   dont_vectorize set), make sure to process loop1 before loop4;
+	   so that we can prevent vectorization of loop4 if loop1
+	   is successfully vectorized.  */
+	if (loop->inner)
+	  {
+	    gimple *loop_vectorized_call
+	      = vect_loop_vectorized_call (loop);
+	    if (loop_vectorized_call
+		&& vect_loop_vectorized_call (loop->inner))
+	      {
+		tree arg = gimple_call_arg (loop_vectorized_call, 0);
+		struct loop *vector_loop
+		  = get_loop (cfun, tree_to_shwi (arg));
+		if (vector_loop && vector_loop != loop)
+		  {
+		    loop = vector_loop;
+		    /* Make sure we don't vectorize it twice.  */
+		    loop->dont_vectorize = true;
+		    goto try_vectorize;
+		  }
+	      }
+	  }
+      }
+    else
+      {
+	loop_vec_info loop_vinfo, orig_loop_vinfo;
+	gimple *loop_vectorized_call;
+       try_vectorize:
+	if (!((flag_tree_loop_vectorize
+	       && optimize_loop_nest_for_speed_p (loop))
+	      || loop->force_vectorize))
+	  continue;
+	orig_loop_vinfo = NULL;
+	loop_vectorized_call = vect_loop_vectorized_call (loop);
+       vectorize_epilogue:
 	vect_location = find_loop_location (loop);
         if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOCATION
 	    && dump_enabled_p ())
@@ -595,6 +654,12 @@ vectorize_epilogue:
 		    ret |= TODO_cleanup_cfg;
 		  }
 	      }
+	    /* If outer loop vectorization fails for LOOP_VECTORIZED guarded
+	       loop, don't vectorize its inner loop; we'll attempt to
+	       vectorize LOOP_VECTORIZED guarded inner loop of the scalar
+	       loop version.  */
+	    if (loop_vectorized_call && loop->inner)
+	      loop->inner->dont_vectorize = true;
 	    continue;
 	  }
 
--- gcc/tree-vect-loop-manip.c.jj	2017-01-06 19:34:04.040561006 +0100
+++ gcc/tree-vect-loop-manip.c	2017-01-09 11:52:59.155806356 +0100
@@ -71,7 +71,7 @@ rename_use_op (use_operand_p op_p)
 }
 
 
-/* Renames the variables in basic block BB.  Allow renaming  of PHI argumnets
+/* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
    on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
    true.  */
 
@@ -102,9 +102,25 @@ rename_variables_in_bb (basic_block bb,
 
   FOR_EACH_EDGE (e, ei, bb->preds)
     {
-      if (!flow_bb_inside_loop_p (loop, e->src)
-	  && (!rename_from_outer_loop || e->src != outer_loop->header))
-	continue;
+      if (!flow_bb_inside_loop_p (loop, e->src))
+	{
+	  if (!rename_from_outer_loop)
+	    continue;
+	  if (e->src != outer_loop->header)
+	    {
+	      if (outer_loop->inner->next)
+		{
+		  /* If outer_loop has 2 inner loops, allow there to
+		     be an extra basic block which decides which of the
+		     two loops to use using LOOP_VECTORIZED.  */
+		  if (!single_pred_p (e->src)
+		      || single_pred (e->src) != outer_loop->header)
+		    continue;
+		}
+	      else
+		continue;
+	    }
+	}
       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 	   gsi_next (&gsi))
         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
--- gcc/testsuite/gcc.target/i386/pr78899.c.jj	2017-01-09 11:52:59.156806343 +0100
+++ gcc/testsuite/gcc.target/i386/pr78899.c	2017-01-09 11:52:59.156806343 +0100
@@ -0,0 +1,27 @@
+/* PR tree-optimization/78899 */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fopenmp-simd -mavx2 -mno-avx512f" } */
+
+#define N 1024
+#define M 4
+int p1[N], p2[N], p3[N], c[N];
+
+void
+foo (int n)
+{
+  int i, k;
+  for (k = 0; k < n / M; k++)
+    {
+    #pragma omp simd
+      for (i = 0; i < M; i++)
+	if (c[k * M + i])
+	  {
+	    p1[k * M + i] += 1;
+	    p2[k * M + i] = p3[k * M + i] + 2;
+	  }
+    }
+}
+
+/* Ensure the loop is vectorized.  */
+/* { dg-final { scan-assembler "vpmaskmov" } } */
+/* { dg-final { scan-assembler "vpadd" } } */
--- gcc/testsuite/gcc.dg/pr71077.c.jj	2017-01-09 11:52:59.179806042 +0100
+++ gcc/testsuite/gcc.dg/pr71077.c	2017-01-09 11:52:59.179806042 +0100
@@ -0,0 +1,14 @@
+/* PR c++/71077  */
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-additional-options "-mavx2" { target { i?86-*-* x86_64-*-* } } } */
+
+void
+foo (int *a, int n)
+{
+  int b, c;
+  for (b = 0; b < n; b++)
+    for (c = 0; c < 32; c++)
+      if ((b & 1U) << c)
+	a[b + c] = 0;
+}


	Jakub



More information about the Gcc-patches mailing list