[PATCH] Outer vs. inner loop ifcvt (PR tree-optimization/78899)

Richard Biener rguenther@suse.de
Sat Jan 7 11:46:00 GMT 2017


On January 6, 2017 7:56:10 PM GMT+01:00, Jakub Jelinek <jakub@redhat.com> wrote:
>Hi!
>
>If-conversion can't easily predict whether outer loop vectorization
>will be
>successful or not.  In GCC 6, we'd version and guard with
>LOOP_VECTORIZED
>only the inner loop, which is beneficial if the outer loop
>vectorization
>isn't successful, but inner loop vectorization is.
>This changed last year, and now we sometimes version the outer loop
>instead.
>The problem with that is that it regresses according to the PR some
>important benchmarks where the outer loop can't be really vectorized
>but
>if-conv doesn't know that, but because the inner loop needs masked
>loads/stores, the inner loop isn't vectorized either.
>
>The following patch tweaks tree-if-conv.c so that when it wants to
>version
>an outer loop, it actually transforms:
>	      loop1
>		loop2
>into:
>	      if (LOOP_VECTORIZED (1, 3))
>		{
>		  loop1
>		    loop2
>		}
>	      else
>		loop3 (copy of loop1)
>		  if (LOOP_VECTORIZED (4, 5))
>		    loop4 (copy of loop2)
>		  else
>		    loop5 (copy of loop4)

Huh.  Why isn't the else case equal to the if case for the vectorizer?  That is, we have the inner loop if-converted And thus for the if case either outer or inner loop vectorization should be possible.

So - why doesn't it work that way?

Richard.

>and tweaks the vectorizer, so that it attempts to vectorize always the
>outer loop (i.e. loop1) in this case first.  If that is successful,
>it marks loop4 as non-vectorizable and folds the latter
>LOOP_VECTORIZED,
>so that in the copies of the scalar loop the inner loop isn't
>vectorized.
>If outer loop vectorization fails, loop1/loop2 are thrown away and we
>get
>loop3 non-vectorized with either loop4 vectorized, or if even that
>fails,
>with loop5 inside of it.
>
>I think vectorization in the scalar loop of the outer loop is code size
>waste, it will be just very small number of iterations of the outer
>loop.
>If you think we should vectorize even there, we'd need other changes
>- handle LOOP_VECTORIZED ifn during copying of loops in
>tree-vect-loop-manip.c and ensure we'd run the vectorizer even on the
>copied loops - right now the vectorizer runs only on the loops created
>before vectorization starts (except for epilogue vectorization).
>
>Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
>2017-01-06  Jakub Jelinek  <jakub@redhat.com>
>
>	PR tree-optimization/78899
>	* tree-if-conv.c (version_loop_for_if_conversion): Instead of
>	returning bool return struct loop *, NULL for failure and the new
>	loop on success.
>	(versionable_outer_loop_p): Don't version outer loop if it has
>	dont_vectorized bit set.
>	(tree_if_conversion): When versioning outer loop, ensure
>	tree_if_conversion is performed also on the inner loop of the
>	non-vectorizable outer loop copy.
>	* tree-vectorizer.c (set_uid_loop_bbs): Formatting fix.  Fold
>	LOOP_VECTORIZED in inner loop of the scalar outer loop and
>	prevent vectorization of it.
>	(vectorize_loops): For outer + inner LOOP_VECTORIZED, ensure
>	the outer loop vectorization of the non-scalar version is attempted
>	before vectorization of the inner loop in scalar version.
>	* tree-vect-loop-manip.c (rename_variables_in_bb): If outer_loop
>	has 2 inner loops, rename also on edges from bb whose single pred
>	is outer_loop->header.  Fix typo in function comment.
>
>	* gcc.target/i386/pr78899.c: New test.
>	* gcc.dg/pr71077.c: New test.
>	* gcc.dg/gomp/pr68128-1.c: Adjust allowed number of vectorized
>	loops.
>
>--- gcc/tree-if-conv.c.jj	2017-01-04 18:12:50.180878924 +0100
>+++ gcc/tree-if-conv.c	2017-01-06 09:24:08.410557481 +0100
>@@ -2535,7 +2535,7 @@ combine_blocks (struct loop *loop)
>    loop to execute.  The vectorizer pass will fold this
>    internal call into either true or false.  */
> 
>-static bool
>+static struct loop *
> version_loop_for_if_conversion (struct loop *loop)
> {
>   basic_block cond_bb;
>@@ -2566,7 +2566,7 @@ version_loop_for_if_conversion (struct l
>     ifc_bbs[i]->aux = saved_preds[i];
> 
>   if (new_loop == NULL)
>-    return false;
>+    return NULL;
> 
>   new_loop->dont_vectorize = true;
>   new_loop->force_vectorize = false;
>@@ -2574,7 +2574,7 @@ version_loop_for_if_conversion (struct l
>gimple_call_set_arg (g, 1, build_int_cst (integer_type_node,
>new_loop->num));
>   gsi_insert_before (&gsi, g, GSI_SAME_STMT);
>   update_ssa (TODO_update_ssa);
>-  return true;
>+  return new_loop;
> }
> 
> /* Return true when LOOP satisfies the follow conditions that will
>@@ -2594,6 +2594,7 @@ static bool
> versionable_outer_loop_p (struct loop *loop)
> {
>   if (!loop_outer (loop)
>+      || loop->dont_vectorize
>       || !loop->inner
>       || loop->inner->next
>       || !single_exit (loop)
>@@ -2602,7 +2603,7 @@ versionable_outer_loop_p (struct loop *l
>       || !single_pred_p (loop->latch)
>       || !single_pred_p (loop->inner->latch))
>     return false;
>-  
>+
>   basic_block outer_exit = single_pred (loop->latch);
>   basic_block inner_exit = single_pred (loop->inner->latch);
> 
>@@ -2789,7 +2790,10 @@ tree_if_conversion (struct loop *loop)
> {
>   unsigned int todo = 0;
>   bool aggressive_if_conv;
>+  struct loop *rloop;
> 
>+ again:
>+  rloop = NULL;
>   ifc_bbs = NULL;
>   any_pred_load_store = false;
>   any_complicated_phi = false;
>@@ -2829,8 +2833,31 @@ tree_if_conversion (struct loop *loop)
>       struct loop *vloop
> 	= (versionable_outer_loop_p (loop_outer (loop))
> 	   ? loop_outer (loop) : loop);
>-      if (!version_loop_for_if_conversion (vloop))
>+      struct loop *nloop = version_loop_for_if_conversion (vloop);
>+      if (nloop == NULL)
> 	goto cleanup;
>+      if (vloop != loop)
>+	{
>+	  /* If versionable_outer_loop_p decided to version the
>+	     outer loop, version also the inner loop of the non-vectorized
>+	     loop copy.  So we transform:
>+	      loop1
>+		loop2
>+	     into:
>+	      if (LOOP_VECTORIZED (1, 3))
>+		{
>+		  loop1
>+		    loop2
>+		}
>+	      else
>+		loop3 (copy of loop1)
>+		  if (LOOP_VECTORIZED (4, 5))
>+		    loop4 (copy of loop2)
>+		  else
>+		    loop5 (copy of loop4)  */
>+	  gcc_assert (nloop->inner && nloop->inner->next == NULL);
>+	  rloop = nloop->inner;
>+	}
>     }
> 
>   /* Now all statements are if-convertible.  Combine all the basic
>@@ -2854,6 +2881,11 @@ tree_if_conversion (struct loop *loop)
>       free (ifc_bbs);
>       ifc_bbs = NULL;
>     }
>+  if (rloop != NULL)
>+    {
>+      loop = rloop;
>+      goto again;
>+    }
> 
>   return todo;
> }
>--- gcc/tree-vectorizer.c.jj	2017-01-04 18:12:49.960881733 +0100
>+++ gcc/tree-vectorizer.c	2017-01-06 09:24:08.411557468 +0100
>@@ -465,6 +465,7 @@ fold_loop_vectorized_call (gimple *g, tr
>       update_stmt (use_stmt);
>     }
> }
>+
> /* Set the uids of all the statements in basic blocks inside loop
>    represented by LOOP_VINFO. LOOP_VECTORIZED_CALL is the internal
>    call guarding the loop which has been if converted.  */
>@@ -477,9 +478,22 @@ set_uid_loop_bbs (loop_vec_info loop_vin
>   struct loop *scalar_loop = get_loop (cfun, tree_to_shwi (arg));
> 
>   LOOP_VINFO_SCALAR_LOOP (loop_vinfo) = scalar_loop;
>-  gcc_checking_assert (vect_loop_vectorized_call
>-		       (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
>+  gcc_checking_assert (vect_loop_vectorized_call (scalar_loop)
> 		       == loop_vectorized_call);
>+  /* If we are going to vectorize outer loop, prevent vectorization
>+     of the inner loop in the scalar loop - either the scalar loop is
>+     thrown away, so it is a wasted work, or is used only for
>+     a few iterations.  */
>+  if (scalar_loop->inner)
>+    {
>+      gimple *g = vect_loop_vectorized_call (scalar_loop->inner);
>+      if (g)
>+	{
>+	  arg = gimple_call_arg (g, 0);
>+	  get_loop (cfun, tree_to_shwi (arg))->dont_vectorize = true;
>+	  fold_loop_vectorized_call (g, boolean_false_node);
>+	}
>+    }
>   bbs = get_loop_body (scalar_loop);
>   for (i = 0; i < scalar_loop->num_nodes; i++)
>     {
>@@ -534,14 +548,59 @@ vectorize_loops (void)
>      only over initial loops skipping newly generated ones.  */
>   FOR_EACH_LOOP (loop, 0)
>     if (loop->dont_vectorize)
>-      any_ifcvt_loops = true;
>-    else if ((flag_tree_loop_vectorize
>-	      && optimize_loop_nest_for_speed_p (loop))
>-	     || loop->force_vectorize)
>       {
>-	loop_vec_info loop_vinfo, orig_loop_vinfo = NULL;
>-	gimple *loop_vectorized_call = vect_loop_vectorized_call (loop);
>-vectorize_epilogue:
>+	any_ifcvt_loops = true;
>+	/* If-conversion sometimes versions both the outer loop
>+	   (for the case when outer loop vectorization might be
>+	   desirable) as well as the inner loop in the scalar version
>+	   of the loop.  So we have:
>+	    if (LOOP_VECTORIZED (1, 3))
>+	      {
>+		loop1
>+		  loop2
>+	      }
>+	    else
>+	      loop3 (copy of loop1)
>+		if (LOOP_VECTORIZED (4, 5))
>+		  loop4 (copy of loop2)
>+		else
>+		  loop5 (copy of loop4)
>+	   If FOR_EACH_LOOP gives us loop3 first (which has
>+	   dont_vectorize set), make sure to process loop1 before loop4;
>+	   so that we can prevent vectorization of loop4 if loop1
>+	   is successfully vectorized.  */
>+	if (loop->inner)
>+	  {
>+	    gimple *loop_vectorized_call
>+	      = vect_loop_vectorized_call (loop);
>+	    if (loop_vectorized_call
>+		&& vect_loop_vectorized_call (loop->inner))
>+	      {
>+		tree arg = gimple_call_arg (loop_vectorized_call, 0);
>+		struct loop *vector_loop
>+		  = get_loop (cfun, tree_to_shwi (arg));
>+		if (vector_loop && vector_loop != loop)
>+		  {
>+		    loop = vector_loop;
>+		    /* Make sure we don't vectorize it twice.  */
>+		    loop->dont_vectorize = true;
>+		    goto try_vectorize;
>+		  }
>+	      }
>+	  }
>+      }
>+    else
>+      {
>+	loop_vec_info loop_vinfo, orig_loop_vinfo;
>+	gimple *loop_vectorized_call;
>+       try_vectorize:
>+	if (!((flag_tree_loop_vectorize
>+	       && optimize_loop_nest_for_speed_p (loop))
>+	      || loop->force_vectorize))
>+	  continue;
>+	orig_loop_vinfo = NULL;
>+	loop_vectorized_call = vect_loop_vectorized_call (loop);
>+       vectorize_epilogue:
> 	vect_location = find_loop_location (loop);
>         if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOCATION
> 	    && dump_enabled_p ())
>--- gcc/tree-vect-loop-manip.c.jj	2017-01-01 12:45:36.000000000 +0100
>+++ gcc/tree-vect-loop-manip.c	2017-01-06 10:22:43.248168286 +0100
>@@ -71,7 +71,7 @@ rename_use_op (use_operand_p op_p)
> }
> 
> 
>-/* Renames the variables in basic block BB.  Allow renaming  of PHI
>argumnets
>+/* Renames the variables in basic block BB.  Allow renaming  of PHI
>arguments
> on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
>    true.  */
> 
>@@ -102,9 +102,25 @@ rename_variables_in_bb (basic_block bb,
> 
>   FOR_EACH_EDGE (e, ei, bb->preds)
>     {
>-      if (!flow_bb_inside_loop_p (loop, e->src)
>-	  && (!rename_from_outer_loop || e->src != outer_loop->header))
>-	continue;
>+      if (!flow_bb_inside_loop_p (loop, e->src))
>+	{
>+	  if (!rename_from_outer_loop)
>+	    continue;
>+	  if (e->src != outer_loop->header)
>+	    {
>+	      if (outer_loop->inner->next)
>+		{
>+		  /* If outer_loop has 2 inner loops, allow there to
>+		     be an extra basic block which decides which of the
>+		     two loops to use using LOOP_VECTORIZED.  */
>+		  if (!single_pred_p (e->src)
>+		      || single_pred (e->src) != outer_loop->header)
>+		    continue;
>+		}
>+	      else
>+		continue;
>+	    }
>+	}
>       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
> 	   gsi_next (&gsi))
>         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
>--- gcc/testsuite/gcc.target/i386/pr78899.c.jj	2017-01-06
>09:24:08.411557468 +0100
>+++ gcc/testsuite/gcc.target/i386/pr78899.c	2017-01-06
>09:24:08.411557468 +0100
>@@ -0,0 +1,27 @@
>+/* PR tree-optimization/78899 */
>+/* { dg-do compile } */
>+/* { dg-options "-Ofast -fopenmp-simd -mavx2 -mno-avx512f" } */
>+
>+#define N 1024
>+#define M 4
>+int p1[N], p2[N], p3[N], c[N];
>+
>+void
>+foo (int n)
>+{
>+  int i, k;
>+  for (k = 0; k < n / M; k++)
>+    {
>+    #pragma omp simd
>+      for (i = 0; i < M; i++)
>+	if (c[k * M + i])
>+	  {
>+	    p1[k * M + i] += 1;
>+	    p2[k * M + i] = p3[k * M + i] + 2;
>+	  }
>+    }
>+}
>+
>+/* Ensure the loop is vectorized.  */
>+/* { dg-final { scan-assembler "vpmaskmov" } } */
>+/* { dg-final { scan-assembler "vpadd" } } */
>--- gcc/testsuite/gcc.dg/gomp/pr68128-1.c.jj	2017-01-04
>18:12:50.033880801 +0100
>+++ gcc/testsuite/gcc.dg/gomp/pr68128-1.c	2017-01-06 09:24:08.419557365
>+0100
>@@ -29,4 +29,4 @@ foo (float *u, float v, float w, float x
>     }
> }
> 
>-/* { dg-final { scan-tree-dump "note: vectorized 1 loops in function"
>"vect" { target i?86-*-* x86_64-*-* } } } */
>+/* { dg-final { scan-tree-dump "note: vectorized \[12] loops in
>function" "vect" { target i?86-*-* x86_64-*-* } } } */
>--- gcc/testsuite/gcc.dg/pr71077.c.jj	2017-01-06 09:24:08.420557352
>+0100
>+++ gcc/testsuite/gcc.dg/pr71077.c	2017-01-06 09:24:08.419557365 +0100
>@@ -0,0 +1,14 @@
>+/* PR c++/71077  */
>+/* { dg-do compile } */
>+/* { dg-options "-O3" } */
>+/* { dg-additional-options "-mavx2" { target { i?86-*-* x86_64-*-* } }
>} */
>+
>+void
>+foo (int *a, int n)
>+{
>+  int b, c;
>+  for (b = 0; b < n; b++)
>+    for (c = 0; c < 32; c++)
>+      if ((b & 1U) << c)
>+	a[b + c] = 0;
>+}
>
>	Jakub



More information about the Gcc-patches mailing list