Fix PR43432, vectorize backwards stepping loads

Michael Matz matz@suse.de
Thu Sep 9 15:01:00 GMT 2010


Hi,

On Wed, 8 Sep 2010, Ira Rosen wrote:

> > Regstrapped on x86_64-linux.  I've had to add x86_64-*-* to
> > target-supports.exp/vect_perm recognition which in turn makes
> > vect/slp-perm-8.c and vect/slp-perm-9.c fail then.  That's most probably
> > because while x86_64 supports permutation for 4 and 8-sized elements it
> > doesn't do so for char (slp-perm-8.c) or short (slp-perm-9.c).  I'm going
> > to investigate this but seek feedback on the patch itself already now.
> >
> 
> Looks good to me.

Okay, so here's the complete patch.  Functionality is unchanged, but I 
extended the testsuite to have vect_perm_byte and vect_perm_short 
predicates (using them in slp-perm-8.c and slp-perm-9.c), so that I can 
enable vect_perm on x86_64.  With this one I get no regressions on 
x86_64-linux (all default languages).  Okay for trunk?


Ciao,
Michael.
-- 
	PR tree-optimization/43432
	* tree-vect-data-refs.c (vect_analyze_data_ref_access):
	Accept backwards consecutive accesses.
	(vect_create_data_ref_ptr): If step is negative generate
	decreasing IVs.
	* tree-vect-stmts.c (vectorizable_store): Reject negative steps.
	(perm_mask_for_reverse, reverse_vec_elements): New functions.
	(vectorizable_load): Handle loads with negative steps when easily
	possible.

testsuite/
	PR tree-optimization/43432
	* lib/target-supports.exp (check_effective_target_vect_perm_byte,
	check_effective_target_vect_perm_short): New predicates.
	(check_effective_target_vect_perm): Include x86_64.
	* gcc.dg/vect/pr43432.c: New test.
	* gcc.dg/vect/vect-114.c: Adjust.
	* gcc.dg/vect/vect-15.c: Ditto.
	* gcc.dg/vect/slp-perm-8.c: Use new predicate.
	* gcc.dg/vect/slp-perm-9.c: Ditto.

Index: tree-vect-data-refs.c
===================================================================
--- tree-vect-data-refs.c	(revision 163997)
+++ tree-vect-data-refs.c	(working copy)
@@ -2282,7 +2282,9 @@ vect_analyze_data_ref_access (struct dat
     }
 
   /* Consecutive?  */
-  if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type)))
+  if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
+      || (dr_step < 0
+	  && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
     {
       /* Mark that it is not interleaving.  */
       DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) = NULL;
@@ -2954,6 +2956,7 @@ vect_create_data_ref_ptr (gimple stmt, s
   tree vptr;
   gimple_stmt_iterator incr_gsi;
   bool insert_after;
+  bool negative;
   tree indx_before_incr, indx_after_incr;
   gimple incr;
   tree step;
@@ -2986,6 +2989,7 @@ vect_create_data_ref_ptr (gimple stmt, s
     *inv_p = true;
   else
     *inv_p = false;
+  negative = tree_int_cst_compare (step, size_zero_node) < 0;
 
   /* Create an expression for the first address accessed by this load
      in LOOP.  */
@@ -3145,6 +3149,8 @@ vect_create_data_ref_ptr (gimple stmt, s
 	 LOOP is zero. In this case the step here is also zero.  */
       if (*inv_p)
 	step = size_zero_node;
+      else if (negative)
+	step = fold_build1 (NEGATE_EXPR, TREE_TYPE (step), step);
 
       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 
Index: tree-vect-stmts.c
===================================================================
--- tree-vect-stmts.c	(revision 163999)
+++ tree-vect-stmts.c	(working copy)
@@ -3134,6 +3134,13 @@ vectorizable_store (gimple stmt, gimple_
   if (!STMT_VINFO_DATA_REF (stmt_info))
     return false;
 
+  if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
+    {
+      if (vect_print_dump_info (REPORT_DETAILS))
+        fprintf (vect_dump, "negative step for store.");
+      return false;
+    }
+
   if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
     {
       strided_store = true;
@@ -3412,6 +3419,68 @@ vectorizable_store (gimple stmt, gimple_
   return true;
 }
 
+/* Given a vector type VECTYPE returns a builtin DECL to be used
+   for vector permutation and stores a mask into *MASK that implements
+   reversal of the vector elements.  If that is impossible to do
+   returns NULL (and *MASK is unchanged).  */
+
+static tree
+perm_mask_for_reverse (tree vectype, tree *mask)
+{
+  tree builtin_decl;
+  tree mask_element_type, mask_type;
+  tree mask_vec = NULL;
+  int i;
+  int nunits;
+  if (!targetm.vectorize.builtin_vec_perm)
+    return NULL;
+
+  builtin_decl = targetm.vectorize.builtin_vec_perm (vectype,
+                                                     &mask_element_type);
+  if (!builtin_decl || !mask_element_type)
+    return NULL;
+
+  mask_type = get_vectype_for_scalar_type (mask_element_type);
+  nunits = TYPE_VECTOR_SUBPARTS (vectype);
+  if (TYPE_VECTOR_SUBPARTS (vectype) != TYPE_VECTOR_SUBPARTS (mask_type))
+    return NULL;
+
+  for (i = 0; i < nunits; i++)
+    mask_vec = tree_cons (NULL, build_int_cst (mask_element_type, i), mask_vec);
+  mask_vec = build_vector (mask_type, mask_vec);
+
+  if (!targetm.vectorize.builtin_vec_perm_ok (vectype, mask_vec))
+    return NULL;
+  if (mask)
+    *mask = mask_vec;
+  return builtin_decl;
+}
+
+/* Given a vector variable X, that was generated for the scalar LHS of
+   STMT, generate instructions to reverse the vector elements of X,
+   insert them a *GSI and return the permuted vector variable.  */
+
+static tree
+reverse_vec_elements (tree x, gimple stmt, gimple_stmt_iterator *gsi)
+{
+  tree vectype = TREE_TYPE (x);
+  tree mask_vec, builtin_decl;
+  tree perm_dest, data_ref;
+  gimple perm_stmt;
+
+  builtin_decl = perm_mask_for_reverse (vectype, &mask_vec);
+
+  perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
+
+  /* Generate the permute statement.  */
+  perm_stmt = gimple_build_call (builtin_decl, 3, x, x, mask_vec);
+  data_ref = make_ssa_name (perm_dest, perm_stmt);
+  gimple_call_set_lhs (perm_stmt, data_ref);
+  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+
+  return data_ref;
+}
+
 /* vectorizable_load.
 
    Check if STMT reads a non scalar data-ref (array/pointer/structure) that
@@ -3454,6 +3523,7 @@ vectorizable_load (gimple stmt, gimple_s
   gimple first_stmt;
   tree scalar_type;
   bool inv_p;
+  bool negative;
   bool compute_in_loop = false;
   struct loop *at_loop;
   int vec_num;
@@ -3516,6 +3586,14 @@ vectorizable_load (gimple stmt, gimple_s
   if (!STMT_VINFO_DATA_REF (stmt_info))
     return false;
 
+  negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
+  if (negative && ncopies > 1)
+    {
+      if (vect_print_dump_info (REPORT_DETAILS))
+        fprintf (vect_dump, "multiple types with negative step.");
+      return false;
+    }
+
   scalar_type = TREE_TYPE (DR_REF (dr));
   mode = TYPE_MODE (vectype);
 
@@ -3550,6 +3628,25 @@ vectorizable_load (gimple stmt, gimple_s
 	return false;
     }
 
+  if (negative)
+    {
+      gcc_assert (!strided_load);
+      alignment_support_scheme = vect_supportable_dr_alignment (dr, false);
+      if (alignment_support_scheme != dr_aligned
+	  && alignment_support_scheme != dr_unaligned_supported)
+	{
+	  if (vect_print_dump_info (REPORT_DETAILS))
+	    fprintf (vect_dump, "negative step but alignment required.");
+	  return false;
+	}
+      if (!perm_mask_for_reverse (vectype, NULL))
+	{
+	  if (vect_print_dump_info (REPORT_DETAILS))
+	    fprintf (vect_dump, "negative step and reversing not supported.");
+	  return false;
+	}
+    }
+
   if (!vec_stmt) /* transformation not required.  */
     {
       STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
@@ -3724,6 +3821,9 @@ vectorizable_load (gimple stmt, gimple_s
   else
     at_loop = loop;
 
+  if (negative)
+    offset = size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1);
+
   prev_stmt_info = NULL;
   for (j = 0; j < ncopies; j++)
     {
@@ -3912,6 +4012,12 @@ vectorizable_load (gimple stmt, gimple_s
 		gcc_unreachable (); /* FORNOW. */
 	    }
 
+	  if (negative)
+	    {
+	      new_temp = reverse_vec_elements (new_temp, stmt, gsi);
+	      new_stmt = SSA_NAME_DEF_STMT (new_temp);
+	    }
+
 	  /* Collect vector loads and later create their permutation in
 	     vect_transform_strided_load ().  */
           if (strided_load || slp_perm)
Index: testsuite/gcc.dg/vect/vect-114.c
===================================================================
--- testsuite/gcc.dg/vect/vect-114.c	(revision 163997)
+++ testsuite/gcc.dg/vect/vect-114.c	(working copy)
@@ -34,6 +34,7 @@ int main (void)
   return main1 ();
 }
 
-/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" { target { ! vect_perm } } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */
 
Index: testsuite/gcc.dg/vect/slp-perm-8.c
===================================================================
--- testsuite/gcc.dg/vect/slp-perm-8.c	(revision 163997)
+++ testsuite/gcc.dg/vect/slp-perm-8.c	(working copy)
@@ -53,7 +53,7 @@ int main (int argc, const char* argv[])
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { target vect_perm } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { target vect_perm_byte } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm_byte } } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */
 
Index: testsuite/gcc.dg/vect/pr43432.c
===================================================================
--- testsuite/gcc.dg/vect/pr43432.c	(revision 0)
+++ testsuite/gcc.dg/vect/pr43432.c	(revision 0)
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_float } */
+/* { dg-options "-O3 -ffast-math -fdump-tree-vect-details" } */
+
+
+void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1,
+int len){
+    int i;
+    src1 += len-1;
+    for(i=0; i<len; i++)
+        dst[i] = src0[i] * src1[-i];
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_perm } } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
Index: testsuite/gcc.dg/vect/slp-perm-9.c
===================================================================
--- testsuite/gcc.dg/vect/slp-perm-9.c	(revision 163997)
+++ testsuite/gcc.dg/vect/slp-perm-9.c	(working copy)
@@ -54,7 +54,7 @@ int main (int argc, const char* argv[])
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  } } */
-/* { dg-final { scan-tree-dump-times "permutation requires at least three vectors" 1 "vect" { target vect_perm } } } */
+/* { dg-final { scan-tree-dump-times "permutation requires at least three vectors" 1 "vect" { target vect_perm_short } } } */
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect"  } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */
 
Index: testsuite/gcc.dg/vect/vect-15.c
===================================================================
--- testsuite/gcc.dg/vect/vect-15.c	(revision 163997)
+++ testsuite/gcc.dg/vect/vect-15.c	(working copy)
@@ -35,5 +35,5 @@ int main (void)
   return main1 ();
 }
 
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */
Index: testsuite/lib/target-supports.exp
===================================================================
--- testsuite/lib/target-supports.exp	(revision 163997)
+++ testsuite/lib/target-supports.exp	(working copy)
@@ -2366,7 +2366,8 @@ proc check_effective_target_vect_perm {
     } else {
         set et_vect_perm_saved 0
         if { [istarget powerpc*-*-*]
-             || [istarget spu-*-*] } {
+             || [istarget spu-*-*]
+	     || [istarget x86_64-*-*] } {
             set et_vect_perm_saved 1
         }
     }
@@ -2374,6 +2375,48 @@ proc check_effective_target_vect_perm {
     return $et_vect_perm_saved
 }
 
+# Return 1 if the target plus current options supports vector permutation
+# on byte-sized elements, 0 otherwise.
+#
+# This won't change for different subtargets so cache the result.
+
+proc check_effective_target_vect_perm_byte { } {
+    global et_vect_perm_byte
+
+    if [info exists et_vect_perm_byte_saved] {
+        verbose "check_effective_target_vect_perm_byte: using cached result" 2
+    } else {
+        set et_vect_perm_byte_saved 0
+        if { [istarget powerpc*-*-*]
+             || [istarget spu-*-*] } {
+            set et_vect_perm_byte_saved 1
+        }
+    }
+    verbose "check_effective_target_vect_perm_byte: returning $et_vect_perm_byte_saved" 2
+    return $et_vect_perm_byte_saved
+}
+
+# Return 1 if the target plus current options supports vector permutation
+# on short-sized elements, 0 otherwise.
+#
+# This won't change for different subtargets so cache the result.
+
+proc check_effective_target_vect_perm_short { } {
+    global et_vect_perm_short
+
+    if [info exists et_vect_perm_short_saved] {
+        verbose "check_effective_target_vect_perm_short: using cached result" 2
+    } else {
+        set et_vect_perm_short_saved 0
+        if { [istarget powerpc*-*-*]
+             || [istarget spu-*-*] } {
+            set et_vect_perm_short_saved 1
+        }
+    }
+    verbose "check_effective_target_vect_perm_short: returning $et_vect_perm_short_saved" 2
+    return $et_vect_perm_short_saved
+}
+
 # Return 1 if the target plus current options supports a vector
 # widening summation of *short* args into *int* result, 0 otherwise.
 #



More information about the Gcc-patches mailing list