[PATCH] Fix PR66510

Richard Biener rguenther@suse.de
Thu Jun 18 09:41:00 GMT 2015


The testcase shows a series of interesting issues in the vectorizer.
First of all there is a disconnect between the number of expected
vector statements generated for the load and the permutation result,
second, the vectorizer alignment code appearantly didn't expect a
non-vector-size-multiple step for loop vectorization.  To fix the
second I needed to move alignment check and enhancement after
we discover the final vectorization factor (with the SLP decision
factored in).  In theory this would now also allow to defer the
decision on whether we have to do epilogue peeling (I put that on
my TODO).

The testcase also shows that it might be profitable to have an
additional path for vectorizing this kind of permutations
(as it is just "shifting"), namely by performing a load with
adjusted address (I put that on my TODO).

Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk.

Richard.

2015-06-18  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/66510
	* tree-vect-stmts.c (vectorizable_load): Properly compute the
	number of vector loads for SLP permuted loads.
	* tree-vect-data-refs.c (vect_compute_data_ref_alignment): Also
	check the stride for loop vectorization.
	(vect_enhance_data_refs_alignment): Deal with SLP adjusted
	vectorization factor.
	(vect_analyze_group_access): If the group size is not a power
	of two require a epilogue loop.
	* tree-vect-loop.c (vect_analyze_loop_2): Move alignment
	compute and optimizing and alias test pruning after final
	vectorization factor computation.
	* tree-vect-slp.c (vect_build_slp_tree_1): Remove check on
	vector alignment.
	(vect_transform_slp_perm_load): Properly compute the original
	number of vector load stmts.

	* gcc.dg/vect/slp-perm-12.c: New testcase.

Index: gcc/tree-vect-stmts.c
===================================================================
--- gcc/tree-vect-stmts.c	(revision 224551)
+++ gcc/tree-vect-stmts.c	(working copy)
@@ -6424,7 +6424,13 @@ vectorizable_load (gimple stmt, gimple_s
       if (slp)
 	{
 	  grouped_load = false;
-	  vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
+	  /* For SLP permutation support we need to load the whole group,
+	     not only the number of vector stmts the permutation result
+	     fits in.  */
+	  if (slp_perm)
+	    vec_num = (group_size * vf + nunits - 1) / nunits;
+	  else
+	    vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
 	  group_gap_adj = vf * group_size - nunits * vec_num;
     	}
       else
Index: gcc/tree-vect-data-refs.c
===================================================================
--- gcc/tree-vect-data-refs.c	(revision 224551)
+++ gcc/tree-vect-data-refs.c	(working copy)
@@ -693,21 +693,22 @@ vect_compute_data_ref_alignment (struct
 	}
     }
 
-  /* Similarly, if we're doing basic-block vectorization, we can only use
-     base and misalignment information relative to an innermost loop if the
-     misalignment stays the same throughout the execution of the loop.
-     As above, this is the case if the stride of the dataref evenly divides
-     by the vector size.  */
-  if (!loop)
+  /* Similarly we can only use base and misalignment information relative to
+     an innermost loop if the misalignment stays the same throughout the
+     execution of the loop.  As above, this is the case if the stride of
+     the dataref evenly divides by the vector size.  */
+  else
     {
       tree step = DR_STEP (dr);
+      unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
 
       if (tree_fits_shwi_p (step)
-	  && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
+	  && ((tree_to_shwi (step) * vf)
+	      % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-	                     "SLP: step doesn't divide the vector-size.\n");
+	                     "step doesn't divide the vector-size.\n");
 	  misalign = NULL_TREE;
 	}
     }
@@ -1442,7 +1443,13 @@ vect_enhance_data_refs_alignment (loop_v
                  We do this automtically for cost model, since we calculate cost
                  for every peeling option.  */
               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
-                possible_npeel_number = vf /nelements;
+		{
+		  if (STMT_SLP_TYPE (stmt_info))
+		    possible_npeel_number
+		      = (vf * GROUP_SIZE (stmt_info)) / nelements;
+		  else
+		    possible_npeel_number = vf / nelements;
+		}
 
               /* Handle the aligned case. We may decide to align some other
                  access, making DR unaligned.  */
@@ -1455,7 +1462,6 @@ vect_enhance_data_refs_alignment (loop_v
 
               for (j = 0; j < possible_npeel_number; j++)
                 {
-                  gcc_assert (npeel_tmp <= vf);
                   vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
                   npeel_tmp += nelements;
                 }
@@ -2232,8 +2238,13 @@ vect_analyze_group_access (struct data_r
             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
         }
 
-      /* There is a gap in the end of the group.  */
-      if (groupsize - last_accessed_element > 0 && loop_vinfo)
+      /* If there is a gap in the end of the group or the group size cannot
+         be made a multiple of the vector element count then we access excess
+	 elements in the last iteration and thus need to peel that off.  */
+      if (loop_vinfo
+	  && (groupsize - last_accessed_element > 0
+	      || exact_log2 (groupsize) == -1))
+
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
Index: gcc/tree-vect-loop.c
===================================================================
--- gcc/tree-vect-loop.c	(revision 224551)
+++ gcc/tree-vect-loop.c	(working copy)
@@ -1793,6 +1793,22 @@ vect_analyze_loop_2 (loop_vec_info loop_
       return false;
     }
 
+  /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
+  ok = vect_analyze_slp (loop_vinfo, NULL, n_stmts);
+  if (!ok)
+    return false;
+
+  /* If there are any SLP instances mark them as pure_slp.  */
+  bool slp = vect_make_slp_decision (loop_vinfo);
+  if (slp)
+    {
+      /* Find stmts that need to be both vectorized and SLPed.  */
+      vect_detect_hybrid_slp (loop_vinfo);
+
+      /* Update the vectorization factor based on the SLP decision.  */
+      vect_update_vf_for_slp (loop_vinfo);
+    }
+
   /* Analyze the alignment of the data-refs in the loop.
      Fail if a data reference is found that cannot be vectorized.  */
 
@@ -1832,31 +1848,17 @@ vect_analyze_loop_2 (loop_vec_info loop_
       return false;
     }
 
-  /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
-  ok = vect_analyze_slp (loop_vinfo, NULL, n_stmts);
-  if (ok)
+  if (slp)
     {
-      /* If there are any SLP instances mark them as pure_slp.  */
-      if (vect_make_slp_decision (loop_vinfo))
-	{
-	  /* Find stmts that need to be both vectorized and SLPed.  */
-	  vect_detect_hybrid_slp (loop_vinfo);
-
-	  /* Update the vectorization factor based on the SLP decision.  */
-	  vect_update_vf_for_slp (loop_vinfo);
-
-	  /* Analyze operations in the SLP instances.  Note this may
-	     remove unsupported SLP instances which makes the above
-	     SLP kind detection invalid.  */
-	  unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
-	  vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
-				       LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
-	  if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
-	    return false;
-	}
+      /* Analyze operations in the SLP instances.  Note this may
+	 remove unsupported SLP instances which makes the above
+	 SLP kind detection invalid.  */
+      unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
+      vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
+				   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
+      if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
+	return false;
     }
-  else
-    return false;
 
   /* Scan all the remaining operations in the loop that are not subject
      to SLP and make sure they are vectorizable.  */
Index: gcc/tree-vect-slp.c
===================================================================
--- gcc/tree-vect-slp.c	(revision 224551)
+++ gcc/tree-vect-slp.c	(working copy)
@@ -487,9 +487,8 @@ vect_build_slp_tree_1 (loop_vec_info loo
   int icode;
   machine_mode optab_op2_mode;
   machine_mode vec_mode;
-  struct data_reference *first_dr;
   HOST_WIDE_INT dummy;
-  gimple first_load = NULL, prev_first_load = NULL, old_first_load = NULL;
+  gimple first_load = NULL, prev_first_load = NULL;
   tree cond;
 
   /* For every stmt in NODE find its def stmt/s.  */
@@ -787,7 +786,6 @@ vect_build_slp_tree_1 (loop_vec_info loo
                   return false;
                 }
 
-	      old_first_load = first_load;
               first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
               if (prev_first_load)
                 {
@@ -811,30 +809,6 @@ vect_build_slp_tree_1 (loop_vec_info loo
                 }
               else
                 prev_first_load = first_load;
-
-	      /* In some cases a group of loads is just the same load
-		 repeated N times.  Only analyze its cost once.  */
-              if (first_load == stmt && old_first_load != first_load)
-                {
-                  first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
-                  if (vect_supportable_dr_alignment (first_dr, false)
-                      == dr_unaligned_unsupported)
-                    {
-                      if (dump_enabled_p ())
-                        {
-                          dump_printf_loc (MSG_MISSED_OPTIMIZATION,
-					   vect_location, 
-					   "Build SLP failed: unsupported "
-					   "unaligned load ");
-                          dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
-					    stmt, 0);
-                          dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
-                        }
-		      /* Fatal mismatch.  */
-		      matches[0] = false;
-                      return false;
-                    }
-                }
            }
         } /* Grouped access.  */
       else
@@ -3203,6 +3177,11 @@ vect_transform_slp_perm_load (slp_tree n
   bool needs_first_vector = false;
   machine_mode mode;
 
+  if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
+    return false;
+
+  stmt_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info));
+
   mode = TYPE_MODE (vectype);
 
   if (!can_vec_perm_p (mode, false, NULL))
@@ -3228,8 +3207,10 @@ vect_transform_slp_perm_load (slp_tree n
 
   /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
      unrolling factor.  */
-  orig_vec_stmts_num = group_size *
-                SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
+  orig_vec_stmts_num
+    = (STMT_VINFO_GROUP_SIZE (stmt_info)
+       * SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance)
+       + nunits - 1) / nunits;
   if (orig_vec_stmts_num == 1)
     only_one_vec = true;
 
@@ -3237,11 +3218,6 @@ vect_transform_slp_perm_load (slp_tree n
      relatively to SLP_NODE_INSTANCE unrolling factor.  */
   ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
 
-  if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
-    return false;
-
-  stmt_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info));
-
   /* Generate permutation masks for every NODE. Number of masks for each NODE
      is equal to GROUP_SIZE.
      E.g., we have a group of three nodes with three loads from the same
Index: gcc/testsuite/gcc.dg/vect/slp-perm-12.c
===================================================================
--- gcc/testsuite/gcc.dg/vect/slp-perm-12.c	(revision 0)
+++ gcc/testsuite/gcc.dg/vect/slp-perm-12.c	(revision 0)
@@ -0,0 +1,52 @@
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target vect_pack_trunc } */
+/* { dg-additional-options "-msse4" { target { i?86-*-* x86_64-*-* } } } */
+
+#include "tree-vect.h"
+
+extern void abort (void);
+
+unsigned char a[64];
+short b[88];
+
+void __attribute__((noinline))
+test(unsigned char * __restrict__ dst, short * __restrict__ tptr)
+{
+  int i;
+  for (i = 0; i < 8; i++)
+    {
+      dst[0] = (tptr[0] - tptr[0 + 3]);
+      dst[1] = (tptr[1] - tptr[1 + 3]);
+      dst[2] = (tptr[2] - tptr[2 + 3]);
+      dst[3] = (tptr[3] - tptr[3 + 3]);
+      dst[4] = (tptr[4] - tptr[4 + 3]);
+      dst[5] = (tptr[5] - tptr[5 + 3]);
+      dst[6] = (tptr[6] - tptr[6 + 3]);
+      dst[7] = (tptr[7] - tptr[7 + 3]);
+      dst += 8;
+      tptr += 11;
+    }
+}
+
+int main()
+{
+  int i;
+
+  check_vect ();
+
+  for (i = 0; i < 88; ++i)
+    {
+      b[i] = i;
+      __asm__ volatile ("");
+    }
+
+  test (a, b);
+
+  for (i = 0; i < 64; ++i)
+    if (a[i] != 253)
+      abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */



More information about the Gcc-patches mailing list