[PATCH] Remove strided grouped store restrictions

Richard Biener rguenther@suse.de
Tue Jun 14 14:06:00 GMT 2016


The following patch is similar to the strided grouped load case I fixed
recently - it handles all the missing cases.  The testcase needs the
previous dependence fix.

Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.

Richard.

2016-06-14  Richard Biener  <rguenther@suse.de>

	* tree-vect-stmts.c (vectorizable_store): Remove strided grouped
	store restrictions.

	* gcc.dg/vect/slp-45.c: New testcase.

Index: gcc/tree-vect-stmts.c
===================================================================
*** gcc/tree-vect-stmts.c	(revision 237428)
--- gcc/tree-vect-stmts.c	(working copy)
*************** vectorizable_store (gimple *stmt, gimple
*** 5234,5239 ****
--- 5297,5303 ----
    enum vect_def_type scatter_idx_dt = vect_unknown_def_type;
    enum vect_def_type scatter_src_dt = vect_unknown_def_type;
    gimple *new_stmt;
+   int vf;
  
    if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
      return false;
*************** vectorizable_store (gimple *stmt, gimple
*** 5270,5276 ****
    unsigned int nunits = TYPE_VECTOR_SUBPARTS (vectype);
  
    if (loop_vinfo)
!     loop = LOOP_VINFO_LOOP (loop_vinfo);
  
    /* Multiple types in SLP are handled by creating the appropriate number of
       vectorized stmts for each SLP node.  Hence, NCOPIES is always 1 in
--- 5334,5345 ----
    unsigned int nunits = TYPE_VECTOR_SUBPARTS (vectype);
  
    if (loop_vinfo)
!     {
!       loop = LOOP_VINFO_LOOP (loop_vinfo);
!       vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
!     }
!   else
!     vf = 1;
  
    /* Multiple types in SLP are handled by creating the appropriate number of
       vectorized stmts for each SLP node.  Hence, NCOPIES is always 1 in
*************** vectorizable_store (gimple *stmt, gimple
*** 5365,5380 ****
  	    return false;
  	}
  
-       if (STMT_VINFO_STRIDED_P (stmt_info)
- 	  && slp
- 	  && (group_size > nunits
- 	      || nunits % group_size != 0))
- 	{
- 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- 			   "unhandled strided group store\n");
- 	  return false;
- 	}
- 
        if (first_stmt == stmt)
  	{
            /* STMT is the leader of the group. Check the operands of all the
--- 5434,5439 ----
*************** vectorizable_store (gimple *stmt, gimple
*** 5653,5675 ****
           */
  
        unsigned nstores = nunits;
        tree ltype = elem_type;
        if (slp)
  	{
! 	  nstores = nunits / group_size;
! 	  if (group_size < nunits)
! 	    ltype = build_vector_type (elem_type, group_size);
! 	  else
! 	    ltype = vectype;
  	  ltype = build_aligned_type (ltype, TYPE_ALIGN (elem_type));
  	  ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
- 	  group_size = 1;
  	}
  
        ivstep = stride_step;
        ivstep = fold_build2 (MULT_EXPR, TREE_TYPE (ivstep), ivstep,
! 			    build_int_cst (TREE_TYPE (ivstep),
! 					   ncopies * nstores));
  
        standard_iv_increment_position (loop, &incr_gsi, &insert_after);
  
--- 5712,5742 ----
           */
  
        unsigned nstores = nunits;
+       unsigned lnel = 1;
        tree ltype = elem_type;
        if (slp)
  	{
! 	  if (group_size < nunits
! 	      && nunits % group_size == 0)
! 	    {
! 	      nstores = nunits / group_size;
! 	      lnel = group_size;
! 	      ltype = build_vector_type (elem_type, group_size);
! 	    }
! 	  else if (group_size >= nunits
! 		   && group_size % nunits == 0)
! 	    {
! 	      nstores = 1;
! 	      lnel = nunits;
! 	      ltype = vectype;
! 	    }
  	  ltype = build_aligned_type (ltype, TYPE_ALIGN (elem_type));
  	  ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
  	}
  
        ivstep = stride_step;
        ivstep = fold_build2 (MULT_EXPR, TREE_TYPE (ivstep), ivstep,
! 			    build_int_cst (TREE_TYPE (ivstep), vf));
  
        standard_iv_increment_position (loop, &incr_gsi, &insert_after);
  
*************** vectorizable_store (gimple *stmt, gimple
*** 5700,5705 ****
--- 5767,5775 ----
  	      vect_finish_stmt_generation (stmt, incr, gsi);
  	      running_off = newoff;
  	    }
+ 	  unsigned int group_el = 0;
+ 	  unsigned HOST_WIDE_INT
+ 	    elsz = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
  	  for (j = 0; j < ncopies; j++)
  	    {
  	      /* We've set op and dt above, from gimple_assign_rhs1(stmt),
*************** vectorizable_store (gimple *stmt, gimple
*** 5745,5763 ****
  						   NULL_TREE, true,
  						   GSI_SAME_STMT);
  
  		  newref = build2 (MEM_REF, ltype,
! 				   running_off, alias_off);
  
  		  /* And store it to *running_off.  */
  		  assign = gimple_build_assign (newref, elem);
  		  vect_finish_stmt_generation (stmt, assign, gsi);
  
! 		  newoff = copy_ssa_name (running_off, NULL);
! 		  incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR,
! 					      running_off, stride_step);
! 		  vect_finish_stmt_generation (stmt, incr, gsi);
  
! 		  running_off = newoff;
  		  if (g == group_size - 1
  		      && !slp)
  		    {
--- 5815,5841 ----
  						   NULL_TREE, true,
  						   GSI_SAME_STMT);
  
+ 		  tree this_off = build_int_cst (TREE_TYPE (alias_off),
+ 						 group_el * elsz);
  		  newref = build2 (MEM_REF, ltype,
! 				   running_off, this_off);
  
  		  /* And store it to *running_off.  */
  		  assign = gimple_build_assign (newref, elem);
  		  vect_finish_stmt_generation (stmt, assign, gsi);
  
! 		  group_el += lnel;
! 		  if (! slp
! 		      || group_el == group_size)
! 		    {
! 		      newoff = copy_ssa_name (running_off, NULL);
! 		      incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR,
! 						  running_off, stride_step);
! 		      vect_finish_stmt_generation (stmt, incr, gsi);
  
! 		      running_off = newoff;
! 		      group_el = 0;
! 		    }
  		  if (g == group_size - 1
  		      && !slp)
  		    {
*************** vectorizable_store (gimple *stmt, gimple
*** 5771,5776 ****
--- 5849,5856 ----
  		}
  	    }
  	  next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
+ 	  if (slp)
+ 	    break;
  	}
        return true;
      }
Index: gcc/testsuite/gcc.dg/vect/slp-45.c
===================================================================
*** gcc/testsuite/gcc.dg/vect/slp-45.c	(revision 0)
--- gcc/testsuite/gcc.dg/vect/slp-45.c	(working copy)
***************
*** 0 ****
--- 1,78 ----
+ /* { dg-do run } */
+ /* { dg-require-effective-target vect_int } */
+ /* { dg-additional-options "-O3" } */
+ 
+ #include <string.h>
+ #include "tree-vect.h"
+ 
+ #define FOO(T,N) \
+ void __attribute__((noinline,noclone)) \
+ foo_ ## T ## _ ## N (T * __restrict__ in_, T * __restrict__ out_, int s) \
+ { \
+   T *in = __builtin_assume_aligned (in_, __BIGGEST_ALIGNMENT__); \
+   T *out = __builtin_assume_aligned (out_, __BIGGEST_ALIGNMENT__); \
+   for (int i = 0; i < 16; i++) \
+     { \
+       for (int j = 0; j < N; ++j) \
+         out[j] = in[j]; \
+       in += N; \
+       out += s*N; \
+     } \
+ }
+ 
+ #define TEST(T,N) \
+  do { \
+   memset (out, 0, 4096); \
+   foo_ ## T ## _ ## N ((T *)in, (T *)out, 1); \
+   if (memcmp (in, out, sizeof (T) * 16 * N) != 0) \
+     __builtin_abort (); \
+   for (int i = sizeof (T) * 16 * N; i < 4096; ++i) \
+     if (out[i] != 0) \
+       __builtin_abort (); \
+  } while (0)
+ 
+ FOO(char, 1)
+ FOO(char, 2)
+ FOO(char, 3)
+ FOO(char, 4)
+ FOO(char, 6)
+ FOO(char, 8)
+ FOO(int, 1)
+ FOO(int, 2)
+ FOO(int, 3)
+ FOO(int, 4)
+ FOO(int, 6)
+ FOO(int, 8)
+ FOO(int, 16)
+ 
+ char in[4096] __attribute__((aligned(__BIGGEST_ALIGNMENT__)));
+ char out[4096] __attribute__((aligned(__BIGGEST_ALIGNMENT__)));
+ 
+ int main()
+ {
+   check_vect ();
+ 
+   for (int i = 0; i < 4096; ++i)
+     {
+       in[i] = i;
+       __asm__ volatile ("" : : : "memory");
+     }
+ 
+   TEST(char, 1);
+   TEST(char, 2);
+   TEST(char, 3);
+   TEST(char, 4);
+   TEST(char, 6);
+   TEST(char, 8);
+   TEST(int, 1);
+   TEST(int, 2);
+   TEST(int, 3);
+   TEST(int, 4);
+   TEST(int, 6);
+   TEST(int, 8);
+   TEST(int, 16);
+ 
+   return 0;
+ }
+ 
+ /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 13 "vect" } } */



More information about the Gcc-patches mailing list