This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] Remove strided SLP load vectorization restriction


Currently we only handle group_size <= nunits && nunits % group_size == 0
strided SLP loads.  That's overly restrictive as we can chunk
group_size > nunits && group_size % nunits == 0 loads and handle all
other cases by constructing the vector from scalars (as we'd do for
non-SLP).

Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.

Richard.

2016-06-08  Richard Biener  <rguenther@suse.de>

	* tree-vect-stmts.c (vectorizable_load): Remove restrictions
	on strided SLP loads and fall back to scalar loads in case
	we can't chunk them.

	* gcc.dg/vect/slp-43.c: New testcase.

Index: gcc/tree-vect-stmts.c
===================================================================
*** gcc/tree-vect-stmts.c	(revision 237205)
--- gcc/tree-vect-stmts.c	(working copy)
*************** vectorizable_load (gimple *stmt, gimple_
*** 6440,6456 ****
  	}
      }
    else if (STMT_VINFO_STRIDED_P (stmt_info))
!     {
!       if (grouped_load
! 	  && slp
! 	  && (group_size > nunits
! 	      || nunits % group_size != 0))
! 	{
! 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
! 			   "unhandled strided group load\n");
! 	  return false;
! 	}
!     }
    else
      {
        negative = tree_int_cst_compare (nested_in_vect_loop
--- 6440,6446 ----
  	}
      }
    else if (STMT_VINFO_STRIDED_P (stmt_info))
!     ;
    else
      {
        negative = tree_int_cst_compare (nested_in_vect_loop
*************** vectorizable_load (gimple *stmt, gimple_
*** 6744,6759 ****
        running_off = offvar;
        alias_off = build_int_cst (reference_alias_ptr_type (DR_REF (first_dr)), 0);
        int nloads = nunits;
        tree ltype = TREE_TYPE (vectype);
        auto_vec<tree> dr_chain;
        if (slp)
  	{
! 	  nloads = nunits / group_size;
! 	  if (group_size < nunits)
! 	    ltype = build_vector_type (TREE_TYPE (vectype), group_size);
! 	  else
! 	    ltype = vectype;
! 	  ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE (vectype)));
  	  /* For SLP permutation support we need to load the whole group,
  	     not only the number of vector stmts the permutation result
  	     fits in.  */
--- 6734,6762 ----
        running_off = offvar;
        alias_off = build_int_cst (reference_alias_ptr_type (DR_REF (first_dr)), 0);
        int nloads = nunits;
+       int lnel = 1;
        tree ltype = TREE_TYPE (vectype);
        auto_vec<tree> dr_chain;
        if (slp)
  	{
! 	  if (group_size < nunits
! 	      && nunits % group_size == 0)
! 	    {
! 	      nloads = nunits / group_size;
! 	      lnel = group_size;
! 	      ltype = build_vector_type (TREE_TYPE (vectype), group_size);
! 	      ltype = build_aligned_type (ltype,
! 					  TYPE_ALIGN (TREE_TYPE (vectype)));
! 	    }
! 	  else if (group_size >= nunits
! 		   && group_size % nunits == 0)
! 	    {
! 	      nloads = 1;
! 	      lnel = nunits;
! 	      ltype = vectype;
! 	      ltype = build_aligned_type (ltype,
! 					  TYPE_ALIGN (TREE_TYPE (vectype)));
! 	    }
  	  /* For SLP permutation support we need to load the whole group,
  	     not only the number of vector stmts the permutation result
  	     fits in.  */
*************** vectorizable_load (gimple *stmt, gimple_
*** 6765,6812 ****
  	  else
  	    ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
  	}
        for (j = 0; j < ncopies; j++)
  	{
- 	  tree vec_inv;
- 
  	  if (nloads > 1)
  	    {
! 	      vec_alloc (v, nloads);
! 	      for (i = 0; i < nloads; i++)
  		{
! 		  tree newref, newoff;
! 		  gimple *incr;
! 		  newref = build2 (MEM_REF, ltype, running_off, alias_off);
! 
! 		  newref = force_gimple_operand_gsi (gsi, newref, true,
! 						     NULL_TREE, true,
! 						     GSI_SAME_STMT);
! 		  CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, newref);
! 		  newoff = copy_ssa_name (running_off);
! 		  incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR,
! 					      running_off, stride_step);
  		  vect_finish_stmt_generation (stmt, incr, gsi);
  
  		  running_off = newoff;
  		}
- 
- 	      vec_inv = build_constructor (vectype, v);
- 	      new_temp = vect_init_vector (stmt, vec_inv, vectype, gsi);
- 	      new_stmt = SSA_NAME_DEF_STMT (new_temp);
  	    }
! 	  else
  	    {
! 	      new_stmt = gimple_build_assign (make_ssa_name (ltype),
! 					      build2 (MEM_REF, ltype,
! 						      running_off, alias_off));
! 	      vect_finish_stmt_generation (stmt, new_stmt, gsi);
! 
! 	      tree newoff = copy_ssa_name (running_off);
! 	      gimple *incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR,
! 					  running_off, stride_step);
! 	      vect_finish_stmt_generation (stmt, incr, gsi);
! 
! 	      running_off = newoff;
  	    }
  
  	  if (slp)
--- 6768,6810 ----
  	  else
  	    ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
  	}
+       int group_el = 0;
+       unsigned HOST_WIDE_INT
+ 	elsz = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
        for (j = 0; j < ncopies; j++)
  	{
  	  if (nloads > 1)
+ 	    vec_alloc (v, nloads);
+ 	  for (i = 0; i < nloads; i++)
  	    {
! 	      tree this_off = build_int_cst (TREE_TYPE (alias_off),
! 					     group_el * elsz);
! 	      new_stmt = gimple_build_assign (make_ssa_name (ltype),
! 					      build2 (MEM_REF, ltype,
! 						      running_off, this_off));
! 	      vect_finish_stmt_generation (stmt, new_stmt, gsi);
! 	      if (nloads > 1)
! 		CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
! 					gimple_assign_lhs (new_stmt));
! 
! 	      group_el += lnel;
! 	      if (! slp
! 		  || group_el == group_size)
  		{
! 		  tree newoff = copy_ssa_name (running_off);
! 		  gimple *incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR,
! 						      running_off, stride_step);
  		  vect_finish_stmt_generation (stmt, incr, gsi);
  
  		  running_off = newoff;
+ 		  group_el = 0;
  		}
  	    }
! 	  if (nloads > 1)
  	    {
! 	      tree vec_inv = build_constructor (vectype, v);
! 	      new_temp = vect_init_vector (stmt, vec_inv, vectype, gsi);
! 	      new_stmt = SSA_NAME_DEF_STMT (new_temp);
  	    }
  
  	  if (slp)
Index: gcc/testsuite/gcc.dg/vect/slp-43.c
===================================================================
*** gcc/testsuite/gcc.dg/vect/slp-43.c	(revision 0)
--- gcc/testsuite/gcc.dg/vect/slp-43.c	(revision 0)
***************
*** 0 ****
--- 1,78 ----
+ /* { dg-do run } */
+ /* { dg-require-effective-target vect_int } */
+ /* { dg-additional-options "-O3" } */
+ 
+ #include <string.h>
+ #include "tree-vect.h"
+ 
+ #define FOO(T,N) \
+ void __attribute__((noinline,noclone)) \
+ foo_ ## T ## _ ## N (T * __restrict__ in_, T * __restrict__ out_, int s) \
+ { \
+   T *in = __builtin_assume_aligned (in_, __BIGGEST_ALIGNMENT__); \
+   T *out = __builtin_assume_aligned (out_, __BIGGEST_ALIGNMENT__); \
+   for (int i = 0; i < 16; i++) \
+     { \
+       for (int j = 0; j < N; ++j) \
+         out[j] = in[j]; \
+       in += s*N; \
+       out += N; \
+     } \
+ }
+ 
+ #define TEST(T,N) \
+  do { \
+   memset (out, 0, 4096); \
+   foo_ ## T ## _ ## N ((T *)in, (T *)out, 1); \
+   if (memcmp (in, out, sizeof (T) * 16 * N) != 0) \
+     __builtin_abort (); \
+   for (int i = sizeof (T) * 16 * N; i < 4096; ++i) \
+     if (out[i] != 0) \
+       __builtin_abort (); \
+  } while (0)
+ 
+ FOO(char, 1)
+ FOO(char, 2)
+ FOO(char, 3)
+ FOO(char, 4)
+ FOO(char, 6)
+ FOO(char, 8)
+ FOO(int, 1)
+ FOO(int, 2)
+ FOO(int, 3)
+ FOO(int, 4)
+ FOO(int, 6)
+ FOO(int, 8)
+ FOO(int, 16)
+ 
+ char in[4096] __attribute__((aligned(__BIGGEST_ALIGNMENT__)));
+ char out[4096] __attribute__((aligned(__BIGGEST_ALIGNMENT__)));
+ 
+ int main()
+ {
+   check_vect ();
+ 
+   for (int i = 0; i < 4096; ++i)
+     {
+       in[i] = i;
+       __asm__ volatile ("" : : : "memory");
+     }
+ 
+   TEST(char, 1);
+   TEST(char, 2);
+   TEST(char, 3);
+   TEST(char, 4);
+   TEST(char, 6);
+   TEST(char, 8);
+   TEST(int, 1);
+   TEST(int, 2);
+   TEST(int, 3);
+   TEST(int, 4);
+   TEST(int, 6);
+   TEST(int, 8);
+   TEST(int, 16);
+ 
+   return 0;
+ }
+ 
+ /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 13 "vect" } } */


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]