[Vectorizer] Add SLP support for masked loads

Fri Apr 26 13:17:00 GMT 2019

Alejandro Martinez Vicente <Alejandro.MartinezVicente@arm.com> writes:
> Hi,
>
> Current vectorizer doesn't support masked loads for SLP. We should add that, to
> allow things like:
>
> void
> f (int *restrict x, int *restrict y, int *restrict z, int n)
> {
>   for (int i = 0; i < n; i += 2)
>     {
>       x[i] = y[i] ? z[i] : 1;
>       x[i + 1] = y[i + 1] ? z[i + 1] : 2;
>     }
> }
>
> to be vectorized using contiguous loads rather than LD2 and ST2.
>
> This patch was motivated by SVE, but it is completely generic and should apply
> to any architecture with masked loads.
>
> After the patch is applied, the above code generates this output
> (-march=armv8.2-a+sve -O2 -ftree-vectorize):
>
> 0000000000000000 <f>:
>    0:	7100007f 	cmp	w3, #0x0
>    4:	540002cd 	b.le	5c <f+0x5c>
>    8:	51000464 	sub	w4, w3, #0x1
>    c:	d2800003 	mov	x3, #0x0                   	// #0
>   10:	90000005 	adrp	x5, 0 <f>
>   14:	25d8e3e0 	ptrue	p0.d
>   18:	53017c84 	lsr	w4, w4, #1
>   1c:	910000a5 	add	x5, x5, #0x0
>   20:	11000484 	add	w4, w4, #0x1
>   24:	85c0e0a1 	ld1rd	{z1.d}, p0/z, [x5]
>   28:	2598e3e3 	ptrue	p3.s
>   2c:	d37ff884 	lsl	x4, x4, #1
>   30:	25a41fe2 	whilelo	p2.s, xzr, x4
>   34:	d503201f 	nop
>   38:	a5434820 	ld1w	{z0.s}, p2/z, [x1, x3, lsl #2]
>   3c:	25808c11 	cmpne	p1.s, p3/z, z0.s, #0
>   40:	25808810 	cmpne	p0.s, p2/z, z0.s, #0
>   44:	a5434040 	ld1w	{z0.s}, p0/z, [x2, x3, lsl #2]
>   48:	05a1c400 	sel	z0.s, p1, z0.s, z1.s
>   4c:	e5434800 	st1w	{z0.s}, p2, [x0, x3, lsl #2]
>   50:	04b0e3e3 	incw	x3
>   54:	25a41c62 	whilelo	p2.s, x3, x4
>   58:	54ffff01 	b.ne	38 <f+0x38>  // b.any
>   5c:	d65f03c0 	ret
>
>
> I tested this patch in an aarch64 machine bootstrapping the compiler and
> running the checks.
>
> Alejandro
>
> gcc/Changelog:
>
> 2019-01-16  Alejandro Martinez  <alejandro.martinezvicente@arm.com>
>
> 	* config/aarch64/aarch64-sve.md (copysign<mode>3): New define_expand.
> 	(xorsign<mode>3): Likewise.
> 	internal-fn.c: Marked mask_load_direct and mask_store_direct as
> 	vectorizable.
> 	tree-data-ref.c (data_ref_compare_tree): Fixed comment typo.
> 	tree-vect-data-refs.c (can_group_stmts_p): Allow masked loads to be
> 	combined even if masks different.
> 	(slp_vect_only_p): New function to detect masked loads that are only
> 	vectorizable using SLP.
> 	(vect_analyze_data_ref_accesses): Mark SLP only vectorizable groups.
> 	tree-vect-loop.c (vect_dissolve_slp_only_groups): New function to
> 	dissolve SLP-only vectorizable groups when SLP has been discarded.
> 	(vect_analyze_loop_2): Call vect_dissolve_slp_only_groups when needed.
> 	tree-vect-slp.c (vect_get_and_check_slp_defs): Check masked loads
> 	masks.
> 	(vect_build_slp_tree_1): Fixed comment typo.
> 	(vect_build_slp_tree_2): Include masks from masked loads in SLP tree.
> 	tree-vect-stmts.c (vect_get_vec_defs_for_operand): New function to get
> 	vec_defs for operand with optional SLP and vectype.
> 	(vectorizable_load): Allow vectorizaion of masked loads for SLP only.
> 	tree-vectorizer.h (_stmt_vec_info): Added flag for SLP-only
> 	vectorizable.
> 	tree-vectorizer.c (vec_info::new_stmt_vec_info): Likewise.
>
> gcc/testsuite/Changelog:
>  
> 2019-01-16  Alejandro Martinez  <alejandro.martinezvicente@arm.com>
>
> 	* gcc.target/aarch64/sve/mask_load_slp_1.c: New test for SLP
> 	vectorized masked loads.
>
> diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
> index 4f2ef45..67eee59 100644
> --- a/gcc/internal-fn.c
> +++ b/gcc/internal-fn.c
> @@ -100,11 +100,11 @@ init_internal_fns ()
>  /* Create static initializers for the information returned by
>     direct_internal_fn.  */
>  #define not_direct { -2, -2, false }
> -#define mask_load_direct { -1, 2, false }
> +#define mask_load_direct { -1, 2, true }
>  #define load_lanes_direct { -1, -1, false }
>  #define mask_load_lanes_direct { -1, -1, false }
>  #define gather_load_direct { -1, -1, false }
> -#define mask_store_direct { 3, 2, false }
> +#define mask_store_direct { 3, 2, true }
>  #define store_lanes_direct { 0, 0, false }
>  #define mask_store_lanes_direct { 0, 0, false }
>  #define scatter_store_direct { 3, 3, false }
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c b/gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c
> new file mode 100644
> index 0000000..b106cae
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c
> @@ -0,0 +1,74 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize" } */
> +
> +#include <stdint.h>
> +
> +#define MASK_SLP_2(TYPE_COND, ALT_VAL)					\
> +void __attribute__ ((noinline, noclone))				\
> +mask_slp_##TYPE_COND##_2_##ALT_VAL (int *restrict x, int *restrict y,	\
> +				    TYPE_COND *restrict z, int n)	\
> +{									\
> +  for (int i = 0; i < n; i += 2)					\
> +    {									\
> +      x[i] = y[i] ? z[i] : 1;						\
> +      x[i + 1] = y[i + 1] ? z[i + 1] : ALT_VAL;				\
> +    }									\
> +}
> +
> +#define MASK_SLP_4(TYPE_COND, ALT_VAL)					\
> +void __attribute__ ((noinline, noclone))				\
> +mask_slp_##TYPE_COND##_4_##ALT_VAL (int *restrict x, int *restrict y,	\
> +				    TYPE_COND *restrict z, int n)	\
> +{									\
> +  for (int i = 0; i < n; i += 4)					\
> +    {									\
> +      x[i] = y[i] ? z[i] : 1;						\
> +      x[i + 1] = y[i + 1] ? z[i + 1] : ALT_VAL;				\
> +      x[i + 2] = y[i + 2] ? z[i + 2] : 1;				\
> +      x[i + 3] = y[i + 3] ? z[i + 3] : ALT_VAL;				\
> +    }									\
> +}
> +
> +#define MASK_SLP_8(TYPE_COND, ALT_VAL)					\
> +void __attribute__ ((noinline, noclone))				\
> +mask_slp_##TYPE_COND##_8_##ALT_VAL (int *restrict x, int *restrict y,	\
> +				    TYPE_COND *restrict z, int n)	\
> +{									\
> +  for (int i = 0; i < n; i += 8)					\
> +    {									\
> +      x[i] = y[i] ? z[i] : 1;						\
> +      x[i + 1] = y[i + 1] ? z[i + 1] : ALT_VAL;				\
> +      x[i + 2] = y[i + 2] ? z[i + 2] : 1;				\
> +      x[i + 3] = y[i + 3] ? z[i + 3] : ALT_VAL;				\
> +      x[i + 4] = y[i + 4] ? z[i + 4] : 1;				\
> +      x[i + 5] = y[i + 5] ? z[i + 5] : ALT_VAL;				\
> +      x[i + 6] = y[i + 6] ? z[i + 6] : 1;				\
> +      x[i + 7] = y[i + 7] ? z[i + 7] : ALT_VAL;				\
> +    }									\
> +}
> +
> +MASK_SLP_2(int8_t, 1)
> +MASK_SLP_2(int8_t, 2)
> +MASK_SLP_2(int, 1)
> +MASK_SLP_2(int, 2)
> +MASK_SLP_2(int64_t, 1)
> +MASK_SLP_2(int64_t, 2)
> +
> +MASK_SLP_4(int8_t, 1)
> +MASK_SLP_4(int8_t, 2)
> +MASK_SLP_4(int, 1)
> +MASK_SLP_4(int, 2)
> +MASK_SLP_4(int64_t, 1)
> +MASK_SLP_4(int64_t, 2)
> +
> +MASK_SLP_8(int8_t, 1)
> +MASK_SLP_8(int8_t, 2)
> +MASK_SLP_8(int, 1)
> +MASK_SLP_8(int, 2)
> +MASK_SLP_8(int64_t, 1)
> +MASK_SLP_8(int64_t, 2)
> +
> +/* { dg-final { scan-assembler-not {\tld2w\t} } } */
> +/* { dg-final { scan-assembler-not {\tst2w\t} } } */
> +/* { dg-final { scan-assembler-times {\tld1w\t} 48 } } */
> +/* { dg-final { scan-assembler-times {\tst1w\t} 40 } } */
> diff --git a/gcc/tree-data-ref.c b/gcc/tree-data-ref.c
> index 7d1f03c..1833a5f 100644
> --- a/gcc/tree-data-ref.c
> +++ b/gcc/tree-data-ref.c
> @@ -1272,7 +1272,7 @@ create_data_ref (edge nest, loop_p loop, tree memref, gimple *stmt,
>    return dr;
>  }
>  
> -/*  A helper function computes order between two tree epxressions T1 and T2.
> +/*  A helper function computes order between two tree expressions T1 and T2.
>      This is used in comparator functions sorting objects based on the order
>      of tree expressions.  The function returns -1, 0, or 1.  */
>  
> diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
> index 7bbd47f..8a82147 100644
> --- a/gcc/tree-vect-data-refs.c
> +++ b/gcc/tree-vect-data-refs.c
> @@ -2837,22 +2837,72 @@ can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info)
>        if (ifn != gimple_call_internal_fn (call2))
>  	return false;
>  
> -      /* Check that the masks are the same.  Cope with casts of masks,
> +      /* Check that the masks can be combined.  */
> +      tree mask1 = gimple_call_arg (call1, 2);
> +      tree mask2 = gimple_call_arg (call2, 2);
> +      if (!operand_equal_p (mask1, mask2, 0))
> +	{
> +	  /* Stores need identical masks.  */
> +	  if (ifn == IFN_MASK_STORE)
> +	    {
> +	      mask1 = strip_conversion (mask1);
> +	      if (!mask1)
> +		return false;
> +	      mask2 = strip_conversion (mask2);
> +	      if (!mask2)
> +		return false;
> +	      if (!operand_equal_p (mask1, mask2, 0))
> +	      	return false;
> +	    }
> +	  /* Loads are allowed different masks under SLP only.
> +	     (See slp_vect_only_p () below).  */
> +	}
> +      return true;
> +    }
> +
> +  return false;
> +}
> +
> +/* Return true if vectorizable_* routines can handle statements STMT1_INFO
> +   and STMT2_INFO being in a single group for SLP only.  */
> +
> +static bool
> +slp_vect_only_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info)
> +{
> +  if (gimple_assign_single_p (stmt1_info->stmt))
> +    {
> +      gcc_assert (gimple_assign_single_p (stmt2_info->stmt));
> +      return false;
> +    }
> +
> +  gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
> +  if (call1 && gimple_call_internal_p (call1))
> +    {
> +      /* Check for two masked loads or two masked stores.  */
> +      gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
> +      gcc_assert (call2 && gimple_call_internal_p (call2));
> +      internal_fn ifn = gimple_call_internal_fn (call1);
> +      if (ifn != IFN_MASK_LOAD)
> +	return false;
> +      gcc_assert (ifn == gimple_call_internal_fn (call2));
> +
> +      /* Check if the masks are the same.  Cope with casts of masks,
>  	 like those created by build_mask_conversion.  */
>        tree mask1 = gimple_call_arg (call1, 2);
>        tree mask2 = gimple_call_arg (call2, 2);
>        if (!operand_equal_p (mask1, mask2, 0))
>  	{
> +	  /* This is the only case that is just for SLP: non-identical but
> +	     otherwise slp-compatible masks.  */
>  	  mask1 = strip_conversion (mask1);
>  	  if (!mask1)
> -	    return false;
> +	    return true;
>  	  mask2 = strip_conversion (mask2);
>  	  if (!mask2)
> -	    return false;
> +	    return true;
>  	  if (!operand_equal_p (mask1, mask2, 0))
> -	    return false;
> +	    return true;
>  	}
> -      return true;
>      }
>  
>    return false;

Normally I'd say it would be better to add a bool argument to
can_group_stmts_p that says whether we want non-SLP-only rules,
or perhaps convert the return type to an enum.  But given that the
non-SLP path is going away soon anyway, I guess separate functions
are better despite the cut-&-paste.

> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index afbf9a9..754a2e4 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -1755,6 +1755,49 @@ vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
>    return opt_result::success ();
>  }
>  
> +/* Look for SLP-only access groups and turn each individual access into its own
> +   group.  */
> +static void
> +vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
> +{
> +  unsigned int i;
> +  struct data_reference *dr;
> +
> +  DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
> +
> +  vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
> +  FOR_EACH_VEC_ELT (datarefs, i, dr)
> +    {
> +      gcc_assert (DR_REF (dr));
> +      stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
> +
> +      /* Check if the load is a part of an interleaving chain.  */
> +      if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
> +	{
> +	  stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
> +	  unsigned int group_size = DR_GROUP_SIZE (first_element);
> +
> +	  /* Check if SLP-only groups.  */
> +	  if (STMT_VINFO_SLP_VECT_ONLY (first_element))
> +	    {
> +		/* Dissolve the group.  */
> +		STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
> +
> +		stmt_vec_info vinfo = first_element;
> +		while (vinfo)
> +		  {
> +		    stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
> +		    DR_GROUP_FIRST_ELEMENT (vinfo) = NULL;
> +		    DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
> +		    DR_GROUP_SIZE (vinfo) = 1;
> +		    DR_GROUP_GAP (vinfo) = group_size - 1;
> +		    vinfo = next;

I think DR_GROUP_FIRST_ELEMENT should be vinfo here, so that it remains
a grouped access with only one element.

> +		  }

Formatting nit: this block is indented two columns too far.

> diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
> index 0e15087d..ebc4432 100644
> --- a/gcc/tree-vect-slp.c
> +++ b/gcc/tree-vect-slp.c
> @@ -325,6 +325,14 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
>  	{
>  	  internal_fn ifn = gimple_call_internal_fn (stmt);
>  	  commutative_op = first_commutative_argument (ifn);
> +
> +	  /* Masked load, only look at mask.  */
> +	  if (ifn == IFN_MASK_LOAD)
> +	    {
> +	      number_of_oprnds = 1;
> +	      /* Mask operand index.  */
> +	      first_op_idx = 5;
> +	    }
>  	}
>      }
>    else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
> @@ -624,7 +632,7 @@ vect_two_operations_perm_ok_p (vec<stmt_vec_info> stmts,
>     is false then this indicates the comparison could not be
>     carried out or the stmts will never be vectorized by SLP.
>  
> -   Note COND_EXPR is possibly ismorphic to another one after swapping its
> +   Note COND_EXPR is possibly isomorphic to another one after swapping its
>     operands.  Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
>     the first stmt by swapping the two operands of comparison; set SWAP[i]
>     to 2 if stmt I is isormorphic to the first stmt by inverting the code
> @@ -1146,13 +1154,23 @@ vect_build_slp_tree_2 (vec_info *vinfo,
>  			      &this_max_nunits, matches, &two_operators))
>      return NULL;
>  
> -  /* If the SLP node is a load, terminate the recursion.  */
> +  /* If the SLP node is a load, terminate the recursion unless masked.  */
>    if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
>        && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
>      {
> -      *max_nunits = this_max_nunits;
> -      node = vect_create_new_slp_node (stmts);
> -      return node;
> +      if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
> +	{
> +	  /* Masked load.  */
> +	  gcc_assert (gimple_call_internal_p (stmt));
> +	  gcc_assert (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD);

Normal GCC style is to have a single assert with '&&'.

Looks good to me otherwise, but Richard B should have the final say.

Thanks,
Richard