This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
Re: [PATCH][AARCH64] Emulating aligned mask loads on AArch64

From: James Greenhalgh <james dot greenhalgh at arm dot com>
To: Pawel Kupidura <pawel dot kupidura at arm dot com>
Cc: "gcc-patches at gcc dot gnu dot org" <gcc-patches at gcc dot gnu dot org>, rguenther at suse dot de, ook at ucw dot cz
Date: Fri, 18 Sep 2015 11:43:51 +0100
Subject: Re: [PATCH][AARCH64] Emulating aligned mask loads on AArch64
Authentication-results: sourceware.org; auth=none
References: <55FBE672 dot 2000905 at arm dot com>
On Fri, Sep 18, 2015 at 11:24:50AM +0100, Pawel Kupidura wrote:
> This patch uses max reductions to emulate aligned masked loads on AArch64.
> It reduces the mask to a scalar that is nonzero if any mask element is true,
> then uses that scalar to select between the real address and a scratchpad
> address.
> 
> The idea is that if the vector load is aligned, it cannot cross a page
> boundary and so cannot partially fault.  It is safe to load from the
> address (and use only some of the result) if any mask element is true.
> 
> The patch provided a 15% speed improvement for simple microbenchmarks.
> 
> There were several spec2k6 benchmarks affected by patch: 400.perlbench,
> 403.gcc, 436.cactusADM, 454.calculix and 464.h264.  However, the changes
> had no measureable effect on performance.
> 
> Regression-tested on x86_64-linux-gnu, aarch64-linux-gnu and 
> arm-linux-gnueabi.

Hi Pawel, this patch doesn't look AArch64 specific to me. You will probably
get more traction with reviews if you post it tagged appropriately and
with the relevant maintainers on CC, in this case - as an auto-vectorizer
patch, Richard Biener and Zdenek Dvorak.

It is also customary to include a ChangeLog in your submissions, this can
be useful for seeign at a glance what your patch modifies.

Thanks,
James

> diff --git a/gcc/optabs-query.h b/gcc/optabs-query.h
> index 73f2729..066d133 100644
> --- a/gcc/optabs-query.h
> +++ b/gcc/optabs-query.h
> @@ -134,5 +134,6 @@ bool can_vec_mask_load_store_p (machine_mode, bool);
>  bool can_compare_and_swap_p (machine_mode, bool);
>  bool can_atomic_exchange_p (machine_mode, bool);
>  bool lshift_cheap_p (bool);
> +bool supports_umax_reduction ();
>  
>  #endif
> diff --git a/gcc/optabs-query.c b/gcc/optabs-query.c
> index 254089f..23a85a4 100644
> --- a/gcc/optabs-query.c
> +++ b/gcc/optabs-query.c
> @@ -463,6 +463,21 @@ can_mult_highpart_p (machine_mode mode, bool uns_p)
>    return 0;
>  }
>  
> +/* Return true if target supports unsigned max reduction for any mode.  */
> +
> +bool
> +supports_umax_reduction ()
> +{
> +  machine_mode mode;
> +
> +  for (mode = MIN_MODE_VECTOR_INT; mode <= MAX_MODE_VECTOR_INT;
> +       mode = (machine_mode) (mode + 1))
> +    if (optab_handler (reduc_umax_scal_optab, mode) != CODE_FOR_nothing)
> +      return true;
> +
> +  return false;
> +}
> +
>  /* Return true if target supports vector masked load/store for mode.  */
>  
>  bool
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-align-4.c b/gcc/testsuite/gcc.dg/vect/vect-align-4.c
> new file mode 100644
> index 0000000..98db8e3
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-align-4.c
> @@ -0,0 +1,65 @@
> +/* { dg-require-effective-target umax_reduction } */
> +
> +#define N 512
> +#define K 32
> +
> +extern void abort (void) __attribute__((noreturn));
> +
> +int a[N] __attribute__ ((aligned (16)));
> +int b[N] __attribute__ ((aligned (16)));
> +int c[N] __attribute__ ((aligned (16)));
> +
> +__attribute__ ((noinline)) void
> +init_arrays () {
> +  int i;
> +
> +  for (i = 0; i < N / 4; ++i)
> +    a[i] = K + 1;
> +
> +  for (i = N / 4; i < N / 2; ++i)
> +    a[i] = (i % 2 == 0) ? K - 1 : K + 1;
> +
> +  for (i = N / 2; i < N; ++i)
> +    a[i] = K - 1;
> +
> +  for (i = 0; i < N; ++i)
> +    b[i] = i;
> +}
> +
> +__attribute__ ((noinline)) void
> +check_array () {
> +  int i = 0;
> +
> +  for (i = 0; i < N / 4; ++i)
> +    if (c[i] != a[i])
> +      abort ();
> +
> +  for (i = N / 4; i < N / 2; ++i)
> +    if (c[i] != (i % 2 == 0) ? b[i] : a[i])
> +      abort ();
> +
> +  for (i = N / 2; i < N; ++i)
> +    if (c[i] != b[i])
> +      abort ();
> +}
> +
> +__attribute__ ((noinline)) void
> +main1 (int* bp) {
> +  int i;
> +
> +  for (i = 0; i < N; ++i)
> +    c[i] = a[i] < K ? bp[i] : a[i];
> +
> +  check_array ();
> +}
> +
> +int main (void) {
> +  init_arrays ();
> +
> +  main1 (b);
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
> +
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-align-5.c b/gcc/testsuite/gcc.dg/vect/vect-align-5.c
> new file mode 100644
> index 0000000..93bfaa1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-align-5.c
> @@ -0,0 +1,65 @@
> +/* { dg-require-effective-target umax_reduction } */
> +
> +#define N 512
> +#define K 32
> +
> +extern void abort (void) __attribute__((noreturn));
> +
> +int a[N] __attribute__ ((aligned (16)));
> +int b[N];
> +int c[N] __attribute__ ((aligned (16)));
> +
> +__attribute__ ((noinline)) void
> +init_arrays () {
> +  int i;
> +
> +  for (i = 0; i < N / 4; ++i)
> +    a[i] = K + 1;
> +
> +  for (i = N / 4; i < N / 2; ++i)
> +    a[i] = (i % 2 == 0) ? K - 1 : K + 1;
> +
> +  for (i = N / 2; i < N; ++i)
> +    a[i] = K - 1;
> +
> +  for (i = 0; i < N; ++i)
> +    b[i] = i;
> +}
> +
> +__attribute__ ((noinline)) void
> +check_array () {
> +  int i = 0;
> +
> +  for (i = 0; i < N / 4; ++i)
> +    if (c[i] != a[i])
> +      abort ();
> +
> +  for (i = N / 4; i < N / 2; ++i)
> +    if (c[i] != (i % 2 == 0) ? b[i] : a[i])
> +      abort ();
> +
> +  for (i = N / 2; i < N; ++i)
> +    if (c[i] != b[i])
> +      abort ();
> +}
> +
> +__attribute__ ((noinline)) void
> +main1 (int* bp) {
> +  int i;
> +
> +  for (i = 0; i < N; ++i)
> +    c[i] = a[i] < K ? bp[i] : a[i];
> +
> +  check_array ();
> +}
> +
> +int main (void) {
> +  init_arrays ();
> +
> +  main1 (b);
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
> +
> diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
> index a465eb1..9b1c338 100644
> --- a/gcc/testsuite/lib/target-supports.exp
> +++ b/gcc/testsuite/lib/target-supports.exp
> @@ -6449,3 +6449,14 @@ proc check_effective_target_comdat_group {} {
>  	int (*fn) () = foo;
>      }]
>  }
> +
> +# Return 1 if the target supports unsigned max vector reduction.
> +
> +proc check_effective_target_umax_reduction { } {
> +    if { [istarget aarch64*-*-*] } {
> +	return 1;
> +    } else {
> +	return 0;
> +    }
> +}
> +
> diff --git a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c
> index 0987884..4f84705 100644
> --- a/gcc/tree-if-conv.c
> +++ b/gcc/tree-if-conv.c
> @@ -811,7 +811,8 @@ ifcvt_can_use_mask_load_store (gimple stmt)
>        || VECTOR_MODE_P (mode))
>      return false;
>  
> -  if (can_vec_mask_load_store_p (mode, is_load))
> +  if (can_vec_mask_load_store_p (mode, is_load)
> +      || (is_load && supports_umax_reduction ()))
>      return true;
>  
>    return false;
> diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
> index 671e613..4f8c2c5 100644
> --- a/gcc/tree-vect-data-refs.c
> +++ b/gcc/tree-vect-data-refs.c
> @@ -5749,10 +5749,19 @@ vect_supportable_dr_alignment (struct data_reference *dr,
>    /* For now assume all conditional loads/stores support unaligned
>       access without any special code.  */
>    if (is_gimple_call (stmt)
> -      && gimple_call_internal_p (stmt)
> -      && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
> -	  || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
> -    return dr_unaligned_supported;
> +      && gimple_call_internal_p (stmt))
> +    {
> +      if (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD)
> +	return (can_vec_mask_load_store_p (mode, true)
> +		? dr_unaligned_supported
> +		: dr_unaligned_unsupported);
> +      else if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
> +	{
> +	  gcc_checking_assert (can_vec_mask_load_store_p (
> +				TYPE_MODE (TREE_TYPE (vectype)), false));
> +	  return dr_unaligned_supported;
> +	}
> +    }
>  
>    if (loop_vinfo)
>      {
> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> index d4a436d..2a8c231 100644
> --- a/gcc/tree-vect-stmts.c
> +++ b/gcc/tree-vect-stmts.c
> @@ -1840,7 +1840,9 @@ vectorizable_mask_load_store (gimple stmt, gimple_stmt_iterator *gsi,
>  				 : DR_STEP (dr), size_zero_node) <= 0)
>      return false;
>    else if (!VECTOR_MODE_P (TYPE_MODE (vectype))
> -	   || !can_vec_mask_load_store_p (TYPE_MODE (vectype), !is_store))
> +	   || !(can_vec_mask_load_store_p (TYPE_MODE (vectype), !is_store)
> +		|| (optab_handler (reduc_umax_scal_optab,
> +				   TYPE_MODE (vectype)) != CODE_FOR_nothing)))
>      return false;
>  
>    if (TREE_CODE (mask) != SSA_NAME)
> @@ -2140,12 +2142,43 @@ vectorizable_mask_load_store (gimple stmt, gimple_stmt_iterator *gsi,
>  	    misalign = DR_MISALIGNMENT (dr);
>  	  set_ptr_info_alignment (get_ptr_info (dataref_ptr), align,
>  				  misalign);
> -	  new_stmt
> -	    = gimple_build_call_internal (IFN_MASK_LOAD, 3, dataref_ptr,
> -					  gimple_call_arg (stmt, 1),
> -					  vec_mask);
> -	  gimple_call_set_lhs (new_stmt, make_ssa_name (vec_dest));
> -	  vect_finish_stmt_generation (stmt, new_stmt, gsi);
> +
> +	  if (can_vec_mask_load_store_p (TYPE_MODE (vectype), !is_store))
> +	    {
> +	      new_stmt
> +		= gimple_build_call_internal (IFN_MASK_LOAD, 3, dataref_ptr,
> +					      gimple_call_arg (stmt, 1),
> +					      vec_mask);
> +	      gimple_call_set_lhs (new_stmt, make_ssa_name (vec_dest));
> +	      vect_finish_stmt_generation (stmt, new_stmt, gsi);
> +	    }
> +	  else
> +	    {
> +	      tree anytrue = make_temp_ssa_name (TREE_TYPE (
> +						  TREE_TYPE (vec_mask)),
> +						 NULL, "_anytrue");
> +	      tree reduction = build1 (REDUC_MAX_EXPR, TREE_TYPE (anytrue),
> +				       vec_mask);
> +	      gimple anytrue_init = gimple_build_assign (anytrue, reduction);
> +	      vect_finish_stmt_generation (stmt, anytrue_init, gsi);
> +
> +	      tree temp_addr = build1 (ADDR_EXPR, TREE_TYPE (dataref_ptr),
> +				       create_tmp_var (vectype, "safevec"));
> +	      tree vec_cond_expr = build3 (COND_EXPR, vectype, anytrue,
> +					   dataref_ptr, temp_addr);
> +
> +	      tree safeb = make_temp_ssa_name (TREE_TYPE (dataref_ptr),
> +					       NULL, "_safeb");
> +	      gimple safeb_init = gimple_build_assign (safeb, vec_cond_expr);
> +	      vect_finish_stmt_generation (stmt, safeb_init, gsi);
> +
> +	      tree load = build2 (MEM_REF, vectype, safeb,
> +				  build_int_cst (ptr_type_node, 0));
> +	      new_stmt
> +		= gimple_build_assign (make_ssa_name (vec_dest), load);
> +	      vect_finish_stmt_generation (stmt, new_stmt, gsi);
> +	    }
> +
>  	  if (i == 0)
>  	    STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
>  	  else
Follow-Ups:
- Re: [PATCH][AARCH64] Emulating aligned mask loads on AArch64
  - From: Richard Biener
References:
- [PATCH][AARCH64] Emulating aligned mask loads on AArch64
  - From: Pawel Kupidura
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]