This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH][AARCH64] Emulating aligned mask loads on AArch64


On Fri, Sep 18, 2015 at 11:24:50AM +0100, Pawel Kupidura wrote:
> This patch uses max reductions to emulate aligned masked loads on AArch64.
> It reduces the mask to a scalar that is nonzero if any mask element is true,
> then uses that scalar to select between the real address and a scratchpad
> address.
> 
> The idea is that if the vector load is aligned, it cannot cross a page
> boundary and so cannot partially fault.  It is safe to load from the
> address (and use only some of the result) if any mask element is true.
> 
> The patch provided a 15% speed improvement for simple microbenchmarks.
> 
> There were several spec2k6 benchmarks affected by patch: 400.perlbench,
> 403.gcc, 436.cactusADM, 454.calculix and 464.h264.  However, the changes
> had no measureable effect on performance.
> 
> Regression-tested on x86_64-linux-gnu, aarch64-linux-gnu and 
> arm-linux-gnueabi.

Hi Pawel, this patch doesn't look AArch64 specific to me. You will probably
get more traction with reviews if you post it tagged appropriately and
with the relevant maintainers on CC, in this case - as an auto-vectorizer
patch, Richard Biener and Zdenek Dvorak.

It is also customary to include a ChangeLog in your submissions, this can
be useful for seeign at a glance what your patch modifies.

Thanks,
James

> diff --git a/gcc/optabs-query.h b/gcc/optabs-query.h
> index 73f2729..066d133 100644
> --- a/gcc/optabs-query.h
> +++ b/gcc/optabs-query.h
> @@ -134,5 +134,6 @@ bool can_vec_mask_load_store_p (machine_mode, bool);
>  bool can_compare_and_swap_p (machine_mode, bool);
>  bool can_atomic_exchange_p (machine_mode, bool);
>  bool lshift_cheap_p (bool);
> +bool supports_umax_reduction ();
>  
>  #endif
> diff --git a/gcc/optabs-query.c b/gcc/optabs-query.c
> index 254089f..23a85a4 100644
> --- a/gcc/optabs-query.c
> +++ b/gcc/optabs-query.c
> @@ -463,6 +463,21 @@ can_mult_highpart_p (machine_mode mode, bool uns_p)
>    return 0;
>  }
>  
> +/* Return true if target supports unsigned max reduction for any mode.  */
> +
> +bool
> +supports_umax_reduction ()
> +{
> +  machine_mode mode;
> +
> +  for (mode = MIN_MODE_VECTOR_INT; mode <= MAX_MODE_VECTOR_INT;
> +       mode = (machine_mode) (mode + 1))
> +    if (optab_handler (reduc_umax_scal_optab, mode) != CODE_FOR_nothing)
> +      return true;
> +
> +  return false;
> +}
> +
>  /* Return true if target supports vector masked load/store for mode.  */
>  
>  bool
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-align-4.c b/gcc/testsuite/gcc.dg/vect/vect-align-4.c
> new file mode 100644
> index 0000000..98db8e3
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-align-4.c
> @@ -0,0 +1,65 @@
> +/* { dg-require-effective-target umax_reduction } */
> +
> +#define N 512
> +#define K 32
> +
> +extern void abort (void) __attribute__((noreturn));
> +
> +int a[N] __attribute__ ((aligned (16)));
> +int b[N] __attribute__ ((aligned (16)));
> +int c[N] __attribute__ ((aligned (16)));
> +
> +__attribute__ ((noinline)) void
> +init_arrays () {
> +  int i;
> +
> +  for (i = 0; i < N / 4; ++i)
> +    a[i] = K + 1;
> +
> +  for (i = N / 4; i < N / 2; ++i)
> +    a[i] = (i % 2 == 0) ? K - 1 : K + 1;
> +
> +  for (i = N / 2; i < N; ++i)
> +    a[i] = K - 1;
> +
> +  for (i = 0; i < N; ++i)
> +    b[i] = i;
> +}
> +
> +__attribute__ ((noinline)) void
> +check_array () {
> +  int i = 0;
> +
> +  for (i = 0; i < N / 4; ++i)
> +    if (c[i] != a[i])
> +      abort ();
> +
> +  for (i = N / 4; i < N / 2; ++i)
> +    if (c[i] != (i % 2 == 0) ? b[i] : a[i])
> +      abort ();
> +
> +  for (i = N / 2; i < N; ++i)
> +    if (c[i] != b[i])
> +      abort ();
> +}
> +
> +__attribute__ ((noinline)) void
> +main1 (int* bp) {
> +  int i;
> +
> +  for (i = 0; i < N; ++i)
> +    c[i] = a[i] < K ? bp[i] : a[i];
> +
> +  check_array ();
> +}
> +
> +int main (void) {
> +  init_arrays ();
> +
> +  main1 (b);
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
> +
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-align-5.c b/gcc/testsuite/gcc.dg/vect/vect-align-5.c
> new file mode 100644
> index 0000000..93bfaa1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-align-5.c
> @@ -0,0 +1,65 @@
> +/* { dg-require-effective-target umax_reduction } */
> +
> +#define N 512
> +#define K 32
> +
> +extern void abort (void) __attribute__((noreturn));
> +
> +int a[N] __attribute__ ((aligned (16)));
> +int b[N];
> +int c[N] __attribute__ ((aligned (16)));
> +
> +__attribute__ ((noinline)) void
> +init_arrays () {
> +  int i;
> +
> +  for (i = 0; i < N / 4; ++i)
> +    a[i] = K + 1;
> +
> +  for (i = N / 4; i < N / 2; ++i)
> +    a[i] = (i % 2 == 0) ? K - 1 : K + 1;
> +
> +  for (i = N / 2; i < N; ++i)
> +    a[i] = K - 1;
> +
> +  for (i = 0; i < N; ++i)
> +    b[i] = i;
> +}
> +
> +__attribute__ ((noinline)) void
> +check_array () {
> +  int i = 0;
> +
> +  for (i = 0; i < N / 4; ++i)
> +    if (c[i] != a[i])
> +      abort ();
> +
> +  for (i = N / 4; i < N / 2; ++i)
> +    if (c[i] != (i % 2 == 0) ? b[i] : a[i])
> +      abort ();
> +
> +  for (i = N / 2; i < N; ++i)
> +    if (c[i] != b[i])
> +      abort ();
> +}
> +
> +__attribute__ ((noinline)) void
> +main1 (int* bp) {
> +  int i;
> +
> +  for (i = 0; i < N; ++i)
> +    c[i] = a[i] < K ? bp[i] : a[i];
> +
> +  check_array ();
> +}
> +
> +int main (void) {
> +  init_arrays ();
> +
> +  main1 (b);
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
> +
> diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
> index a465eb1..9b1c338 100644
> --- a/gcc/testsuite/lib/target-supports.exp
> +++ b/gcc/testsuite/lib/target-supports.exp
> @@ -6449,3 +6449,14 @@ proc check_effective_target_comdat_group {} {
>  	int (*fn) () = foo;
>      }]
>  }
> +
> +# Return 1 if the target supports unsigned max vector reduction.
> +
> +proc check_effective_target_umax_reduction { } {
> +    if { [istarget aarch64*-*-*] } {
> +	return 1;
> +    } else {
> +	return 0;
> +    }
> +}
> +
> diff --git a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c
> index 0987884..4f84705 100644
> --- a/gcc/tree-if-conv.c
> +++ b/gcc/tree-if-conv.c
> @@ -811,7 +811,8 @@ ifcvt_can_use_mask_load_store (gimple stmt)
>        || VECTOR_MODE_P (mode))
>      return false;
>  
> -  if (can_vec_mask_load_store_p (mode, is_load))
> +  if (can_vec_mask_load_store_p (mode, is_load)
> +      || (is_load && supports_umax_reduction ()))
>      return true;
>  
>    return false;
> diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
> index 671e613..4f8c2c5 100644
> --- a/gcc/tree-vect-data-refs.c
> +++ b/gcc/tree-vect-data-refs.c
> @@ -5749,10 +5749,19 @@ vect_supportable_dr_alignment (struct data_reference *dr,
>    /* For now assume all conditional loads/stores support unaligned
>       access without any special code.  */
>    if (is_gimple_call (stmt)
> -      && gimple_call_internal_p (stmt)
> -      && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
> -	  || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
> -    return dr_unaligned_supported;
> +      && gimple_call_internal_p (stmt))
> +    {
> +      if (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD)
> +	return (can_vec_mask_load_store_p (mode, true)
> +		? dr_unaligned_supported
> +		: dr_unaligned_unsupported);
> +      else if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
> +	{
> +	  gcc_checking_assert (can_vec_mask_load_store_p (
> +				TYPE_MODE (TREE_TYPE (vectype)), false));
> +	  return dr_unaligned_supported;
> +	}
> +    }
>  
>    if (loop_vinfo)
>      {
> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> index d4a436d..2a8c231 100644
> --- a/gcc/tree-vect-stmts.c
> +++ b/gcc/tree-vect-stmts.c
> @@ -1840,7 +1840,9 @@ vectorizable_mask_load_store (gimple stmt, gimple_stmt_iterator *gsi,
>  				 : DR_STEP (dr), size_zero_node) <= 0)
>      return false;
>    else if (!VECTOR_MODE_P (TYPE_MODE (vectype))
> -	   || !can_vec_mask_load_store_p (TYPE_MODE (vectype), !is_store))
> +	   || !(can_vec_mask_load_store_p (TYPE_MODE (vectype), !is_store)
> +		|| (optab_handler (reduc_umax_scal_optab,
> +				   TYPE_MODE (vectype)) != CODE_FOR_nothing)))
>      return false;
>  
>    if (TREE_CODE (mask) != SSA_NAME)
> @@ -2140,12 +2142,43 @@ vectorizable_mask_load_store (gimple stmt, gimple_stmt_iterator *gsi,
>  	    misalign = DR_MISALIGNMENT (dr);
>  	  set_ptr_info_alignment (get_ptr_info (dataref_ptr), align,
>  				  misalign);
> -	  new_stmt
> -	    = gimple_build_call_internal (IFN_MASK_LOAD, 3, dataref_ptr,
> -					  gimple_call_arg (stmt, 1),
> -					  vec_mask);
> -	  gimple_call_set_lhs (new_stmt, make_ssa_name (vec_dest));
> -	  vect_finish_stmt_generation (stmt, new_stmt, gsi);
> +
> +	  if (can_vec_mask_load_store_p (TYPE_MODE (vectype), !is_store))
> +	    {
> +	      new_stmt
> +		= gimple_build_call_internal (IFN_MASK_LOAD, 3, dataref_ptr,
> +					      gimple_call_arg (stmt, 1),
> +					      vec_mask);
> +	      gimple_call_set_lhs (new_stmt, make_ssa_name (vec_dest));
> +	      vect_finish_stmt_generation (stmt, new_stmt, gsi);
> +	    }
> +	  else
> +	    {
> +	      tree anytrue = make_temp_ssa_name (TREE_TYPE (
> +						  TREE_TYPE (vec_mask)),
> +						 NULL, "_anytrue");
> +	      tree reduction = build1 (REDUC_MAX_EXPR, TREE_TYPE (anytrue),
> +				       vec_mask);
> +	      gimple anytrue_init = gimple_build_assign (anytrue, reduction);
> +	      vect_finish_stmt_generation (stmt, anytrue_init, gsi);
> +
> +	      tree temp_addr = build1 (ADDR_EXPR, TREE_TYPE (dataref_ptr),
> +				       create_tmp_var (vectype, "safevec"));
> +	      tree vec_cond_expr = build3 (COND_EXPR, vectype, anytrue,
> +					   dataref_ptr, temp_addr);
> +
> +	      tree safeb = make_temp_ssa_name (TREE_TYPE (dataref_ptr),
> +					       NULL, "_safeb");
> +	      gimple safeb_init = gimple_build_assign (safeb, vec_cond_expr);
> +	      vect_finish_stmt_generation (stmt, safeb_init, gsi);
> +
> +	      tree load = build2 (MEM_REF, vectype, safeb,
> +				  build_int_cst (ptr_type_node, 0));
> +	      new_stmt
> +		= gimple_build_assign (make_ssa_name (vec_dest), load);
> +	      vect_finish_stmt_generation (stmt, new_stmt, gsi);
> +	    }
> +
>  	  if (i == 0)
>  	    STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
>  	  else


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]