This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
Re: [AArch64][SVE] Utilize ASRD instruction for division and remainder

From: Richard Sandiford <richard dot sandiford at arm dot com>
To: Yuliang Wang <Yuliang dot Wang at arm dot com>
Cc: "gcc-patches\@gcc.gnu.org" <gcc-patches at gcc dot gnu dot org>, nd <nd at arm dot com>
Date: Tue, 24 Sep 2019 17:12:13 +0100
Subject: Re: [AArch64][SVE] Utilize ASRD instruction for division and remainder
References: <AM0PR08MB3716B89766EA131BF8A442A69B840@AM0PR08MB3716.eurprd08.prod.outlook.com>
Yuliang Wang <Yuliang.Wang@arm.com> writes:
> Hi,
>
> The C snippets below  (signed division/modulo by a power-of-2 immediate value):
>
> #define P ...
>
> void foo_div (int *a, int *b, int N)
> {
>     for (int i = 0; i < N; i++)
>         a[i] = b[i] / (1 << P);
> }
> void foo_mod (int *a, int *b, int N)
> {
>     for (int i = 0; i < N; i++)
>         a[i] = b[i] % (1 << P);
> }
>
> Vectorize to the following on AArch64 + SVE:
>
> foo_div:
>     movx0, 0
>     movw2, N
>     ptruep1.b, all
>     whilelop0.s, wzr, w2
>     .p2align3,,7
> .L2:
>     ld1wz1.s, p0/z, [x3, x0, lsl 2]
>     cmpltp2.s, p1/z, z1.s, #0//
>     movz0.s, p2/z, #7//
>     addz0.s, z0.s, z1.s//
>     asrz0.s, z0.s, #3//
>     st1wz0.s, p0, [x1, x0, lsl 2]
>     incwx0
>     whilelop0.s, w0, w2
>     b.any.L2
>     ret
>
> foo_mod:
>     ...
> .L2:
>     ld1wz0.s, p0/z, [x3, x0, lsl 2]
>     cmpltp2.s, p1/z, z0.s, #0//
>     movz1.s, p2/z, #-1//
>     lsrz1.s, z1.s, #29//
>     addz0.s, z0.s, z1.s//
>     andz0.s, z0.s, #{2^P-1}//
>     subz0.s, z0.s, z1.s//
>     st1wz0.s, p0, [x1, x0, lsl 2]
>     incwx0
>     whilelop0.s, w0, w2
>     b.any.L2
>     ret
>
> This patch utilizes the special-purpose ASRD (arithmetic shift-right for divide by immediate) instruction:
>
> foo_div:
>     ...
> .L2:
>     ld1wz0.s, p0/z, [x3, x0, lsl 2]
>     asrdz0.s, p1/m, z0.s, #{P}//
>     st1wz0.s, p0, [x1, x0, lsl 2]
>     incwx0
>     whilelop0.s, w0, w2
>     b.any.L2
>     ret
>
> foo_mod:
>     ...
> .L2:
>     ld1wz0.s, p0/z, [x3, x0, lsl 2]
>     movprfxz1, z0//
>     asrdz1.s, p1/m, z1.s, #{P}//
>     lslz1.s, z1.s, #{P}//
>     subz0.s, z0.s, z1.s//
>     st1wz0.s, p0, [x1, x0, lsl 2]
>     incwx0
>     whilelop0.s, w0, w2
>     b.any.L2
>     ret
>
> Added new tests. Built and regression tested on aarch64-none-elf.
>
> Best Regards,
> Yuliang Wang
>
>
> gcc/ChangeLog:
>
> 2019-09-23  Yuliang Wang  <yuliang.wang@arm.com>
>
> * config/aarch64/aarch64-sve.md (asrd<mode>3): New pattern for ASRD.
> * config/aarch64/iterators.md (UNSPEC_ASRD): New unspec.
> (ASRDIV): New int iterator.
> * internal-fn.def (IFN_ASHR_DIV): New internal function.
> * optabs.def (ashr_div_optab): New optab.
> * tree-vect-patterns.c (vect_recog_divmod_pattern):
> Modify pattern to support new operation.
> * doc/md.texi (asrd$var{m3}): Documentation for the above.
> * doc/sourcebuild.texi (vect_asrdiv_si): Document new target selector.

This looks good to me.  My only real question is about naming:
maybe IFN_DIV_POW2 would be a better name for the internal function
and sdiv_pow2_optab/"div_pow2$a3" for the optab?  But I'm useless at
naming things, so maybe others would prefer your names.

Thanks,
Richard

>
> gcc/testsuite/ChangeLog:
>
> 2019-09-23  Yuliang Wang  <yuliang.wang@arm.com>
>
> * gcc.dg/vect/vect-asrdiv-1.c: New test.
> * gcc.target/aarch64/sve/asrdiv_1.c: As above.
> * lib/target-support.exp (check_effective_target_vect_asrdiv_si):
> Return true for AArch64 with SVE.
>
> diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
> index f58353e9c6dc0df97ce4074db6bb22181f426e5b..607440b7ba16d5616695f29a9cf7c4c277a4a502 100644
> --- a/gcc/config/aarch64/aarch64-sve.md
> +++ b/gcc/config/aarch64/aarch64-sve.md
> @@ -71,6 +71,7 @@
>  ;; ---- [INT] Binary logical operations
>  ;; ---- [INT] Binary logical operations (inverted second input)
>  ;; ---- [INT] Shifts
> +;; ---- [INT] Shifts (rounding towards 0)
>  ;; ---- [FP] General binary arithmetic corresponding to rtx codes
>  ;; ---- [FP] General binary arithmetic corresponding to unspecs
>  ;; ---- [FP] Addition
> @@ -2563,6 +2564,46 @@
>    [(set_attr "movprfx" "yes")]
>  )
>  
> +;; -------------------------------------------------------------------------
> +;; ---- [INT] Shifts (rounding towards 0)
> +;; -------------------------------------------------------------------------
> +;; Includes:
> +;; - ASRD
> +;; -------------------------------------------------------------------------
> +
> +;; Unpredicated arithmetic right shift for division by power-of-2.
> +(define_expand "asrd<mode>3"
> +  [(set (match_operand:SVE_I 0 "register_operand" "")
> +	(unspec:SVE_I
> +	  [(match_dup 3)
> +	   (unspec:SVE_I
> +	     [(match_operand:SVE_I 1 "register_operand" "")
> +	      (match_operand 2 "aarch64_simd_rshift_imm")]
> +	    UNSPEC_ASRD)]
> +	 UNSPEC_PRED_X))]
> +  "TARGET_SVE"
> +  {
> +    operands[3] = aarch64_ptrue_reg (<VPRED>mode);
> +  }
> +)
> +
> +;; Predicated ASRD with PTRUE.
> +(define_insn "*asrd<mode>3"
> +  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
> +	(unspec:SVE_I
> +	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
> +	   (unspec:SVE_I
> +	     [(match_operand:SVE_I 2 "register_operand" "0, w")
> +	      (match_operand 3 "aarch64_simd_rshift_imm")]
> +	    UNSPEC_ASRD)]
> +	 UNSPEC_PRED_X))]
> +  "TARGET_SVE"
> +  "@
> +  asrd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
> +  movprfx\t%0, %2\;asrd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3"
> +  [(set_attr "movprfx" "*,yes")]
> +)
> +
>  ;; -------------------------------------------------------------------------
>  ;; ---- [FP] General binary arithmetic corresponding to rtx codes
>  ;; -------------------------------------------------------------------------
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index 03b3ce363021a71578803e07b3548d3dd9c9de32..1e321af710bfe80606eedee7e0d191f36c70355b 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -538,6 +538,7 @@
>      UNSPEC_SMULHRS	; Used in aarch64-sve2.md.
>      UNSPEC_UMULHS	; Used in aarch64-sve2.md.
>      UNSPEC_UMULHRS	; Used in aarch64-sve2.md.
> +    UNSPEC_ASRD		; Used in aarch64-sve.md.
>  ])
>  
>  ;; ------------------------------------------------------------------
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index f35fd2b1b19cef1deb41566d7614d80d449d69fc..2c0396c0d2ba14ef942db6dcdfea8250819bebfd 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -5414,6 +5414,17 @@ op0 = (narrow) (((((wide) op1 * (wide) op2) >> (N / 2 - 2)) + 1) >> 1);
>  where the sign of @samp{narrow} determines whether this is a signed
>  or unsigned operation, and @var{N} is the size of @samp{wide} in bits.
>  
> +@cindex @code{asrd@var{m3}} instruction pattern
> +@item @samp{asrd@var{m3}}
> +@cindex @code{asrd@var{m3}} instruction pattern
> +@itemx @samp{asrd@var{m3}}
> +Arithmetic shift right for division by power-of-2 immediate. Equivalent to:
> +@smallexample
> +signed op0, op1;
> +@dots{}
> +op0 = op1 / (1 << imm);
> +@end smallexample
> +
>  @cindex @code{vec_shl_insert_@var{m}} instruction pattern
>  @item @samp{vec_shl_insert_@var{m}}
>  Shift the elements in vector input operand 1 left one element (i.e.@:
> diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi
> index 4ace224a8ff5ed4fafed10a69ef00ffb2d7d8c39..b01ba570bfb44b6316c3f391b7be92f1244e2030 100644
> --- a/gcc/doc/sourcebuild.texi
> +++ b/gcc/doc/sourcebuild.texi
> @@ -1446,6 +1446,10 @@ of bytes.
>  Target supports both signed and unsigned multiply-high-with-round-and-scale
>  operations on vectors of half-words.
>  
> +@item vect_asrdiv_si
> +Target supports arithmetic shift-right division by constant power-of-2
> +operations on vectors of words.
> +
>  @item vect_condition
>  Target supports vector conditional operations.
>  
> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index 49f57978c88a3a8c1a0206d983e1720ed09b0385..f994129747854b5921bd72cc3ec7105fb6a061b7 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -140,6 +140,8 @@ DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
>  DEF_INTERNAL_OPTAB_FN (VEC_SHL_INSERT, ECF_CONST | ECF_NOTHROW,
>  		       vec_shl_insert, binary)
>  
> +DEF_INTERNAL_OPTAB_FN (ASHR_DIV, ECF_CONST | ECF_NOTHROW, ashr_div, binary)
> +
>  DEF_INTERNAL_OPTAB_FN (FMS, ECF_CONST, fms, ternary)
>  DEF_INTERNAL_OPTAB_FN (FNMA, ECF_CONST, fnma, ternary)
>  DEF_INTERNAL_OPTAB_FN (FNMS, ECF_CONST, fnms, ternary)
> diff --git a/gcc/optabs.def b/gcc/optabs.def
> index 308696846d4926fdd94133b87f4f59b8d1cc7f20..bdf0e5ccc68f0809aeb4d949d290d88074af0f9b 100644
> --- a/gcc/optabs.def
> +++ b/gcc/optabs.def
> @@ -347,6 +347,7 @@ OPTAB_D (smulhs_optab, "smulhs$a3")
>  OPTAB_D (smulhrs_optab, "smulhrs$a3")
>  OPTAB_D (umulhs_optab, "umulhs$a3")
>  OPTAB_D (umulhrs_optab, "umulhrs$a3")
> +OPTAB_D (ashr_div_optab, "asrd$a3")
>  OPTAB_D (vec_pack_sfix_trunc_optab, "vec_pack_sfix_trunc_$a")
>  OPTAB_D (vec_pack_ssat_optab, "vec_pack_ssat_$a")
>  OPTAB_D (vec_pack_trunc_optab, "vec_pack_trunc_$a")
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-asrdiv-1.c b/gcc/testsuite/gcc.dg/vect/vect-asrdiv-1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..961c8b0d05659fcbd70083982169af9a584a6ad3
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-asrdiv-1.c
> @@ -0,0 +1,79 @@
> +/* { dg-require-effective-target vect_int } */
> +
> +#include "tree-vect.h"
> +
> +#define DIV(x,y) ((x)/(y))
> +#define MOD(x,y) ((x)%(y))
> +
> +#define TEMPLATE(PO2,OP)						\
> +void __attribute__ ((noipa))						\
> +f_##PO2##_##OP (int *restrict a, int *restrict b, __INTPTR_TYPE__ n)	\
> +{									\
> +  for (__INTPTR_TYPE__ i = 0; i < n; ++i)				\
> +    a[i] = OP (b[i], (1 << PO2));					\
> +}
> +#define TEMPLATES(PO2)	\
> +TEMPLATE (PO2,DIV);	\
> +TEMPLATE (PO2,MOD);
> +
> +TEMPLATES (1);
> +TEMPLATES (2);
> +TEMPLATES (3);
> +TEMPLATES (7);
> +TEMPLATES (8);
> +TEMPLATES (10);
> +TEMPLATES (15);
> +TEMPLATES (16);
> +TEMPLATES (20);
> +
> +typedef void (*func_t) (int *, int *, __INTPTR_TYPE__);
> +typedef struct {
> +  int po2;
> +  func_t div;
> +  func_t mod;
> +} fn_t;
> +const fn_t fns[] = {
> +#define FN_PAIR(PO2) { PO2, f_##PO2##_DIV, f_##PO2##_MOD }
> +  FN_PAIR (1),
> +  FN_PAIR (2),
> +  FN_PAIR (3),
> +  FN_PAIR (7),
> +  FN_PAIR (8),
> +  FN_PAIR (10),
> +  FN_PAIR (15),
> +  FN_PAIR (16),
> +  FN_PAIR (20),
> +};
> +
> +int __attribute__ ((noipa, noinline))
> +power2 (int x)
> +{
> +  return 1 << x;
> +}
> +
> +#define N 50
> +
> +int
> +main (void)
> +{
> +  int a[N], b[N], c[N];
> +
> +  for (int i = 0; i < (sizeof(fns)/sizeof(fns[0])); i++)
> +    {
> +      int p = power2 (fns[i].po2);
> +      for (int j = 0; j < N; j++)
> +        a[j] = ((p << 4) * j) / (N - 1) - (p << 5);
> +
> +      fns[i].div (b, a, N);
> +      fns[i].mod (c, a, N);
> +
> +      for (int j = 0; j < N; j++)
> +	if (a[j] != (b[j] * p + c[j]))
> +          __builtin_abort ();
> +    }
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump {\.ASHR_DIV} "vect" { target vect_asrdiv_si } } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 18 "vect" { target vect_asrdiv_si } } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_1.c b/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..692df2012a4544f1cc36c3e6f671c121c5e550ff
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_1.c
> @@ -0,0 +1,52 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details --save-temps" } */
> +
> +#include <stdint.h>
> +
> +#define SIGNED(S) int##S##_t
> +
> +#define DIV(x,y) ((x)/(y))
> +#define MOD(x,y) ((x)%(y))
> +
> +#define TEMPLATE(OP,SIZE)						\
> +void __attribute__ ((noinline, noclone))				\
> +f_##OP##_##SIZE (SIGNED(SIZE) *restrict a, SIGNED(SIZE) *restrict b,	\
> +		 __INTPTR_TYPE__ n)					\
> +{									\
> +  for (__INTPTR_TYPE__ i = 0; i < n; ++i)				\
> +    a[i] = OP (b[i], ((SIGNED(SIZE))1 << ((SIZE)/2+1)));		\
> +}
> +#define DIVMOD(SIZE)	\
> +TEMPLATE (DIV,SIZE);	\
> +TEMPLATE (MOD,SIZE);
> +
> +DIVMOD (8);
> +DIVMOD (16);
> +DIVMOD (32);
> +DIVMOD (64);
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 8 "vect" } } */
> +
> +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+, z[0-9]+\n} 4 } } */
> +
> +/* { dg-final { scan-assembler-times {\tasrd\tz[0-9]+\.b, p[0-9]+/m, z[0-9]+\.b, #5\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #5\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\tasrd\tz[0-9]+\.h, p[0-9]+/m, z[0-9]+\.h, #9\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #9\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\tasrd\tz[0-9]+\.s, p[0-9]+/m, z[0-9]+\.s, #17\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #17\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\tasrd\tz[0-9]+\.d, p[0-9]+/m, z[0-9]+\.d, #33\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #33\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
> +
> +/* { dg-final { scan-assembler-not {\tasr\t%} } } */
> +/* { dg-final { scan-assembler-not {\tlsr\t%} } } */
> +/* { dg-final { scan-assembler-not {\tcmplt\t%} } } */
> +/* { dg-final { scan-assembler-not {\tand\t%} } } */
> +
> diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
> index 414bf80003b9192806f79afed9393f9ef4750a7d..8d6956471c7dc4e5a3147bd2958ea37e5183f408 100644
> --- a/gcc/testsuite/lib/target-supports.exp
> +++ b/gcc/testsuite/lib/target-supports.exp
> @@ -6192,6 +6192,14 @@ proc check_effective_target_vect_mulhrs_hi {} {
>  		   && [check_effective_target_aarch64_sve2] }]
>  }
>  
> +# Return 1 if the target plus current options supports arithmetic
> +# shift-right division by power-of-2 operations on vectors of half-words.
> +
> +proc check_effective_target_vect_asrdiv_si {} {
> +    return [expr { [istarget aarch64*-*-*]
> +		   && [check_effective_target_aarch64_sve] }]
> +}
> +
>  # Return 1 if the target plus current options supports a vector
>  # demotion (packing) of shorts (to chars) and ints (to shorts) 
>  # using modulo arithmetic, 0 otherwise.
> diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c
> index 2f86f9e4fc7039add1b1d7b82574cb8262eb4ba4..8dbe8d9ea3a5c3e9db37cf15b651fa6aa1bea567 100644
> --- a/gcc/tree-vect-patterns.c
> +++ b/gcc/tree-vect-patterns.c
> @@ -2925,6 +2925,38 @@ vect_recog_divmod_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
>        /* Pattern detected.  */
>        vect_pattern_detected ("vect_recog_divmod_pattern", last_stmt);
>  
> +      *type_out = vectype;
> +
> +      /* Check if the target supports this internal function.  */
> +      internal_fn ifn = IFN_ASHR_DIV;
> +      if (direct_internal_fn_supported_p (ifn, vectype, OPTIMIZE_FOR_SPEED))
> +	{
> +	  tree shift = build_int_cst (itype, tree_log2 (oprnd1));
> +
> +	  tree var_div = vect_recog_temp_ssa_var (itype, NULL);
> +	  gimple *div_stmt = gimple_build_call_internal (ifn, 2, oprnd0, shift);
> +	  gimple_call_set_lhs (div_stmt, var_div);
> +
> +	  if (rhs_code == TRUNC_MOD_EXPR)
> +	    {
> +	      append_pattern_def_seq (stmt_vinfo, div_stmt);
> +	      def_stmt
> +		= gimple_build_assign (vect_recog_temp_ssa_var (itype, NULL),
> +				       LSHIFT_EXPR, var_div, shift);
> +	      append_pattern_def_seq (stmt_vinfo, def_stmt);
> +	      pattern_stmt
> +		= gimple_build_assign (vect_recog_temp_ssa_var (itype, NULL),
> +				       MINUS_EXPR, oprnd0,
> +				       gimple_assign_lhs (def_stmt));
> +	    }
> +	  else
> +	    {
> +	      pattern_stmt = div_stmt;
> +	      gimple_set_location (pattern_stmt, gimple_location (last_stmt));
> +	    }
> +          return pattern_stmt;
> +	}
> +
>        cond = build2 (LT_EXPR, boolean_type_node, oprnd0,
>  		     build_int_cst (itype, 0));
>        if (rhs_code == TRUNC_DIV_EXPR
> @@ -3001,7 +3033,6 @@ vect_recog_divmod_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
>  				   signmask);
>  	}
>  
> -      *type_out = vectype;
>        return pattern_stmt;
>      }
>
Follow-Ups:
- RE: [AArch64][SVE] Utilize ASRD instruction for division and remainder
  - From: Yuliang Wang
References:
- [AArch64][SVE] Utilize ASRD instruction for division and remainder
  - From: Yuliang Wang
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]