[Aarch64][SVE] Vectorise sum-of-absolute-differences

Tue May 7 16:17:00 GMT 2019

Alejandro Martinez Vicente <Alejandro.MartinezVicente@arm.com> writes:
> Thanks for your comments Richard. I think this patch addresses them.

Yeah, this is OK to install, thanks.

Richard

>
> Alejandro
>
>> -----Original Message-----
>> From: Richard Sandiford <richard.sandiford@arm.com>
>> Sent: 07 May 2019 15:46
>> To: Alejandro Martinez Vicente <Alejandro.MartinezVicente@arm.com>
>> Cc: James Greenhalgh <James.Greenhalgh@arm.com>; GCC Patches <gcc-
>> patches@gcc.gnu.org>; nd <nd@arm.com>; Richard Biener
>> <richard.guenther@gmail.com>
>> Subject: Re: [Aarch64][SVE] Vectorise sum-of-absolute-differences
>> 
>> Alejandro Martinez Vicente <Alejandro.MartinezVicente@arm.com> writes:
>> > +;; Helper expander for aarch64_<su>abd<mode>_3 to save the callers ;;
>> > +the hassle of constructing the other arm of the MINUS.
>> > +(define_expand "<su>abd<mode>_3"
>> > +  [(use (match_operand:SVE_I 0 "register_operand"))
>> > +   (USMAX:SVE_I (match_operand:SVE_I 1 "register_operand")
>> > +		(match_operand:SVE_I 2 "register_operand"))]
>> > +  "TARGET_SVE"
>> > +  {
>> > +    rtx other_arm
>> > +      = simplify_gen_binary (<MAX_OPP>, <MODE>mode, operands[1],
>> > +operands[2]);
>> 
>> I realise this is just copied from the Advanced SIMD version, but
>> simplify_gen_binary is a bit dangerous here, since we explicitly want an
>> unsimplified <MAX_OPP> with the two operands given.  Probably better as:
>> 
>>   gen_rtx_<MAX_OPP> (<MODE>mode, ...)
>> 
>> > +    emit_insn (gen_aarch64_<su>abd<mode>_3 (operands[0], operands[1],
>> > +	       operands[2], other_arm));
>> > +    DONE;
>> > +  }
>> > +)
>> > +
>> > +;; Unpredicated integer absolute difference.
>> > +(define_expand "aarch64_<su>abd<mode>_3"
>> > +  [(set (match_operand:SVE_I 0 "register_operand")
>> > +	(unspec:SVE_I
>> > +	  [(match_dup 4)
>> > +	   (minus:SVE_I
>> > +	     (USMAX:SVE_I
>> > +	       (match_operand:SVE_I 1 "register_operand" "w")
>> > +	       (match_operand:SVE_I 2 "register_operand" "w"))
>> > +	     (match_operator 3 "aarch64_<max_opp>"
>> > +	       [(match_dup 1)
>> > +		(match_dup 2)]))]
>> > +	  UNSPEC_MERGE_PTRUE))]
>> > +  "TARGET_SVE"
>> > +  {
>> > +    operands[4] = force_reg (<VPRED>mode, CONSTM1_RTX
>> (<VPRED>mode));
>> > +  }
>> > +)
>> 
>> I think we should go directly from <su>abd<mode>_3 to the final pattern, so
>> that <su>abd<mode>_3 does the force_reg too.  This would make...
>> 
>> > +;; Predicated integer absolute difference.
>> > +(define_insn "*aarch64_<su>abd<mode>_3"
>> 
>> ...this the named pattern, instead of starting with "*".
>> 
>> > +  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
>> > +	(unspec:SVE_I
>> > +	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
>> > +	   (minus:SVE_I
>> > +	     (USMAX:SVE_I
>> > +	       (match_operand:SVE_I 2 "register_operand" "w, w")
>> 
>> Should be "0, w", so that the first alternative ties the input to the output.
>> 
>> > +	       (match_operand:SVE_I 3 "register_operand" "w, w"))
>> > +	     (match_operator 4 "aarch64_<max_opp>"
>> > +	       [(match_dup 2)
>> > +		(match_dup 3)]))]
>> > +	  UNSPEC_MERGE_PTRUE))]
>> > +  "TARGET_SVE"
>> > +  "@
>> > +   <su>abd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
>> > +
>> movprfx\t%0, %2\;<su>abd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype
>> >"
>> > +  [(set_attr "movprfx" "*,yes")]
>> > +)
>> > +
>> > +;; Emit a sequence to produce a sum-of-absolute-differences of the
>> > +inputs in ;; operands 1 and 2.  The sequence also has to perform a
>> > +widening reduction of ;; the difference into a vector and accumulate
>> > +that into operand 3 before ;; copying that into the result operand 0.
>> > +;; Perform that with a sequence of:
>> > +;; MOV		ones.b, #1
>> > +;; UABD		diff.b, p0/m, op1.b, op2.b
>> > +;; UDOT		op3.s, diff.b, ones.b
>> > +;; MOV		op0, op3  // should be eliminated in later passes.
>> > +;; The signed version just uses the signed variants of the above
>> instructions.
>> 
>> Think it would be clearer if we removed the last line and just used [SU]ABD
>> instead of UABD, since that's the only sign-dependent part of the operation.
>> Also think we should SVEize it with MOVPRFX, since a separate MOV should
>> never be needed:
>> 
>> ;; MOV		ones.b, #1
>> ;; [SU]ABD	diff.b, ptrue/m, op1.b, op2.b
>> ;; MOVPRFX	op0, op3		// If necessary
>> ;; UDOT		op0.s, diff.b, ones.b
>> 
>> > +(define_expand "<sur>sad<vsi2qi>"
>> > +  [(use (match_operand:SVE_SDI 0 "register_operand"))
>> > +   (unspec:<VSI2QI> [(use (match_operand:<VSI2QI> 1 "register_operand"))
>> > +		    (use (match_operand:<VSI2QI> 2 "register_operand"))]
>> ABAL)
>> > +   (use (match_operand:SVE_SDI 3 "register_operand"))]
>> > +  "TARGET_SVE"
>> > +  {
>> > +    rtx ones = force_reg (<VSI2QI>mode, CONST1_RTX (<VSI2QI>mode));
>> > +    rtx diff = gen_reg_rtx (<VSI2QI>mode);
>> > +    emit_insn (gen_<sur>abd<vsi2qi>_3 (diff, operands[1], operands[2]));
>> > +    emit_insn (gen_udot_prod<vsi2qi> (operands[3], diff, ones,
>> operands[3]));
>> > +    emit_move_insn (operands[0], operands[3]);
>> 
>> It would be better to make operands[0] the destination of the UDOT and
>> drop the move.
>> 
>> Thanks,
>> Richard
>> 
>> > +    DONE;
>> > +  }
>> > +)
>> > diff --git a/gcc/config/aarch64/iterators.md
>> > b/gcc/config/aarch64/iterators.md index b3b2d6e..20aa0e9 100644
>> > --- a/gcc/config/aarch64/iterators.md
>> > +++ b/gcc/config/aarch64/iterators.md
>> > @@ -1060,6 +1060,9 @@
>> >  ;; Map smax to smin and umax to umin.
>> >  (define_code_attr max_opp [(smax "smin") (umax "umin")])
>> >
>> > +;; Same as above, but louder.
>> > +(define_code_attr MAX_OPP [(smax "SMIN") (umax "UMIN")])
>> 
>> :-)
>
> diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
> index 02d33b7..e94801d 100644
> --- a/gcc/config/aarch64/aarch64-sve.md
> +++ b/gcc/config/aarch64/aarch64-sve.md
> @@ -3148,3 +3148,64 @@
>     movprfx\t%0, %3\;<sur>dot\\t%0.<Vetype>, %1.<Vetype_fourth>, %2.<Vetype_fourth>"
>    [(set_attr "movprfx" "*,yes")]
>  )
> +
> +;; Helper expander for aarch64_<su>abd<mode>_3 to save the callers
> +;; the hassle of constructing the other arm of the MINUS.
> +(define_expand "<su>abd<mode>_3"
> +  [(use (match_operand:SVE_I 0 "register_operand"))
> +   (USMAX:SVE_I (match_operand:SVE_I 1 "register_operand")
> +		(match_operand:SVE_I 2 "register_operand"))]
> +  "TARGET_SVE"
> +  {
> +    rtx pred = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
> +    rtx other_arm = gen_rtx_<MAX_OPP> (<MODE>mode, operands[1], operands[2]);
> +    emit_insn (gen_aarch64_<su>abd<mode>_3 (operands[0], pred, operands[1],
> +					    operands[2], other_arm));
> +    DONE;
> +  }
> +)
> +
> +;; Predicated integer absolute difference.
> +(define_insn "aarch64_<su>abd<mode>_3"
> +  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
> +	(unspec:SVE_I
> +	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
> +	   (minus:SVE_I
> +	     (USMAX:SVE_I
> +	       (match_operand:SVE_I 2 "register_operand" "0, w")
> +	       (match_operand:SVE_I 3 "register_operand" "w, w"))
> +	     (match_operator 4 "aarch64_<max_opp>"
> +	       [(match_dup 2)
> +		(match_dup 3)]))]
> +	  UNSPEC_MERGE_PTRUE))]
> +  "TARGET_SVE"
> +  "@
> +   <su>abd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
> +   movprfx\t%0, %2\;<su>abd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
> +  [(set_attr "movprfx" "*,yes")]
> +)
> +
> +;; Emit a sequence to produce a sum-of-absolute-differences of the inputs in
> +;; operands 1 and 2.  The sequence also has to perform a widening reduction of
> +;; the difference into a vector and accumulate that into operand 3 before
> +;; copying that into the result operand 0.
> +;; Perform that with a sequence of:
> +;; MOV		ones.b, #1
> +;; [SU]ABD	diff.b, p0/m, op1.b, op2.b
> +;; MOVPRFX	op0, op3	// If necessary
> +;; UDOT		op0.s, diff.b, ones.b
> +
> +(define_expand "<sur>sad<vsi2qi>"
> +  [(use (match_operand:SVE_SDI 0 "register_operand"))
> +   (unspec:<VSI2QI> [(use (match_operand:<VSI2QI> 1 "register_operand"))
> +		    (use (match_operand:<VSI2QI> 2 "register_operand"))] ABAL)
> +   (use (match_operand:SVE_SDI 3 "register_operand"))]
> +  "TARGET_SVE"
> +  {
> +    rtx ones = force_reg (<VSI2QI>mode, CONST1_RTX (<VSI2QI>mode));
> +    rtx diff = gen_reg_rtx (<VSI2QI>mode);
> +    emit_insn (gen_<sur>abd<vsi2qi>_3 (diff, operands[1], operands[2]));
> +    emit_insn (gen_udot_prod<vsi2qi> (operands[0], diff, ones, operands[3]));
> +    DONE;
> +  }
> +)
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index b3b2d6e..20aa0e9 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -1060,6 +1060,9 @@
>  ;; Map smax to smin and umax to umin.
>  (define_code_attr max_opp [(smax "smin") (umax "umin")])
>  
> +;; Same as above, but louder.
> +(define_code_attr MAX_OPP [(smax "SMIN") (umax "UMIN")])
> +
>  ;; The number of subvectors in an SVE_STRUCT.
>  (define_mode_attr vector_count [(VNx32QI "2") (VNx16HI "2")
>  				(VNx8SI  "2") (VNx4DI  "2")
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sad_1.c b/gcc/testsuite/gcc.target/aarch64/sve/sad_1.c
> new file mode 100644
> index 0000000..e7bf64a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/sad_1.c
> @@ -0,0 +1,28 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize" } */
> +
> +#include <stdint.h>
> +
> +#define DEF_SAD(TYPE1, TYPE2)						\
> +TYPE1 __attribute__ ((noinline, noclone))				\
> +sum_abs_##TYPE1##_##TYPE2 (TYPE2 *restrict x, TYPE2 *restrict y, int n)	\
> +{									\
> +  TYPE1 sum = 0;							\
> +  for (int i = 0; i < n; i++)						\
> +    {									\
> +      sum += __builtin_abs (x[i] - y[i]);				\
> +    }									\
> +  return sum;								\
> +}
> +
> +DEF_SAD(int32_t, uint8_t)
> +DEF_SAD(int32_t, int8_t)
> +DEF_SAD(int64_t, uint16_t)
> +DEF_SAD(int64_t, int16_t)
> +
> +/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tudot\tz[0-9]+\.s, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tudot\tz[0-9]+\.d, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index 493c1ab..057a874 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -5973,6 +5973,7 @@ use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
>    switch (code)
>      {
>      case DOT_PROD_EXPR:
> +    case SAD_EXPR:
>        return true;
>  
>      default:
> @@ -6002,6 +6003,17 @@ build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
>  	break;
>        }
>  
> +    case SAD_EXPR:
> +      {
> +	tree vectype = TREE_TYPE (vop[1]);
> +	tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
> +	gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
> +					       mask, vop[1], vop[0]);
> +	gsi_insert_before (gsi, select, GSI_SAME_STMT);
> +	vop[1] = masked_op1;
> +	break;
> +      }
> +
>      default:
>        gcc_unreachable ();
>      }