[GCC][PATCH][AArch64]Add ACLE intrinsics for bfdot for ARMv8.6 Extension

Fri Dec 20 14:36:00 GMT 2019

Stam Markianos-Wright <Stam.Markianos-Wright@arm.com> writes:
> Hi all,
>
> This patch adds the ARMv8.6 Extension ACLE intrinsics for the bfloat bfdot 
> operation.
>
> The functions are declared in arm_neon.h with the armv8.2-a+bf16 target option 
> as required.
>
> RTL patterns are defined to generate assembler.
>
> Tests added to verify expected assembly and perform adequate lane checks.
>
> This patch depends on:
>
> https://gcc.gnu.org/ml/gcc-patches/2019-12/msg00857.html
>
> for testuite effective_target update and on:
>
> https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01323.html
> https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01324.html
>
> for back-end Bfloat enablement.
>
> Cheers,
> Stam
>
>
> gcc/ChangeLog:
>
> 2019-11-04  Stam Markianos-Wright  <stam.markianos-wright@arm.com>
>
> 	* config/aarch64/aarch64-simd-builtins.def (aarch64_bfdot,
>            aarch64_bfdot_lane, aarch64_bfdot_laneq): New.
> 	* config/aarch64/aarch64-simd.md
>            (aarch64_bfdot, aarch64_bfdot_lane): New.
> 	* config/aarch64/arm_neon.h (vbfdot_f32, vbfdotq_f32, vbfdot_lane_f32,
>            vbfdotq_lane_f32, vbfdot_laneq_f32, vbfdotq_laneq_f32): New.
>   	* config/aarch64/iterators.md (UNSPEC_BFDOT, VBF, isquadop, Vbfdottype,
>            VBFMLA_W): New.

Changelog nit: the continuation lines should be indened by a tab only.

> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index c4858ab7cffd786066646a5cd95a168311990b76..bdc26c190610580e57e9749804b7729ee4e34793 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -7027,3 +7027,37 @@
>    "xtn\t%0.<Vntype>, %1.<Vtype>"
>    [(set_attr "type" "neon_shift_imm_narrow_q")]
>  )
> +
> +(define_insn "aarch64_bfdot<mode>"
> +  [(set (match_operand:VDQSF 0 "register_operand" "=w")
> +	(plus:VDQSF (match_operand:VDQSF 1 "register_operand" "0")
> +		    (unspec:VDQSF [(match_operand:<VBFMLA_W> 2
> +						"register_operand" "w")
> +				   (match_operand:<VBFMLA_W> 3
> +						"register_operand" "w")]
> +				   UNSPEC_BFDOT)))]

The operands to the plus should be the other way around, so that
the more complicated operand comes first,

> +  "TARGET_BF16_SIMD"
> +  "bfdot\t%0.<Vtype>, %2.<Vbfdottype>, %3.<Vbfdottype>"
> +  [(set_attr "type" "neon_dot<q>")]
> +)
> +
> +
> +(define_insn "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>"
> +  [(set (match_operand:VDQSF 0 "register_operand" "=w")
> +	(plus:VDQSF (match_operand:VDQSF 1 "register_operand" "0")
> +		    (unspec:VDQSF [(match_operand:<VDQSF:VBFMLA_W> 2
> +						"register_operand" "w")
> +				   (match_operand: VBF 3

Nit: should be no space before "VBF".

> +						"register_operand" "w")
> +				   (match_operand:SI 4
> +						"const_int_operand" "n")]
> +				   UNSPEC_BFDOT)))]
> +  "TARGET_BF16_SIMD"
> +{
> +  int nunits = GET_MODE_NUNITS (<VBF:MODE>mode).to_constant ();
> +  int lane = INTVAL (operands[4]);
> +  operands[4] =  gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane), SImode);

Should only be one space after "=".

> +  return "bfdot\t%0.<VDQSF:Vtype>, %2.<VDQSF:Vbfdottype>, %3.2h[%4]";
> +}
> +  [(set_attr "type" "neon_dot<VDQSF:q>")]
> +)
> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
> index 5996df0a612caff3c881fc15b0aa12b8f91a193b..0357d97cc4143c3a9c56260d9a9cc24138afc049 100644
> --- a/gcc/config/aarch64/arm_neon.h
> +++ b/gcc/config/aarch64/arm_neon.h
> @@ -34612,6 +34612,57 @@ vrnd64xq_f64 (float64x2_t __a)
>  
>  #include "arm_bf16.h"
>  
> +#pragma GCC push_options
> +#pragma GCC target ("arch=armv8.2-a+bf16")
> +
> +__extension__ extern __inline float32x2_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b)
> +{
> +  return __builtin_aarch64_bfdotv2sf (__r, __a, __b);
> +}
> +
> +__extension__ extern __inline float32x4_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vbfdotq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
> +{
> +  return __builtin_aarch64_bfdotv4sf (__r, __a, __b);
> +}
> +
> +__extension__ extern __inline float32x2_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vbfdot_lane_f32 \
> +      (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b, const int __index)

Stray backslash (same comment as for the USDOT/SUDOT review
just posted).

> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..62ac715c2a9c4468eb7c143464390dbf1144d6d6
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c
> @@ -0,0 +1,80 @@
> +/* { dg-do assemble { target { aarch64*-*-* } } } */
> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
> +/* { dg-add-options arm_v8_2a_bf16_neon }  */
> +/* { dg-additional-options "--save-temps" } */
> +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +**ufoo:
> +**	...
> +**	bfdot\tv[0-9]+.2s, v[0-9]+.4h, v[0-9]+.4h
> +**	...
> +**	ret
> +*/
> +float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
> +{
> +  return vbfdot_f32 (r, x, y);
> +}

Same comments as for SUDOT and USDOT here too.

Thanks,
Richard