[GCC][PATCH][Aarch64] Add Bfloat16_t scalar type, vector types and machine modes to Aarch64 back-end [1/2]

Richard Sandiford richard.sandiford@arm.com
Thu Dec 19 10:07:00 GMT 2019


Stam Markianos-Wright <Stam.Markianos-Wright@arm.com> writes:
> [...]
> @@ -659,6 +666,8 @@ aarch64_simd_builtin_std_type (machine_mode mode,
>        return float_type_node;
>      case E_DFmode:
>        return double_type_node;
> +    case E_BFmode:
> +      return aarch64_bf16_type_node;
>      default:
>        gcc_unreachable ();
>      }
> @@ -750,6 +759,11 @@ aarch64_init_simd_builtin_types (void)
>    aarch64_simd_types[Float64x1_t].eltype = double_type_node;
>    aarch64_simd_types[Float64x2_t].eltype = double_type_node;
>  
> +
> +/* Init Bfloat vector types with underlying uint types.  */
> +  aarch64_simd_types[Bfloat16x4_t].eltype = aarch64_bf16_type_node;
> +  aarch64_simd_types[Bfloat16x8_t].eltype = aarch64_bf16_type_node;

Formatting nits: too many blank lines, comment should be indented
to match the code.

> +
>    for (i = 0; i < nelts; i++)
>      {
>        tree eltype = aarch64_simd_types[i].eltype;
> @@ -1059,6 +1073,19 @@ aarch64_init_fp16_types (void)
>    aarch64_fp16_ptr_type_node = build_pointer_type (aarch64_fp16_type_node);
>  }
>  
> +/* Initialize the backend REAL_TYPE type supporting bfloat types.  */
> +static void
> +aarch64_init_bf16_types (void)
> +{
> +  aarch64_bf16_type_node = make_node (REAL_TYPE);
> +  TYPE_PRECISION (aarch64_bf16_type_node) = 16;
> +  SET_TYPE_MODE (aarch64_bf16_type_node, BFmode);
> +  layout_type (aarch64_bf16_type_node);
> +
> +  (*lang_hooks.types.register_builtin_type) (aarch64_bf16_type_node, "__bf16");

This style is mostly a carry-over from pre-ANSI days.  New code
can just use "lang_hooks.types.register_builtin_type (...)".

> +  aarch64_bf16_ptr_type_node = build_pointer_type (aarch64_bf16_type_node);
> +}
> +
>  /* Pointer authentication builtins that will become NOP on legacy platform.
>     Currently, these builtins are for internal use only (libgcc EH unwinder).  */
>  
> [...]
> diff --git a/gcc/config/aarch64/aarch64-simd-builtin-types.def b/gcc/config/aarch64/aarch64-simd-builtin-types.def
> index b015694293c..3b387377f38 100644
> --- a/gcc/config/aarch64/aarch64-simd-builtin-types.def
> +++ b/gcc/config/aarch64/aarch64-simd-builtin-types.def
> @@ -50,3 +50,5 @@
>    ENTRY (Float32x4_t, V4SF, none, 13)
>    ENTRY (Float64x1_t, V1DF, none, 13)
>    ENTRY (Float64x2_t, V2DF, none, 13)
> +  ENTRY (Bfloat16x4_t, V4BF, none, 15)
> +  ENTRY (Bfloat16x8_t, V8BF, none, 15)

Should be 14 (number of characters + 2 for "__").  Would be good to have
a test for correct C++ mangling.

> [...]
> @@ -101,10 +101,10 @@
>    [(set_attr "type" "neon_dup<q>")]
>  )
>  
> -(define_insn "*aarch64_simd_mov<VD:mode>"
> -  [(set (match_operand:VD 0 "nonimmediate_operand"
> +(define_insn "*aarch64_simd_mov<VDMOV:mode>"
> +  [(set (match_operand:VDMOV 0 "nonimmediate_operand"
>  		"=w, m,  m,  w, ?r, ?w, ?r, w")
> -	(match_operand:VD 1 "general_operand"
> +	(match_operand:VDMOV 1 "general_operand"
>  		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
>    "TARGET_SIMD
>     && (register_operand (operands[0], <MODE>mode)
> @@ -126,13 +126,14 @@
>  }
>    [(set_attr "type" "neon_load1_1reg<q>, store_8, neon_store1_1reg<q>,\
>  		     neon_logic<q>, neon_to_gp<q>, f_mcr,\
> -		     mov_reg, neon_move<q>")]
> +		     mov_reg, neon_move<q>")
> +    (set_attr "arch" "*,notbf16,*,*,*,*,*,notbf16")]
>  )

Together with the changes to the arch attribute:

> @@ -378,6 +378,12 @@
>  	(and (eq_attr "arch" "fp16")
>  	     (match_test "TARGET_FP_F16INST"))
>  
> +	(and (eq_attr "arch" "fp16_notbf16")
> +	     (match_test "TARGET_FP_F16INST && !TARGET_BF16_FP"))
> +
> +	(and (eq_attr "arch" "notbf16")
> +	     (match_test "!TARGET_BF16_SIMD"))
> +
>  	(and (eq_attr "arch" "sve")
>  	     (match_test "TARGET_SVE")))
>      (const_string "yes")

this will disable the second and final alternatives for all VDMOV modes
when bf16 is enabled.  E.g. enabling bf16 will disable those alternatives
for V4HI as well as V4BF.

If you want to disable some alternatives for V4BF then it'd be better to
use define_mode_attr instead.  But are you sure we need to disable them?
The m<-Dz alternative should work for V4BF as well.  The w<-Dn alternative
should work too -- it's up to aarch64_simd_valid_immediate to decide
which immediates are valid.

> [...]
> @@ -1174,6 +1174,11 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
>  extern tree aarch64_fp16_type_node;
>  extern tree aarch64_fp16_ptr_type_node;
>  
> +/* This type is the user-visible __bf16, and a pointer to that type.  We
> +   need it in many places in the backend.  Defined in aarch64-builtins.c.  */

Not sure the number of places in this patch counts as "many" :-)
Probably best just to drop that sentence.

> +extern tree aarch64_bf16_type_node;
> +extern tree aarch64_bf16_ptr_type_node;
> +
>  /* The generic unwind code in libgcc does not initialize the frame pointer.
>     So in order to unwind a function using a frame pointer, the very first
>     function that is unwound must save the frame pointer.  That way the frame
> [...]
> @@ -1321,11 +1327,11 @@
>    }
>  )
>  
> -(define_insn "*movhf_aarch64"
> -  [(set (match_operand:HF 0 "nonimmediate_operand" "=w,w  , w,?r,w,w  ,w  ,w,m,r,m ,r")
> -	(match_operand:HF 1 "general_operand"      "Y ,?rY,?r, w,w,Ufc,Uvi,m,w,m,rY,r"))]
> -  "TARGET_FLOAT && (register_operand (operands[0], HFmode)
> -    || aarch64_reg_or_fp_zero (operands[1], HFmode))"
> +(define_insn "*mov<mode>_aarch64"
> +  [(set (match_operand:HFBF 0 "nonimmediate_operand" "=w,w  , w,?r,w,w  ,w  ,w,m,r,m ,r")
> +	(match_operand:HFBF 1 "general_operand"      "Y ,?rY,?r, w,w,Ufc,Uvi,m,w,m,rY,r"))]
> +  "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode)
> +    || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))"
>    "@
>     movi\\t%0.4h, #0
>     fmov\\t%h0, %w1
> @@ -1341,7 +1347,7 @@
>     mov\\t%w0, %w1"
>    [(set_attr "type" "neon_move,f_mcr,neon_move,neon_to_gp, neon_move,fconsts, \
>  		     neon_move,f_loads,f_stores,load_4,store_4,mov_reg")
> -   (set_attr "arch" "simd,fp16,simd,simd,simd,fp16,simd,*,*,*,*,*")]
> +   (set_attr "arch" "simd,fp16,simd,simd,simd,fp16_notbf16,simd,*,*,*,*,*")]
>  )

Here too we should avoid changing "arch" if possible.  Why do you need
to exclude the FMOV alternative for bf16?

> diff --git a/gcc/config/aarch64/arm_bf16.h b/gcc/config/aarch64/arm_bf16.h
> new file mode 100644
> index 00000000000..aedb0972735
> --- /dev/null
> +++ b/gcc/config/aarch64/arm_bf16.h
> @@ -0,0 +1,42 @@
> +/* Arm BF16 instrinsics include file.
> +
> +   Copyright (C) 2019 Free Software Foundation, Inc.
> +   Contributed by Arm.
> +
> +   This file is part of GCC.
> +
> +   GCC is free software; you can redistribute it and/or modify it
> +   under the terms of the GNU General Public License as published
> +   by the Free Software Foundation; either version 3, or (at your
> +   option) any later version.
> +
> +   GCC is distributed in the hope that it will be useful, but WITHOUT
> +   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
> +   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
> +   License for more details.
> +
> +   Under Section 7 of GPL version 3, you are granted additional
> +   permissions described in the GCC Runtime Library Exception, version
> +   3.1, as published by the Free Software Foundation.
> +
> +   You should have received a copy of the GNU General Public License and
> +   a copy of the GCC Runtime Library Exception along with this program;
> +   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#ifndef _AARCH64_BF16_H_
> +#define _AARCH64_BF16_H_
> +
> +#include <stdint.h>

Are we supposed to include stdint.h?  The ACLE spec doesn't seem
to require it.

> +
> +#pragma GCC push_options
> +#pragma GCC target ("arch=armv8.2-a+bf16")
> +#ifdef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
> +
> +typedef __bf16 bfloat16_t;
> +
> +
> +#endif
> +#pragma GCC pop_options
> +
> +#endif

Are you sure we need the #ifdef?  The target pragma should guarantee
that the macro's defined.

But the validity of the typedef shouldn't depend on target options,
so AFAICT this should just be:

typedef __bf16 bfloat16_t;

> diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_compile.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_compile.c
> new file mode 100644
> index 00000000000..f2bef671deb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_compile.c
> @@ -0,0 +1,51 @@
> +/* { dg-do assemble { target { aarch64*-*-* } } } */
> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
> +/* { dg-add-options arm_v8_2a_bf16_neon }  */
> +/* { dg-additional-options "-O3 --save-temps" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +**stacktest1:
> +**	...
> +**	str	h0, \[sp, [0-9]+\]
> +**	ldr	h0, \[sp, [0-9]+\]
> +**	...
> +**	ret
> +*/
> +bfloat16_t stacktest1 (bfloat16_t __a)
> +{
> +  volatile bfloat16_t b = __a;
> +  return b;
> +}
> +
> +/*
> +**stacktest2:
> +**	...
> +**	str	d0, \[sp, [0-9]+\]
> +**	ldr	d0, \[sp, [0-9]+\]
> +**	...
> +**	ret
> +*/
> +bfloat16x4_t stacktest2 (bfloat16x4_t __a)
> +{
> +  volatile bfloat16x4_t b = __a;
> +  return b;
> +}
> +
> +/*
> +**stacktest3:
> +**	...
> +**	str	q0, \[sp\]
> +**	ldr	q0, \[sp\]
> +**	...
> +**	ret
> +*/
> +bfloat16x8_t stacktest3 (bfloat16x8_t __a)
> +{
> +  volatile bfloat16x8_t b = __a;
> +  return b;
> +}
> +
> +

It would be good to have more test coverage than this.  E.g.:

- a test that includes arm_bf16.h, with just scalar tests.

- a test that includes arm_bf16.h without bf16 enabled, switches bf16 on,
  and then uses bfloat16_t.

- a test that includes arm_bf16.h without bf16 enabled and tries to use
  bfloat16_t without turning bf16 on.

- a test for _Complex bfloat16_t.

- a test for moves involving:

    typedef bfloat16_t v16bf __attribute__((vector_size(32)));

- a test that involves moving constants, for both scalars and vectors.
  You can create zero scalar constants in C++ using bfloat16_t() etc.
  For vectors it's possible to do things like:

    typedef short v2hi __attribute__((vector_size(4)));
    v2hi foo (void) { return (v2hi) 0x12345678; }

  The same sort of things should work for bfloat16x4_t and bfloat16x8_t.

Thanks,
Richard



More information about the Gcc-patches mailing list