[gcc(refs/vendors/ARM/heads/arm-struct-reorg-wip)] arm: ACLE intrinsics for bfloat16 dot product
Tamar Christina
tnfchris@gcc.gnu.org
Fri Jul 17 13:10:50 GMT 2020
https://gcc.gnu.org/g:eb7ba6c36b8a17c79936abe26245e4bc66bb8859
commit eb7ba6c36b8a17c79936abe26245e4bc66bb8859
Author: Dennis Zhang <dennis.zhang@arm.com>
Date: Tue Feb 25 17:38:00 2020 +0000
arm: ACLE intrinsics for bfloat16 dot product
This patch is part of a series adding support for Armv8.6-A features.
It adds intrinsics for brain half-precision float-point (BF16) dot
instructions with AdvSIMD support.
gcc/ChangeLog:
2020-02-25 Dennis Zhang <dennis.zhang@arm.com>
* config/arm/arm_neon.h (vbfdot_f32, vbfdotq_f32): New
(vbfdot_lane_f32, vbfdotq_laneq_f32): New.
(vbfdot_laneq_f32, vbfdotq_lane_f32): New.
* config/arm/arm_neon_builtins.def (vbfdot): New entry.
(vbfdot_lanev4bf, vbfdot_lanev8bf): Likewise.
* config/arm/iterators.md (VSF2BF): New attribute.
* config/arm/neon.md (neon_vbfdot<VCVTF:mode>): New entry.
(neon_vbfdot_lanev4bf<VCVTF:mode>): Likewise.
(neon_vbfdot_lanev8bf<VCVTF:mode>): Likewise.
gcc/testsuite/ChangeLog:
2020-02-25 Dennis Zhang <dennis.zhang@arm.com>
* gcc.target/arm/simd/bf16_dot_1.c: New test.
* gcc.target/arm/simd/bf16_dot_2.c: New test.
* gcc.target/arm/simd/bf16_dot_3.c: New test.
Diff:
---
gcc/ChangeLog | 12 +++
gcc/config/arm/arm_neon.h | 52 +++++++++++++
gcc/config/arm/arm_neon_builtins.def | 4 +
gcc/config/arm/iterators.md | 2 +
gcc/config/arm/neon.md | 48 ++++++++++++
gcc/testsuite/ChangeLog | 6 ++
gcc/testsuite/gcc.target/arm/simd/bf16_dot_1.c | 100 +++++++++++++++++++++++++
gcc/testsuite/gcc.target/arm/simd/bf16_dot_2.c | 33 ++++++++
gcc/testsuite/gcc.target/arm/simd/bf16_dot_3.c | 33 ++++++++
9 files changed, 290 insertions(+)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 14a4b1a06e8..bbb4a656553 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,15 @@
+2020-02-25 Dennis Zhang <dennis.zhang@arm.com>
+
+ * config/arm/arm_neon.h (vbfdot_f32, vbfdotq_f32): New
+ (vbfdot_lane_f32, vbfdotq_laneq_f32): New.
+ (vbfdot_laneq_f32, vbfdotq_lane_f32): New.
+ * config/arm/arm_neon_builtins.def (vbfdot): New entry.
+ (vbfdot_lanev4bf, vbfdot_lanev8bf): Likewise.
+ * config/arm/iterators.md (VSF2BF): New attribute.
+ * config/arm/neon.md (neon_vbfdot<VCVTF:mode>): New entry.
+ (neon_vbfdot_lanev4bf<VCVTF:mode>): Likewise.
+ (neon_vbfdot_lanev8bf<VCVTF:mode>): Likewise.
+
2020-02-25 Christophe Lyon <christophe.lyon@linaro.org>
* config/arm/arm.md (required_for_purecode): New attribute.
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index e81681aa415..d2ebee40538 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -18819,6 +18819,58 @@ vusmmlaq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b)
#pragma GCC pop_options
+/* AdvSIMD Brain half-precision float-point (Bfloat16) intrinsics. */
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+bf16")
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b)
+{
+ return __builtin_neon_vbfdotv2sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdotq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+ return __builtin_neon_vbfdotv4sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdot_lane_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b,
+ const int __index)
+{
+ return __builtin_neon_vbfdot_lanev4bfv2sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdotq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
+ const int __index)
+{
+ return __builtin_neon_vbfdot_lanev8bfv4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdot_laneq_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x8_t __b,
+ const int __index)
+{
+ return __builtin_neon_vbfdot_lanev8bfv2sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdotq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
+ const int __index)
+{
+ return __builtin_neon_vbfdot_lanev4bfv4sf (__r, __a, __b, __index);
+}
+
+#pragma GCC pop_options
+
#ifdef __cplusplus
}
#endif
diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def
index f4a97fd764c..4a6f4cfc44e 100644
--- a/gcc/config/arm/arm_neon_builtins.def
+++ b/gcc/config/arm/arm_neon_builtins.def
@@ -381,3 +381,7 @@ VAR2 (MAC_LANE_PAIR, vcmlaq_lane270, v4sf, v8hf)
VAR1 (TERNOP, smmla, v16qi)
VAR1 (UTERNOP, ummla, v16qi)
VAR1 (USTERNOP, usmmla, v16qi)
+
+VAR2 (TERNOP, vbfdot, v2sf, v4sf)
+VAR2 (MAC_LANE_PAIR, vbfdot_lanev4bf, v2sf, v4sf)
+VAR2 (MAC_LANE_PAIR, vbfdot_lanev8bf, v2sf, v4sf)
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 136c45274ae..b435a05d219 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -835,6 +835,8 @@
(define_mode_attr VSI2QI [(V2SI "V8QI") (V4SI "V16QI")])
(define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")])
+(define_mode_attr VSF2BF [(V2SF "V4BF") (V4SF "V8BF")])
+
;;----------------------------------------------------------------------------
;; Code attributes
;;----------------------------------------------------------------------------
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 039cd90c3da..80e94de4b84 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -6596,3 +6596,51 @@ if (BYTES_BIG_ENDIAN)
"v<sup>mmla.<mmla_sfx>\t%q0, %q2, %q3"
[(set_attr "type" "neon_mla_s_q")]
)
+
+(define_insn "neon_vbfdot<VCVTF:mode>"
+ [(set (match_operand:VCVTF 0 "register_operand" "=w")
+ (plus:VCVTF (match_operand:VCVTF 1 "register_operand" "0")
+ (unspec:VCVTF [
+ (match_operand:<VSF2BF> 2 "register_operand" "w")
+ (match_operand:<VSF2BF> 3 "register_operand" "w")]
+ UNSPEC_DOT_S)))]
+ "TARGET_BF16_SIMD"
+ "vdot.bf16\\t%<V_reg>0, %<V_reg>2, %<V_reg>3"
+ [(set_attr "type" "neon_dot<q>")]
+)
+
+(define_insn "neon_vbfdot_lanev4bf<VCVTF:mode>"
+ [(set (match_operand:VCVTF 0 "register_operand" "=w")
+ (plus:VCVTF (match_operand:VCVTF 1 "register_operand" "0")
+ (unspec:VCVTF [
+ (match_operand:<VSF2BF> 2 "register_operand" "w")
+ (match_operand:V4BF 3 "register_operand" "x")
+ (match_operand:SI 4 "immediate_operand" "i")]
+ UNSPEC_DOT_S)))]
+ "TARGET_BF16_SIMD"
+ "vdot.bf16\\t%<V_reg>0, %<V_reg>2, %P3[%c4]"
+ [(set_attr "type" "neon_dot<q>")]
+)
+
+(define_insn "neon_vbfdot_lanev8bf<VCVTF:mode>"
+ [(set (match_operand:VCVTF 0 "register_operand" "=w")
+ (plus:VCVTF (match_operand:VCVTF 1 "register_operand" "0")
+ (unspec:VCVTF [
+ (match_operand:<VSF2BF> 2 "register_operand" "w")
+ (match_operand:V8BF 3 "register_operand" "x")
+ (match_operand:SI 4 "immediate_operand" "i")]
+ UNSPEC_DOT_S)))]
+ "TARGET_BF16_SIMD"
+ {
+ int lane = INTVAL (operands[4]);
+ int half = GET_MODE_NUNITS (GET_MODE (operands[3])) / 4;
+ if (lane < half)
+ return "vdot.bf16\\t%<V_reg>0, %<V_reg>2, %e3[%c4]";
+ else
+ {
+ operands[4] = GEN_INT (lane - half);
+ return "vdot.bf16\\t%<V_reg>0, %<V_reg>2, %f3[%c4]";
+ }
+ }
+ [(set_attr "type" "neon_dot<q>")]
+)
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 922ae5c4b7a..7b24b54fac7 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,9 @@
+2020-02-25 Dennis Zhang <dennis.zhang@arm.com>
+
+ * gcc.target/arm/simd/bf16_dot_1.c: New test.
+ * gcc.target/arm/simd/bf16_dot_2.c: New test.
+ * gcc.target/arm/simd/bf16_dot_3.c: New test.
+
2020-02-25 Jakub Jelinek <jakub@redhat.com>
PR rtl-optimization/93908
diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_dot_1.c b/gcc/testsuite/gcc.target/arm/simd/bf16_dot_1.c
new file mode 100644
index 00000000000..4487152d6cb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/bf16_dot_1.c
@@ -0,0 +1,100 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-options "-save-temps -O2" } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+
+#include "arm_neon.h"
+
+/* BF16 DOT without lane. */
+float32x2_t
+test_vbfdot_f32 (float32x2_t r, bfloat16x4_t a, bfloat16x4_t b)
+{
+ /* vdot.bf16 d, d, d */
+ return vbfdot_f32 (r, a, b);
+}
+
+float32x4_t
+test_vbfdotq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+ /* vdot.bf16 q, q, q */
+ return vbfdotq_f32 (r, a, b);
+}
+
+/* 64-bit BF16 DOT with lane. */
+float32x2_t
+test_vbfdot_lane_f32_0 (float32x2_t r, bfloat16x4_t a, bfloat16x4_t b)
+{
+ /* vdot.bf16 d, d, d[0] */
+ return vbfdot_lane_f32 (r, a, b, 0);
+}
+
+float32x2_t
+test_vbfdot_lane_f32_1 (float32x2_t r, bfloat16x4_t a, bfloat16x4_t b)
+{
+ /* vdot.bf16 d, d, d[1] */
+ return vbfdot_lane_f32 (r, a, b, 1);
+}
+
+float32x2_t
+test_vbfdot_laneq_f32_0 (float32x2_t r, bfloat16x4_t a, bfloat16x8_t b)
+{
+ /* vdot.bf16 d, d, d[0] */
+ return vbfdot_laneq_f32 (r, a, b, 0);
+}
+
+float32x2_t
+test_vbfdot_laneq_f32_1 (float32x2_t r, bfloat16x4_t a, bfloat16x8_t b)
+{
+ /* vdot.bf16 d, d, d[1] */
+ return vbfdot_laneq_f32 (r, a, b, 1);
+}
+
+float32x2_t
+test_vbfdot_laneq_f32_2 (float32x2_t r, bfloat16x4_t a, bfloat16x8_t b)
+{
+ /* vdot.bf16 d, d, d[0] */
+ return vbfdot_laneq_f32 (r, a, b, 2);
+}
+
+float32x2_t
+test_vbfdot_laneq_f32_3 (float32x2_t r, bfloat16x4_t a, bfloat16x8_t b)
+{
+ /* vdot.bf16 d, d, d[1] */
+ return vbfdot_laneq_f32 (r, a, b, 3);
+}
+
+/* 128-bit BF16 DOT with lane. */
+float32x4_t
+test_vbfdotq_lane_f32_0 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+ /* vdot.bf16 q, q, d[0] */
+ return vbfdotq_lane_f32 (r, a, b, 0);
+}
+
+float32x4_t
+test_vbfdotq_lane_f32_1 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+ /* vdot.bf16 q, q, d[1] */
+ return vbfdotq_lane_f32 (r, a, b, 1);
+}
+
+float32x4_t
+test_vbfdotq_laneq_f32_0 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+ /* vdot.bf16 q, q, d[0] */
+ return vbfdotq_laneq_f32 (r, a, b, 0);
+}
+
+float32x4_t
+test_vbfdotq_laneq_f32_3 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+ /* vdot.bf16 q, q, d[1] */
+ return vbfdotq_laneq_f32 (r, a, b, 3);
+}
+
+/* { dg-final { scan-assembler-times {\tvdot.bf16\td[0-9]+, d[0-9]+, d[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvdot.bf16\tq[0-9]+, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvdot.bf16\td[0-9]+, d[0-9]+, d[0-9]+\[0\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tvdot.bf16\td[0-9]+, d[0-9]+, d[0-9]+\[1\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tvdot.bf16\tq[0-9]+, q[0-9]+, d[0-9]+\[0\]\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvdot.bf16\tq[0-9]+, q[0-9]+, d[0-9]+\[1\]\n} 2 } } */
diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_dot_2.c b/gcc/testsuite/gcc.target/arm/simd/bf16_dot_2.c
new file mode 100644
index 00000000000..d2ef344c68b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/bf16_dot_2.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+
+#include "arm_neon.h"
+
+float32x2_t
+test_vbfdot_lane_f32_a (float32x2_t r, bfloat16x4_t a, bfloat16x4_t b)
+{
+ /* { dg-error "lane -1 out of range 0 - 1" "" {target *-*-*} 0 } */
+ return vbfdot_lane_f32 (r, a, b, -1);
+}
+
+float32x2_t
+test_vbfdot_lane_f32_b (float32x2_t r, bfloat16x4_t a, bfloat16x4_t b)
+{
+ /* { dg-error "lane 2 out of range 0 - 1" "" {target *-*-*} 0 } */
+ return vbfdot_lane_f32 (r, a, b, 2);
+}
+
+float32x2_t
+test_vbfdot_laneq_f32_a (float32x2_t r, bfloat16x4_t a, bfloat16x8_t b)
+{
+ /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */
+ return vbfdot_laneq_f32 (r, a, b, -1);
+}
+
+float32x2_t
+test_vbfdot_laneq_f32_b (float32x2_t r, bfloat16x4_t a, bfloat16x8_t b)
+{
+ /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */
+ return vbfdot_laneq_f32 (r, a, b, 4);
+}
diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_dot_3.c b/gcc/testsuite/gcc.target/arm/simd/bf16_dot_3.c
new file mode 100644
index 00000000000..93f08f02bc7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/bf16_dot_3.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+
+#include "arm_neon.h"
+
+float32x4_t
+test_vbfdotq_lane_f32_a (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+ /* { dg-error "lane -1 out of range 0 - 1" "" {target *-*-*} 0 } */
+ return vbfdotq_lane_f32 (r, a, b, -1);
+}
+
+float32x4_t
+test_vbfdotq_lane_f32_b (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+ /* { dg-error "lane 2 out of range 0 - 1" "" {target *-*-*} 0 } */
+ return vbfdotq_lane_f32 (r, a, b, 2);
+}
+
+float32x4_t
+test_vbfdotq_laneq_f32_a (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+ /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */
+ return vbfdotq_laneq_f32 (r, a, b, -1);
+}
+
+float32x4_t
+test_vbfdotq_laneq_f32_b (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+ /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */
+ return vbfdotq_laneq_f32 (r, a, b, 4);
+}
More information about the Gcc-cvs
mailing list