[PATCH V2] aarch64: Add vstN_lane_bf16 + vstNq_lane_bf16 intrinsics

Mon Oct 26 09:08:45 GMT 2020

Hi all,

Second version of the patch here implementing the bfloat16_t neon
related store intrinsics: vst2_lane_bf16, vst2q_lane_bf16,
vst3_lane_bf16, vst3q_lane_bf16 vst4_lane_bf16, vst4q_lane_bf16.

Please see refer to:
ACLE <https://developer.arm.com/docs/101028/latest>
ISA  <https://developer.arm.com/docs/ddi0596/latest>

This better narrows testcases so they do not cause regressions for the
arm backend where these intrinsics are not yet present.

Please see refer to:
ACLE <https://developer.arm.com/docs/101028/latest>
ISA  <https://developer.arm.com/docs/ddi0596/latest>

Okay for trunk?

Thanks!

  Andrea

-------------- next part --------------
>From 16803710f96889ec89349c5bb6ff1fb96a9d32d8 Mon Sep 17 00:00:00 2001
From: Andrea Corallo <andrea.corallo@arm.com>
Date: Thu, 8 Oct 2020 11:02:09 +0200
Subject: [PATCH] aarch64: Add vstN_lane_bf16 + vstNq_lane_bf16 intrinsics

gcc/ChangeLog

2020-10-19  Andrea Corallo  <andrea.corallo@arm.com>

	* config/aarch64/arm_neon.h (__STX_LANE_FUNC): Move to the bottom
	of the file so we can use these also for defining the bf16 related
	intrinsics.
	(vst2_lane_bf16, vst2q_lane_bf16, vst3_lane_bf16, vst3q_lane_bf16)
	(vst4_lane_bf16, vst4q_lane_bf16): Add new intrinsics.

gcc/testsuite/ChangeLog

2020-10-19  Andrea Corallo  <andrea.corallo@arm.com>

	* gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
	(hbfloat16_t): Define type.
	(CHECK_FP): Make it working for bfloat types.
	* gcc.target/aarch64/advsimd-intrinsics/bf16_vstN_lane_1.c: New file.
	* gcc.target/aarch64/advsimd-intrinsics/bf16_vstN_lane_2.c: Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vst2_lane_bf16_indices_1.c:
	Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vst2q_lane_bf16_indices_1.c:
	Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vst3_lane_bf16_indices_1.c:
	Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vst3q_lane_bf16_indices_1.c:
	Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vst4_lane_bf16_indices_1.c:
	Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vst4q_lane_bf16_indices_1.c:
	Likewise.
---
 gcc/config/aarch64/arm_neon.h                 | 534 +++++++++---------
 .../aarch64/advsimd-intrinsics/arm-neon-ref.h |   4 +-
 .../advsimd-intrinsics/bf16_vstN_lane_1.c     | 227 ++++++++
 .../advsimd-intrinsics/bf16_vstN_lane_2.c     |  52 ++
 .../vst2_lane_bf16_indices_1.c                |  16 +
 .../vst2q_lane_bf16_indices_1.c               |  16 +
 .../vst3_lane_bf16_indices_1.c                |  16 +
 .../vst3q_lane_bf16_indices_1.c               |  16 +
 .../vst4_lane_bf16_indices_1.c                |  16 +
 .../vst4q_lane_bf16_indices_1.c               |  16 +
 10 files changed, 656 insertions(+), 257 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vstN_lane_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vstN_lane_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst2_lane_bf16_indices_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst2q_lane_bf16_indices_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst3_lane_bf16_indices_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst3q_lane_bf16_indices_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst4_lane_bf16_indices_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst4q_lane_bf16_indices_1.c

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 2bb20e15069..0088ea9896f 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -10873,262 +10873,6 @@ __STRUCTN (poly, 8, 4)
 __STRUCTN (float, 64, 4)
 #undef __STRUCTN
 
-
-#define __ST2_LANE_FUNC(intype, largetype, ptrtype, mode,		     \
-			qmode, ptr_mode, funcsuffix, signedtype)	     \
-__extension__ extern __inline void					     \
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
-vst2_lane_ ## funcsuffix (ptrtype *__ptr,				     \
-			  intype __b, const int __c)			     \
-{									     \
-  __builtin_aarch64_simd_oi __o;					     \
-  largetype __temp;							     \
-  __temp.val[0]								     \
-    = vcombine_##funcsuffix (__b.val[0],				     \
-			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
-  __temp.val[1]								     \
-    = vcombine_##funcsuffix (__b.val[1],				     \
-			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
-  __o = __builtin_aarch64_set_qregoi##qmode (__o,			     \
-					     (signedtype) __temp.val[0], 0); \
-  __o = __builtin_aarch64_set_qregoi##qmode (__o,			     \
-					     (signedtype) __temp.val[1], 1); \
-  __builtin_aarch64_st2_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *)  \
-				     __ptr, __o, __c);			     \
-}
-
-__ST2_LANE_FUNC (float16x4x2_t, float16x8x2_t, float16_t, v4hf, v8hf, hf, f16,
-		 float16x8_t)
-__ST2_LANE_FUNC (float32x2x2_t, float32x4x2_t, float32_t, v2sf, v4sf, sf, f32,
-		 float32x4_t)
-__ST2_LANE_FUNC (float64x1x2_t, float64x2x2_t, float64_t, df, v2df, df, f64,
-		 float64x2_t)
-__ST2_LANE_FUNC (poly8x8x2_t, poly8x16x2_t, poly8_t, v8qi, v16qi, qi, p8,
-		 int8x16_t)
-__ST2_LANE_FUNC (poly16x4x2_t, poly16x8x2_t, poly16_t, v4hi, v8hi, hi, p16,
-		 int16x8_t)
-__ST2_LANE_FUNC (poly64x1x2_t, poly64x2x2_t, poly64_t, di, v2di_ssps, di, p64,
-		 poly64x2_t)
-__ST2_LANE_FUNC (int8x8x2_t, int8x16x2_t, int8_t, v8qi, v16qi, qi, s8,
-		 int8x16_t)
-__ST2_LANE_FUNC (int16x4x2_t, int16x8x2_t, int16_t, v4hi, v8hi, hi, s16,
-		 int16x8_t)
-__ST2_LANE_FUNC (int32x2x2_t, int32x4x2_t, int32_t, v2si, v4si, si, s32,
-		 int32x4_t)
-__ST2_LANE_FUNC (int64x1x2_t, int64x2x2_t, int64_t, di, v2di, di, s64,
-		 int64x2_t)
-__ST2_LANE_FUNC (uint8x8x2_t, uint8x16x2_t, uint8_t, v8qi, v16qi, qi, u8,
-		 int8x16_t)
-__ST2_LANE_FUNC (uint16x4x2_t, uint16x8x2_t, uint16_t, v4hi, v8hi, hi, u16,
-		 int16x8_t)
-__ST2_LANE_FUNC (uint32x2x2_t, uint32x4x2_t, uint32_t, v2si, v4si, si, u32,
-		 int32x4_t)
-__ST2_LANE_FUNC (uint64x1x2_t, uint64x2x2_t, uint64_t, di, v2di, di, u64,
-		 int64x2_t)
-
-#undef __ST2_LANE_FUNC
-#define __ST2_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix)	    \
-__extension__ extern __inline void					    \
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
-vst2q_lane_ ## funcsuffix (ptrtype *__ptr,				    \
-			   intype __b, const int __c)			    \
-{									    \
-  union { intype __i;							    \
-	  __builtin_aarch64_simd_oi __o; } __temp = { __b };		    \
-  __builtin_aarch64_st2_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \
-				    __ptr, __temp.__o, __c);		    \
-}
-
-__ST2_LANE_FUNC (float16x8x2_t, float16_t, v8hf, hf, f16)
-__ST2_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32)
-__ST2_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64)
-__ST2_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8)
-__ST2_LANE_FUNC (poly16x8x2_t, poly16_t, v8hi, hi, p16)
-__ST2_LANE_FUNC (poly64x2x2_t, poly64_t, v2di, di, p64)
-__ST2_LANE_FUNC (int8x16x2_t, int8_t, v16qi, qi, s8)
-__ST2_LANE_FUNC (int16x8x2_t, int16_t, v8hi, hi, s16)
-__ST2_LANE_FUNC (int32x4x2_t, int32_t, v4si, si, s32)
-__ST2_LANE_FUNC (int64x2x2_t, int64_t, v2di, di, s64)
-__ST2_LANE_FUNC (uint8x16x2_t, uint8_t, v16qi, qi, u8)
-__ST2_LANE_FUNC (uint16x8x2_t, uint16_t, v8hi, hi, u16)
-__ST2_LANE_FUNC (uint32x4x2_t, uint32_t, v4si, si, u32)
-__ST2_LANE_FUNC (uint64x2x2_t, uint64_t, v2di, di, u64)
-
-#define __ST3_LANE_FUNC(intype, largetype, ptrtype, mode,		     \
-			qmode, ptr_mode, funcsuffix, signedtype)	     \
-__extension__ extern __inline void					     \
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
-vst3_lane_ ## funcsuffix (ptrtype *__ptr,				     \
-			  intype __b, const int __c)			     \
-{									     \
-  __builtin_aarch64_simd_ci __o;					     \
-  largetype __temp;							     \
-  __temp.val[0]								     \
-    = vcombine_##funcsuffix (__b.val[0],				     \
-			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
-  __temp.val[1]								     \
-    = vcombine_##funcsuffix (__b.val[1],				     \
-			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
-  __temp.val[2]								     \
-    = vcombine_##funcsuffix (__b.val[2],				     \
-			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
-  __o = __builtin_aarch64_set_qregci##qmode (__o,			     \
-					     (signedtype) __temp.val[0], 0); \
-  __o = __builtin_aarch64_set_qregci##qmode (__o,			     \
-					     (signedtype) __temp.val[1], 1); \
-  __o = __builtin_aarch64_set_qregci##qmode (__o,			     \
-					     (signedtype) __temp.val[2], 2); \
-  __builtin_aarch64_st3_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *)  \
-				     __ptr, __o, __c);			     \
-}
-
-__ST3_LANE_FUNC (float16x4x3_t, float16x8x3_t, float16_t, v4hf, v8hf, hf, f16,
-		 float16x8_t)
-__ST3_LANE_FUNC (float32x2x3_t, float32x4x3_t, float32_t, v2sf, v4sf, sf, f32,
-		 float32x4_t)
-__ST3_LANE_FUNC (float64x1x3_t, float64x2x3_t, float64_t, df, v2df, df, f64,
-		 float64x2_t)
-__ST3_LANE_FUNC (poly8x8x3_t, poly8x16x3_t, poly8_t, v8qi, v16qi, qi, p8,
-		 int8x16_t)
-__ST3_LANE_FUNC (poly16x4x3_t, poly16x8x3_t, poly16_t, v4hi, v8hi, hi, p16,
-		 int16x8_t)
-__ST3_LANE_FUNC (poly64x1x3_t, poly64x2x3_t, poly64_t, di, v2di_ssps, di, p64,
-		 poly64x2_t)
-__ST3_LANE_FUNC (int8x8x3_t, int8x16x3_t, int8_t, v8qi, v16qi, qi, s8,
-		 int8x16_t)
-__ST3_LANE_FUNC (int16x4x3_t, int16x8x3_t, int16_t, v4hi, v8hi, hi, s16,
-		 int16x8_t)
-__ST3_LANE_FUNC (int32x2x3_t, int32x4x3_t, int32_t, v2si, v4si, si, s32,
-		 int32x4_t)
-__ST3_LANE_FUNC (int64x1x3_t, int64x2x3_t, int64_t, di, v2di, di, s64,
-		 int64x2_t)
-__ST3_LANE_FUNC (uint8x8x3_t, uint8x16x3_t, uint8_t, v8qi, v16qi, qi, u8,
-		 int8x16_t)
-__ST3_LANE_FUNC (uint16x4x3_t, uint16x8x3_t, uint16_t, v4hi, v8hi, hi, u16,
-		 int16x8_t)
-__ST3_LANE_FUNC (uint32x2x3_t, uint32x4x3_t, uint32_t, v2si, v4si, si, u32,
-		 int32x4_t)
-__ST3_LANE_FUNC (uint64x1x3_t, uint64x2x3_t, uint64_t, di, v2di, di, u64,
-		 int64x2_t)
-
-#undef __ST3_LANE_FUNC
-#define __ST3_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix)	    \
-__extension__ extern __inline void					    \
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
-vst3q_lane_ ## funcsuffix (ptrtype *__ptr,				    \
-			   intype __b, const int __c)			    \
-{									    \
-  union { intype __i;							    \
-	  __builtin_aarch64_simd_ci __o; } __temp = { __b };		    \
-  __builtin_aarch64_st3_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \
-				    __ptr, __temp.__o, __c);		    \
-}
-
-__ST3_LANE_FUNC (float16x8x3_t, float16_t, v8hf, hf, f16)
-__ST3_LANE_FUNC (float32x4x3_t, float32_t, v4sf, sf, f32)
-__ST3_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64)
-__ST3_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8)
-__ST3_LANE_FUNC (poly16x8x3_t, poly16_t, v8hi, hi, p16)
-__ST3_LANE_FUNC (poly64x2x3_t, poly64_t, v2di, di, p64)
-__ST3_LANE_FUNC (int8x16x3_t, int8_t, v16qi, qi, s8)
-__ST3_LANE_FUNC (int16x8x3_t, int16_t, v8hi, hi, s16)
-__ST3_LANE_FUNC (int32x4x3_t, int32_t, v4si, si, s32)
-__ST3_LANE_FUNC (int64x2x3_t, int64_t, v2di, di, s64)
-__ST3_LANE_FUNC (uint8x16x3_t, uint8_t, v16qi, qi, u8)
-__ST3_LANE_FUNC (uint16x8x3_t, uint16_t, v8hi, hi, u16)
-__ST3_LANE_FUNC (uint32x4x3_t, uint32_t, v4si, si, u32)
-__ST3_LANE_FUNC (uint64x2x3_t, uint64_t, v2di, di, u64)
-
-#define __ST4_LANE_FUNC(intype, largetype, ptrtype, mode,		     \
-			qmode, ptr_mode, funcsuffix, signedtype)	     \
-__extension__ extern __inline void					     \
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
-vst4_lane_ ## funcsuffix (ptrtype *__ptr,				     \
-			  intype __b, const int __c)			     \
-{									     \
-  __builtin_aarch64_simd_xi __o;					     \
-  largetype __temp;							     \
-  __temp.val[0]								     \
-    = vcombine_##funcsuffix (__b.val[0],				     \
-			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
-  __temp.val[1]								     \
-    = vcombine_##funcsuffix (__b.val[1],				     \
-			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
-  __temp.val[2]								     \
-    = vcombine_##funcsuffix (__b.val[2],				     \
-			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
-  __temp.val[3]								     \
-    = vcombine_##funcsuffix (__b.val[3],				     \
-			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
-  __o = __builtin_aarch64_set_qregxi##qmode (__o,			     \
-					     (signedtype) __temp.val[0], 0); \
-  __o = __builtin_aarch64_set_qregxi##qmode (__o,			     \
-					     (signedtype) __temp.val[1], 1); \
-  __o = __builtin_aarch64_set_qregxi##qmode (__o,			     \
-					     (signedtype) __temp.val[2], 2); \
-  __o = __builtin_aarch64_set_qregxi##qmode (__o,			     \
-					     (signedtype) __temp.val[3], 3); \
-  __builtin_aarch64_st4_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *)  \
-				     __ptr, __o, __c);			     \
-}
-
-__ST4_LANE_FUNC (float16x4x4_t, float16x8x4_t, float16_t, v4hf, v8hf, hf, f16,
-		 float16x8_t)
-__ST4_LANE_FUNC (float32x2x4_t, float32x4x4_t, float32_t, v2sf, v4sf, sf, f32,
-		 float32x4_t)
-__ST4_LANE_FUNC (float64x1x4_t, float64x2x4_t, float64_t, df, v2df, df, f64,
-		 float64x2_t)
-__ST4_LANE_FUNC (poly8x8x4_t, poly8x16x4_t, poly8_t, v8qi, v16qi, qi, p8,
-		 int8x16_t)
-__ST4_LANE_FUNC (poly16x4x4_t, poly16x8x4_t, poly16_t, v4hi, v8hi, hi, p16,
-		 int16x8_t)
-__ST4_LANE_FUNC (poly64x1x4_t, poly64x2x4_t, poly64_t, di, v2di_ssps, di, p64,
-		 poly64x2_t)
-__ST4_LANE_FUNC (int8x8x4_t, int8x16x4_t, int8_t, v8qi, v16qi, qi, s8,
-		 int8x16_t)
-__ST4_LANE_FUNC (int16x4x4_t, int16x8x4_t, int16_t, v4hi, v8hi, hi, s16,
-		 int16x8_t)
-__ST4_LANE_FUNC (int32x2x4_t, int32x4x4_t, int32_t, v2si, v4si, si, s32,
-		 int32x4_t)
-__ST4_LANE_FUNC (int64x1x4_t, int64x2x4_t, int64_t, di, v2di, di, s64,
-		 int64x2_t)
-__ST4_LANE_FUNC (uint8x8x4_t, uint8x16x4_t, uint8_t, v8qi, v16qi, qi, u8,
-		 int8x16_t)
-__ST4_LANE_FUNC (uint16x4x4_t, uint16x8x4_t, uint16_t, v4hi, v8hi, hi, u16,
-		 int16x8_t)
-__ST4_LANE_FUNC (uint32x2x4_t, uint32x4x4_t, uint32_t, v2si, v4si, si, u32,
-		 int32x4_t)
-__ST4_LANE_FUNC (uint64x1x4_t, uint64x2x4_t, uint64_t, di, v2di, di, u64,
-		 int64x2_t)
-
-#undef __ST4_LANE_FUNC
-#define __ST4_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix)	    \
-__extension__ extern __inline void					    \
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
-vst4q_lane_ ## funcsuffix (ptrtype *__ptr,				    \
-			   intype __b, const int __c)			    \
-{									    \
-  union { intype __i;							    \
-	  __builtin_aarch64_simd_xi __o; } __temp = { __b };		    \
-  __builtin_aarch64_st4_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \
-				    __ptr, __temp.__o, __c);		    \
-}
-
-__ST4_LANE_FUNC (float16x8x4_t, float16_t, v8hf, hf, f16)
-__ST4_LANE_FUNC (float32x4x4_t, float32_t, v4sf, sf, f32)
-__ST4_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64)
-__ST4_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8)
-__ST4_LANE_FUNC (poly16x8x4_t, poly16_t, v8hi, hi, p16)
-__ST4_LANE_FUNC (poly64x2x4_t, poly64_t, v2di, di, p64)
-__ST4_LANE_FUNC (int8x16x4_t, int8_t, v16qi, qi, s8)
-__ST4_LANE_FUNC (int16x8x4_t, int16_t, v8hi, hi, s16)
-__ST4_LANE_FUNC (int32x4x4_t, int32_t, v4si, si, s32)
-__ST4_LANE_FUNC (int64x2x4_t, int64_t, v2di, di, s64)
-__ST4_LANE_FUNC (uint8x16x4_t, uint8_t, v16qi, qi, u8)
-__ST4_LANE_FUNC (uint16x8x4_t, uint16_t, v8hi, hi, u16)
-__ST4_LANE_FUNC (uint32x4x4_t, uint32_t, v4si, si, u32)
-__ST4_LANE_FUNC (uint64x2x4_t, uint64_t, v2di, di, u64)
-
 __extension__ extern __inline int64_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vaddlv_s32 (int32x2_t __a)
@@ -35568,6 +35312,284 @@ vaddq_p128 (poly128_t __a, poly128_t __b)
   return __a ^ __b;
 }
 
+/* vst2_lane */
+
+#define __ST2_LANE_FUNC(intype, largetype, ptrtype, mode,		     \
+			qmode, ptr_mode, funcsuffix, signedtype)	     \
+__extension__ extern __inline void					     \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+vst2_lane_ ## funcsuffix (ptrtype *__ptr,				     \
+			  intype __b, const int __c)			     \
+{									     \
+  __builtin_aarch64_simd_oi __o;					     \
+  largetype __temp;							     \
+  __temp.val[0]								     \
+    = vcombine_##funcsuffix (__b.val[0],				     \
+			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
+  __temp.val[1]								     \
+    = vcombine_##funcsuffix (__b.val[1],				     \
+			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
+  __o = __builtin_aarch64_set_qregoi##qmode (__o,			     \
+					     (signedtype) __temp.val[0], 0); \
+  __o = __builtin_aarch64_set_qregoi##qmode (__o,			     \
+					     (signedtype) __temp.val[1], 1); \
+  __builtin_aarch64_st2_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *)  \
+				     __ptr, __o, __c);			     \
+}
+
+__ST2_LANE_FUNC (float16x4x2_t, float16x8x2_t, float16_t, v4hf, v8hf, hf, f16,
+		 float16x8_t)
+__ST2_LANE_FUNC (float32x2x2_t, float32x4x2_t, float32_t, v2sf, v4sf, sf, f32,
+		 float32x4_t)
+__ST2_LANE_FUNC (float64x1x2_t, float64x2x2_t, float64_t, df, v2df, df, f64,
+		 float64x2_t)
+__ST2_LANE_FUNC (poly8x8x2_t, poly8x16x2_t, poly8_t, v8qi, v16qi, qi, p8,
+		 int8x16_t)
+__ST2_LANE_FUNC (poly16x4x2_t, poly16x8x2_t, poly16_t, v4hi, v8hi, hi, p16,
+		 int16x8_t)
+__ST2_LANE_FUNC (poly64x1x2_t, poly64x2x2_t, poly64_t, di, v2di_ssps, di, p64,
+		 poly64x2_t)
+__ST2_LANE_FUNC (int8x8x2_t, int8x16x2_t, int8_t, v8qi, v16qi, qi, s8,
+		 int8x16_t)
+__ST2_LANE_FUNC (int16x4x2_t, int16x8x2_t, int16_t, v4hi, v8hi, hi, s16,
+		 int16x8_t)
+__ST2_LANE_FUNC (int32x2x2_t, int32x4x2_t, int32_t, v2si, v4si, si, s32,
+		 int32x4_t)
+__ST2_LANE_FUNC (int64x1x2_t, int64x2x2_t, int64_t, di, v2di, di, s64,
+		 int64x2_t)
+__ST2_LANE_FUNC (uint8x8x2_t, uint8x16x2_t, uint8_t, v8qi, v16qi, qi, u8,
+		 int8x16_t)
+__ST2_LANE_FUNC (uint16x4x2_t, uint16x8x2_t, uint16_t, v4hi, v8hi, hi, u16,
+		 int16x8_t)
+__ST2_LANE_FUNC (uint32x2x2_t, uint32x4x2_t, uint32_t, v2si, v4si, si, u32,
+		 int32x4_t)
+__ST2_LANE_FUNC (uint64x1x2_t, uint64x2x2_t, uint64_t, di, v2di, di, u64,
+		 int64x2_t)
+__ST2_LANE_FUNC (bfloat16x4x2_t, bfloat16x8x2_t, bfloat16_t, v4bf, v8bf, bf, bf16,
+		 bfloat16x8_t)
+
+/* vst2q_lane */
+
+#undef __ST2_LANE_FUNC
+#define __ST2_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix)	    \
+__extension__ extern __inline void					    \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+vst2q_lane_ ## funcsuffix (ptrtype *__ptr,				    \
+			   intype __b, const int __c)			    \
+{									    \
+  union { intype __i;							    \
+	  __builtin_aarch64_simd_oi __o; } __temp = { __b };		    \
+  __builtin_aarch64_st2_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \
+				    __ptr, __temp.__o, __c);		    \
+}
+
+__ST2_LANE_FUNC (float16x8x2_t, float16_t, v8hf, hf, f16)
+__ST2_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32)
+__ST2_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64)
+__ST2_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8)
+__ST2_LANE_FUNC (poly16x8x2_t, poly16_t, v8hi, hi, p16)
+__ST2_LANE_FUNC (poly64x2x2_t, poly64_t, v2di, di, p64)
+__ST2_LANE_FUNC (int8x16x2_t, int8_t, v16qi, qi, s8)
+__ST2_LANE_FUNC (int16x8x2_t, int16_t, v8hi, hi, s16)
+__ST2_LANE_FUNC (int32x4x2_t, int32_t, v4si, si, s32)
+__ST2_LANE_FUNC (int64x2x2_t, int64_t, v2di, di, s64)
+__ST2_LANE_FUNC (uint8x16x2_t, uint8_t, v16qi, qi, u8)
+__ST2_LANE_FUNC (uint16x8x2_t, uint16_t, v8hi, hi, u16)
+__ST2_LANE_FUNC (uint32x4x2_t, uint32_t, v4si, si, u32)
+__ST2_LANE_FUNC (uint64x2x2_t, uint64_t, v2di, di, u64)
+__ST2_LANE_FUNC (bfloat16x8x2_t, bfloat16_t, v8bf, bf, bf16)
+
+/* vst3_lane */
+
+#define __ST3_LANE_FUNC(intype, largetype, ptrtype, mode,		     \
+			qmode, ptr_mode, funcsuffix, signedtype)	     \
+__extension__ extern __inline void					     \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+vst3_lane_ ## funcsuffix (ptrtype *__ptr,				     \
+			  intype __b, const int __c)			     \
+{									     \
+  __builtin_aarch64_simd_ci __o;					     \
+  largetype __temp;							     \
+  __temp.val[0]								     \
+    = vcombine_##funcsuffix (__b.val[0],				     \
+			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
+  __temp.val[1]								     \
+    = vcombine_##funcsuffix (__b.val[1],				     \
+			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
+  __temp.val[2]								     \
+    = vcombine_##funcsuffix (__b.val[2],				     \
+			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
+  __o = __builtin_aarch64_set_qregci##qmode (__o,			     \
+					     (signedtype) __temp.val[0], 0); \
+  __o = __builtin_aarch64_set_qregci##qmode (__o,			     \
+					     (signedtype) __temp.val[1], 1); \
+  __o = __builtin_aarch64_set_qregci##qmode (__o,			     \
+					     (signedtype) __temp.val[2], 2); \
+  __builtin_aarch64_st3_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *)  \
+				     __ptr, __o, __c);			     \
+}
+
+__ST3_LANE_FUNC (float16x4x3_t, float16x8x3_t, float16_t, v4hf, v8hf, hf, f16,
+		 float16x8_t)
+__ST3_LANE_FUNC (float32x2x3_t, float32x4x3_t, float32_t, v2sf, v4sf, sf, f32,
+		 float32x4_t)
+__ST3_LANE_FUNC (float64x1x3_t, float64x2x3_t, float64_t, df, v2df, df, f64,
+		 float64x2_t)
+__ST3_LANE_FUNC (poly8x8x3_t, poly8x16x3_t, poly8_t, v8qi, v16qi, qi, p8,
+		 int8x16_t)
+__ST3_LANE_FUNC (poly16x4x3_t, poly16x8x3_t, poly16_t, v4hi, v8hi, hi, p16,
+		 int16x8_t)
+__ST3_LANE_FUNC (poly64x1x3_t, poly64x2x3_t, poly64_t, di, v2di_ssps, di, p64,
+		 poly64x2_t)
+__ST3_LANE_FUNC (int8x8x3_t, int8x16x3_t, int8_t, v8qi, v16qi, qi, s8,
+		 int8x16_t)
+__ST3_LANE_FUNC (int16x4x3_t, int16x8x3_t, int16_t, v4hi, v8hi, hi, s16,
+		 int16x8_t)
+__ST3_LANE_FUNC (int32x2x3_t, int32x4x3_t, int32_t, v2si, v4si, si, s32,
+		 int32x4_t)
+__ST3_LANE_FUNC (int64x1x3_t, int64x2x3_t, int64_t, di, v2di, di, s64,
+		 int64x2_t)
+__ST3_LANE_FUNC (uint8x8x3_t, uint8x16x3_t, uint8_t, v8qi, v16qi, qi, u8,
+		 int8x16_t)
+__ST3_LANE_FUNC (uint16x4x3_t, uint16x8x3_t, uint16_t, v4hi, v8hi, hi, u16,
+		 int16x8_t)
+__ST3_LANE_FUNC (uint32x2x3_t, uint32x4x3_t, uint32_t, v2si, v4si, si, u32,
+		 int32x4_t)
+__ST3_LANE_FUNC (uint64x1x3_t, uint64x2x3_t, uint64_t, di, v2di, di, u64,
+		 int64x2_t)
+__ST3_LANE_FUNC (bfloat16x4x3_t, bfloat16x8x3_t, bfloat16_t, v4bf, v8bf, bf, bf16,
+		 bfloat16x8_t)
+
+/* vst3q_lane */
+
+#undef __ST3_LANE_FUNC
+#define __ST3_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix)	    \
+__extension__ extern __inline void					    \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+vst3q_lane_ ## funcsuffix (ptrtype *__ptr,				    \
+			   intype __b, const int __c)			    \
+{									    \
+  union { intype __i;							    \
+	  __builtin_aarch64_simd_ci __o; } __temp = { __b };		    \
+  __builtin_aarch64_st3_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \
+				    __ptr, __temp.__o, __c);		    \
+}
+
+__ST3_LANE_FUNC (float16x8x3_t, float16_t, v8hf, hf, f16)
+__ST3_LANE_FUNC (float32x4x3_t, float32_t, v4sf, sf, f32)
+__ST3_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64)
+__ST3_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8)
+__ST3_LANE_FUNC (poly16x8x3_t, poly16_t, v8hi, hi, p16)
+__ST3_LANE_FUNC (poly64x2x3_t, poly64_t, v2di, di, p64)
+__ST3_LANE_FUNC (int8x16x3_t, int8_t, v16qi, qi, s8)
+__ST3_LANE_FUNC (int16x8x3_t, int16_t, v8hi, hi, s16)
+__ST3_LANE_FUNC (int32x4x3_t, int32_t, v4si, si, s32)
+__ST3_LANE_FUNC (int64x2x3_t, int64_t, v2di, di, s64)
+__ST3_LANE_FUNC (uint8x16x3_t, uint8_t, v16qi, qi, u8)
+__ST3_LANE_FUNC (uint16x8x3_t, uint16_t, v8hi, hi, u16)
+__ST3_LANE_FUNC (uint32x4x3_t, uint32_t, v4si, si, u32)
+__ST3_LANE_FUNC (uint64x2x3_t, uint64_t, v2di, di, u64)
+__ST3_LANE_FUNC (bfloat16x8x3_t, bfloat16_t, v8bf, bf, bf16)
+
+/* vst4_lane */
+
+#define __ST4_LANE_FUNC(intype, largetype, ptrtype, mode,		     \
+			qmode, ptr_mode, funcsuffix, signedtype)	     \
+__extension__ extern __inline void					     \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+vst4_lane_ ## funcsuffix (ptrtype *__ptr,				     \
+			  intype __b, const int __c)			     \
+{									     \
+  __builtin_aarch64_simd_xi __o;					     \
+  largetype __temp;							     \
+  __temp.val[0]								     \
+    = vcombine_##funcsuffix (__b.val[0],				     \
+			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
+  __temp.val[1]								     \
+    = vcombine_##funcsuffix (__b.val[1],				     \
+			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
+  __temp.val[2]								     \
+    = vcombine_##funcsuffix (__b.val[2],				     \
+			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
+  __temp.val[3]								     \
+    = vcombine_##funcsuffix (__b.val[3],				     \
+			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
+  __o = __builtin_aarch64_set_qregxi##qmode (__o,			     \
+					     (signedtype) __temp.val[0], 0); \
+  __o = __builtin_aarch64_set_qregxi##qmode (__o,			     \
+					     (signedtype) __temp.val[1], 1); \
+  __o = __builtin_aarch64_set_qregxi##qmode (__o,			     \
+					     (signedtype) __temp.val[2], 2); \
+  __o = __builtin_aarch64_set_qregxi##qmode (__o,			     \
+					     (signedtype) __temp.val[3], 3); \
+  __builtin_aarch64_st4_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *)  \
+				     __ptr, __o, __c);			     \
+}
+
+__ST4_LANE_FUNC (float16x4x4_t, float16x8x4_t, float16_t, v4hf, v8hf, hf, f16,
+		 float16x8_t)
+__ST4_LANE_FUNC (float32x2x4_t, float32x4x4_t, float32_t, v2sf, v4sf, sf, f32,
+		 float32x4_t)
+__ST4_LANE_FUNC (float64x1x4_t, float64x2x4_t, float64_t, df, v2df, df, f64,
+		 float64x2_t)
+__ST4_LANE_FUNC (poly8x8x4_t, poly8x16x4_t, poly8_t, v8qi, v16qi, qi, p8,
+		 int8x16_t)
+__ST4_LANE_FUNC (poly16x4x4_t, poly16x8x4_t, poly16_t, v4hi, v8hi, hi, p16,
+		 int16x8_t)
+__ST4_LANE_FUNC (poly64x1x4_t, poly64x2x4_t, poly64_t, di, v2di_ssps, di, p64,
+		 poly64x2_t)
+__ST4_LANE_FUNC (int8x8x4_t, int8x16x4_t, int8_t, v8qi, v16qi, qi, s8,
+		 int8x16_t)
+__ST4_LANE_FUNC (int16x4x4_t, int16x8x4_t, int16_t, v4hi, v8hi, hi, s16,
+		 int16x8_t)
+__ST4_LANE_FUNC (int32x2x4_t, int32x4x4_t, int32_t, v2si, v4si, si, s32,
+		 int32x4_t)
+__ST4_LANE_FUNC (int64x1x4_t, int64x2x4_t, int64_t, di, v2di, di, s64,
+		 int64x2_t)
+__ST4_LANE_FUNC (uint8x8x4_t, uint8x16x4_t, uint8_t, v8qi, v16qi, qi, u8,
+		 int8x16_t)
+__ST4_LANE_FUNC (uint16x4x4_t, uint16x8x4_t, uint16_t, v4hi, v8hi, hi, u16,
+		 int16x8_t)
+__ST4_LANE_FUNC (uint32x2x4_t, uint32x4x4_t, uint32_t, v2si, v4si, si, u32,
+		 int32x4_t)
+__ST4_LANE_FUNC (uint64x1x4_t, uint64x2x4_t, uint64_t, di, v2di, di, u64,
+		 int64x2_t)
+__ST4_LANE_FUNC (bfloat16x4x4_t, bfloat16x8x4_t, bfloat16_t, v4bf, v8bf, bf, bf16,
+		 bfloat16x8_t)
+
+/* vst4q_lane */
+
+#undef __ST4_LANE_FUNC
+#define __ST4_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix)	    \
+__extension__ extern __inline void					    \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+vst4q_lane_ ## funcsuffix (ptrtype *__ptr,				    \
+			   intype __b, const int __c)			    \
+{									    \
+  union { intype __i;							    \
+	  __builtin_aarch64_simd_xi __o; } __temp = { __b };		    \
+  __builtin_aarch64_st4_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \
+				    __ptr, __temp.__o, __c);		    \
+}
+
+__ST4_LANE_FUNC (float16x8x4_t, float16_t, v8hf, hf, f16)
+__ST4_LANE_FUNC (float32x4x4_t, float32_t, v4sf, sf, f32)
+__ST4_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64)
+__ST4_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8)
+__ST4_LANE_FUNC (poly16x8x4_t, poly16_t, v8hi, hi, p16)
+__ST4_LANE_FUNC (poly64x2x4_t, poly64_t, v2di, di, p64)
+__ST4_LANE_FUNC (int8x16x4_t, int8_t, v16qi, qi, s8)
+__ST4_LANE_FUNC (int16x8x4_t, int16_t, v8hi, hi, s16)
+__ST4_LANE_FUNC (int32x4x4_t, int32_t, v4si, si, s32)
+__ST4_LANE_FUNC (int64x2x4_t, int64_t, v2di, di, s64)
+__ST4_LANE_FUNC (uint8x16x4_t, uint8_t, v16qi, qi, u8)
+__ST4_LANE_FUNC (uint16x8x4_t, uint16_t, v8hi, hi, u16)
+__ST4_LANE_FUNC (uint32x4x4_t, uint32_t, v4si, si, u32)
+__ST4_LANE_FUNC (uint64x2x4_t, uint64_t, v2di, di, u64)
+__ST4_LANE_FUNC (bfloat16x8x4_t, bfloat16_t, v8bf, bf, bf16)
+
+#undef __ST4_LANE_FUNC
+
 /* vld2_lane */
 
 #define __LD2_LANE_FUNC(intype, vectype, largetype, ptrtype, mode,	   \
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
index 791972c737e..61fe7e759dc 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
@@ -11,6 +11,8 @@ typedef uint16_t hfloat16_t;
 typedef uint32_t hfloat32_t;
 typedef uint64_t hfloat64_t;
 
+typedef uint16_t hbfloat16_t;
+
 extern void abort(void);
 extern void *memset(void *, int, size_t);
 extern void *memcpy(void *, const void *, size_t);
@@ -107,7 +109,7 @@ extern size_t strlen(const char *);
       {									\
 	union fp_operand {						\
 	  uint##W##_t i;						\
-	  float##W##_t f;						\
+	  T##W##_t f;							\
 	} tmp_res, tmp_exp;						\
 	tmp_res.f = VECT_VAR(result, T, W, N)[i];			\
 	tmp_exp.i = VECT_VAR(EXPECTED, h##T, W, N)[i];			\
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vstN_lane_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vstN_lane_1.c
new file mode 100644
index 00000000000..2c70bb9de9c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vstN_lane_1.c
@@ -0,0 +1,227 @@
+/* { dg-do run { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+/* Expected results for vst2, chunk 0.  */
+VECT_VAR_DECL(expected_st2_0,hbfloat,16,4) [] = { 0xABAB, 0x3210, 0x0, 0x0 };
+VECT_VAR_DECL(expected_st2_0,hbfloat,16,8) [] = { 0xABAB, 0x3210, 0x0, 0x0,
+						  0x0, 0x0, 0x0, 0x0 };
+
+/* Expected results for vst2, chunk 1.  */
+VECT_VAR_DECL(expected_st2_1,hbfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_st2_1,hbfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
+						  0x0, 0x0, 0x0, 0x0 };
+
+/* Expected results for vst3, chunk 0.  */
+VECT_VAR_DECL(expected_st3_0,hbfloat,16,4) [] = { 0xABAB, 0x3210, 0xCAFE, 0x0 };
+VECT_VAR_DECL(expected_st3_0,hbfloat,16,8) [] = { 0xABAB, 0x3210, 0xCAFE, 0x0,
+						  0x0, 0x0, 0x0, 0x0 };
+
+/* Expected results for vst3, chunk 1.  */
+VECT_VAR_DECL(expected_st3_1,hbfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_st3_1,hbfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
+						 0x0, 0x0, 0x0, 0x0 };
+
+/* Expected results for vst3, chunk 2.  */
+VECT_VAR_DECL(expected_st3_2,hbfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_st3_2,hbfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
+						  0x0, 0x0, 0x0, 0x0 };
+
+/* Expected results for vst4, chunk 0.  */
+VECT_VAR_DECL(expected_st4_0,hbfloat,16,4) [] =
+  { 0xABAB, 0x3210, 0xCAFE, 0x1234 };
+VECT_VAR_DECL(expected_st4_0,hbfloat,16,8) [] =
+  { 0xABAB, 0x3210, 0xCAFE, 0x1234, 0x0, 0x0, 0x0, 0x0 };
+
+/* Expected results for vst4, chunk 1.  */
+VECT_VAR_DECL(expected_st4_1,hbfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_st4_1,hbfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
+						  0x0, 0x0, 0x0, 0x0 };
+
+/* Expected results for vst4, chunk 2.  */
+VECT_VAR_DECL(expected_st4_2,hbfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_st4_2,hbfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
+					          0x0, 0x0, 0x0, 0x0 };
+
+/* Expected results for vst4, chunk 3.  */
+VECT_VAR_DECL(expected_st4_3,hbfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_st4_3,hbfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
+						  0x0, 0x0, 0x0, 0x0 };
+
+typedef union
+{
+  bfloat16_t bf16;
+  uint16_t u16;
+} bfloat16_u_t;
+
+static bfloat16_t result_bfloat16x4[4];
+static bfloat16_t result_bfloat16x8[8];
+
+void exec_vstX_lane (void)
+{
+  bfloat16_u_t bfloat16_data[4];
+  bfloat16_data[0].u16 = 0xABAB;
+  bfloat16_data[1].u16 = 0x3210;
+  bfloat16_data[2].u16 = 0xCAFE;
+  bfloat16_data[3].u16 = 0x1234;
+
+  bfloat16_t buffer_vld2_lane_bfloat16x2 [2] =
+    { bfloat16_data[0].bf16,
+      bfloat16_data[1].bf16 };
+  bfloat16_t buffer_vld3_lane_bfloat16x3 [3] =
+    { bfloat16_data[0].bf16,
+      bfloat16_data[1].bf16,
+      bfloat16_data[2].bf16 };
+  bfloat16_t buffer_vld4_lane_bfloat16x4 [4] =
+    { bfloat16_data[0].bf16,
+      bfloat16_data[1].bf16,
+      bfloat16_data[2].bf16,
+      bfloat16_data[3].bf16 };
+
+  /* In this case, input variables are arrays of vectors.  */
+#define DECL_VSTX_LANE(T1, W, N, X)					\
+  VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vector, T1, W, N, X);	\
+  VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vector_src, T1, W, N, X);	\
+  VECT_VAR_DECL(result_bis_##X, T1, W, N)[X * N]
+
+  /* We need to use a temporary result buffer (result_bis), because
+     the one used for other tests is not large enough. A subset of the
+     result data is moved from result_bis to result, and it is this
+     subset which is used to check the actual behavior. The next
+     macro enables to move another chunk of data from result_bis to
+     result.  */
+  /* We also use another extra input buffer (buffer_src), which we
+     fill with 0xAA, and which it used to load a vector from which we
+     read a given lane.  */
+#define TEST_VSTX_LANE(Q, T1, T2, W, N, X, L)				 \
+  memset (VECT_VAR(buffer_src, T1, W, N), 0xAA,				 \
+	  sizeof(VECT_VAR(buffer_src, T1, W, N)));			 \
+  memset (VECT_VAR(result_bis_##X, T1, W, N), 0,			 \
+	  sizeof(VECT_VAR(result_bis_##X, T1, W, N)));			 \
+									 \
+  VECT_ARRAY_VAR(vector_src, T1, W, N, X) =				 \
+    vld##X##Q##_##T2##W(VECT_VAR(buffer_src, T1, W, N));		 \
+									 \
+  VECT_ARRAY_VAR(vector, T1, W, N, X) =					 \
+    /* Use dedicated init buffer, of size X.  */			 \
+    vld##X##Q##_lane_##T2##W(VECT_VAR(buffer_vld##X##_lane, T1, W, X),	 \
+			     VECT_ARRAY_VAR(vector_src, T1, W, N, X),	 \
+			     L);					 \
+  vst##X##Q##_lane_##T2##W(VECT_VAR(result_bis_##X, T1, W, N),		 \
+			   VECT_ARRAY_VAR(vector, T1, W, N, X),		 \
+			   L);						 \
+  memcpy(VECT_VAR(result, T1, W, N), VECT_VAR(result_bis_##X, T1, W, N), \
+	 sizeof(VECT_VAR(result, T1, W, N)));
+
+  /* Overwrite "result" with the contents of "result_bis"[Y].  */
+#define TEST_EXTRA_CHUNK(T1, W, N, X, Y)		\
+  memcpy(VECT_VAR(result, T1, W, N),			\
+	 &(VECT_VAR(result_bis_##X, T1, W, N)[Y*N]),	\
+	 sizeof(VECT_VAR(result, T1, W, N)));
+
+#define DUMMY_ARRAY(V, T, W, N, L) VECT_VAR_DECL(V,T,W,N)[N*L]
+
+  DECL_VSTX_LANE(bfloat, 16, 4, 2);
+  DECL_VSTX_LANE(bfloat, 16, 8, 2);
+  DECL_VSTX_LANE(bfloat, 16, 4, 3);
+  DECL_VSTX_LANE(bfloat, 16, 8, 3);
+  DECL_VSTX_LANE(bfloat, 16, 4, 4);
+  DECL_VSTX_LANE(bfloat, 16, 8, 4);
+
+  DUMMY_ARRAY(buffer_src, bfloat, 16, 4, 4);
+  DUMMY_ARRAY(buffer_src, bfloat, 16, 8, 4);
+
+  /* Check vst2_lane/vst2q_lane.  */
+  clean_results ();
+  TEST_VSTX_LANE(, bfloat, bf, 16, 4, 2, 2);
+  TEST_VSTX_LANE(q, bfloat, bf, 16, 8, 2, 6);
+
+#undef CMT
+#define CMT " (chunk 0)"
+#undef TEST_MSG
+#define TEST_MSG "VST2_LANE/VST2Q_LANE"
+  CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st2_0, CMT);
+  CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st2_0, CMT);
+  TEST_EXTRA_CHUNK(bfloat, 16, 4, 2, 1);
+  TEST_EXTRA_CHUNK(bfloat, 16, 8, 2, 1);
+
+#undef CMT
+#define CMT " (chunk 1)"
+  CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st2_1, CMT);
+  CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st2_1, CMT);
+
+  /* Check vst3_lane/vst3q_lane.  */
+  clean_results ();
+#undef TEST_MSG
+#define TEST_MSG "VST3_LANE/VST3Q_LANE"
+  TEST_VSTX_LANE(, bfloat, bf, 16, 4, 3, 2);
+  TEST_VSTX_LANE(q, bfloat, bf, 16, 8, 3, 6);
+
+#undef CMT
+#define CMT " (chunk 0)"
+  CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st3_0, CMT);
+  CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st3_0, CMT);
+
+  TEST_EXTRA_CHUNK(bfloat, 16, 4, 3, 1);
+  TEST_EXTRA_CHUNK(bfloat, 16, 8, 3, 1);
+
+
+#undef CMT
+#define CMT " (chunk 1)"
+  CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st3_1, CMT);
+  CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st3_1, CMT);
+
+  TEST_EXTRA_CHUNK(bfloat, 16, 4, 3, 2);
+  TEST_EXTRA_CHUNK(bfloat, 16, 8, 3, 2);
+
+#undef CMT
+#define CMT " (chunk 2)"
+  CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st3_2, CMT);
+  CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st3_2, CMT);
+
+  /* Check vst4_lane/vst4q_lane.  */
+  clean_results ();
+#undef TEST_MSG
+#define TEST_MSG "VST4_LANE/VST4Q_LANE"
+  TEST_VSTX_LANE(, bfloat, bf, 16, 4, 4, 2);
+  TEST_VSTX_LANE(q, bfloat, bf, 16, 8, 4, 6);
+
+#undef CMT
+#define CMT " (chunk 0)"
+  CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st4_0, CMT);
+  CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st4_0, CMT);
+
+  TEST_EXTRA_CHUNK(bfloat, 16, 4, 4, 1);
+  TEST_EXTRA_CHUNK(bfloat, 16, 8, 4, 1);
+
+#undef CMT
+#define CMT " (chunk 1)"
+  CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st4_1, CMT);
+  CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st4_1, CMT);
+
+  TEST_EXTRA_CHUNK(bfloat, 16, 4, 4, 2);
+  TEST_EXTRA_CHUNK(bfloat, 16, 8, 4, 2);
+
+#undef CMT
+#define CMT " (chunk 2)"
+  CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st4_2, CMT);
+  CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st4_2, CMT);
+
+  TEST_EXTRA_CHUNK(bfloat, 16, 4, 4, 3);
+  TEST_EXTRA_CHUNK(bfloat, 16, 8, 4, 3);
+
+#undef CMT
+#define CMT " (chunk 3)"
+  CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st4_3, CMT);
+  CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st4_3, CMT);
+}
+
+int main (void)
+{
+  exec_vstX_lane ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vstN_lane_2.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vstN_lane_2.c
new file mode 100644
index 00000000000..f70c34dbd83
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vstN_lane_2.c
@@ -0,0 +1,52 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-O2  --save-temps" } */
+
+#include <arm_neon.h>
+
+void
+test_vst2_lane_bf16 (bfloat16_t *ptr, bfloat16x4x2_t b)
+{
+  vst2_lane_bf16 (ptr, b, 2);
+}
+
+/* { dg-final { scan-assembler-times "st2\\t{v2.h - v3.h}\\\[2\\\], \\\[x0\\\]" 1 } } */
+
+void
+test_vst2q_lane_bf16 (bfloat16_t *ptr, bfloat16x8x2_t b)
+{
+  vst2q_lane_bf16 (ptr, b, 2);
+}
+
+/* { dg-final { scan-assembler-times "st2\\t{v0.h - v1.h}\\\[2\\\], \\\[x0\\\]" 1 } } */
+
+void
+test_vst3_lane_bf16 (bfloat16_t *ptr, bfloat16x4x3_t b)
+{
+  vst3_lane_bf16 (ptr, b, 2);
+}
+
+void
+test_vst3q_lane_bf16 (bfloat16_t *ptr, bfloat16x8x3_t b)
+{
+  vst3q_lane_bf16 (ptr, b, 2);
+}
+
+/* { dg-final { scan-assembler-times "st3\\t{v4.h - v6.h}\\\[2\\\], \\\[x0\\\]" 2 } } */
+
+void
+test_vst4_lane_bf16 (bfloat16_t *ptr, bfloat16x4x4_t b)
+{
+  vst4_lane_bf16 (ptr, b, 2);
+}
+
+/* { dg-final { scan-assembler-times "st4\\t{v4.h - v7.h}\\\[2\\\], \\\[x0\\\]" 1 } } */
+
+void
+test_vst4q_lane_bf16 (bfloat16_t *ptr, bfloat16x8x4_t b)
+{
+  vst4q_lane_bf16 (ptr, b, 2);
+}
+
+/* { dg-final { scan-assembler-times "st4\\t{v0.h - v3.h}\\\[2\\\], \\\[x0\\\]" 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst2_lane_bf16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst2_lane_bf16_indices_1.c
new file mode 100644
index 00000000000..00e3b9bfe5f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst2_lane_bf16_indices_1.c
@@ -0,0 +1,16 @@
+#include <arm_neon.h>
+
+/* { dg-do compile { target { aarch64*-*-* } } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok { target { arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+
+void
+f_vst2_lane_bf16 (bfloat16_t * p, bfloat16x4x2_t v)
+{
+  /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */
+  vst2_lane_bf16 (p, v, 4);
+  /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */
+  vst2_lane_bf16 (p, v, -1);
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst2q_lane_bf16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst2q_lane_bf16_indices_1.c
new file mode 100644
index 00000000000..e39968008f9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst2q_lane_bf16_indices_1.c
@@ -0,0 +1,16 @@
+#include <arm_neon.h>
+
+/* { dg-do compile { target { aarch64*-*-* } } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok { target { arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+
+void
+f_vst2q_lane_bf16 (bfloat16_t * p, bfloat16x8x2_t v)
+{
+  /* { dg-error "lane 8 out of range 0 - 7" "" { target *-*-* } 0 } */
+  vst2q_lane_bf16 (p, v, 8);
+  /* { dg-error "lane -1 out of range 0 - 7" "" { target *-*-* } 0 } */
+  vst2q_lane_bf16 (p, v, -1);
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst3_lane_bf16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst3_lane_bf16_indices_1.c
new file mode 100644
index 00000000000..9dcba196791
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst3_lane_bf16_indices_1.c
@@ -0,0 +1,16 @@
+#include <arm_neon.h>
+
+/* { dg-do compile { target { aarch64*-*-* } } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok { target { arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+
+void
+f_vst3_lane_bf16 (bfloat16_t * p, bfloat16x4x3_t v)
+{
+  /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */
+  vst3_lane_bf16 (p, v, 4);
+  /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */
+  vst3_lane_bf16 (p, v, -1);
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst3q_lane_bf16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst3q_lane_bf16_indices_1.c
new file mode 100644
index 00000000000..f9aa76a06dc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst3q_lane_bf16_indices_1.c
@@ -0,0 +1,16 @@
+#include <arm_neon.h>
+
+/* { dg-do compile { target { aarch64*-*-* } } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok { target { arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+
+void
+f_vst3q_lane_bf16 (bfloat16_t * p, bfloat16x8x3_t v)
+{
+  /* { dg-error "lane 8 out of range 0 - 7" "" { target *-*-* } 0 } */
+  vst3q_lane_bf16 (p, v, 8);
+  /* { dg-error "lane -1 out of range 0 - 7" "" { target *-*-* } 0 } */
+  vst3q_lane_bf16 (p, v, -1);
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst4_lane_bf16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst4_lane_bf16_indices_1.c
new file mode 100644
index 00000000000..405b583aabd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst4_lane_bf16_indices_1.c
@@ -0,0 +1,16 @@
+#include <arm_neon.h>
+
+/* { dg-do compile { target { aarch64*-*-* } } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok { target { arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+
+void
+f_vst4_lane_bf16 (bfloat16_t * p, bfloat16x4x4_t v)
+{
+  /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */
+  vst4_lane_bf16 (p, v, 4);
+  /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */
+  vst4_lane_bf16 (p, v, -1);
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst4q_lane_bf16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst4q_lane_bf16_indices_1.c
new file mode 100644
index 00000000000..51e372197b0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst4q_lane_bf16_indices_1.c
@@ -0,0 +1,16 @@
+#include <arm_neon.h>
+
+/* { dg-do compile { target { aarch64*-*-* } } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok { target { arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+
+void
+f_vst4q_lane_bf16 (bfloat16_t * p, bfloat16x8x4_t v)
+{
+  /* { dg-error "lane 8 out of range 0 - 7" "" { target *-*-* } 0 } */
+  vst4q_lane_bf16 (p, v, 8);
+  /* { dg-error "lane -1 out of range 0 - 7" "" { target *-*-* } 0 } */
+  vst4q_lane_bf16 (p, v, -1);
+  return;
+}
-- 
2.20.1