This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [AARCH64] Neon vld1_*_x3, vst1_*_x2 and vst1_*_x3 intrinsics
On 11 April 2018 at 15:53, Sudakshina Das <sudi.das@arm.com> wrote:
> Hi Sameera
>
>
> On 11/04/18 09:04, Sameera Deshpande wrote:
>>
>> On 10 April 2018 at 20:07, Sudakshina Das <sudi.das@arm.com> wrote:
>>>
>>> Hi Sameera
>>>
>>>
>>> On 10/04/18 11:20, Sameera Deshpande wrote:
>>>>
>>>>
>>>> On 7 April 2018 at 01:25, Christophe Lyon <christophe.lyon@linaro.org>
>>>> wrote:
>>>>>
>>>>>
>>>>> Hi,
>>>>>
>>>>> 2018-04-06 12:15 GMT+02:00 Sameera Deshpande
>>>>> <sameera.deshpande@linaro.org>:
>>>>>>
>>>>>>
>>>>>> Hi Christophe,
>>>>>>
>>>>>> Please find attached the updated patch with testcases.
>>>>>>
>>>>>> Ok for trunk?
>>>>>
>>>>>
>>>>>
>>>>> Thanks for the update.
>>>>>
>>>>> Since the new intrinsics are only available on aarch64, you want to
>>>>> prevent the tests from running on arm.
>>>>> Indeed gcc.target/aarch64/advsimd-intrinsics/ is shared between the two
>>>>> targets.
>>>>> There are several examples on how to do that in that directory.
>>>>>
>>>>> I have also noticed that the tests fail at execution on aarch64_be.
>>>>>
>>>>> I didn't look at the patch in details.
>>>>>
>>>>> Christophe
>>>>>
>>>>>
>>>>>>
>>>>>> - Thanks and regards,
>>>>>> Sameera D.
>>>>>>
>>>>>> 2017-12-14 22:17 GMT+05:30 Christophe Lyon
>>>>>> <christophe.lyon@linaro.org>:
>>>>>>>
>>>>>>>
>>>>>>> 2017-12-14 9:29 GMT+01:00 Sameera Deshpande
>>>>>>> <sameera.deshpande@linaro.org>:
>>>>>>>>
>>>>>>>>
>>>>>>>> Hi!
>>>>>>>>
>>>>>>>> Please find attached the patch implementing vld1_*_x3, vst1_*_x2 and
>>>>>>>> vst1_*_x3 intrinsics as defined by Neon document.
>>>>>>>>
>>>>>>>> Ok for trunk?
>>>>>>>>
>>>>>>>> - Thanks and regards,
>>>>>>>> Sameera D.
>>>>>>>>
>>>>>>>> gcc/Changelog:
>>>>>>>>
>>>>>>>> 2017-11-14 Sameera Deshpande <sameera.deshpande@linaro.org>
>>>>>>>>
>>>>>>>>
>>>>>>>> * config/aarch64/aarch64-simd-builtins.def (ld1x3): New.
>>>>>>>> (st1x2): Likewise.
>>>>>>>> (st1x3): Likewise.
>>>>>>>> * config/aarch64/aarch64-simd.md
>>>>>>>> (aarch64_ld1x3<VALLDIF:mode>): New pattern.
>>>>>>>> (aarch64_ld1_x3_<mode>): Likewise
>>>>>>>> (aarch64_st1x2<VALLDIF:mode>): Likewise
>>>>>>>> (aarch64_st1_x2_<mode>): Likewise
>>>>>>>> (aarch64_st1x3<VALLDIF:mode>): Likewise
>>>>>>>> (aarch64_st1_x3_<mode>): Likewise
>>>>>>>> * config/aarch64/arm_neon.h (vld1_u8_x3): New function.
>>>>>>>> (vld1_s8_x3): Likewise.
>>>>>>>> (vld1_u16_x3): Likewise.
>>>>>>>> (vld1_s16_x3): Likewise.
>>>>>>>> (vld1_u32_x3): Likewise.
>>>>>>>> (vld1_s32_x3): Likewise.
>>>>>>>> (vld1_u64_x3): Likewise.
>>>>>>>> (vld1_s64_x3): Likewise.
>>>>>>>> (vld1_fp16_x3): Likewise.
>>>>>>>> (vld1_f32_x3): Likewise.
>>>>>>>> (vld1_f64_x3): Likewise.
>>>>>>>> (vld1_p8_x3): Likewise.
>>>>>>>> (vld1_p16_x3): Likewise.
>>>>>>>> (vld1_p64_x3): Likewise.
>>>>>>>> (vld1q_u8_x3): Likewise.
>>>>>>>> (vld1q_s8_x3): Likewise.
>>>>>>>> (vld1q_u16_x3): Likewise.
>>>>>>>> (vld1q_s16_x3): Likewise.
>>>>>>>> (vld1q_u32_x3): Likewise.
>>>>>>>> (vld1q_s32_x3): Likewise.
>>>>>>>> (vld1q_u64_x3): Likewise.
>>>>>>>> (vld1q_s64_x3): Likewise.
>>>>>>>> (vld1q_f16_x3): Likewise.
>>>>>>>> (vld1q_f32_x3): Likewise.
>>>>>>>> (vld1q_f64_x3): Likewise.
>>>>>>>> (vld1q_p8_x3): Likewise.
>>>>>>>> (vld1q_p16_x3): Likewise.
>>>>>>>> (vld1q_p64_x3): Likewise.
>>>>>>>> (vst1_s64_x2): Likewise.
>>>>>>>> (vst1_u64_x2): Likewise.
>>>>>>>> (vst1_f64_x2):
>>>>>>>>
>>>>>>>> Likewise.patchurl=http://people.linaro.org/~christophe.lyon/armv8_2-fp16-scalar-2.patch3
>>>>>
>>>>>
>>>>> patchname=armv8_2-fp16-scalar-2.patch3
>>>>> refrev=259064
>>>>> email_to=christophe.lyon@linaro.org
>>>>>
>>>>>>>> (vst1_s8_x2): Likewise.
>>>>>>>> (vst1_p8_x2): Likewise.
>>>>>>>> (vst1_s16_x2): Likewise.
>>>>>>>> (vst1_p16_x2): Likewise.
>>>>>>>> (vst1_s32_x2): Likewise.
>>>>>>>> (vst1_u8_x2): Likewise.
>>>>>>>> (vst1_u16_x2): Likewise.
>>>>>>>> (vst1_u32_x2): Likewise.
>>>>>>>> (vst1_f16_x2): Likewise.
>>>>>>>> (vst1_f32_x2): Likewise.
>>>>>>>> (vst1_p64_x2): Likewise.
>>>>>>>> (vst1q_s8_x2): Likewise.
>>>>>>>> (vst1q_p8_x2): Likewise.
>>>>>>>> (vst1q_s16_x2): Likewise.
>>>>>>>> (vst1q_p16_x2): Likewise.
>>>>>>>> (vst1q_s32_x2): Likewise.
>>>>>>>> (vst1q_s64_x2): Likewise.
>>>>>>>> (vst1q_u8_x2): Likewise.
>>>>>>>> (vst1q_u16_x2): Likewise.
>>>>>>>> (vst1q_u32_x2): Likewise.
>>>>>>>> (vst1q_u64_x2): Likewise.
>>>>>>>> (vst1q_f16_x2): Likewise.
>>>>>>>> (vst1q_f32_x2): Likewise.
>>>>>>>> (vst1q_f64_x2): Likewise.
>>>>>>>> (vst1q_p64_x2): Likewise.
>>>>>>>> (vst1_s64_x3): Likewise.
>>>>>>>> (vst1_u64_x3): Likewise.
>>>>>>>> (vst1_f64_x3): Likewise.
>>>>>>>> (vst1_s8_x3): Likewise.
>>>>>>>> (vst1_p8_x3): Likewise.
>>>>>>>> (vst1_s16_x3): Likewise.
>>>>>>>> (vst1_p16_x3): Likewise.
>>>>>>>> (vst1_s32_x3): Likewise.
>>>>>>>> (vst1_u8_x3): Likewise.
>>>>>>>> (vst1_u16_x3): Likewise.
>>>>>>>> (vst1_u32_x3): Likewise.
>>>>>>>> (vst1_f16_x3): Likewise.
>>>>>>>> (vst1_f32_x3): Likewise.
>>>>>>>> (vst1_p64_x3): Likewise.
>>>>>>>> (vst1q_s8_x3): Likewise.
>>>>>>>> (vst1q_p8_x3): Likewise.
>>>>>>>> (vst1q_s16_x3): Likewise.
>>>>>>>> (vst1q_p16_x3): Likewise.
>>>>>>>> (vst1q_s32_x3): Likewise.
>>>>>>>> (vst1q_s64_x3): Likewise.
>>>>>>>> (vst1q_u8_x3): Likewise.
>>>>>>>> (vst1q_u16_x3): Likewise.
>>>>>>>> (vst1q_u32_x3): Likewise.
>>>>>>>> (vst1q_u64_x3): Likewise.
>>>>>>>> (vst1q_f16_x3): Likewise.
>>>>>>>> (vst1q_f32_x3): Likewise.
>>>>>>>> (vst1q_f64_x3): Likewise.
>>>>>>>> (vst1q_p64_x3): Likewise.
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> Hi,
>>>>>>> I'm not a maintainer, but I suspect you should add some tests.
>>>>>>>
>>>>>>> Christophe
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>> --
>>>>>> - Thanks and regards,
>>>>>> Sameera D.
>>>>
>>>>
>>>>
>>>> Hi Christophe,
>>>>
>>>> Please find attached the updated patch. Similar to the testcase
>>>> vld1x2.c, I have updated the testcases to mark them XFAIL for ARM, as
>>>> the intrinsics are not implemented yet. I have also added required
>>>> target to be little endian.
>>>
>>>
>>>
>>> I am not a maintainer either. Shouldn't these intrinsics be supported
>>> even for big endian?
>>>
>>
>> Yes, they should be implemented, however it is out of scope of this patch.
>>
>>> From your patch:
>>>
>>>> diff --git
>>>> a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c
>>>> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c
>>>
>>>
>>> new file mode 100644
>>> index 0000000..c37c72c
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c
>>> @@ -0,0 +1,82 @@
>>> +/* We haven't implemented these intrinsics for arm yet. */
>>> +/* { dg-xfail-if "" { arm*-*-* } } */
>>> +/* { dg-require-effective-target aarch64_little_endian } */
>>>
>>> According to
>>>
>>> https://gcc.gnu.org/onlinedocs/gccint/Directives.html#Skip-a-test-for-some-targets
>>> this must follow the dg-do directive.
>>>
>>> Also I think the require-effective-target directive will only allow
>>> the test to run on aarch64-*-*-* so the xfail on arm-*-*-* is kind
>>> of not helpful. Maybe something like:
>>>
>>> /* { dg-require-effective-target aarch64_little_endian { target {
>>> aarch64*-*-* } } } */
>>
>>
>> Ok, then I will restrict this test only for aarch64_little_endian, as
>> that is the purpose of this test.
>>
>>>
>>> So that the check is not performed on arm-*-*-* and the test runs
>>> and fails and then the xfail makes more sense.
>>>
>>> In case the big endian version is also expected to be implemented
>>> in the future, something like:
>>>
>>> /* { dg-xfail-if "" { arm*-*-* || aarch64_big_endian } } */
>>>
>>> or
>>>
>>> /* { dg-do run { xfail { arm*-*-* || aarch64_big_endian } } } */
>>>
>>> would be simpler. (PS: I haven't tested any of these directive myself).
>>>
>>
>> Please find attached updated patch.
>
>
> Thank you for making the edits. One last nit from my side, as I
> mentioned earlier the dg-require-effective directive should go after
> the dg-do directive as mentioned here:
> https://gcc.gnu.org/onlinedocs/gccint/Directives.html#Skip-a-test-for-some-targets
>
Thanks for pointing that out.
Please find attached the updated patch.
> Thanks
> Sudi
>
>
>>
>>>
>>> Thanks
>>> Sudi
>>>
>>>
>>> +/* { dg-do run } */
>>> +/* { dg-options "-O3" } */
>>>
>>>
>>>
>>>> Ok for thrunk?
>>>>
>>>> - Thanks and regards,
>>>> Sameera D.
>>>>
>>>> gcc/Changelog:
>>>>
>>>> 2018-04-10 Sameera Deshpande <sameera.deshpande@linaro.org>
>>>>
>>>>
>>>> * config/aarch64/aarch64-simd-builtins.def (ld1x3): New.
>>>> (st1x2): Likewise.
>>>> (st1x3): Likewise.
>>>> * config/aarch64/aarch64-simd.md
>>>> (aarch64_ld1x3<VALLDIF:mode>): New pattern.
>>>> (aarch64_ld1_x3_<mode>): Likewise
>>>> (aarch64_st1x2<VALLDIF:mode>): Likewise
>>>> (aarch64_st1_x2_<mode>): Likewise
>>>> (aarch64_st1x3<VALLDIF:mode>): Likewise
>>>> (aarch64_st1_x3_<mode>): Likewise
>>>> * config/aarch64/arm_neon.h (vld1_u8_x3): New function.
>>>> (vld1_s8_x3): Likewise.
>>>> (vld1_u16_x3): Likewise.
>>>> (vld1_s16_x3): Likewise.
>>>> (vld1_u32_x3): Likewise.
>>>> (vld1_s32_x3): Likewise.
>>>> (vld1_u64_x3): Likewise.
>>>> (vld1_s64_x3): Likewise.
>>>> (vld1_f16_x3): Likewise.
>>>> (vld1_f32_x3): Likewise.
>>>> (vld1_f64_x3): Likewise.
>>>> (vld1_p8_x3): Likewise.
>>>> (vld1_p16_x3): Likewise.
>>>> (vld1_p64_x3): Likewise.
>>>> (vld1q_u8_x3): Likewise.
>>>> (vld1q_s8_x3): Likewise.
>>>> (vld1q_u16_x3): Likewise.
>>>> (vld1q_s16_x3): Likewise.
>>>> (vld1q_u32_x3): Likewise.
>>>> (vld1q_s32_x3): Likewise.
>>>> (vld1q_u64_x3): Likewise.
>>>> (vld1q_s64_x3): Likewise.
>>>> (vld1q_f16_x3): Likewise.
>>>> (vld1q_f32_x3): Likewise.
>>>> (vld1q_f64_x3): Likewise.
>>>> (vld1q_p8_x3): Likewise.
>>>> (vld1q_p16_x3): Likewise.
>>>> (vld1q_p64_x3): Likewise.
>>>> (vst1_s64_x2): Likewise.
>>>> (vst1_u64_x2): Likewise.
>>>> (vst1_f64_x2): Likewise.
>>>> (vst1_s8_x2): Likewise.
>>>> (vst1_p8_x2): Likewise.
>>>> (vst1_s16_x2): Likewise.
>>>> (vst1_p16_x2): Likewise.
>>>> (vst1_s32_x2): Likewise.
>>>> (vst1_u8_x2): Likewise.
>>>> (vst1_u16_x2): Likewise.
>>>> (vst1_u32_x2): Likewise.
>>>> (vst1_f16_x2): Likewise.
>>>> (vst1_f32_x2): Likewise.
>>>> (vst1_p64_x2): Likewise.
>>>> (vst1q_s8_x2): Likewise.
>>>> (vst1q_p8_x2): Likewise.
>>>> (vst1q_s16_x2): Likewise.
>>>> (vst1q_p16_x2): Likewise.
>>>> (vst1q_s32_x2): Likewise.
>>>> (vst1q_s64_x2): Likewise.
>>>> (vst1q_u8_x2): Likewise.
>>>> (vst1q_u16_x2): Likewise.
>>>> (vst1q_u32_x2): Likewise.
>>>> (vst1q_u64_x2): Likewise.
>>>> (vst1q_f16_x2): Likewise.
>>>> (vst1q_f32_x2): Likewise.
>>>> (vst1q_f64_x2): Likewise.
>>>> (vst1q_p64_x2): Likewise.
>>>> (vst1_s64_x3): Likewise.
>>>> (vst1_u64_x3): Likewise.
>>>> (vst1_f64_x3): Likewise.
>>>> (vst1_s8_x3): Likewise.
>>>> (vst1_p8_x3): Likewise.
>>>> (vst1_s16_x3): Likewise.
>>>> (vst1_p16_x3): Likewise.
>>>> (vst1_s32_x3): Likewise.
>>>> (vst1_u8_x3): Likewise.
>>>> (vst1_u16_x3): Likewise.
>>>> (vst1_u32_x3): Likewise.
>>>> (vst1_f16_x3): Likewise.
>>>> (vst1_f32_x3): Likewise.
>>>> (vst1_p64_x3): Likewise.
>>>> (vst1q_s8_x3): Likewise.
>>>> (vst1q_p8_x3): Likewise.
>>>> (vst1q_s16_x3): Likewise.
>>>> (vst1q_p16_x3): Likewise.
>>>> (vst1q_s32_x3): Likewise.
>>>> (vst1q_s64_x3): Likewise.
>>>> (vst1q_u8_x3): Likewise.
>>>> (vst1q_u16_x3): Likewise.
>>>> (vst1q_u32_x3): Likewise.
>>>> (vst1q_u64_x3): Likewise.
>>>> (vst1q_f16_x3): Likewise.
>>>> (vst1q_f32_x3): Likewise.
>>>> (vst1q_f64_x3): Likewise.
>>>> (vst1q_p64_x3): Likewise.
>>>>
>>>> gcc/testsuite/Changelog:
>>>>
>>>> 2018-04-10 Sameera Deshpande <sameera.deshpande@linaro.org>
>>>>
>>>> * gcc.target/aarch64/advsimd-intrinsics/vld1x3.c: New test for
>>>> vld1x3 intrinsics for aarch64_little_endian.
>>>> * gcc.target/aarch64/advsimd-intrinsics/vst1x2.c: New test for
>>>> vst1x2 intrinsics for aarch64_little_endian.
>>>> * gcc.target/aarch64/advsimd-intrinsics/vst1x3.c: New test for
>>>> vst1x3 intrinsics for aarch64_little_endian.
>>>>
>>>
>>
>>
>>
>
--
- Thanks and regards,
Sameera D.
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index b383f24..2fd072a 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -445,6 +445,15 @@
BUILTIN_VALL_F16 (STORE1, st1, 0)
VAR1(STORE1P, st1, 0, v2di)
+ /* Implemented by aarch64_ld1x3<VALLDIF:mode>. */
+ BUILTIN_VALLDIF (LOADSTRUCT, ld1x3, 0)
+
+ /* Implemented by aarch64_st1x2<VALLDIF:mode>. */
+ BUILTIN_VALLDIF (STORESTRUCT, st1x2, 0)
+
+ /* Implemented by aarch64_st1x3<VALLDIF:mode>. */
+ BUILTIN_VALLDIF (STORESTRUCT, st1x3, 0)
+
/* Implemented by fma<mode>4. */
BUILTIN_VHSDF (TERNOP, fma, 4)
VAR1 (TERNOP, fma, 4, hf)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 3d1f6a0..e197a67 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -5047,6 +5047,70 @@
}
})
+
+(define_expand "aarch64_ld1x3<VALLDIF:mode>"
+ [(match_operand:CI 0 "register_operand" "=w")
+ (match_operand:DI 1 "register_operand" "r")
+ (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ "TARGET_SIMD"
+{
+ rtx mem = gen_rtx_MEM (CImode, operands[1]);
+ emit_insn (gen_aarch64_ld1_x3_<VALLDIF:mode> (operands[0], mem));
+ DONE;
+})
+
+(define_insn "aarch64_ld1_x3_<mode>"
+ [(set (match_operand:CI 0 "register_operand" "=w")
+ (unspec:CI
+ [(match_operand:CI 1 "aarch64_simd_struct_operand" "Utv")
+ (unspec:VALLDIF [(const_int 3)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_LD1))]
+ "TARGET_SIMD"
+ "ld1\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
+ [(set_attr "type" "neon_load1_3reg<q>")]
+)
+
+(define_expand "aarch64_st1x2<VALLDIF:mode>"
+ [(match_operand:DI 0 "register_operand" "")
+ (match_operand:OI 1 "register_operand" "")
+ (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ "TARGET_SIMD"
+{
+ rtx mem = gen_rtx_MEM (OImode, operands[0]);
+ emit_insn (gen_aarch64_st1_x2_<VALLDIF:mode> (mem, operands[1]));
+ DONE;
+})
+
+(define_insn "aarch64_st1_x2_<mode>"
+ [(set (match_operand:OI 0 "aarch64_simd_struct_operand" "=Utv")
+ (unspec:OI
+ [(match_operand:OI 1 "register_operand" "w")
+ (unspec:VALLDIF [(const_int 2)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_ST1))]
+ "TARGET_SIMD"
+ "st1\\t{%S1.<Vtype> - %T1.<Vtype>}, %0"
+ [(set_attr "type" "neon_store1_2reg<q>")]
+)
+
+(define_expand "aarch64_st1x3<VALLDIF:mode>"
+ [(match_operand:DI 0 "register_operand" "")
+ (match_operand:CI 1 "register_operand" "")
+ (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ "TARGET_SIMD"
+{
+ rtx mem = gen_rtx_MEM (CImode, operands[0]);
+ emit_insn (gen_aarch64_st1_x3_<VALLDIF:mode> (mem, operands[1]));
+ DONE;
+})
+
+(define_insn "aarch64_st1_x3_<mode>"
+ [(set (match_operand:CI 0 "aarch64_simd_struct_operand" "=Utv")
+ (unspec:CI
+ [(match_operand:CI 1 "register_operand" "w")
+ (unspec:VALLDIF [(const_int 3)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_ST1))]
+ "TARGET_SIMD"
+ "st1\\t{%S1.<Vtype> - %U1.<Vtype>}, %0"
+ [(set_attr "type" "neon_store1_3reg<q>")]
+)
+
(define_insn "*aarch64_mov<mode>"
[(set (match_operand:VSTRUCT 0 "aarch64_simd_nonimmediate_operand" "=w,Utv,w")
(match_operand:VSTRUCT 1 "aarch64_simd_general_operand" " w,w,Utv"))]
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index c45c29a..6ac7099 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -17145,6 +17145,374 @@ vld1_u64 (const uint64_t *a)
return (uint64x1_t) {*a};
}
+/* vld1x3 */
+
+__extension__ extern __inline uint8x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u8_x3 (const uint8_t *__a)
+{
+ uint8x8x3_t __i;
+ __builtin_aarch64_simd_ci __o;
+ __o = (__builtin_aarch64_simd_ci)__builtin_aarch64_ld1x3v8qi ((const __builtin_aarch64_simd_qi *) __a);
+ __i.val[0] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0);
+ __i.val[1] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1);
+ __i.val[2] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2);
+ return __i;
+}
+
+__extension__ extern __inline int8x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s8_x3 (const uint8_t *__a)
+{
+ int8x8x3_t __i;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld1x3v8qi ((const __builtin_aarch64_simd_qi *) __a);
+ __i.val[0] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0);
+ __i.val[1] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1);
+ __i.val[2] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2);
+ return __i;
+}
+
+__extension__ extern __inline uint16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u16_x3 (const uint16_t *__a)
+{
+ uint16x4x3_t __i;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld1x3v4hi ((const __builtin_aarch64_simd_hi *) __a);
+ __i.val[0] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0);
+ __i.val[1] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1);
+ __i.val[2] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2);
+ return __i;
+}
+
+__extension__ extern __inline int16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s16_x3 (const int16_t *__a)
+{
+ int16x4x3_t __i;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld1x3v4hi ((const __builtin_aarch64_simd_hi *) __a);
+ __i.val[0] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0);
+ __i.val[1] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1);
+ __i.val[2] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2);
+ return __i;
+}
+
+__extension__ extern __inline uint32x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u32_x3 (const uint32_t *__a)
+{
+ uint32x2x3_t __i;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld1x3v2si ((const __builtin_aarch64_simd_si *) __a);
+ __i.val[0] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 0);
+ __i.val[1] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 1);
+ __i.val[2] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 2);
+ return __i;
+}
+
+__extension__ extern __inline int32x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s32_x3 (const uint32_t *__a)
+{
+ int32x2x3_t __i;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld1x3v2si ((const __builtin_aarch64_simd_si *) __a);
+ __i.val[0] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 0);
+ __i.val[1] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 1);
+ __i.val[2] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 2);
+ return __i;
+}
+
+__extension__ extern __inline uint64x1x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u64_x3 (const uint64_t *__a)
+{
+ uint64x1x3_t __i;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld1x3di ((const __builtin_aarch64_simd_di *) __a);
+ __i.val[0] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
+ __i.val[1] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
+ __i.val[2] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
+ return __i;
+}
+
+__extension__ extern __inline int64x1x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s64_x3 (const int64_t *__a)
+{
+ int64x1x3_t __i;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld1x3di ((const __builtin_aarch64_simd_di *) __a);
+ __i.val[0] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
+ __i.val[1] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
+ __i.val[2] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
+
+ return __i;
+}
+
+__extension__ extern __inline float16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_f16_x3 (const float16_t *__a)
+{
+ float16x4x3_t __i;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld1x3v4hf ((const __builtin_aarch64_simd_hf *) __a);
+ __i.val[0] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 0);
+ __i.val[1] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 1);
+ __i.val[2] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 2);
+ return __i;
+}
+
+__extension__ extern __inline float32x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_f32_x3 (const float32_t *__a)
+{
+ float32x2x3_t __i;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld1x3v2sf ((const __builtin_aarch64_simd_sf *) __a);
+ __i.val[0] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 0);
+ __i.val[1] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 1);
+ __i.val[2] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 2);
+ return __i;
+}
+
+__extension__ extern __inline float64x1x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_f64_x3 (const float64_t *__a)
+{
+ float64x1x3_t __i;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld1x3df ((const __builtin_aarch64_simd_df *) __a);
+ __i.val[0] = (float64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
+ __i.val[1] = (float64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
+ __i.val[2] = (float64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
+ return __i;
+}
+
+__extension__ extern __inline poly8x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_p8_x3 (const poly8_t *__a)
+{
+ poly8x8x3_t __i;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld1x3v8qi ((const __builtin_aarch64_simd_qi *) __a);
+ __i.val[0] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0);
+ __i.val[1] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1);
+ __i.val[2] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2);
+ return __i;
+}
+
+__extension__ extern __inline poly16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_p16_x3 (const poly16_t *__a)
+{
+ poly16x4x3_t __i;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld1x3v4hi ((const __builtin_aarch64_simd_hi *) __a);
+ __i.val[0] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0);
+ __i.val[1] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1);
+ __i.val[2] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2);
+ return __i;
+}
+
+__extension__ extern __inline poly64x1x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_p64_x3 (const poly64_t *__a)
+{
+ poly64x1x3_t __i;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld1x3di ((const __builtin_aarch64_simd_di *) __a);
+ __i.val[0] = (poly64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
+ __i.val[1] = (poly64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
+ __i.val[2] = (poly64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
+
+return __i;
+}
+
+__extension__ extern __inline uint8x16x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u8_x3 (const uint8_t *__a)
+{
+ uint8x16x3_t __i;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld1x3v16qi ((const __builtin_aarch64_simd_qi *) __a);
+ __i.val[0] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
+ __i.val[1] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
+ __i.val[2] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
+ return __i;
+}
+
+__extension__ extern __inline int8x16x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s8_x3 (const int8_t *__a)
+{
+ int8x16x3_t __i;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld1x3v16qi ((const __builtin_aarch64_simd_qi *) __a);
+ __i.val[0] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
+ __i.val[1] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
+ __i.val[2] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
+ return __i;
+}
+
+__extension__ extern __inline uint16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u16_x3 (const uint16_t *__a)
+{
+ uint16x8x3_t __i;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld1x3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+ __i.val[0] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
+ __i.val[1] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
+ __i.val[2] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
+ return __i;
+}
+
+__extension__ extern __inline int16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s16_x3 (const int16_t *__a)
+{
+ int16x8x3_t __i;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld1x3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+ __i.val[0] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
+ __i.val[1] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
+ __i.val[2] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
+ return __i;
+}
+
+__extension__ extern __inline uint32x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u32_x3 (const uint32_t *__a)
+{
+ uint32x4x3_t __i;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld1x3v4si ((const __builtin_aarch64_simd_si *) __a);
+ __i.val[0] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0);
+ __i.val[1] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1);
+ __i.val[2] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2);
+ return __i;
+}
+
+__extension__ extern __inline int32x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s32_x3 (const int32_t *__a)
+{
+ int32x4x3_t __i;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld1x3v4si ((const __builtin_aarch64_simd_si *) __a);
+ __i.val[0] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0);
+ __i.val[1] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1);
+ __i.val[2] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2);
+ return __i;
+}
+
+__extension__ extern __inline uint64x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u64_x3 (const uint64_t *__a)
+{
+ uint64x2x3_t __i;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld1x3v2di ((const __builtin_aarch64_simd_di *) __a);
+ __i.val[0] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0);
+ __i.val[1] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1);
+ __i.val[2] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2);
+ return __i;
+}
+
+__extension__ extern __inline int64x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s64_x3 (const int64_t *__a)
+{
+ int64x2x3_t __i;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld1x3v2di ((const __builtin_aarch64_simd_di *) __a);
+ __i.val[0] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0);
+ __i.val[1] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1);
+ __i.val[2] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2);
+ return __i;
+}
+
+__extension__ extern __inline float16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_f16_x3 (const float16_t *__a)
+{
+ float16x8x3_t __i;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld1x3v8hf ((const __builtin_aarch64_simd_hf *) __a);
+ __i.val[0] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 0);
+ __i.val[1] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 1);
+ __i.val[2] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 2);
+ return __i;
+}
+
+__extension__ extern __inline float32x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_f32_x3 (const float32_t *__a)
+{
+ float32x4x3_t __i;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld1x3v4sf ((const __builtin_aarch64_simd_sf *) __a);
+ __i.val[0] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 0);
+ __i.val[1] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 1);
+ __i.val[2] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 2);
+ return __i;
+}
+
+__extension__ extern __inline float64x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_f64_x3 (const float64_t *__a)
+{
+ float64x2x3_t __i;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld1x3v2df ((const __builtin_aarch64_simd_df *) __a);
+ __i.val[0] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 0);
+ __i.val[1] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 1);
+ __i.val[2] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 2);
+ return __i;
+}
+
+__extension__ extern __inline poly8x16x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_p8_x3 (const poly8_t *__a)
+{
+ poly8x16x3_t __i;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld1x3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+ __i.val[0] = (poly8x16_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
+ __i.val[1] = (poly8x16_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
+ __i.val[2] = (poly8x16_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
+ return __i;
+}
+
+__extension__ extern __inline poly16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_p16_x3 (const poly16_t *__a)
+{
+ poly16x8x3_t __i;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld1x3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+ __i.val[0] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
+ __i.val[1] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
+ __i.val[2] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
+ return __i;
+}
+
+__extension__ extern __inline poly64x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_p64_x3 (const poly64_t *__a)
+{
+ poly64x2x3_t __i;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld1x3v2di ((const __builtin_aarch64_simd_di *) __a);
+ __i.val[0] = (poly64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0);
+ __i.val[1] = (poly64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1);
+ __i.val[2] = (poly64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2);
+ return __i;
+}
+
/* vld1q */
__extension__ extern __inline float16x8_t
@@ -27497,6 +27865,706 @@ vst1q_lane_u64 (uint64_t *__a, uint64x2_t __b, const int __lane)
*__a = __aarch64_vget_lane_any (__b, __lane);
}
+/* vst1x2 */
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s64_x2 (int64_t * __a, int64x1x2_t val)
+{
+ __builtin_aarch64_simd_oi __o;
+ int64x2x2_t temp;
+ temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
+ temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
+ __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1);
+ __builtin_aarch64_st1x2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u64_x2 (uint64_t * __a, uint64x1x2_t val)
+{
+ __builtin_aarch64_simd_oi __o;
+ uint64x2x2_t temp;
+ temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
+ temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
+ __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1);
+ __builtin_aarch64_st1x2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f64_x2 (float64_t * __a, float64x1x2_t val)
+{
+ __builtin_aarch64_simd_oi __o;
+ float64x2x2_t temp;
+ temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
+ temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
+ __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[1], 1);
+ __builtin_aarch64_st1x2df ((__builtin_aarch64_simd_df *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s8_x2 (int8_t * __a, int8x8x2_t val)
+{
+ __builtin_aarch64_simd_oi __o;
+ int8x16x2_t temp;
+ temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
+ temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
+ __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
+ __builtin_aarch64_st1x2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p8_x2 (poly8_t * __a, poly8x8x2_t val)
+{
+ __builtin_aarch64_simd_oi __o;
+ poly8x16x2_t temp;
+ temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
+ temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
+ __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
+ __builtin_aarch64_st1x2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s16_x2 (int16_t * __a, int16x4x2_t val)
+{
+ __builtin_aarch64_simd_oi __o;
+ int16x8x2_t temp;
+ temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
+ temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
+ __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
+ __builtin_aarch64_st1x2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p16_x2 (poly16_t * __a, poly16x4x2_t val)
+{
+ __builtin_aarch64_simd_oi __o;
+ poly16x8x2_t temp;
+ temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
+ temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
+ __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
+ __builtin_aarch64_st1x2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s32_x2 (int32_t * __a, int32x2x2_t val)
+{
+ __builtin_aarch64_simd_oi __o;
+ int32x4x2_t temp;
+ temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
+ temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
+ __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1);
+ __builtin_aarch64_st1x2v2si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u8_x2 (uint8_t * __a, uint8x8x2_t val)
+{
+ __builtin_aarch64_simd_oi __o;
+ uint8x16x2_t temp;
+ temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
+ temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
+ __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
+ __builtin_aarch64_st1x2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u16_x2 (uint16_t * __a, uint16x4x2_t val)
+{
+ __builtin_aarch64_simd_oi __o;
+ uint16x8x2_t temp;
+ temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
+ temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
+ __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
+ __builtin_aarch64_st1x2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u32_x2 (uint32_t * __a, uint32x2x2_t val)
+{
+ __builtin_aarch64_simd_oi __o;
+ uint32x4x2_t temp;
+ temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
+ temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
+ __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1);
+ __builtin_aarch64_st1x2v2si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f16_x2 (float16_t * __a, float16x4x2_t val)
+{
+ __builtin_aarch64_simd_oi __o;
+ float16x8x2_t temp;
+ temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
+ temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
+ __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[1], 1);
+ __builtin_aarch64_st1x2v4hf (__a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f32_x2 (float32_t * __a, float32x2x2_t val)
+{
+ __builtin_aarch64_simd_oi __o;
+ float32x4x2_t temp;
+ temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
+ temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
+ __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[1], 1);
+ __builtin_aarch64_st1x2v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p64_x2 (poly64_t * __a, poly64x1x2_t val)
+{
+ __builtin_aarch64_simd_oi __o;
+ poly64x2x2_t temp;
+ temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
+ temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
+ __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+ (poly64x2_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+ (poly64x2_t) temp.val[1], 1);
+ __builtin_aarch64_st1x2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s8_x2 (int8_t * __a, int8x16x2_t val)
+{
+ __builtin_aarch64_simd_oi __o;
+ __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
+ __builtin_aarch64_st1x2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p8_x2 (poly8_t * __a, poly8x16x2_t val)
+{
+ __builtin_aarch64_simd_oi __o;
+ __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
+ __builtin_aarch64_st1x2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s16_x2 (int16_t * __a, int16x8x2_t val)
+{
+ __builtin_aarch64_simd_oi __o;
+ __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
+ __builtin_aarch64_st1x2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p16_x2 (poly16_t * __a, poly16x8x2_t val)
+{
+ __builtin_aarch64_simd_oi __o;
+ __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
+ __builtin_aarch64_st1x2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s32_x2 (int32_t * __a, int32x4x2_t val)
+{
+ __builtin_aarch64_simd_oi __o;
+ __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
+ __builtin_aarch64_st1x2v4si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s64_x2 (int64_t * __a, int64x2x2_t val)
+{
+ __builtin_aarch64_simd_oi __o;
+ __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
+ __builtin_aarch64_st1x2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u8_x2 (uint8_t * __a, uint8x16x2_t val)
+{
+ __builtin_aarch64_simd_oi __o;
+ __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
+ __builtin_aarch64_st1x2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u16_x2 (uint16_t * __a, uint16x8x2_t val)
+{
+ __builtin_aarch64_simd_oi __o;
+ __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
+ __builtin_aarch64_st1x2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u32_x2 (uint32_t * __a, uint32x4x2_t val)
+{
+ __builtin_aarch64_simd_oi __o;
+ __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
+ __builtin_aarch64_st1x2v4si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u64_x2 (uint64_t * __a, uint64x2x2_t val)
+{
+ __builtin_aarch64_simd_oi __o;
+ __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
+ __builtin_aarch64_st1x2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f16_x2 (float16_t * __a, float16x8x2_t val)
+{
+ __builtin_aarch64_simd_oi __o;
+ __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[1], 1);
+ __builtin_aarch64_st1x2v8hf (__a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f32_x2 (float32_t * __a, float32x4x2_t val)
+{
+ __builtin_aarch64_simd_oi __o;
+ __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[1], 1);
+ __builtin_aarch64_st1x2v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f64_x2 (float64_t * __a, float64x2x2_t val)
+{
+ __builtin_aarch64_simd_oi __o;
+ __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[1], 1);
+ __builtin_aarch64_st1x2v2df ((__builtin_aarch64_simd_df *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p64_x2 (poly64_t * __a, poly64x2x2_t val)
+{
+ __builtin_aarch64_simd_oi __o;
+ __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+ (poly64x2_t) val.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+ (poly64x2_t) val.val[1], 1);
+ __builtin_aarch64_st1x2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+/* vst1x3 */
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s64_x3 (int64_t * __a, int64x1x3_t val)
+{
+ __builtin_aarch64_simd_ci __o;
+ int64x2x3_t temp;
+ temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
+ temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
+ temp.val[2] = vcombine_s64 (val.val[2], vcreate_s64 (__AARCH64_INT64_C (0)));
+ __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2);
+ __builtin_aarch64_st1x3di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u64_x3 (uint64_t * __a, uint64x1x3_t val)
+{
+ __builtin_aarch64_simd_ci __o;
+ uint64x2x3_t temp;
+ temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
+ temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
+ temp.val[2] = vcombine_u64 (val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0)));
+ __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2);
+ __builtin_aarch64_st1x3di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f64_x3 (float64_t * __a, float64x1x3_t val)
+{
+ __builtin_aarch64_simd_ci __o;
+ float64x2x3_t temp;
+ temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
+ temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
+ temp.val[2] = vcombine_f64 (val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0)));
+ __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[2], 2);
+ __builtin_aarch64_st1x3df ((__builtin_aarch64_simd_df *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s8_x3 (int8_t * __a, int8x8x3_t val)
+{
+ __builtin_aarch64_simd_ci __o;
+ int8x16x3_t temp;
+ temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
+ temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
+ temp.val[2] = vcombine_s8 (val.val[2], vcreate_s8 (__AARCH64_INT64_C (0)));
+ __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
+ __builtin_aarch64_st1x3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p8_x3 (poly8_t * __a, poly8x8x3_t val)
+{
+ __builtin_aarch64_simd_ci __o;
+ poly8x16x3_t temp;
+ temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
+ temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
+ temp.val[2] = vcombine_p8 (val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0)));
+ __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
+ __builtin_aarch64_st1x3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s16_x3 (int16_t * __a, int16x4x3_t val)
+{
+ __builtin_aarch64_simd_ci __o;
+ int16x8x3_t temp;
+ temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
+ temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
+ temp.val[2] = vcombine_s16 (val.val[2], vcreate_s16 (__AARCH64_INT64_C (0)));
+ __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
+ __builtin_aarch64_st1x3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p16_x3 (poly16_t * __a, poly16x4x3_t val)
+{
+ __builtin_aarch64_simd_ci __o;
+ poly16x8x3_t temp;
+ temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
+ temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
+ temp.val[2] = vcombine_p16 (val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0)));
+ __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
+ __builtin_aarch64_st1x3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s32_x3 (int32_t * __a, int32x2x3_t val)
+{
+ __builtin_aarch64_simd_ci __o;
+ int32x4x3_t temp;
+ temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
+ temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
+ temp.val[2] = vcombine_s32 (val.val[2], vcreate_s32 (__AARCH64_INT64_C (0)));
+ __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2);
+ __builtin_aarch64_st1x3v2si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u8_x3 (uint8_t * __a, uint8x8x3_t val)
+{
+ __builtin_aarch64_simd_ci __o;
+ uint8x16x3_t temp;
+ temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
+ temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
+ temp.val[2] = vcombine_u8 (val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0)));
+ __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
+ __builtin_aarch64_st1x3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u16_x3 (uint16_t * __a, uint16x4x3_t val)
+{
+ __builtin_aarch64_simd_ci __o;
+ uint16x8x3_t temp;
+ temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
+ temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
+ temp.val[2] = vcombine_u16 (val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0)));
+ __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
+ __builtin_aarch64_st1x3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u32_x3 (uint32_t * __a, uint32x2x3_t val)
+{
+ __builtin_aarch64_simd_ci __o;
+ uint32x4x3_t temp;
+ temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
+ temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
+ temp.val[2] = vcombine_u32 (val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0)));
+ __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2);
+ __builtin_aarch64_st1x3v2si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f16_x3 (float16_t * __a, float16x4x3_t val)
+{
+ __builtin_aarch64_simd_ci __o;
+ float16x8x3_t temp;
+ temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
+ temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
+ temp.val[2] = vcombine_f16 (val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0)));
+ __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[2], 2);
+ __builtin_aarch64_st1x3v4hf ((__builtin_aarch64_simd_hf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f32_x3 (float32_t * __a, float32x2x3_t val)
+{
+ __builtin_aarch64_simd_ci __o;
+ float32x4x3_t temp;
+ temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
+ temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
+ temp.val[2] = vcombine_f32 (val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0)));
+ __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[2], 2);
+ __builtin_aarch64_st1x3v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p64_x3 (poly64_t * __a, poly64x1x3_t val)
+{
+ __builtin_aarch64_simd_ci __o;
+ poly64x2x3_t temp;
+ temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
+ temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
+ temp.val[2] = vcombine_p64 (val.val[2], vcreate_p64 (__AARCH64_UINT64_C (0)));
+ __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+ (poly64x2_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+ (poly64x2_t) temp.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+ (poly64x2_t) temp.val[2], 2);
+ __builtin_aarch64_st1x3di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s8_x3 (int8_t * __a, int8x16x3_t val)
+{
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
+ __builtin_aarch64_st1x3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p8_x3 (poly8_t * __a, poly8x16x3_t val)
+{
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
+ __builtin_aarch64_st1x3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s16_x3 (int16_t * __a, int16x8x3_t val)
+{
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
+ __builtin_aarch64_st1x3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p16_x3 (poly16_t * __a, poly16x8x3_t val)
+{
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
+ __builtin_aarch64_st1x3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s32_x3 (int32_t * __a, int32x4x3_t val)
+{
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2);
+ __builtin_aarch64_st1x3v4si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s64_x3 (int64_t * __a, int64x2x3_t val)
+{
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2);
+ __builtin_aarch64_st1x3v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u8_x3 (uint8_t * __a, uint8x16x3_t val)
+{
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
+ __builtin_aarch64_st1x3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u16_x3 (uint16_t * __a, uint16x8x3_t val)
+{
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
+ __builtin_aarch64_st1x3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u32_x3 (uint32_t * __a, uint32x4x3_t val)
+{
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2);
+ __builtin_aarch64_st1x3v4si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u64_x3 (uint64_t * __a, uint64x2x3_t val)
+{
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2);
+ __builtin_aarch64_st1x3v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f16_x3 (float16_t * __a, float16x8x3_t val)
+{
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[2], 2);
+ __builtin_aarch64_st1x3v8hf ((__builtin_aarch64_simd_hf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f32_x3 (float32_t * __a, float32x4x3_t val)
+{
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[2], 2);
+ __builtin_aarch64_st1x3v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f64_x3 (float64_t * __a, float64x2x3_t val)
+{
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[2], 2);
+ __builtin_aarch64_st1x3v2df ((__builtin_aarch64_simd_df *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p64_x3 (poly64_t * __a, poly64x2x3_t val)
+{
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+ (poly64x2_t) val.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+ (poly64x2_t) val.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+ (poly64x2_t) val.val[2], 2);
+ __builtin_aarch64_st1x3v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
/* vstn */
__extension__ extern __inline void
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c
new file mode 100644
index 0000000..c37c72c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c
@@ -0,0 +1,82 @@
+/* We haven't implemented these intrinsics for arm yet. */
+/* { dg-do run } */
+/* { dg-require-effective-target aarch64_little_endian } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+extern void abort (void);
+
+#define TESTMETH(BASE, ELTS, SUFFIX) \
+int __attribute__ ((noinline)) \
+test_vld##SUFFIX##_x3 () \
+{ \
+ BASE##_t data[ELTS * 3]; \
+ BASE##_t temp[ELTS * 3]; \
+ BASE##x##ELTS##x##3##_t vectors; \
+ int i,j; \
+ for (i = 0; i < ELTS * 3; i++) \
+ data [i] = (BASE##_t) 3*i; \
+ asm volatile ("" : : : "memory"); \
+ vectors = vld1##SUFFIX##_x3 (data); \
+ vst1##SUFFIX (temp, vectors.val[0]); \
+ vst1##SUFFIX (&temp[ELTS], vectors.val[1]); \
+ vst1##SUFFIX (&temp[ELTS * 2], vectors.val[2]); \
+ asm volatile ("" : : : "memory"); \
+ for (j = 0; j < ELTS * 3; j++) \
+ if (temp[j] != data[j]) \
+ return 1; \
+ return 0; \
+}
+
+#define VARIANTS_1(VARIANT) \
+VARIANT (uint8, 8, _u8) \
+VARIANT (uint16, 4, _u16) \
+VARIANT (uint32, 2, _u32) \
+VARIANT (uint64, 1, _u64) \
+VARIANT (int8, 8, _s8) \
+VARIANT (int16, 4, _s16) \
+VARIANT (int32, 2, _s32) \
+VARIANT (int64, 1, _s64) \
+VARIANT (poly8, 8, _p8) \
+VARIANT (poly16, 4, _p16) \
+VARIANT (float16, 4, _f16) \
+VARIANT (float32, 2, _f32) \
+VARIANT (uint8, 16, q_u8) \
+VARIANT (uint16, 8, q_u16) \
+VARIANT (uint32, 4, q_u32) \
+VARIANT (uint64, 2, q_u64) \
+VARIANT (int8, 16, q_s8) \
+VARIANT (int16, 8, q_s16) \
+VARIANT (int32, 4, q_s32) \
+VARIANT (int64, 2, q_s64) \
+VARIANT (poly8, 16, q_p8) \
+VARIANT (poly16, 8, q_p16) \
+VARIANT (float16, 8, q_f16) \
+VARIANT (float32, 4, q_f32)
+
+#ifdef __aarch64__
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT) \
+VARIANT (float64, 1, _f64) \
+VARIANT (float64, 2, q_f64)
+#else
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)
+#endif
+
+
+/* Tests of vld1_x3 and vld1q_x3. */
+VARIANTS (TESTMETH)
+
+#define CHECK(BASE, ELTS, SUFFIX) \
+ if (test_vld##SUFFIX##_x3 () != 0) \
+ abort ();
+
+int
+main (int argc, char **argv)
+{
+ VARIANTS (CHECK)
+
+ return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x2.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x2.c
new file mode 100644
index 0000000..3b6797c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x2.c
@@ -0,0 +1,80 @@
+/* We haven't implemented these intrinsics for arm yet. */
+/* { dg-do run } */
+/* { dg-require-effective-target aarch64_little_endian } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+extern void abort (void);
+
+#define TESTMETH(BASE, ELTS, SUFFIX) \
+int __attribute__ ((noinline)) \
+test_vst1##SUFFIX##_x2 () \
+{ \
+ BASE##_t data[ELTS * 2]; \
+ BASE##_t temp[ELTS * 2]; \
+ BASE##x##ELTS##x##2##_t vectors; \
+ int i,j; \
+ for (i = 0; i < ELTS * 2; i++) \
+ data [i] = (BASE##_t) 2*i; \
+ asm volatile ("" : : : "memory"); \
+ vectors.val[0] = vld1##SUFFIX (data); \
+ vectors.val[1] = vld1##SUFFIX (&data[ELTS]); \
+ vst1##SUFFIX##_x2 (temp, vectors); \
+ asm volatile ("" : : : "memory"); \
+ for (j = 0; j < ELTS * 2; j++) \
+ if (temp[j] != data[j]) \
+ return 1; \
+ return 0; \
+}
+
+#define VARIANTS_1(VARIANT) \
+VARIANT (uint8, 8, _u8) \
+VARIANT (uint16, 4, _u16) \
+VARIANT (uint32, 2, _u32) \
+VARIANT (uint64, 1, _u64) \
+VARIANT (int8, 8, _s8) \
+VARIANT (int16, 4, _s16) \
+VARIANT (int32, 2, _s32) \
+VARIANT (int64, 1, _s64) \
+VARIANT (poly8, 8, _p8) \
+VARIANT (poly16, 4, _p16) \
+VARIANT (float16, 4, _f16) \
+VARIANT (float32, 2, _f32) \
+VARIANT (uint8, 16, q_u8) \
+VARIANT (uint16, 8, q_u16) \
+VARIANT (uint32, 4, q_u32) \
+VARIANT (uint64, 2, q_u64) \
+VARIANT (int8, 16, q_s8) \
+VARIANT (int16, 8, q_s16) \
+VARIANT (int32, 4, q_s32) \
+VARIANT (int64, 2, q_s64) \
+VARIANT (poly8, 16, q_p8) \
+VARIANT (poly16, 8, q_p16) \
+VARIANT (float16, 8, q_f16) \
+VARIANT (float32, 4, q_f32)
+
+#ifdef __aarch64__
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT) \
+VARIANT (float64, 1, _f64) \
+VARIANT (float64, 2, q_f64)
+#else
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)
+#endif
+
+/* Tests of vst1_x2 and vst1q_x2. */
+VARIANTS (TESTMETH)
+
+#define CHECK(BASE, ELTS, SUFFIX) \
+ if (test_vst1##SUFFIX##_x2 () != 0) \
+ abort ();
+
+int
+main (int argc, char **argv)
+{
+ VARIANTS (CHECK)
+
+ return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x3.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x3.c
new file mode 100644
index 0000000..1709115
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x3.c
@@ -0,0 +1,81 @@
+/* We haven't implemented these intrinsics for arm yet. */
+/* { dg-do run } */
+/* { dg-require-effective-target aarch64_little_endian } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+extern void abort (void);
+
+#define TESTMETH(BASE, ELTS, SUFFIX) \
+int __attribute__ ((noinline)) \
+test_vst1##SUFFIX##_x3 () \
+{ \
+ BASE##_t data[ELTS * 3]; \
+ BASE##_t temp[ELTS * 3]; \
+ BASE##x##ELTS##x##3##_t vectors; \
+ int i,j; \
+ for (i = 0; i < ELTS * 3; i++) \
+ data [i] = (BASE##_t) 3*i; \
+ asm volatile ("" : : : "memory"); \
+ vectors.val[0] = vld1##SUFFIX (data); \
+ vectors.val[1] = vld1##SUFFIX (&data[ELTS]); \
+ vectors.val[2] = vld1##SUFFIX (&data[ELTS * 2]); \
+ vst1##SUFFIX##_x3 (temp, vectors); \
+ asm volatile ("" : : : "memory"); \
+ for (j = 0; j < ELTS * 3; j++) \
+ if (temp[j] != data[j]) \
+ return 1; \
+ return 0; \
+}
+
+#define VARIANTS_1(VARIANT) \
+VARIANT (uint8, 8, _u8) \
+VARIANT (uint16, 4, _u16) \
+VARIANT (uint32, 2, _u32) \
+VARIANT (uint64, 1, _u64) \
+VARIANT (int8, 8, _s8) \
+VARIANT (int16, 4, _s16) \
+VARIANT (int32, 2, _s32) \
+VARIANT (int64, 1, _s64) \
+VARIANT (poly8, 8, _p8) \
+VARIANT (poly16, 4, _p16) \
+VARIANT (float16, 4, _f16) \
+VARIANT (float32, 2, _f32) \
+VARIANT (uint8, 16, q_u8) \
+VARIANT (uint16, 8, q_u16) \
+VARIANT (uint32, 4, q_u32) \
+VARIANT (uint64, 2, q_u64) \
+VARIANT (int8, 16, q_s8) \
+VARIANT (int16, 8, q_s16) \
+VARIANT (int32, 4, q_s32) \
+VARIANT (int64, 2, q_s64) \
+VARIANT (poly8, 16, q_p8) \
+VARIANT (poly16, 8, q_p16) \
+VARIANT (float16, 8, q_f16) \
+VARIANT (float32, 4, q_f32)
+
+#ifdef __aarch64__
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT) \
+VARIANT (float64, 1, _f64) \
+VARIANT (float64, 2, q_f64)
+#else
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)
+#endif
+
+/* Tests of vst1_x3 and vst1q_x3. */
+VARIANTS (TESTMETH)
+
+#define CHECK(BASE, ELTS, SUFFIX) \
+ if (test_vst1##SUFFIX##_x3 () != 0) \
+ abort ();
+
+int
+main (int argc, char **argv)
+{
+ VARIANTS (CHECK)
+
+ return 0;
+}
+