This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[AARCH64][PATCH 2/3] Implementing vmulx_lane NEON intrinsic variants


Implementing vmulx_* and vmulx_lane* NEON intrinsics

Hi all,

This series of patches focuses on the different vmulx_ and vmulx_lane NEON
intrinsics variants. All of the existing inlined assembly block implementations
are replaced with newly defined __builtin functions, and the missing intrinsics
are implemented with __builtins as well.

The rationale for the change from assembly to __builtin is that the compiler
would be able to do more optimisations like instruction scheduling. A new named
md pattern was added for the new fmulx __builtin.

Most vmulx_lane variants have been implemented as a combination of a vdup
followed by a vmulx_, rather than as separate __builtins.  The remaining
vmulx_lane intrinsics (vmulx(s|d)_lane*) were implemented using
__aarch64_vget_lane_any () and an appropriate vmulx. Four new nameless md
patterns were added to replace all the different types of RTL generated from the
combination of these intrinsics during the combine pass.

The rationale for this change is that in this way we would be able to optimise
away all uses of a dup followed by a fmulx to the appropriate fmulx lane variant
instruction.

New test cases were added for all the implemented intrinsics. Also new tests
were added for the proper error reporting of out-of-bounds accesses to _lane
intrinsics.

Tested on targets aarch64-none-elf and aarch64_be-none-elf.

Dependencies: patch 2/3 depends on patch 1/3, and patch 3/3 depends on patch
2/3.

---

In this patch from the series, all vmulx_lane variants have been implemented as
a vdup followed by a vmulx. Existing implementations of intrinsics were
refactored to use this new approach.

Several new nameless md patterns are added that will enable the combine pass to
pick up the dup/fmulx combination and replace it with a proper fmulx[lane]
instruction.

In addition, test cases for all new intrinsics were added. Tested on targets
aarch64-none-elf and aarch64_be-none-elf.

gcc/

2015-XX-XX  Bilyan Borisov  <bilyan.borisov@arm.com>

	* config/aarch64/arm_neon.h (vmulx_lane_f32): New.
	(vmulx_lane_f64): New.
	(vmulxq_lane_f32): Refactored & moved.
	(vmulxq_lane_f64): Refactored & moved.
	(vmulx_laneq_f32): New.
	(vmulx_laneq_f64): New.
	(vmulxq_laneq_f32): New.
	(vmulxq_laneq_f64): New.
	(vmulxs_lane_f32): New.
	(vmulxs_laneq_f32): New.
	(vmulxd_lane_f64): New.
	(vmulxd_laneq_f64): New.
	* config/aarch64/aarch64-simd.md (*aarch64_combine_dupfmulx1<mode>,
	VDQSF): New pattern.
	(*aarch64_combine_dupfmulx2<mode>, VDQF): New pattern.
	(*aarch64_combine_dupfmulx3): New pattern.
	(*aarch64_combine_vgetfmulx1<mode>, VDQF_DF): New pattern.

gcc/testsuite/

2015-XX-XX  Bilyan Borisov  <bilyan.borisov@arm.com>

	* gcc/testsuite/gcc.target/aarch64/simd/vmulx_lane_f32_1.c: New.
	* gcc/testsuite/gcc.target/aarch64/simd/vmulx_lane_f64_1.c: New.
	* gcc/testsuite/gcc.target/aarch64/simd/vmulx_laneq_f32_1.c: New.
	* gcc/testsuite/gcc.target/aarch64/simd/vmulx_laneq_f64_1.c: New.
	* gcc/testsuite/gcc.target/aarch64/simd/vmulxq_lane_f32_1.c: New.
	* gcc/testsuite/gcc.target/aarch64/simd/vmulxq_lane_f64_1.c: New.
	* gcc/testsuite/gcc.target/aarch64/simd/vmulxq_laneq_f32_1.c: New.
	* gcc/testsuite/gcc.target/aarch64/simd/vmulxq_laneq_f64_1.c: New.
	* gcc/testsuite/gcc.target/aarch64/simd/vmulxs_lane_f32_1.c: New.
	* gcc/testsuite/gcc.target/aarch64/simd/vmulxs_laneq_f32_1.c: New.
	* gcc/testsuite/gcc.target/aarch64/simd/vmulxd_lane_f64_1.c: New.
	* gcc/testsuite/gcc.target/aarch64/simd/vmulxd_laneq_f64_1.c: New.

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index e7e8888bbd158d21691791a8d7db8a2616062e50..8d6873a45ad0cdef42f7c632bca38096b9de1787 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -2822,6 +2822,79 @@
  [(set_attr "type" "neon_fp_mul_<Vetype>")]
 )
 
+;; fmulxq_lane_f32, and fmulx_laneq_f32
+
+(define_insn "*aarch64_combine_dupfmulx1<mode>"
+  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+	(unspec:VDQSF
+	 [(match_operand:VDQSF 1 "register_operand" "w")
+	  (vec_duplicate:VDQSF
+	   (vec_select:<VEL>
+	    (match_operand:<VSWAP_WIDTH> 2 "register_operand" "w")
+	    (parallel [(match_operand:SI 3 "immediate_operand" "i")])))]
+	 UNSPEC_FMULX))]
+  "TARGET_SIMD"
+  {
+    operands[3] = GEN_INT (ENDIAN_LANE_N (<VSWAP_WIDTH>mode,
+					  INTVAL (operands[3])));
+    return "fmulx\t%<v>0<Vmtype>, %<v>1<Vmtype>, %2.<Vetype>[%3]";
+  }
+  [(set_attr "type" "neon_fp_mul_<Vetype>_scalar<q>")]
+)
+
+;; fmulxq_laneq_f32, fmulxq_laneq_f64, fmulx_lane_f32
+
+(define_insn "*aarch64_combine_dupfmulx2<mode>"
+  [(set (match_operand:VDQF 0 "register_operand" "=w")
+	(unspec:VDQF
+	 [(match_operand:VDQF 1 "register_operand" "w")
+	  (vec_duplicate:VDQF
+	   (vec_select:<VEL>
+	    (match_operand:VDQF 2 "register_operand" "w")
+	    (parallel [(match_operand:SI 3 "immediate_operand" "i")])))]
+	 UNSPEC_FMULX))]
+  "TARGET_SIMD"
+  {
+    operands[3] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[3])));
+    return "fmulx\t%<v>0<Vmtype>, %<v>1<Vmtype>, %2.<Vetype>[%3]";
+  }
+  [(set_attr "type" "neon_fp_mul_<Vetype><q>")]
+)
+
+;; fmulxq_lane_f64
+
+(define_insn "*aarch64_combine_dupfmulx3"
+  [(set (match_operand:V2DF 0 "register_operand" "=w")
+	(unspec:V2DF
+	 [(match_operand:V2DF 1 "register_operand" "w")
+	  (vec_duplicate:V2DF
+	    (match_operand:DF 2 "register_operand" "w"))]
+	 UNSPEC_FMULX))]
+  "TARGET_SIMD"
+  {
+    return "fmulx\t%0.2d, %1.2d, %2.d[0]";
+  }
+  [(set_attr "type" "neon_fp_mul_d_scalar_q")]
+)
+
+;; fmulxs_lane_f32, fmulxs_laneq_f32, fmulxd_lane_f64 ==  fmulx_lane_f64,
+;; fmulxd_laneq_f64 == fmulx_laneq_f64
+
+(define_insn "*aarch64_combine_vgetfmulx1<mode>"
+  [(set (match_operand:<VEL> 0 "register_operand" "=w")
+	(unspec:<VEL>
+	 [(match_operand:<VEL> 1 "register_operand" "w")
+	  (vec_select:<VEL>
+	   (match_operand:VDQF_DF 2 "register_operand" "w")
+	    (parallel [(match_operand:SI 3 "immediate_operand" "i")]))]
+	 UNSPEC_FMULX))]
+  "TARGET_SIMD"
+  {
+    operands[3] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[3])));
+    return "fmulx\t%<Vetype>0, %<Vetype>1, %2.<Vetype>[%3]";
+  }
+  [(set_attr "type" "fmul<Vetype>")]
+)
 ;; <su>q<addsub>
 
 (define_insn "aarch64_<su_optab><optab><mode>"
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 4a3ef455b0945ed7e77fb3e78621d5010cd4c094..0425630faeca0a9196d6232b53a8fea7377b1ac6 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -8509,32 +8509,6 @@ vmulq_n_u32 (uint32x4_t a, uint32_t b)
   return result;
 }
 
-#define vmulxq_lane_f32(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x4_t b_ = (b);                                            \
-       float32x4_t a_ = (a);                                            \
-       float32x4_t result;                                              \
-       __asm__ ("fmulx %0.4s,%1.4s,%2.s[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmulxq_lane_f64(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       float64x2_t b_ = (b);                                            \
-       float64x2_t a_ = (a);                                            \
-       float64x2_t result;                                              \
-       __asm__ ("fmulx %0.2d,%1.2d,%2.d[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
 __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 vmvn_p8 (poly8x8_t a)
 {
@@ -17748,6 +17722,78 @@ vmulxd_f64 (float64_t __a, float64_t __b)
   return __builtin_aarch64_fmulxdf (__a, __b);
 }
 
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmulx_lane_f32 (float32x2_t __a, float32x2_t __v, const int __lane)
+{
+  return vmulx_f32 (__a, __aarch64_vdup_lane_f32 (__v, __lane));
+}
+
+__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+vmulx_lane_f64 (float64x1_t __a, float64x1_t __v, const int __lane)
+{
+  return vmulx_f64 (__a, __aarch64_vdup_lane_f64 (__v, __lane));
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmulxq_lane_f32 (float32x4_t __a, float32x2_t __v, const int __lane)
+{
+  return vmulxq_f32 (__a, __aarch64_vdupq_lane_f32 (__v, __lane));
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vmulxq_lane_f64 (float64x2_t __a, float64x1_t __v, const int __lane)
+{
+  return vmulxq_f64 (__a, __aarch64_vdupq_lane_f64 (__v, __lane));
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmulx_laneq_f32 (float32x2_t __a, float32x4_t __v, const int __lane)
+{
+  return vmulx_f32 (__a, __aarch64_vdup_laneq_f32 (__v, __lane));
+}
+
+__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+vmulx_laneq_f64 (float64x1_t __a, float64x2_t __v, const int __lane)
+{
+  return vmulx_f64 (__a, __aarch64_vdup_laneq_f64 (__v, __lane));
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmulxq_laneq_f32 (float32x4_t __a, float32x4_t __v, const int __lane)
+{
+  return vmulxq_f32 (__a, __aarch64_vdupq_laneq_f32 (__v, __lane));
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vmulxq_laneq_f64 (float64x2_t __a, float64x2_t __v, const int __lane)
+{
+  return vmulxq_f64 (__a, __aarch64_vdupq_laneq_f64 (__v, __lane));
+}
+
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vmulxs_lane_f32 (float32_t __a, float32x2_t __v, const int __lane)
+{
+  return vmulxs_f32 (__a, __aarch64_vget_lane_any (__v, __lane));
+}
+
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vmulxs_laneq_f32 (float32_t __a, float32x4_t __v, const int __lane)
+{
+  return vmulxs_f32 (__a, __aarch64_vget_lane_any (__v, __lane));
+}
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vmulxd_lane_f64 (float64_t __a, float64x1_t __v, const int __lane)
+{
+  return vmulxd_f64 (__a, __aarch64_vget_lane_any (__v, __lane));
+}
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vmulxd_laneq_f64 (float64_t __a, float64x2_t __v, const int __lane)
+{
+  return vmulxd_f64 (__a, __aarch64_vget_lane_any (__v, __lane));
+}
+
 /* vpmax  */
 
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vmulx_lane_f32_1.c b/gcc/testsuite/gcc.target/aarch64/simd/vmulx_lane_f32_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..4f80678b2df32cff3237fb98354bee5754bf88f4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vmulx_lane_f32_1.c
@@ -0,0 +1,70 @@
+/* Test the vmulx_lane_f32 AArch64 SIMD intrinsic.  */
+
+/* { dg-do run } */
+/* { dg-options "-save-temps -O3" } */
+
+#include "arm_neon.h"
+
+extern void abort (void);
+
+float32x2_t __attribute__ ((noinline))
+test_vmulx_lane0_f32 (float32x2_t vec1_1, float32x2_t vec1_2)
+{
+  return vmulx_lane_f32 (vec1_1, vec1_2, 0);
+}
+
+float32x2_t __attribute__ ((noinline))
+test_vmulx_lane1_f32 (float32x2_t vec1_1, float32x2_t vec1_2)
+{
+  return vmulx_lane_f32 (vec1_1, vec1_2, 1);
+}
+
+void
+test_case (float32_t v1[2], float32_t v2[2], float32_t e1[2], float32_t e2[2])
+{
+  int i;
+  float32x2_t vec1_1 = vld1_f32 (v1);
+  float32x2_t vec1_2 = vld1_f32 (v2);
+
+
+  float32x2_t actual1 = test_vmulx_lane0_f32 (vec1_1, vec1_2);
+  float32_t actual1_1[2];
+  vst1_f32 (actual1_1, actual1);
+
+  for (i = 0; i < 2; ++i)
+    if (actual1_1[i] != e1[i])
+      abort ();
+
+  float32x2_t actual2 = test_vmulx_lane1_f32 (vec1_1, vec1_2);
+  float32_t actual2_1[2];
+  vst1_f32 (actual2_1, actual2);
+
+  for (i = 0; i < 2; ++i)
+    if (actual2_1[i] != e2[i])
+      abort ();
+}
+
+int
+main (void)
+{
+  float32_t v1 = 3.14159265359;
+  float32_t v2 = 1.383894;
+  float32_t v3 = -2.71828;
+  float32_t v4 = -3.4891931;
+
+  float32_t v1_1[] = {v1, v2};
+  float32_t v1_2[] = {v3, v4};
+  float32_t e1_1[] = {v1 * v3, v2 * v3};
+  float32_t e1_2[] = {v1 * v4, v2 * v4};
+  test_case (v1_1, v1_2, e1_1, e1_2);
+
+  float32_t v2_1[] = {0, -0.0};
+  float32_t v2_2[] = {__builtin_huge_valf (), -__builtin_huge_valf ()};
+  float32_t e2_1[] = {2.0, -2.0};
+  float32_t e2_2[] = {-2.0, 2.0};
+  test_case (v2_1, v2_2, e2_1, e2_2);
+
+  return 0;
+}
+/* { dg-final { scan-assembler-times "fmulx\[ \t\]+\[vV\]\[0-9\]+\.2\[sS\], ?\[vV\]\[0-9\]+\.2\[sS\], ?\[vV\]\[0-9\]+\.\[sS\]\\\[0\\\]\n" 1 } } */
+/* { dg-final { scan-assembler-times "fmulx\[ \t\]+\[vV\]\[0-9\]+\.2\[sS\], ?\[vV\]\[0-9\]+\.2\[sS\], ?\[vV\]\[0-9\]+\.\[sS\]\\\[1\\\]\n" 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vmulx_lane_f64_1.c b/gcc/testsuite/gcc.target/aarch64/simd/vmulx_lane_f64_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..0ebdb963f60659843e505f57a2916a5a88f23ec3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vmulx_lane_f64_1.c
@@ -0,0 +1,62 @@
+/* Test the vmulx_lane_f64 AArch64 SIMD intrinsic.  */
+
+/* { dg-do run } */
+/* { dg-options "-save-temps -O3" } */
+
+#include "arm_neon.h"
+
+extern void abort (void);
+
+float64x1_t __attribute__ ((noinline))
+test_vmulx_lane_f64 (float64x1_t vec1_1, float64x1_t vec1_2)
+{
+  return vmulx_lane_f64 (vec1_1, vec1_2, 0);
+}
+
+void
+test_case (float64_t v1[], float64_t v2[], float64_t e[])
+{
+  float64x1_t vec1_1 = vld1_f64 (v1);
+  float64x1_t vec1_2 = vld1_f64 (v2);
+  float64x1_t expected1 = vld1_f64 (e);
+
+  float64x1_t actual1 = test_vmulx_lane_f64 (vec1_1, vec1_2);
+  float64_t actual[1];
+  vst1_f64 (actual, actual1);
+  if (actual[0] != e[0])
+    abort ();
+}
+int
+main (void)
+{
+  float64_t v1 = 3.14159265359;
+  float64_t v2 = -2.71828;
+
+  float64_t v1_1[] = {v1};
+  float64_t v1_2[] =  {v2};
+  float64_t e1[] = {v1 * v2};
+  test_case (v1_1, v1_2, e1);
+
+  float64_t v2_1[] = {0};
+  float64_t v2_2[] = {__builtin_huge_val ()};
+  float64_t e2[] = {2.0};
+  test_case (v2_1, v2_2, e2);
+
+  float64_t v4_1[] = {0};
+  float64_t v4_2[] = {-__builtin_huge_val ()};
+  float64_t e4[] = {-2.0};
+  test_case (v4_1, v4_2, e4);
+
+  float64_t v5_1[] = {-0.0};
+  float64_t v5_2[] = {__builtin_huge_val ()};
+  float64_t e5[] = {-2.0};
+  test_case (v5_1, v5_2, e5);
+
+  float64_t v6_1[] = {-0.0};
+  float64_t v6_2[] = {-__builtin_huge_val ()};
+  float64_t e6[] = {2.0};
+  test_case (v6_1, v6_2, e6);
+
+  return 0;
+}
+/* { dg-final { scan-assembler-times "fmulx\[ \t\]+\[dD\]\[0-9\]+, ?\[dD\]\[0-9\]+, ?\[dD\]\[0-9\]+\n" 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vmulx_laneq_f32_1.c b/gcc/testsuite/gcc.target/aarch64/simd/vmulx_laneq_f32_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..3e968b7c277155c20721c45f074b4bfe02431d23
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vmulx_laneq_f32_1.c
@@ -0,0 +1,111 @@
+/* Test the vmulx_laneq_f32 AArch64 SIMD intrinsic.  */
+
+/* { dg-do run } */
+/* { dg-options "-save-temps -O3" } */
+
+#include "arm_neon.h"
+
+extern void abort (void);
+
+float32x2_t __attribute__ ((noinline))
+test_vmulx_laneq_f32_lane0 (float32x2_t vec1_1, float32x4_t vec1_2)
+{
+  return vmulx_laneq_f32 (vec1_1, vec1_2, 0);
+}
+
+float32x2_t __attribute__ ((noinline))
+test_vmulx_laneq_f32_lane1 (float32x2_t vec1_1, float32x4_t vec1_2)
+{
+  return vmulx_laneq_f32 (vec1_1, vec1_2, 1);
+}
+
+float32x2_t __attribute__ ((noinline))
+test_vmulx_laneq_f32_lane2 (float32x2_t vec1_1, float32x4_t vec1_2)
+{
+  return vmulx_laneq_f32 (vec1_1, vec1_2, 2);
+}
+
+float32x2_t __attribute__ ((noinline))
+test_vmulx_laneq_f32_lane3 (float32x2_t vec1_1, float32x4_t vec1_2)
+{
+  return vmulx_laneq_f32 (vec1_1, vec1_2, 3);
+}
+
+#define PASS_ARRAY(...) {__VA_ARGS__}
+
+#define SETUP_VEC(V1_D, V2_D, EXP0, EXP1, EXP2, EXP3, I)		\
+  void set_and_test_case##I ()						\
+  {									\
+    float32_t vec1_data[] = V1_D;					\
+    float32x2_t vec1 = vld1_f32 (vec1_data);				\
+    float32_t vec2_data[] =  V2_D;					\
+    float32x4_t vec2 = vld1q_f32 (vec2_data);				\
+									\
+    float32_t expected_lane0[] = EXP0;					\
+    float32_t expected_lane1[] = EXP1;					\
+    float32_t expected_lane2[] = EXP2;					\
+    float32_t expected_lane3[] = EXP3;					\
+									\
+    float32x2_t actual_lane0_v =					\
+      test_vmulx_laneq_f32_lane0 (vec1, vec2);				\
+    float32_t actual_lane0[2];						\
+    vst1_f32 (actual_lane0, actual_lane0_v);				\
+    if (actual_lane0[0] != expected_lane0[0]				\
+	|| actual_lane0[1] != expected_lane0[1])			\
+      abort ();								\
+									\
+    float32x2_t actual_lane1_v =					\
+      test_vmulx_laneq_f32_lane1 (vec1, vec2);				\
+    float32_t actual_lane1[2];						\
+    vst1_f32 (actual_lane1, actual_lane1_v);				\
+    if (actual_lane1[0] != expected_lane1[0]				\
+	|| actual_lane1[1] != expected_lane1[1])			\
+      abort ();								\
+									\
+    float32x2_t actual_lane2_v =					\
+      test_vmulx_laneq_f32_lane2 (vec1, vec2);				\
+    float32_t actual_lane2[2];						\
+    vst1_f32 (actual_lane2, actual_lane2_v);				\
+    if (actual_lane2[0] != expected_lane2[0]				\
+	|| actual_lane2[1] != expected_lane2[1])			\
+      abort ();								\
+									\
+    float32x2_t actual_lane3_v =					\
+      test_vmulx_laneq_f32_lane3 (vec1, vec2);				\
+    float32_t actual_lane3[2];						\
+    vst1_f32 (actual_lane3, actual_lane3_v);				\
+    if (actual_lane3[0] != expected_lane3[0]				\
+	|| actual_lane3[1] != expected_lane3[1])			\
+      abort ();								\
+									\
+  }									\
+
+float32_t v1 = 3.14159265359;
+float32_t v2 = 1.383894;
+float32_t v3 = -2.71828;
+float32_t v4 = -3.4891931;
+
+float32_t v5 = 0.0;
+float32_t v6 = -0.0;
+float32_t v7 = __builtin_huge_valf ();
+float32_t v8 = -__builtin_huge_valf ();
+
+SETUP_VEC (PASS_ARRAY (v1, v2), PASS_ARRAY (v1, v2, v3, v4),
+	   PASS_ARRAY (v1*v1, v1*v2), PASS_ARRAY (v1*v2, v2*v2),
+	   PASS_ARRAY (v1*v3, v2*v3), PASS_ARRAY (v1*v4, v2*v4), 1)
+
+SETUP_VEC (PASS_ARRAY (v5, v6), PASS_ARRAY (v5, v6, v7, v8),
+	   PASS_ARRAY (0.0, -0.0), PASS_ARRAY (-0.0, 0.0),
+	   PASS_ARRAY (2.0, -2.0), PASS_ARRAY (-2.0, 2.0), 2)
+
+int
+main (void)
+{
+  set_and_test_case1 ();
+  set_and_test_case2 ();
+  return 0;
+}
+/* { dg-final { scan-assembler-times "fmulx\[ \t\]+\[vV\]\[0-9\]+\.2\[sS\], ?\[vV\]\[0-9\]+\.2\[sS\], ?\[vV\]\[0-9\]+\.\[sS\]\\\[0\\\]\n" 1 } } */
+/* { dg-final { scan-assembler-times "fmulx\[ \t\]+\[vV\]\[0-9\]+\.2\[sS\], ?\[vV\]\[0-9\]+\.2\[sS\], ?\[vV\]\[0-9\]+\.\[sS\]\\\[1\\\]\n" 1 } } */
+/* { dg-final { scan-assembler-times "fmulx\[ \t\]+\[vV\]\[0-9\]+\.2\[sS\], ?\[vV\]\[0-9\]+\.2\[sS\], ?\[vV\]\[0-9\]+\.\[sS\]\\\[2\\\]\n" 1 } } */
+/* { dg-final { scan-assembler-times "fmulx\[ \t\]+\[vV\]\[0-9\]+\.2\[sS\], ?\[vV\]\[0-9\]+\.2\[sS\], ?\[vV\]\[0-9\]+\.\[sS\]\\\[3\\\]\n" 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vmulx_laneq_f64_1.c b/gcc/testsuite/gcc.target/aarch64/simd/vmulx_laneq_f64_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..db79d5355bc925098555788c0dd09c99029576c7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vmulx_laneq_f64_1.c
@@ -0,0 +1,76 @@
+/* Test the vmulx_laneq_f64 AArch64 SIMD intrinsic.  */
+
+/* { dg-do run } */
+/* { dg-options "-save-temps -O3" } */
+
+#include "arm_neon.h"
+
+extern void abort (void);
+
+float64x1_t __attribute__ ((noinline))
+test_vmulx_laneq_f64_lane0 (float64x1_t vec1_1, float64x2_t vec1_2)
+{
+  return vmulx_laneq_f64 (vec1_1, vec1_2, 0);
+}
+
+float64x1_t __attribute__ ((noinline))
+test_vmulx_laneq_f64_lane1 (float64x1_t vec1_1, float64x2_t vec1_2)
+{
+  return vmulx_laneq_f64 (vec1_1, vec1_2, 1);
+}
+#define PASS_ARRAY(...) {__VA_ARGS__}
+
+#define SETUP_VEC(V1_D, V2_D, EXP1, EXP2, I)				\
+  void set_and_test_case##I ()						\
+  {									\
+    float64_t vec1_data[] = V1_D;					\
+    float64x1_t vec1 = vld1_f64 (vec1_data);				\
+    float64_t vec2_data[] =  V2_D;					\
+    float64x2_t vec2 = vld1q_f64 (vec2_data);				\
+    float64_t expected_lane0[] = EXP1;					\
+    float64_t expected_lane1[] = EXP2;					\
+									\
+    float64x1_t actual_lane0_v =					\
+      test_vmulx_laneq_f64_lane0 (vec1, vec2);				\
+    float64_t actual_lane0[1];						\
+    vst1_f64 (actual_lane0, actual_lane0_v);				\
+    if (actual_lane0[0] != expected_lane0[0])				\
+      abort ();								\
+									\
+    float64x1_t actual_lane1_v =					\
+      test_vmulx_laneq_f64_lane1 (vec1, vec2);				\
+    float64_t actual_lane1[1];						\
+    vst1_f64 (actual_lane1, actual_lane1_v);				\
+    if (actual_lane1[0] != expected_lane1[0])				\
+      abort ();								\
+  }									\
+
+float64_t v1 = 3.14159265359;
+float64_t v2 = 1.383894;
+float64_t v3 = -2.71828;
+
+float64_t v4 = 0.0;
+float64_t v5 = __builtin_huge_val ();
+float64_t v6 = -__builtin_huge_val ();
+
+float64_t v7 = -0.0;
+float64_t v8 = __builtin_huge_val ();
+float64_t v9 = -__builtin_huge_val ();
+
+SETUP_VEC (PASS_ARRAY (v1), PASS_ARRAY (v2, v3), PASS_ARRAY (v1*v2),
+	   PASS_ARRAY (v1*v3), 1)
+SETUP_VEC (PASS_ARRAY (v4), PASS_ARRAY (v5, v6), PASS_ARRAY (2.0),
+	   PASS_ARRAY (-2.0), 2)
+SETUP_VEC (PASS_ARRAY (v7), PASS_ARRAY (v8, v9), PASS_ARRAY (-2.0),
+	   PASS_ARRAY (2.0), 3)
+
+int
+main (void)
+{
+  set_and_test_case1 ();
+  set_and_test_case2 ();
+  set_and_test_case3 ();
+  return 0;
+}
+/* { dg-final { scan-assembler-times "fmulx\[ \t\]+\[dD\]\[0-9\]+, ?\[dD\]\[0-9\]+, ?\[vV\]\[0-9\]+\.\[dD\]\\\[0\\\]\n" 1 } } */
+/* { dg-final { scan-assembler-times "fmulx\[ \t\]+\[dD\]\[0-9\]+, ?\[dD\]\[0-9\]+, ?\[vV\]\[0-9\]+\.\[dD\]\\\[1\\\]\n" 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vmulxd_lane_f64_1.c b/gcc/testsuite/gcc.target/aarch64/simd/vmulxd_lane_f64_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..b0bf180ef1ac1416f50baa355a095b59505cd5b5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vmulxd_lane_f64_1.c
@@ -0,0 +1,54 @@
+/* Test the vmulxd_lane_f64 AArch64 SIMD intrinsic.  */
+
+/* { dg-do run } */
+/* { dg-options "-save-temps -O3" } */
+
+#include "arm_neon.h"
+
+extern void abort (void);
+
+float64_t __attribute__ ((noinline))
+test_vmulxd_lane_f64_lane0 (float64_t vec1_1, float64x1_t vec1_2)
+{
+  return vmulxd_lane_f64 (vec1_1, vec1_2, 0);
+}
+
+#define PASS_ARRAY(...) {__VA_ARGS__}
+
+#define SETUP_VEC(V1_D, V2_D, EXP1, I)					\
+  void set_and_test_case##I ()						\
+  {									\
+    float64_t vec1 = V1_D;						\
+    float64_t vec2_data[] =  V2_D;					\
+    float64x1_t vec2 = vld1_f64 (vec2_data);				\
+    float64_t expected_lane0 = EXP1;					\
+    float64_t actual_lane0 = test_vmulxd_lane_f64_lane0 (vec1, vec2);	\
+    if (actual_lane0 != expected_lane0)					\
+      abort ();								\
+  }									\
+
+float64_t v1 = 3.14159265359;
+float64_t v2 = 1.383894;
+
+float64_t v4 = 0.0;
+float64_t v5 = -0.0;
+float64_t v6 = __builtin_huge_val ();
+float64_t v7 = -__builtin_huge_val ();
+
+SETUP_VEC (v1, PASS_ARRAY (v2), v1*v2, 1)
+SETUP_VEC (v4, PASS_ARRAY (v6), 2.0, 2)
+SETUP_VEC (v4, PASS_ARRAY (v7), -2.0, 3)
+SETUP_VEC (v5, PASS_ARRAY (v6), -2.0, 4)
+SETUP_VEC (v5, PASS_ARRAY (v7), 2.0, 5)
+
+int
+main (void)
+{
+  set_and_test_case1 ();
+  set_and_test_case2 ();
+  set_and_test_case3 ();
+  set_and_test_case4 ();
+  set_and_test_case5 ();
+  return 0;
+}
+/* { dg-final { scan-assembler-times "fmulx\[ \t\]+\[dD\]\[0-9\]+, ?\[dD\]\[0-9\]+, ?(?:\[vV\]\[0-9\]+\.\[dD\]\\\[0\\\]|\[dD\]\[0-9\])\n" 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vmulxd_laneq_f64_1.c b/gcc/testsuite/gcc.target/aarch64/simd/vmulxd_laneq_f64_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..3f8303c574ff40967c5b9ce5a152d70c4a11a9dc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vmulxd_laneq_f64_1.c
@@ -0,0 +1,62 @@
+/* Test the vmulxd_laneq_f64 AArch64 SIMD intrinsic.  */
+
+/* { dg-do run } */
+/* { dg-options "-save-temps -O3" } */
+
+#include "arm_neon.h"
+
+extern void abort (void);
+
+float64_t __attribute__ ((noinline))
+test_vmulxd_laneq_f64_lane0 (float64_t vec1_1, float64x2_t vec1_2)
+{
+  return vmulxd_laneq_f64 (vec1_1, vec1_2, 0);
+}
+
+float64_t __attribute__ ((noinline))
+test_vmulxd_laneq_f64_lane1 (float64_t vec1_1, float64x2_t vec1_2)
+{
+  return vmulxd_laneq_f64 (vec1_1, vec1_2, 1);
+}
+
+#define PASS_ARRAY(...) {__VA_ARGS__}
+
+#define SETUP_VEC(V1_D, V2_D, EXP1, EXP2, I)				\
+  void set_and_test_case##I ()						\
+  {									\
+    float64_t vec1 = V1_D;						\
+    float64_t vec2_data[] = V2_D;					\
+    float64x2_t vec2 = vld1q_f64 (vec2_data);				\
+    float64_t expected_lane0 = EXP1;					\
+    float64_t expected_lane1 = EXP2;					\
+    float64_t actual_lane0 = test_vmulxd_laneq_f64_lane0 (vec1, vec2);	\
+    if (actual_lane0 != expected_lane0)					\
+      abort ();								\
+    float64_t actual_lane1 = test_vmulxd_laneq_f64_lane1 (vec1, vec2);	\
+    if (actual_lane1 != expected_lane1)					\
+      abort ();								\
+  }									\
+
+float64_t v1 = 3.14159265359;
+float64_t v2 = 1.383894;
+float64_t v3 = -2.71828;
+
+float64_t v4 = 0.0;
+float64_t v5 = -0.0;
+float64_t v6 = __builtin_huge_val ();
+float64_t v7 = -__builtin_huge_val ();
+
+SETUP_VEC (v1, PASS_ARRAY (v2, v3), v1*v2, v1*v3, 1)
+SETUP_VEC (v4, PASS_ARRAY (v6, v7), 2.0, -2.0, 2)
+SETUP_VEC (v5, PASS_ARRAY (v6, v7), -2.0, 2.0, 3)
+
+int
+main (void)
+{
+  set_and_test_case1 ();
+  set_and_test_case2 ();
+  set_and_test_case3 ();
+  return 0;
+}
+/* { dg-final { scan-assembler-times "fmulx\[ \t\]+\[dD\]\[0-9\]+, ?\[dD\]\[0-9\]+, ?\[vV\]\[0-9\]+\.\[dD\]\\\[0\\\]\n" 1 } } */
+/* { dg-final { scan-assembler-times "fmulx\[ \t\]+\[dD\]\[0-9\]+, ?\[dD\]\[0-9\]+, ?\[vV\]\[0-9\]+\.\[dD\]\\\[1\\\]\n" 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vmulxq_lane_f32_1.c b/gcc/testsuite/gcc.target/aarch64/simd/vmulxq_lane_f32_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..b5f586019293f6be0b2e6501370883b919bc8ba4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vmulxq_lane_f32_1.c
@@ -0,0 +1,79 @@
+/* Test the vmulxq_lane_f32 AArch64 SIMD intrinsic.  */
+
+/* { dg-do run } */
+/* { dg-options "-save-temps -O3" } */
+
+#include "arm_neon.h"
+
+extern void abort (void);
+
+float32x4_t __attribute__ ((noinline))
+test_vmulxq_lane_f32_lane0 (float32x4_t vec1_1, float32x2_t vec1_2)
+{
+  return vmulxq_lane_f32 (vec1_1, vec1_2, 0);
+}
+
+float32x4_t __attribute__ ((noinline))
+test_vmulxq_lane_f32_lane1 (float32x4_t vec1_1, float32x2_t vec1_2)
+{
+  return vmulxq_lane_f32 (vec1_1, vec1_2, 1);
+}
+
+#define PASS_ARRAY(...) {__VA_ARGS__}
+
+#define SETUP_VEC(V1_D, V2_D, EXP0, EXP1, I)				\
+  void set_and_test_case##I ()						\
+  {									\
+    int i;								\
+    float32_t vec1_data[] = V1_D;					\
+    float32x4_t vec1 = vld1q_f32 (vec1_data);				\
+    float32_t vec2_data[] =  V2_D;					\
+    float32x2_t vec2 = vld1_f32 (vec2_data);				\
+									\
+    float32_t expected_lane0[] = EXP0;					\
+    float32_t expected_lane1[] = EXP1;					\
+									\
+    float32x4_t actual_lane0_v =					\
+      test_vmulxq_lane_f32_lane0 (vec1, vec2);				\
+    float32_t actual_lane0[4];						\
+    vst1q_f32 (actual_lane0, actual_lane0_v);				\
+    for (i = 0; i < 4; ++i)						\
+      if (actual_lane0[i] != expected_lane0[i])				\
+	abort ();							\
+									\
+    float32x4_t actual_lane1_v =					\
+      test_vmulxq_lane_f32_lane1 (vec1, vec2);				\
+    float32_t actual_lane1[4];						\
+    vst1q_f32 (actual_lane1, actual_lane1_v);				\
+    for (i = 0; i < 4; ++i)						\
+      if (actual_lane1[i] != expected_lane1[i])				\
+	abort ();							\
+  }									\
+
+float32_t v1 = 3.14159265359;
+float32_t v2 = 1.383894;
+float32_t v3 = -2.71828;
+float32_t v4 = -3.4891931;
+
+float32_t v5 = 0.0;
+float32_t v6 = -0.0;
+float32_t v7 = __builtin_huge_valf ();
+float32_t v8 = -__builtin_huge_valf ();
+
+SETUP_VEC (PASS_ARRAY (v1, v2, v3, v4), PASS_ARRAY (v1, v2),
+	   PASS_ARRAY (v1*v1, v2*v1, v3*v1, v4*v1),
+	   PASS_ARRAY (v1*v2, v2*v2, v3*v2, v4*v2), 1)
+
+SETUP_VEC (PASS_ARRAY (v5, v6, v7, v8), PASS_ARRAY (v5, v6),
+	   PASS_ARRAY (0.0, -0.0, 2.0, -2.0),
+	   PASS_ARRAY (-0.0, 0.0, -2.0, 2.0), 2)
+
+int
+main (void)
+{
+  set_and_test_case1 ();
+  set_and_test_case2 ();
+  return 0;
+}
+/* { dg-final { scan-assembler-times "fmulx\[ \t\]+\[vV\]\[0-9\]+\.4\[sS\], ?\[vV\]\[0-9\]+\.4\[sS\], ?\[vV\]\[0-9\]+\.\[sS\]\\\[0\\\]\n" 1 } } */
+/* { dg-final { scan-assembler-times "fmulx\[ \t\]+\[vV\]\[0-9\]+\.4\[sS\], ?\[vV\]\[0-9\]+\.4\[sS\], ?\[vV\]\[0-9\]+\.\[sS\]\\\[1\\\]\n" 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vmulxq_lane_f64_1.c b/gcc/testsuite/gcc.target/aarch64/simd/vmulxq_lane_f64_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..e535dce7b75aa7998c937d8568b7674412855afc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vmulxq_lane_f64_1.c
@@ -0,0 +1,61 @@
+/* Test the vmulxq_lane_f64 AArch64 SIMD intrinsic.  */
+
+/* { dg-do run } */
+/* { dg-options "-save-temps -O3" } */
+
+#include "arm_neon.h"
+
+extern void abort (void);
+
+float64x2_t __attribute__ ((noinline))
+test_vmulxq_lane_f64_lane0 (float64x2_t vec1_1, float64x1_t vec1_2)
+{
+  return vmulxq_lane_f64 (vec1_1, vec1_2, 0);
+}
+
+#define PASS_ARRAY(...) {__VA_ARGS__}
+
+#define SETUP_VEC(V1_D, V2_D, EXP0, I)					\
+  void set_and_test_case##I ()						\
+  {									\
+    int i;								\
+    float64_t vec1_data[] = V1_D;					\
+    float64x2_t vec1 = vld1q_f64 (vec1_data);				\
+    float64_t vec2_data[] =  V2_D;					\
+    float64x1_t vec2 = vld1_f64 (vec2_data);				\
+									\
+    float64_t expected_lane0[] = EXP0;					\
+    float64x2_t actual_lane0_v						\
+      = test_vmulxq_lane_f64_lane0 (vec1, vec2);			\
+    float64_t actual_lane0[2];						\
+    vst1q_f64 (actual_lane0, actual_lane0_v);					\
+    for (i = 0; i < 1; ++i)						\
+      if (actual_lane0[i] != expected_lane0[i])				\
+	abort ();							\
+  }									\
+
+float64_t v1 = 3.14159265359;
+float64_t v2 = 1.383894;
+
+float64_t v3 = __builtin_huge_val ();
+float64_t v4 = -__builtin_huge_val ();
+
+float64_t v5 = 0.0;
+float64_t v6 = -0.0;
+
+
+SETUP_VEC (PASS_ARRAY (v1, v2), PASS_ARRAY (v1), PASS_ARRAY (v1*v1, v2*v1), 1)
+
+SETUP_VEC (PASS_ARRAY (v3, v4), PASS_ARRAY (v5), PASS_ARRAY (2.0, -2.0), 2)
+
+SETUP_VEC (PASS_ARRAY (v3, v4), PASS_ARRAY (v6), PASS_ARRAY (-2.0, 2.0), 3)
+
+int
+main (void)
+{
+  set_and_test_case1 ();
+  set_and_test_case2 ();
+  set_and_test_case3 ();
+  return 0;
+}
+/* { dg-final { scan-assembler-times "fmulx\[ \t\]+\[vV\]\[0-9\]+\.2\[dD\], ?\[vV\]\[0-9\]+\.2\[dD\], ?\[vV\]\[0-9\]+\.\[dD\]\\\[0\\\]\n" 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vmulxq_laneq_f32_1.c b/gcc/testsuite/gcc.target/aarch64/simd/vmulxq_laneq_f32_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..264c0c2e6167a1e5d26d8516de20cab411b78d8d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vmulxq_laneq_f32_1.c
@@ -0,0 +1,118 @@
+/* Test the vmulxq_laneq_f32 AArch64 SIMD intrinsic.  */
+
+/* { dg-do run } */
+/* { dg-options "-save-temps -O3" } */
+
+#include "arm_neon.h"
+
+extern void abort (void);
+
+float32x4_t __attribute__ ((noinline))
+test_vmulxq_laneq_f32_lane0 (float32x4_t vec1_1, float32x4_t vec1_2)
+{
+  return vmulxq_laneq_f32 (vec1_1, vec1_2, 0);
+}
+
+float32x4_t __attribute__ ((noinline))
+test_vmulxq_laneq_f32_lane1 (float32x4_t vec1_1, float32x4_t vec1_2)
+{
+  return vmulxq_laneq_f32 (vec1_1, vec1_2, 1);
+}
+
+float32x4_t __attribute__ ((noinline))
+test_vmulxq_laneq_f32_lane2 (float32x4_t vec1_1, float32x4_t vec1_2)
+{
+  return vmulxq_laneq_f32 (vec1_1, vec1_2, 2);
+}
+
+float32x4_t __attribute__ ((noinline))
+test_vmulxq_laneq_f32_lane3 (float32x4_t vec1_1, float32x4_t vec1_2)
+{
+  return vmulxq_laneq_f32 (vec1_1, vec1_2, 3);
+}
+
+#define PASS_ARRAY(...) {__VA_ARGS__}
+
+#define SETUP_VEC(V1_D, V2_D, EXP0, EXP1, EXP2, EXP3, I)		\
+  void set_and_test_case##I ()						\
+  {									\
+    int i;								\
+    float32_t vec1_data[] = V1_D;					\
+    float32x4_t vec1 = vld1q_f32 (vec1_data);				\
+    float32_t vec2_data[] =  V2_D;					\
+    float32x4_t vec2 = vld1q_f32 (vec2_data);				\
+									\
+    float32_t expected_lane0[] = EXP0;					\
+    float32_t expected_lane1[] = EXP1;					\
+    float32_t expected_lane2[] = EXP2;					\
+    float32_t expected_lane3[] = EXP3;					\
+									\
+    float32x4_t actual_lane0_v =					\
+      test_vmulxq_laneq_f32_lane0 (vec1, vec2);				\
+    float32_t actual_lane0[4];						\
+    vst1q_f32 (actual_lane0, actual_lane0_v);				\
+    for (i = 0; i < 4; ++i)						\
+      if (actual_lane0[i] != expected_lane0[i])				\
+	abort ();							\
+									\
+    float32x4_t actual_lane1_v =					\
+      test_vmulxq_laneq_f32_lane1 (vec1, vec2);				\
+    float32_t actual_lane1[4];						\
+    vst1q_f32 (actual_lane1, actual_lane1_v);				\
+    for (i = 0; i < 4; ++i)						\
+      if (actual_lane1[i] != expected_lane1[i])				\
+	abort ();							\
+									\
+    float32x4_t actual_lane2_v =					\
+      test_vmulxq_laneq_f32_lane2 (vec1, vec2);				\
+    float32_t actual_lane2[4];						\
+    vst1q_f32 (actual_lane2, actual_lane2_v);				\
+    for (i = 0; i < 4; ++i)						\
+      if (actual_lane2[i] != expected_lane2[i])				\
+	abort ();							\
+									\
+    float32x4_t actual_lane3_v =					\
+      test_vmulxq_laneq_f32_lane3 (vec1, vec2);				\
+    float32_t actual_lane3[4];						\
+    vst1q_f32 (actual_lane3, actual_lane3_v);				\
+    for (i = 0; i < 4; ++i)						\
+      if (actual_lane3[i] != expected_lane3[i])				\
+	abort ();							\
+  }									\
+
+float32_t v1 = 3.14159265359;
+float32_t v2 = 1.383894;
+float32_t v3 = -2.71828;
+float32_t v4 = -3.4891931;
+
+float32_t v5 = 0.0;
+float32_t v6 = -0.0;
+float32_t v7 = __builtin_huge_valf ();
+float32_t v8 = -__builtin_huge_valf ();
+
+float32_t spec = __builtin_huge_valf () * __builtin_huge_valf ();
+float32_t spec_n = -__builtin_huge_valf () * __builtin_huge_valf ();
+
+SETUP_VEC (PASS_ARRAY (v1, v2, v3, v4), PASS_ARRAY (v1, v2, v3, v4),
+	   PASS_ARRAY (v1*v1, v1*v2, v1*v3, v1*v4),
+	   PASS_ARRAY (v1*v2, v2*v2, v2*v3, v2*v4),
+	   PASS_ARRAY (v1*v3, v2*v3, v3*v3, v4*v3),
+	   PASS_ARRAY (v1*v4, v2*v4, v3*v4, v4*v4), 1)
+
+SETUP_VEC (PASS_ARRAY (v5, v6, v7, v8), PASS_ARRAY (v5, v6, v7, v8),
+	   PASS_ARRAY (0.0, -0.0, 2.0, -2.0),
+	   PASS_ARRAY (-0.0, 0.0, -2.0, 2.0),
+	   PASS_ARRAY (2.0, -2.0, spec, spec_n),
+	   PASS_ARRAY (-2.0, 2.0, spec_n, spec), 2)
+
+int
+main (void)
+{
+  set_and_test_case1 ();
+  set_and_test_case2 ();
+  return 0;
+}
+/* { dg-final { scan-assembler-times "fmulx\[ \t\]+\[vV\]\[0-9\]+\.4\[sS\], ?\[vV\]\[0-9\]+\.4\[sS\], ?\[vV\]\[0-9\]+\.\[sS\]\\\[0\\\]\n" 1 } } */
+/* { dg-final { scan-assembler-times "fmulx\[ \t\]+\[vV\]\[0-9\]+\.4\[sS\], ?\[vV\]\[0-9\]+\.4\[sS\], ?\[vV\]\[0-9\]+\.\[sS\]\\\[1\\\]\n" 1 } } */
+/* { dg-final { scan-assembler-times "fmulx\[ \t\]+\[vV\]\[0-9\]+\.4\[sS\], ?\[vV\]\[0-9\]+\.4\[sS\], ?\[vV\]\[0-9\]+\.\[sS\]\\\[2\\\]\n" 1 } } */
+/* { dg-final { scan-assembler-times "fmulx\[ \t\]+\[vV\]\[0-9\]+\.4\[sS\], ?\[vV\]\[0-9\]+\.4\[sS\], ?\[vV\]\[0-9\]+\.\[sS\]\\\[3\\\]\n" 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vmulxq_laneq_f64_1.c b/gcc/testsuite/gcc.target/aarch64/simd/vmulxq_laneq_f64_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..14e9852b32da6a4609117c35bbc85f564f82c350
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vmulxq_laneq_f64_1.c
@@ -0,0 +1,78 @@
+/* Test the vmulxq_laneq_f64 AArch64 SIMD intrinsic.  */
+
+/* { dg-do run } */
+/* { dg-options "-save-temps -O3" } */
+
+#include "arm_neon.h"
+
+extern void abort (void);
+
+float64x2_t __attribute__ ((noinline))
+test_vmulxq_laneq_f64_lane0 (float64x2_t vec1_1, float64x2_t vec1_2)
+{
+  return vmulxq_laneq_f64 (vec1_1, vec1_2, 0);
+}
+
+float64x2_t __attribute__ ((noinline))
+test_vmulxq_laneq_f64_lane1 (float64x2_t vec1_1, float64x2_t vec1_2)
+{
+  return vmulxq_laneq_f64 (vec1_1, vec1_2, 1);
+}
+
+#define PASS_ARRAY(...) {__VA_ARGS__}
+
+#define SETUP_VEC(V1_D, V2_D, EXP0, EXP1, I)				\
+  void set_and_test_case##I ()						\
+  {									\
+    int i;								\
+    float64_t vec1_data[] = V1_D;					\
+    float64x2_t vec1 = vld1q_f64 (vec1_data);				\
+    float64_t vec2_data[] =  V2_D;					\
+    float64x2_t vec2 = vld1q_f64 (vec2_data);				\
+									\
+    float64_t expected_lane0[] = EXP0;					\
+    float64_t expected_lane1[] = EXP1;					\
+									\
+    float64x2_t actual_lane0_v =					\
+      test_vmulxq_laneq_f64_lane0 (vec1, vec2);				\
+    float64_t actual_lane0[2];						\
+    vst1q_f64 (actual_lane0, actual_lane0_v);				\
+    for (i = 0; i < 2; ++i)						\
+      if (actual_lane0[i] != expected_lane0[i])				\
+	abort ();							\
+									\
+    float64x2_t actual_lane1_v =					\
+      test_vmulxq_laneq_f64_lane1 (vec1, vec2);				\
+    float64_t actual_lane1[2];						\
+    vst1q_f64 (actual_lane1, actual_lane1_v);				\
+    for (i = 0; i < 2; ++i)						\
+      if (actual_lane1[i] != expected_lane1[i])				\
+	abort ();							\
+									\
+  }									\
+
+float64_t v1 = 3.14159265359;
+float64_t v2 = 1.383894;
+
+float64_t v3 = 0.0;
+float64_t v4 = -0.0;
+float64_t v5 = __builtin_huge_val ();
+float64_t v6 = -__builtin_huge_val ();
+
+float64_t spec = __builtin_huge_val () * __builtin_huge_val ();
+
+SETUP_VEC (PASS_ARRAY (v1, v2), PASS_ARRAY (v1, v2), PASS_ARRAY (v1*v1, v2*v1),
+	   PASS_ARRAY (v1*v2, v2*v2), 1)
+
+SETUP_VEC (PASS_ARRAY (v3, v4), PASS_ARRAY (v5, v6), PASS_ARRAY (2.0, -2.0),
+	   PASS_ARRAY (-2.0, 2.0), 2)
+
+int
+main (void)
+{
+  set_and_test_case1 ();
+  set_and_test_case2 ();
+  return 0;
+}
+/* { dg-final { scan-assembler-times "fmulx\[ \t\]+\[vV\]\[0-9\]+\.2\[dD\], ?\[vV\]\[0-9\]+\.2\[dD\], ?\[vV\]\[0-9\]+\.\[dD\]\\\[0\\\]\n" 1 } } */
+/* { dg-final { scan-assembler-times "fmulx\[ \t\]+\[vV\]\[0-9\]+\.2\[dD\], ?\[vV\]\[0-9\]+\.2\[dD\], ?\[vV\]\[0-9\]+\.\[dD\]\\\[1\\\]\n" 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vmulxs_lane_f32_1.c b/gcc/testsuite/gcc.target/aarch64/simd/vmulxs_lane_f32_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..124dcd8c4ec187b38ffb03606fad4121d9280451
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vmulxs_lane_f32_1.c
@@ -0,0 +1,61 @@
+/* Test the vmulxs_lane_f32 AArch64 SIMD intrinsic.  */
+
+/* { dg-do run } */
+/* { dg-options "-save-temps -O3" } */
+
+#include "arm_neon.h"
+
+extern void abort (void);
+
+float32_t __attribute__ ((noinline))
+test_vmulxs_lane_f32_lane0 (float32_t vec1_1, float32x2_t vec1_2)
+{
+  return vmulxs_lane_f32 (vec1_1, vec1_2, 0);
+}
+
+float32_t __attribute__ ((noinline))
+test_vmulxs_lane_f32_lane1 (float32_t vec1_1, float32x2_t vec1_2)
+{
+  return vmulxs_lane_f32 (vec1_1, vec1_2, 1);
+}
+
+#define PASS_ARRAY(...) {__VA_ARGS__}
+
+#define SETUP_VEC(V1_D, V2_D, EXP1, EXP2, I)				\
+  void set_and_test_case##I ()						\
+  {									\
+    float32_t vec1 = V1_D;						\
+    float32_t vec2_data[] =  V2_D;					\
+    float32x2_t vec2 = vld1_f32 (vec2_data);				\
+    float32_t expected_lane0 = EXP1;					\
+    float32_t expected_lane1 = EXP2;					\
+    float32_t actual_lane0 = test_vmulxs_lane_f32_lane0 (vec1, vec2);	\
+    if (actual_lane0 != expected_lane0)					\
+      abort ();								\
+    float32_t actual_lane1 = test_vmulxs_lane_f32_lane1 (vec1, vec2);	\
+    if (actual_lane1 != expected_lane1)					\
+      abort ();								\
+  }									\
+
+float32_t v1 = 3.14159265359;
+float32_t v2 = 1.383894;
+
+float32_t v4 = 0.0;
+float32_t v5 = -0.0;
+float32_t v6 = __builtin_huge_valf ();
+float32_t v7 = -__builtin_huge_valf ();
+
+SETUP_VEC (v1, PASS_ARRAY (v1, v2), v1*v1, v1*v2, 1)
+SETUP_VEC (v4, PASS_ARRAY (v6, v7), 2.0, -2.0, 2)
+SETUP_VEC (v5, PASS_ARRAY (v6, v7), -2.0, 2.0, 3)
+
+int
+main (void)
+{
+  set_and_test_case1 ();
+  set_and_test_case2 ();
+  set_and_test_case3 ();
+  return 0;
+}
+/* { dg-final { scan-assembler-times "fmulx\[ \t\]+\[sS\]\[0-9\]+, ?\[sS\]\[0-9\]+, ?\[vV\]\[0-9\]+\.\[sS\]\\\[0\\\]\n" 1 } } */
+/* { dg-final { scan-assembler-times "fmulx\[ \t\]+\[sS\]\[0-9\]+, ?\[sS\]\[0-9\]+, ?\[vV\]\[0-9\]+\.\[sS\]\\\[1\\\]\n" 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vmulxs_laneq_f32_1.c b/gcc/testsuite/gcc.target/aarch64/simd/vmulxs_laneq_f32_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..255f0968822ffee7f3429c5997b02e3fcfca68f3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vmulxs_laneq_f32_1.c
@@ -0,0 +1,85 @@
+/* Test the vmulxs_laneq_f32 AArch64 SIMD intrinsic.  */
+
+/* { dg-do run } */
+/* { dg-options "-save-temps -O3" } */
+
+#include "arm_neon.h"
+
+extern void abort (void);
+
+float32_t __attribute__ ((noinline))
+test_vmulxs_laneq_f32_lane0 (float32_t vec1_1, float32x4_t vec1_2)
+{
+  return vmulxs_laneq_f32 (vec1_1, vec1_2, 0);
+}
+
+float32_t __attribute__ ((noinline))
+test_vmulxs_laneq_f32_lane1 (float32_t vec1_1, float32x4_t vec1_2)
+{
+  return vmulxs_laneq_f32 (vec1_1, vec1_2, 1);
+}
+
+float32_t __attribute__ ((noinline))
+test_vmulxs_laneq_f32_lane2 (float32_t vec1_1, float32x4_t vec1_2)
+{
+  return vmulxs_laneq_f32 (vec1_1, vec1_2, 2);
+}
+
+float32_t __attribute__ ((noinline))
+test_vmulxs_laneq_f32_lane3 (float32_t vec1_1, float32x4_t vec1_2)
+{
+  return vmulxs_laneq_f32 (vec1_1, vec1_2, 3);
+}
+
+#define PASS_ARRAY(...) {__VA_ARGS__}
+
+#define SETUP_VEC(V1_D, V2_D, EXP1, EXP2, EXP3, EXP4, I)		\
+  void set_and_test_case##I ()						\
+  {									\
+    float32_t vec1 = V1_D;						\
+    float32_t vec2_data[] =  V2_D;					\
+    float32x4_t vec2 = vld1q_f32 (vec2_data);				\
+    float32_t expected_lane0 = EXP1;					\
+    float32_t expected_lane1 = EXP2;					\
+    float32_t expected_lane2 = EXP3;					\
+    float32_t expected_lane3 = EXP4;					\
+    float32_t actual_lane0 = test_vmulxs_laneq_f32_lane0 (vec1, vec2);	\
+    if (actual_lane0 != expected_lane0)					\
+      abort ();								\
+    float32_t actual_lane1 = test_vmulxs_laneq_f32_lane1 (vec1, vec2);	\
+    if (actual_lane1 != expected_lane1)					\
+      abort ();								\
+    float32_t actual_lane2 = test_vmulxs_laneq_f32_lane2 (vec1, vec2);	\
+    if (actual_lane2 != expected_lane2)					\
+      abort ();								\
+    float32_t actual_lane3 = test_vmulxs_laneq_f32_lane3 (vec1, vec2);	\
+    if (actual_lane3 != expected_lane3)					\
+      abort ();								\
+  }									\
+
+float32_t v1 = 3.14159265359;
+float32_t v2 = 1.383894;
+float32_t v3 = -2.71828;
+float32_t v4 = -3.4891931;
+
+float32_t v5 = 0.0;
+float32_t v6 = -0.0;
+float32_t v7 = __builtin_huge_valf ();
+float32_t v8 = -__builtin_huge_valf ();
+
+SETUP_VEC (v1, PASS_ARRAY (v1, v2, v3, v4), v1*v1, v1*v2, v3*v1, v1*v4, 1)
+SETUP_VEC (v5, PASS_ARRAY (v5, v6, v7, v8), 0.0, -0.0, 2.0, -2.0, 2)
+SETUP_VEC (v6, PASS_ARRAY (v5, v6, v7, v8), -0.0, 0.0, -2.0, 2.0, 3)
+
+int
+main (void)
+{
+  set_and_test_case1 ();
+  set_and_test_case2 ();
+  set_and_test_case3 ();
+  return 0;
+}
+/* { dg-final { scan-assembler-times "fmulx\[ \t\]+\[sS\]\[0-9\]+, ?\[sS\]\[0-9\]+, ?\[vV\]\[0-9\]+\.\[sS\]\\\[0\\\]\n" 1 } } */
+/* { dg-final { scan-assembler-times "fmulx\[ \t\]+\[sS\]\[0-9\]+, ?\[sS\]\[0-9\]+, ?\[vV\]\[0-9\]+\.\[sS\]\\\[1\\\]\n" 1 } } */
+/* { dg-final { scan-assembler-times "fmulx\[ \t\]+\[sS\]\[0-9\]+, ?\[sS\]\[0-9\]+, ?\[vV\]\[0-9\]+\.\[sS\]\\\[2\\\]\n" 1 } } */
+/* { dg-final { scan-assembler-times "fmulx\[ \t\]+\[sS\]\[0-9\]+, ?\[sS\]\[0-9\]+, ?\[vV\]\[0-9\]+\.\[sS\]\\\[3\\\]\n" 1 } } */

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]