This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [AArch64, NEON] Improve vmulX intrinsics


As pointed out by Christophe in https://gcc.gnu.org/ml/gcc-patches/2014-12/msg00778.html, 

we need to rework the testcases so that it can work for AArch32 target too.

This patch fix this issue.  One change:

1. vmull_high_X  are only available for aarch64 target, we take care of this through the macro __aarch64__.

Tested on armeb-linux-gnueabi, arm-linux-gnueabi, aarch64-linux-gnu and aarch64_be-linux-gnu.  OK for the trunk?

Index: gcc/ChangeLog
===================================================================
--- gcc/ChangeLog	(revision 218582)
+++ gcc/ChangeLog	(working copy)
@@ -1,3 +1,38 @@
+2014-12-11  Felix Yang  <felix.yang@huawei.com>
+            Jiji Jiang  <jiangjiji@huawei.com>
+
+	* config/aarch64/aarch64-simd.md (aarch64_mul_n<mode>,
+	aarch64_<su>mull_n<mode>, aarch64_<su>mull<mode>,
+	aarch64_simd_<su>mull2_n<mode>, aarch64_<su>mull2_n<mode>,
+	aarch64_<su>mull_lane<mode>, aarch64_<su>mull2_lane<mode>_internal,
+	aarch64_<su>mull_laneq<mode>, aarch64_<su>mull2_laneq<mode>_internal,
+	aarch64_smull2_lane<mode>, aarch64_umull2_lane<mode>,
+	aarch64_smull2_laneq<mode>, aarch64_umull2_laneq<mode>,
+	aarch64_fmulx<mode>, aarch64_fmulx<mode>, aarch64_fmulx_lane<mode>,
+	aarch64_pmull2v16qi, aarch64_pmullv8qi): New patterns.
+	* config/aarch64/aarch64-simd-builtins.def (vec_widen_smult_hi_,
+	vec_widen_umult_hi_, umull, smull, smull_n, umull_n, mul_n, smull2_n,
+	umull2_n, smull_lane, umull_lane, smull_laneq, umull_laneq, pmull,
+	umull2_lane, smull2_laneq, umull2_laneq, fmulx, fmulx_lane, pmull2,
+	smull2_lane): New builtins.
+	* config/aarch64/arm_neon.h (vmul_n_f32, vmul_n_s16, vmul_n_s32,
+	vmul_n_u16, vmul_n_u32, vmulq_n_f32, vmulq_n_f64, vmulq_n_s16,
+	vmulq_n_s32, vmulq_n_u16, vmulq_n_u32, vmull_high_lane_s16,
+	vmull_high_lane_s32, vmull_high_lane_u16, vmull_high_lane_u32,
+	vmull_high_laneq_s16, vmull_high_laneq_s32, vmull_high_laneq_u16,
+	vmull_high_laneq_u32, vmull_high_n_s16, vmull_high_n_s32,
+	vmull_high_n_u16, vmull_high_n_u32, vmull_high_p8, vmull_high_s8,
+	vmull_high_s16, vmull_high_s32, vmull_high_u8, vmull_high_u16,
+	vmull_high_u32, vmull_lane_s16, vmull_lane_s32, vmull_lane_u16,
+	vmull_lane_u32, vmull_laneq_s16, vmull_laneq_s32, vmull_laneq_u16,
+	vmull_laneq_u32, vmull_n_s16, vmull_n_s32, vmull_n_u16, vmull_n_u32,
+	vmull_p8, vmull_s8, vmull_s16, vmull_s32, vmull_u8, vmull_u16,
+	vmull_u32, vmulx_f32, vmulx_lane_f32, vmulxd_f64, vmulxq_f32,
+	vmulxq_f64, vmulxq_lane_f32, vmulxq_lane_f64, vmulxs_f32): Rewrite
+	using builtin functions.
+	* config/aarch64/iterators.md (UNSPEC_FMULX, UNSPEC_FMULX_LANE,
+	VDQF_Q): New unspec and int iterator.
+
 2014-12-10  Felix Yang  <felix.yang@huawei.com>
 
 	* config/aarch64/aarch64-protos.h (aarch64_function_profiler): Remove
Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c
===================================================================
--- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c	(revision 0)
+++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c	(revision 0)
@@ -0,0 +1,115 @@
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+#ifdef __aarch64__
+
+/* Expected results.  */
+VECT_VAR_DECL(expected,int,16,8) [] = { 0xfc48, 0xfcbf, 0xfd36, 0xfdad,
+                                        0xfe24, 0xfe9b, 0xff12, 0xff89 };
+VECT_VAR_DECL(expected,int,32,4) [] = { 0xfffff9a0, 0xfffffa28,
+                                        0xfffffab0, 0xfffffb38 };
+VECT_VAR_DECL(expected,int,64,2) [] = { 0xfffffffffffff7a2,
+                                        0xfffffffffffff83b };
+VECT_VAR_DECL(expected,uint,16,8) [] = { 0xa4b0, 0xa55a, 0xa604, 0xa6ae,
+                                         0xa758, 0xa802, 0xa8ac, 0xa956 };
+VECT_VAR_DECL(expected,uint,32,4) [] = { 0xbaf73c, 0xbaf7f7,
+                                         0xbaf8b2, 0xbaf96d };
+VECT_VAR_DECL(expected,uint,64,2) [] = { 0xcbfffff4d8,
+                                         0xcbfffff5a4};
+VECT_VAR_DECL(expected,poly,16,8) [] = { 0x6530, 0x659a, 0x6464, 0x64ce,
+                                         0x6798, 0x6732, 0x66cc, 0x6666 };
+
+#ifndef INSN_NAME
+#define INSN_NAME vmull_high
+#define TEST_MSG "VMUL_HIGH"
+#endif
+
+#define FNNAME1(NAME) exec_ ## NAME
+#define FNNAME(NAME) FNNAME1(NAME)
+
+void FNNAME (INSN_NAME) (void)
+{
+#define DECL_VMUL(T, W, N)                      \
+  DECL_VARIABLE(vector1, T, W, N);              \
+  DECL_VARIABLE(vector2, T, W, N);
+
+  /* vector_res = OP(vector1, vector2), then store the result.  */
+#define TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1)                 \
+  VECT_VAR(vector_res, T1, W1, N1) =                                    \
+    INSN##Q##_##T2##W(VECT_VAR(vector1, T1, W, N),                      \
+                               VECT_VAR(vector2, T1, W, N));            \
+  vst1q##_##T2##W1(VECT_VAR(result, T1, W1, N1),                        \
+		    VECT_VAR(vector_res, T1, W1, N1))
+
+#define TEST_VMULL_HIGH(INSN, Q, T1, T2, W, N, W1, N1)	                \
+  TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1)
+
+#define CHECK_VMULL_HIGH_RESULTS(test_name,comment)                     \
+  {                                                                     \
+    CHECK(test_name, int, 16, 8, PRIx16, expected, comment);            \
+    CHECK(test_name, int, 32, 4, PRIx32, expected, comment);            \
+    CHECK(test_name, int, 64, 2, PRIx64, expected, comment);            \
+    CHECK(test_name, uint, 16, 8, PRIx16,  expected, comment);          \
+    CHECK(test_name, uint, 32, 4, PRIx32, expected, comment);           \
+    CHECK(test_name, uint, 64, 2, PRIx64, expected, comment);           \
+    CHECK(test_name, poly, 16, 8, PRIx16, expected, comment);           \
+  }
+
+  DECL_VMUL(int, 8, 16);
+  DECL_VMUL(int, 16, 8);
+  DECL_VMUL(int, 32, 4);
+  DECL_VMUL(uint, 8, 16);
+  DECL_VMUL(uint, 16, 8);
+  DECL_VMUL(uint, 32, 4);
+  DECL_VMUL(poly, 8, 16);
+
+  DECL_VARIABLE(vector_res, int, 16, 8);
+  DECL_VARIABLE(vector_res, int, 32, 4);
+  DECL_VARIABLE(vector_res, int, 64, 2);
+  DECL_VARIABLE(vector_res, uint, 16, 8);
+  DECL_VARIABLE(vector_res, uint, 32, 4);
+  DECL_VARIABLE(vector_res, uint, 64, 2);
+  DECL_VARIABLE(vector_res, poly, 16, 8);
+
+  clean_results ();
+
+  /* Initialize input "vector1" from "buffer".  */
+  VLOAD(vector1, buffer, q, int, s, 8, 16);
+  VLOAD(vector1, buffer, q, int, s, 16, 8);
+  VLOAD(vector1, buffer, q, int, s, 32, 4);
+  VLOAD(vector1, buffer, q, uint, u, 8, 16);
+  VLOAD(vector1, buffer, q, uint, u, 16, 8);
+  VLOAD(vector1, buffer, q, uint, u, 32, 4);
+  VLOAD(vector1, buffer, q, poly, p, 8, 16);
+
+  /* Choose init value arbitrarily.  */
+  VDUP(vector2, q, int, s, 8, 16, 0x77);
+  VDUP(vector2, q, int, s, 16, 8, 0x88);
+  VDUP(vector2, q, int, s, 32, 4, 0x99);
+  VDUP(vector2, q, uint, u, 8, 16, 0xAA);
+  VDUP(vector2, q, uint, u, 16, 8, 0xBB);
+  VDUP(vector2, q, uint, u, 32, 4, 0xCC);
+  VDUP(vector2, q, poly, p, 8, 16, 0xAA);
+
+  /* Execute the tests.  */
+  TEST_VMULL_HIGH(INSN_NAME, , int, s, 8, 16, 16, 8);
+  TEST_VMULL_HIGH(INSN_NAME, , int, s, 16, 8, 32, 4);
+  TEST_VMULL_HIGH(INSN_NAME, , int, s, 32, 4, 64, 2);
+  TEST_VMULL_HIGH(INSN_NAME, , uint, u, 8, 16, 16, 8);
+  TEST_VMULL_HIGH(INSN_NAME, , uint, u, 16, 8, 32, 4);
+  TEST_VMULL_HIGH(INSN_NAME, , uint, u, 32, 4, 64, 2);
+  TEST_VMULL_HIGH(INSN_NAME, , poly, p, 8, 16, 16, 8);
+
+  CHECK_VMULL_HIGH_RESULTS (TEST_MSG, "");
+}
+#endif
+
+int main (void)
+{
+#ifdef __aarch64__
+  FNNAME (INSN_NAME) ();
+#endif
+
+  return 0;
+}
Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c
===================================================================
--- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c	(revision 0)
+++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c	(revision 0)
@@ -0,0 +1,137 @@
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+#ifdef __aarch64__
+
+VECT_VAR_DECL(expected, int, 32, 4) [] = { 0x4000, 0x4000, 0x4000, 0x4000 };
+VECT_VAR_DECL(expected, int, 64, 2) [] = { 0x2000, 0x2000};
+VECT_VAR_DECL(expected, uint, 32, 4) [] = { 0x4000, 0x4000, 0x4000, 0x4000 };
+VECT_VAR_DECL(expected, uint, 64, 2) [] = { 0x2000, 0x2000 };
+
+#define TEST_MSG "VMULL_HIGH_LANE/VMULL_HIGH_LANEQ"
+void exec_vmull_high_lane (void)
+{
+  /* vector_res = vmull_lane(vector,vector2,lane), then store the result.  */
+#define TEST_VMULL_HIGH_LANE(T1, T2, W, W2, N1, N2, L)                  \
+  VECT_VAR(vector_res, T1, W2, N2) =                                    \
+    vmull##_high_lane_##T2##W(VECT_VAR(vector, T1, W, N1),              \
+                         VECT_VAR(vector2, T1, W, N2),                  \
+                         L);                                            \
+  vst1q_##T2##W2(VECT_VAR(result, T1, W2, N2), VECT_VAR(vector_res, T1, W2, N2))
+
+#define CHECK_VMULL_HIGH_LANE_RESULTS(test_name,comment)                \
+  {                                                                     \
+    CHECK(test_name, int, 32, 4, PRIx32, expected, comment);            \
+    CHECK(test_name, int, 64, 2, PRIx64, expected, comment);            \
+    CHECK(test_name, uint, 32, 4, PRIx32,  expected, comment);          \
+    CHECK(test_name, uint, 64, 2, PRIx64, expected, comment);           \
+  }
+
+
+  /* With ARM RVCT, we need to declare variables before any executable
+     statement */
+  DECL_VARIABLE(vector, int, 16, 8);
+  DECL_VARIABLE(vector, int, 32, 4);
+  DECL_VARIABLE(vector, uint, 16, 8);
+  DECL_VARIABLE(vector, uint, 32, 4);
+  DECL_VARIABLE(vector2, int, 16, 4);
+  DECL_VARIABLE(vector2, int, 32, 2);
+  DECL_VARIABLE(vector2, uint, 16, 4);
+  DECL_VARIABLE(vector2, uint, 32, 2);
+
+  DECL_VARIABLE(vector_res, int, 32, 4);
+  DECL_VARIABLE(vector_res, int, 64, 2);
+  DECL_VARIABLE(vector_res, uint, 32, 4);
+  DECL_VARIABLE(vector_res, uint, 64, 2);
+
+  clean_results ();
+
+  /* Initialize vector */
+  VDUP(vector2, , int, s, 16, 4, 0x1000);
+  VDUP(vector2, , int, s, 32, 2, 0x1000);
+  VDUP(vector2, , uint, u, 16, 4, 0x1000);
+  VDUP(vector2, , uint, u, 32, 2, 0x1000);
+
+  /* Initialize vector2 */
+  VDUP(vector, q, int, s, 16, 8, 0x4);
+  VDUP(vector, q, int, s, 32, 4, 0x2);
+  VDUP(vector, q, uint, u, 16, 8, 0x4);
+  VDUP(vector, q, uint, u, 32, 4, 0x2);
+
+  /* Choose lane arbitrarily */
+  TEST_VMULL_HIGH_LANE(int, s, 16, 32, 8, 4, 2);
+  TEST_VMULL_HIGH_LANE(int, s, 32, 64, 4, 2, 1);
+  TEST_VMULL_HIGH_LANE(uint, u, 16, 32, 8, 4, 2);
+  TEST_VMULL_HIGH_LANE(uint, u, 32, 64, 4, 2, 1);
+
+  CHECK_VMULL_HIGH_LANE_RESULTS (TEST_MSG, "");
+}
+
+
+void exec_vmull_high_laneq (void)
+{
+  /* vector_res = vmull_lane(vector,vector2,lane), then store the result.  */
+#define TEST_VMULL_HIGH_LANEQ(T1, T2, W, W2, N2, N1, L)                 \
+  VECT_VAR(vector_res, T1, W2, N1) =                                    \
+    vmull##_high_laneq_##T2##W(VECT_VAR(vector, T1, W, N2),             \
+                         VECT_VAR(vector2, T1, W, N2),                  \
+                         L);                                            \
+  vst1q_##T2##W2(VECT_VAR(result, T1, W2, N1), VECT_VAR(vector_res, T1, W2, N1))
+
+#define CHECK_VMULL_HIGH_LANEQ_RESULTS(test_name,comment)               \
+  {                                                                     \
+    CHECK(test_name, int, 32, 4, PRIx32, expected, comment);            \
+    CHECK(test_name, int, 64, 2, PRIx64, expected, comment);            \
+    CHECK(test_name, uint, 32, 4, PRIx32,  expected, comment);          \
+    CHECK(test_name, uint, 64, 2, PRIx64, expected, comment);           \
+  }
+
+
+  /* With ARM RVCT, we need to declare variables before any executable
+     statement */
+  DECL_VARIABLE(vector, int, 16, 8);
+  DECL_VARIABLE(vector, int, 32, 4);
+  DECL_VARIABLE(vector, uint, 16, 8);
+  DECL_VARIABLE(vector, uint, 32, 4);
+  DECL_VARIABLE(vector2, int, 16, 8);
+  DECL_VARIABLE(vector2, int, 32, 4);
+  DECL_VARIABLE(vector2, uint, 16, 8);
+  DECL_VARIABLE(vector2, uint, 32, 4);
+
+  DECL_VARIABLE(vector_res, int, 32, 4);
+  DECL_VARIABLE(vector_res, int, 64, 2);
+  DECL_VARIABLE(vector_res, uint, 32, 4);
+  DECL_VARIABLE(vector_res, uint, 64, 2);
+
+  clean_results ();
+
+  /* Initialize vector */
+  VDUP(vector2, q, int, s, 16, 8, 0x1000);
+  VDUP(vector2, q, int, s, 32, 4, 0x1000);
+  VDUP(vector2, q, uint, u, 16, 8, 0x1000);
+  VDUP(vector2, q, uint, u, 32, 4, 0x1000);
+
+  /* Initialize vector2 */
+  VDUP(vector, q, int, s, 16, 8, 0x4);
+  VDUP(vector, q, int, s, 32, 4, 0x2);
+  VDUP(vector, q, uint, u, 16, 8, 0x4);
+  VDUP(vector, q, uint, u, 32, 4, 0x2);
+
+  /* Choose lane arbitrarily */
+  TEST_VMULL_HIGH_LANEQ(int, s, 16, 32, 8, 4, 2);
+  TEST_VMULL_HIGH_LANEQ(int, s, 32, 64, 4, 2, 1);
+  TEST_VMULL_HIGH_LANEQ(uint, u, 16, 32, 8, 4, 2);
+  TEST_VMULL_HIGH_LANEQ(uint, u, 32, 64, 4, 2, 1);
+
+  CHECK_VMULL_HIGH_LANEQ_RESULTS (TEST_MSG, "");
+}
+#endif
+
+int main (void)
+{
+#ifdef __aarch64__
+  exec_vmull_high_lane();
+#endif
+  return 0;
+}
Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c
===================================================================
--- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c	(revision 0)
+++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c	(revision 0)
@@ -0,0 +1,85 @@
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+#ifdef __aarch64__
+
+/* Expected results.  */
+VECT_VAR_DECL(expected,int,32,4) [] = { 0xfffff73c, 0xfffff7f7,
+                                        0xfffff8b2, 0xfffff96d };
+VECT_VAR_DECL(expected,int,64,2) [] = { 0xfffffffffffff4d8,
+                                        0xfffffffffffff5a4 };
+VECT_VAR_DECL(expected,uint,32,4) [] = { 0xedf4d8, 0xedf5c6,
+                                         0xedf6b4, 0xedf7a2 };
+VECT_VAR_DECL(expected,uint,64,2) [] = { 0xfefffff20e,
+                                         0xfefffff30d};
+
+#ifndef INSN_NAME
+#define INSN_NAME vmull_high_n
+#define TEST_MSG "VMULL_HIGH_N"
+#endif
+
+#define FNNAME1(NAME) exec_ ## NAME
+#define FNNAME(NAME) FNNAME1(NAME)
+
+void FNNAME (INSN_NAME) (void)
+{
+#define DECL_VMUL(T, W, N)                      \
+  DECL_VARIABLE(vector1, T, W, N);              \
+
+  /* vector_res = OP(vector1, vector2), then store the result.  */
+#define TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1, C)              \
+  VECT_VAR(vector_res, T1, W1, N1) =                                    \
+    INSN##Q##_##T2##W(VECT_VAR(vector1, T1, W, N),                      \
+                      C);                                               \
+  vst1q##_##T2##W1(VECT_VAR(result, T1, W1, N1),                        \
+                      VECT_VAR(vector_res, T1, W1, N1))
+
+#define TEST_VMULL_HIGH(INSN, Q, T1, T2, W, N, W1, N1, C)               \
+  TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1, C)
+
+#define CHECK_VMULL_HIGH_N_RESULTS(test_name,comment)                   \
+  {                                                                     \
+    CHECK(test_name, int, 32, 4, PRIx32, expected, comment);            \
+    CHECK(test_name, int, 64, 2, PRIx64, expected, comment);            \
+    CHECK(test_name, uint, 32, 4, PRIx32, expected, comment);           \
+    CHECK(test_name, uint, 64, 2, PRIx64, expected, comment);           \
+  }
+
+  DECL_VMUL(int, 16, 8);
+  DECL_VMUL(int, 32, 4);
+  DECL_VMUL(uint, 16, 8);
+  DECL_VMUL(uint, 32, 4);
+
+  DECL_VARIABLE(vector_res, int, 32, 4);
+  DECL_VARIABLE(vector_res, int, 64, 2);
+  DECL_VARIABLE(vector_res, uint, 32, 4);
+  DECL_VARIABLE(vector_res, uint, 64, 2);
+
+  clean_results ();
+
+  /* Initialize input "vector1" from "buffer".  */
+  VLOAD(vector1, buffer, q, int, s, 16, 8);
+  VLOAD(vector1, buffer, q, int, s, 32, 4);
+  VLOAD(vector1, buffer, q, uint, u, 16, 8);
+  VLOAD(vector1, buffer, q, uint, u, 32, 4);
+
+
+  /* Execute the tests.  */
+  TEST_VMULL_HIGH(INSN_NAME, , int, s, 16, 8, 32, 4, 0xBB);
+  TEST_VMULL_HIGH(INSN_NAME, , int, s, 32, 4, 64, 2, 0xCC);
+  TEST_VMULL_HIGH(INSN_NAME, , uint, u, 16, 8, 32, 4, 0xEE);
+  TEST_VMULL_HIGH(INSN_NAME, , uint, u, 32, 4, 64, 2, 0xFF);
+
+  CHECK_VMULL_HIGH_N_RESULTS (TEST_MSG, "");
+}
+#endif
+
+int main (void)
+{
+#ifdef __aarh64__
+  FNNAME (INSN_NAME) ();
+#endif
+
+  return 0;
+}
Index: gcc/testsuite/ChangeLog
===================================================================
--- gcc/testsuite/ChangeLog	(revision 218582)
+++ gcc/testsuite/ChangeLog	(working copy)
@@ -1,3 +1,13 @@
+2014-12-11  Felix Yang  <felix.yang@huawei.com>
+            Jiji Jiang  <jiangjiji@huawei.com>
+
+        * testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c: New
+        test.
+        * testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c:
+        New test.
+        * testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c: New
+        test.
+
 2014-12-10  Martin Liska  <mliska@suse.cz>
 
 	* gcc.dg/ipa/pr63909.c: New test.
Index: gcc/config/aarch64/arm_neon.h
===================================================================
--- gcc/config/aarch64/arm_neon.h	(revision 218582)
+++ gcc/config/aarch64/arm_neon.h	(working copy)
@@ -7576,671 +7576,6 @@ vmovn_u64 (uint64x2_t a)
   return result;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmul_n_f32 (float32x2_t a, float32_t b)
-{
-  float32x2_t result;
-  __asm__ ("fmul %0.2s,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmul_n_s16 (int16x4_t a, int16_t b)
-{
-  int16x4_t result;
-  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmul_n_s32 (int32x2_t a, int32_t b)
-{
-  int32x2_t result;
-  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmul_n_u16 (uint16x4_t a, uint16_t b)
-{
-  uint16x4_t result;
-  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmul_n_u32 (uint32x2_t a, uint32_t b)
-{
-  uint32x2_t result;
-  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-#define vmull_high_lane_s16(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x4_t b_ = (b);                                              \
-       int16x8_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_lane_s32(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x2_t b_ = (b);                                              \
-       int32x4_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_lane_u16(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x4_t b_ = (b);                                             \
-       uint16x8_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_lane_u32(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x2_t b_ = (b);                                             \
-       uint32x4_t a_ = (a);                                             \
-       uint64x2_t result;                                               \
-       __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_laneq_s16(a, b, c)                                   \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x8_t b_ = (b);                                              \
-       int16x8_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_laneq_s32(a, b, c)                                   \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t b_ = (b);                                              \
-       int32x4_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_laneq_u16(a, b, c)                                   \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x8_t b_ = (b);                                             \
-       uint16x8_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_laneq_u32(a, b, c)                                   \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x4_t b_ = (b);                                             \
-       uint32x4_t a_ = (a);                                             \
-       uint64x2_t result;                                               \
-       __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmull_high_n_s16 (int16x8_t a, int16_t b)
-{
-  int32x4_t result;
-  __asm__ ("smull2 %0.4s,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmull_high_n_s32 (int32x4_t a, int32_t b)
-{
-  int64x2_t result;
-  __asm__ ("smull2 %0.2d,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmull_high_n_u16 (uint16x8_t a, uint16_t b)
-{
-  uint32x4_t result;
-  __asm__ ("umull2 %0.4s,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmull_high_n_u32 (uint32x4_t a, uint32_t b)
-{
-  uint64x2_t result;
-  __asm__ ("umull2 %0.2d,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vmull_high_p8 (poly8x16_t a, poly8x16_t b)
-{
-  poly16x8_t result;
-  __asm__ ("pmull2 %0.8h,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmull_high_s8 (int8x16_t a, int8x16_t b)
-{
-  int16x8_t result;
-  __asm__ ("smull2 %0.8h,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmull_high_s16 (int16x8_t a, int16x8_t b)
-{
-  int32x4_t result;
-  __asm__ ("smull2 %0.4s,%1.8h,%2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmull_high_s32 (int32x4_t a, int32x4_t b)
-{
-  int64x2_t result;
-  __asm__ ("smull2 %0.2d,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmull_high_u8 (uint8x16_t a, uint8x16_t b)
-{
-  uint16x8_t result;
-  __asm__ ("umull2 %0.8h,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmull_high_u16 (uint16x8_t a, uint16x8_t b)
-{
-  uint32x4_t result;
-  __asm__ ("umull2 %0.4s,%1.8h,%2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmull_high_u32 (uint32x4_t a, uint32x4_t b)
-{
-  uint64x2_t result;
-  __asm__ ("umull2 %0.2d,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-#define vmull_lane_s16(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x4_t b_ = (b);                                              \
-       int16x4_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("smull %0.4s,%1.4h,%2.h[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_lane_s32(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x2_t b_ = (b);                                              \
-       int32x2_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("smull %0.2d,%1.2s,%2.s[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_lane_u16(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x4_t b_ = (b);                                             \
-       uint16x4_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("umull %0.4s,%1.4h,%2.h[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_lane_u32(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x2_t b_ = (b);                                             \
-       uint32x2_t a_ = (a);                                             \
-       uint64x2_t result;                                               \
-       __asm__ ("umull %0.2d, %1.2s, %2.s[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_laneq_s16(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x8_t b_ = (b);                                              \
-       int16x4_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("smull %0.4s, %1.4h, %2.h[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_laneq_s32(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t b_ = (b);                                              \
-       int32x2_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("smull %0.2d, %1.2s, %2.s[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_laneq_u16(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x8_t b_ = (b);                                             \
-       uint16x4_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("umull %0.4s, %1.4h, %2.h[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_laneq_u32(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x4_t b_ = (b);                                             \
-       uint32x2_t a_ = (a);                                             \
-       uint64x2_t result;                                               \
-       __asm__ ("umull %0.2d, %1.2s, %2.s[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmull_n_s16 (int16x4_t a, int16_t b)
-{
-  int32x4_t result;
-  __asm__ ("smull %0.4s,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmull_n_s32 (int32x2_t a, int32_t b)
-{
-  int64x2_t result;
-  __asm__ ("smull %0.2d,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmull_n_u16 (uint16x4_t a, uint16_t b)
-{
-  uint32x4_t result;
-  __asm__ ("umull %0.4s,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmull_n_u32 (uint32x2_t a, uint32_t b)
-{
-  uint64x2_t result;
-  __asm__ ("umull %0.2d,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vmull_p8 (poly8x8_t a, poly8x8_t b)
-{
-  poly16x8_t result;
-  __asm__ ("pmull %0.8h, %1.8b, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmull_s8 (int8x8_t a, int8x8_t b)
-{
-  int16x8_t result;
-  __asm__ ("smull %0.8h, %1.8b, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmull_s16 (int16x4_t a, int16x4_t b)
-{
-  int32x4_t result;
-  __asm__ ("smull %0.4s, %1.4h, %2.4h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmull_s32 (int32x2_t a, int32x2_t b)
-{
-  int64x2_t result;
-  __asm__ ("smull %0.2d, %1.2s, %2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmull_u8 (uint8x8_t a, uint8x8_t b)
-{
-  uint16x8_t result;
-  __asm__ ("umull %0.8h, %1.8b, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmull_u16 (uint16x4_t a, uint16x4_t b)
-{
-  uint32x4_t result;
-  __asm__ ("umull %0.4s, %1.4h, %2.4h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmull_u32 (uint32x2_t a, uint32x2_t b)
-{
-  uint64x2_t result;
-  __asm__ ("umull %0.2d, %1.2s, %2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmulq_n_f32 (float32x4_t a, float32_t b)
-{
-  float32x4_t result;
-  __asm__ ("fmul %0.4s,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vmulq_n_f64 (float64x2_t a, float64_t b)
-{
-  float64x2_t result;
-  __asm__ ("fmul %0.2d,%1.2d,%2.d[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmulq_n_s16 (int16x8_t a, int16_t b)
-{
-  int16x8_t result;
-  __asm__ ("mul %0.8h,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmulq_n_s32 (int32x4_t a, int32_t b)
-{
-  int32x4_t result;
-  __asm__ ("mul %0.4s,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmulq_n_u16 (uint16x8_t a, uint16_t b)
-{
-  uint16x8_t result;
-  __asm__ ("mul %0.8h,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmulq_n_u32 (uint32x4_t a, uint32_t b)
-{
-  uint32x4_t result;
-  __asm__ ("mul %0.4s,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmulx_f32 (float32x2_t a, float32x2_t b)
-{
-  float32x2_t result;
-  __asm__ ("fmulx %0.2s,%1.2s,%2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-#define vmulx_lane_f32(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x4_t b_ = (b);                                            \
-       float32x2_t a_ = (a);                                            \
-       float32x2_t result;                                              \
-       __asm__ ("fmulx %0.2s,%1.2s,%2.s[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vmulxd_f64 (float64_t a, float64_t b)
-{
-  float64_t result;
-  __asm__ ("fmulx %d0, %d1, %d2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmulxq_f32 (float32x4_t a, float32x4_t b)
-{
-  float32x4_t result;
-  __asm__ ("fmulx %0.4s,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vmulxq_f64 (float64x2_t a, float64x2_t b)
-{
-  float64x2_t result;
-  __asm__ ("fmulx %0.2d,%1.2d,%2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-#define vmulxq_lane_f32(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x4_t b_ = (b);                                            \
-       float32x4_t a_ = (a);                                            \
-       float32x4_t result;                                              \
-       __asm__ ("fmulx %0.4s,%1.4s,%2.s[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmulxq_lane_f64(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       float64x2_t b_ = (b);                                            \
-       float64x2_t a_ = (a);                                            \
-       float64x2_t result;                                              \
-       __asm__ ("fmulx %0.2d,%1.2d,%2.d[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vmulxs_f32 (float32_t a, float32_t b)
-{
-  float32_t result;
-  __asm__ ("fmulx %s0, %s1, %s2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 vmvn_p8 (poly8x8_t a)
 {
@@ -18891,6 +18226,78 @@ vmul_n_f64  (float64x1_t __a, float64_t __b)
   return (float64x1_t) { vget_lane_f64 (__a, 0) * __b };
 }
 
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmul_n_f32 (float32x2_t __a, float32_t __b)
+{
+  return __builtin_aarch64_mul_nv2sf (__a, __b);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmul_n_s16 (int16x4_t __a, int16_t __b)
+{
+  return __builtin_aarch64_mul_nv4hi (__a, __b);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmul_n_s32 (int32x2_t __a, int32_t __b)
+{
+  return __builtin_aarch64_mul_nv2si (__a, __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmul_n_u16 (uint16x4_t __a, uint16_t __b)
+{
+  return (uint16x4_t) __builtin_aarch64_mul_nv4hi ((int16x4_t)__a,
+                                                   (int16_t)__b);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmul_n_u32 (uint32x2_t __a, uint32_t __b)
+{
+  return (uint32x2_t) __builtin_aarch64_mul_nv2si ((int32x2_t)__a,
+                                                   (int32_t)__b);
+}
+
+/* vmulq_n  */
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmulq_n_f32 (float32x4_t __a, float32_t __b)
+{
+  return __builtin_aarch64_mul_nv4sf (__a, __b);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vmulq_n_f64 (float64x2_t __a, float64_t __b)
+{
+  return __builtin_aarch64_mul_nv2df (__a, __b);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmulq_n_s16 (int16x8_t __a, int16_t __b)
+{
+  return __builtin_aarch64_mul_nv8hi (__a, __b);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmulq_n_s32 (int32x4_t __a, int32_t __b)
+{
+  return __builtin_aarch64_mul_nv4si (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmulq_n_u16 (uint16x8_t __a, uint16_t __b)
+{
+  return (uint16x8_t) __builtin_aarch64_mul_nv8hi ((int16x8_t)__a,
+                                                   (int16_t)__b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmulq_n_u32 (uint32x4_t __a, uint32_t __b)
+{
+  return (uint32x4_t) __builtin_aarch64_mul_nv4si ((int32x4_t)__a,
+                                                   (int32_t)__b);
+}
+
 /* vmulq_lane  */
 
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
@@ -18968,6 +18375,308 @@ vmulq_laneq_u32 (uint32x4_t __a, uint32x4_t __b, c
   return __a * __aarch64_vget_lane_any (__b, __lane);
 }
 
+/* vmull_high_lane  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_high_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
+{
+  return __builtin_aarch64_smull2_lanev8hi (__a, __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_high_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
+{
+  return __builtin_aarch64_smull2_lanev4si (__a, __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_high_lane_u16 (uint16x8_t __a, uint16x4_t __b, const int __c)
+{
+  return (uint32x4_t) __builtin_aarch64_umull2_lanev8hi ((int16x8_t) __a,
+                                                         (int16x4_t) __b,
+                                                         __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_high_lane_u32 (uint32x4_t __a, uint32x2_t __b, const int __c)
+{
+  return (uint64x2_t) __builtin_aarch64_umull2_lanev4si ((int32x4_t) __a,
+                                                         (int32x2_t) __b,
+                                                          __c);
+}
+
+/* vmull_high_laneq  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_high_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __c)
+{
+  return __builtin_aarch64_smull2_laneqv8hi (__a, __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_high_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_smull2_laneqv4si (__a, __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_high_laneq_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
+{
+  return (uint32x4_t) __builtin_aarch64_umull2_laneqv8hi ((int16x8_t)__a,
+                                                          (int16x8_t)__b,
+                                                          __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_high_laneq_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
+{
+  return (uint64x2_t) __builtin_aarch64_umull2_laneqv4si ((int32x4_t) __a,
+                                                          (int32x4_t) __b,
+                                                           __c);
+}
+
+/* vmull_high_n  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_high_n_s16 (int16x8_t __a, int16_t __b)
+{
+  return __builtin_aarch64_smull2_nv8hi (__a, __b);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_high_n_s32 (int32x4_t __a, int32_t __b)
+{
+  return __builtin_aarch64_smull2_nv4si (__a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_high_n_u16 (uint16x8_t __a, uint16_t __b)
+{
+  return __builtin_aarch64_umull2_nv8hi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_high_n_u32 (uint32x4_t __a, uint32_t __b)
+{
+  return __builtin_aarch64_umull2_nv4si_uuu (__a, __b);
+}
+
+/* vmull_high  */
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vmull_high_p8 (poly8x16_t __a, poly8x16_t __b)
+{
+    return  __builtin_aarch64_pmull2v16qi_ppp (__a, __b);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmull_high_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return __builtin_aarch64_vec_widen_smult_hi_v16qi (__a, __b);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_high_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return __builtin_aarch64_vec_widen_smult_hi_v8hi (__a, __b);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_high_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return __builtin_aarch64_vec_widen_smult_hi_v4si (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmull_high_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  return __builtin_aarch64_vec_widen_umult_hi_v16qi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_high_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return __builtin_aarch64_vec_widen_umult_hi_v8hi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_high_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return __builtin_aarch64_vec_widen_umult_hi_v4si_uuu (__a, __b);
+}
+
+/* vmull_lane  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+{
+  return __builtin_aarch64_smull_lanev4hi (__a, __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+{
+  return __builtin_aarch64_smull_lanev2si (__a, __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_lane_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
+{
+  return __builtin_aarch64_umull_lanev4hi_uuuu (__a, __b, __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_lane_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
+{
+  return __builtin_aarch64_umull_lanev2si_uuuu (__a, __b, __c);
+}
+
+/* vmull_laneq  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __c)
+{
+  return __builtin_aarch64_smull_laneqv4hi (__a, __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_smull_laneqv2si (__a, __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_laneq_u16 (uint16x4_t __a, uint16x8_t __b, const int __c)
+{
+  return __builtin_aarch64_umull_laneqv4hi_uuuu (__a, __b, __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_laneq_u32 (uint32x2_t __a, uint32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_umull_laneqv2si_uuuu (__a, __b, __c);
+}
+
+/* vmull_n  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_n_s16 (int16x4_t __a, int16_t __b)
+{
+   return __builtin_aarch64_smull_nv4hi (__a, __b);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_n_s32 (int32x2_t __a, int32_t __b)
+{
+   return __builtin_aarch64_smull_nv2si (__a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_n_u16 (uint16x4_t __a, uint16_t __b)
+{
+   return __builtin_aarch64_umull_nv4hi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_n_u32 (uint32x2_t __a, uint32_t __b)
+{
+  return __builtin_aarch64_umull_nv2si_uuu (__a, __b);
+}
+
+/* vmull  */
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vmull_p8 (poly8x8_t __a, poly8x8_t __b)
+{
+  return __builtin_aarch64_pmullv8qi_ppp (__a, __b);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmull_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return __builtin_aarch64_smullv8qi (__a, __b);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return __builtin_aarch64_smullv4hi (__a, __b);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return __builtin_aarch64_smullv2si (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmull_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return __builtin_aarch64_umullv8qi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return __builtin_aarch64_umullv4hi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return __builtin_aarch64_umullv2si_uuu (__a, __b);
+}
+
+/* vmulx  */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmulx_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return __builtin_aarch64_fmulxv2sf (__a, __b);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmulx_lane_f32 (float32x2_t __a, float32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_fmulx_lanev2sf (__a, __b, __c);
+}
+
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vmulxd_f64 (float64_t __a, float64_t __b)
+{
+  return __builtin_aarch64_fmulxdf (__a, __b);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmulxq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return __builtin_aarch64_fmulxv4sf (__a, __b);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vmulxq_f64 (float64x2_t __a, float64x2_t __b)
+{
+  return __builtin_aarch64_fmulxv2df (__a, __b);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmulxq_lane_f32 (float32x4_t __a, float32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_fmulx_lanev4sf (__a, __b, __c);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vmulxq_lane_f64 (float64x2_t __a, float64x2_t __b, const int __c)
+{
+  return __builtin_aarch64_fmulx_lanev2df (__a, __b, __c);
+}
+
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vmulxs_f32 (float32_t __a, float32_t __b)
+{
+  return __builtin_aarch64_fmulxsf (__a, __b);
+}
+
 /* vneg  */
 
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
Index: gcc/config/aarch64/iterators.md
===================================================================
--- gcc/config/aarch64/iterators.md	(revision 218582)
+++ gcc/config/aarch64/iterators.md	(working copy)
@@ -276,6 +276,8 @@
     UNSPEC_SHA256SU1    ; Used in aarch64-simd.md.
     UNSPEC_PMULL        ; Used in aarch64-simd.md.
     UNSPEC_PMULL2       ; Used in aarch64-simd.md.
+    UNSPEC_FMULX        ; Used in aarch64-simd.md.
+    UNSPEC_FMULX_LANE   ; Used in aarch64-simd.md.
 ])
 
 ;; -------------------------------------------------------------------
@@ -465,6 +467,9 @@
 
 )
 
+(define_mode_attr VDQF_Q [(V2SF "V4SF") (V4SF "V4SF")
+                          (V2DF "V2DF")])
+
 ;; Widened mode register suffixes for VD_BHSI/VQW.
 (define_mode_attr Vwtype [(V8QI "8h") (V4HI "4s")
 			  (V2SI "2d") (V16QI "8h") 
Index: gcc/config/aarch64/aarch64-simd.md
===================================================================
--- gcc/config/aarch64/aarch64-simd.md	(revision 218582)
+++ gcc/config/aarch64/aarch64-simd.md	(working copy)
@@ -1394,6 +1394,253 @@
  }
 )
 
+(define_insn "aarch64_mul_n<mode>"
+  [(set (match_operand:VMUL 0 "register_operand" "=w")
+        (mult:VMUL
+          (match_operand:VMUL 1 "register_operand" "w")
+          (vec_duplicate:VMUL
+            (match_operand:<VEL> 2 "register_operand" "<h_con>"))))]
+  "TARGET_SIMD"
+  "<f>mul\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[0]"
+  [(set_attr "type" "neon_mul_<Vetype>_long")]
+)
+
+(define_insn "aarch64_<su>mull_n<mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (match_operand:VD_HSI 1 "register_operand" "w"))
+          (ANY_EXTEND:<VWIDE>
+            (vec_duplicate:VD_HSI
+              (match_operand:<VEL> 2 "register_operand" "<vwx>")))))]
+  "TARGET_SIMD"
+  "<su>mull\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[0]"
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+
+(define_insn "aarch64_<su>mull<mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (match_operand:VD_BHSI 1 "register_operand" "w"))
+          (ANY_EXTEND:<VWIDE>
+            (match_operand:VD_BHSI 2 "register_operand" "w"))))]
+ "TARGET_SIMD"
+ "<su>mull\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vtype>"
+  [(set_attr "type" "neon_mul_<Vetype>_long")]
+)
+
+(define_insn "aarch64_simd_<su>mull2_n<mode>"
+ [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+       (mult:<VWIDE> (ANY_EXTEND:<VWIDE> (vec_select:<VHALF>
+                            (match_operand:VQ_HSI 1 "register_operand" "w")
+                            (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" "")))
+                     (ANY_EXTEND:<VWIDE> (vec_duplicate:<VHALF>
+                            (match_operand:<VEL> 2 "register_operand" "<vw>")))))]
+  "TARGET_SIMD"
+  "<su>mull2\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[0]"
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_expand "aarch64_<su>mull2_n<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "")
+   (ANY_EXTEND:<VWIDE> (match_operand:VQ_HSI 1 "register_operand" ""))
+   (match_operand:<VEL> 2 "register_operand" "")]
+ "TARGET_SIMD"
+ {
+   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+   emit_insn (gen_aarch64_simd_<su>mull2_n<mode> (operands[0],
+                                                  operands[1],
+                                                  operands[2], p));
+   DONE;
+
+ }
+)
+
+(define_insn "aarch64_<su>mull_lane<mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (match_operand:VD_HSI 1 "register_operand" "w"))
+          (ANY_EXTEND:<VWIDE>
+            (vec_duplicate:VD_HSI
+              (vec_select:<VEL>
+                (match_operand:<VCOND> 2 "register_operand" "<vwx>")
+                  (parallel [(match_operand:SI 3 "immediate_operand" "i")]))))))]
+  "TARGET_SIMD"
+  {
+    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[3])));
+    return "<su>mull\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[%3]";
+  }
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_insn "aarch64_<su>mull_laneq<mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (match_operand:VD_HSI 1 "register_operand" "w"))
+          (ANY_EXTEND:<VWIDE>
+            (vec_duplicate:VD_HSI
+              (vec_select:<VEL>
+                (match_operand:<VCONQ> 2 "register_operand" "<vwx>")
+                  (parallel [(match_operand:SI 3 "immediate_operand" "i")]))))))]
+  "TARGET_SIMD"
+  {
+    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[3])));
+    return "<su>mull\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[%3]";
+  }
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_insn "aarch64_<su>mull2_lane<mode>_internal"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (vec_select:<VHALF>
+              (match_operand:VQ_HSI 1 "register_operand" "w")
+              (match_operand:VQ_HSI 4 "vect_par_cnst_hi_half" "")))
+          (ANY_EXTEND:<VWIDE>
+            (vec_duplicate:<VHALF>
+              (vec_select:<VEL>
+                (match_operand:<VCOND> 2 "register_operand" "<vwx>")
+                  (parallel [(match_operand:SI 3 "immediate_operand" "i")]))))))]
+  "TARGET_SIMD"
+  {
+    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[3])));
+    return "<su>mull2\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[%3]";
+  }
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_insn "aarch64_<su>mull2_laneq<mode>_internal"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (vec_select:<VHALF>
+              (match_operand:VQ_HSI 1 "register_operand" "w")
+              (match_operand:VQ_HSI 4 "vect_par_cnst_hi_half" "")))
+          (ANY_EXTEND:<VWIDE>
+            (vec_duplicate:<VHALF>
+              (vec_select:<VEL>
+                (match_operand:<VCONQ> 2 "register_operand" "<vwx>")
+                  (parallel [(match_operand:SI 3 "immediate_operand" "i")]))))))]
+  "TARGET_SIMD"
+  {
+    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[3])));
+    return "<su>mull2\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[%3]";
+  }
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_expand "aarch64_smull2_lane<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+   (match_operand:VQ_HSI 1 "register_operand" "w")
+   (match_operand:<VCOND> 2 "register_operand" "<vwx>")
+   (match_operand:SI 3 "immediate_operand" "i")]
+  "TARGET_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  emit_insn (gen_aarch64_smull2_lane<mode>_internal (operands[0], operands[1],
+                                                     operands[2], operands[3],
+                                                     p));
+  DONE;
+})
+
+(define_expand "aarch64_umull2_lane<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+   (match_operand:VQ_HSI 1 "register_operand" "w")
+   (match_operand:<VCOND> 2 "register_operand" "<vwx>")
+   (match_operand:SI 3 "immediate_operand" "i")]
+  "TARGET_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  emit_insn (gen_aarch64_umull2_lane<mode>_internal (operands[0], operands[1],
+                                                     operands[2], operands[3],
+                                                     p));
+  DONE;
+})
+
+(define_expand "aarch64_smull2_laneq<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+   (match_operand:VQ_HSI 1 "register_operand" "w")
+   (match_operand:<VCONQ> 2 "register_operand" "<vwx>")
+   (match_operand:SI 3 "immediate_operand" "i")]
+  "TARGET_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  emit_insn (gen_aarch64_smull2_laneq<mode>_internal (operands[0], operands[1],
+                                                      operands[2], operands[3],
+                                                      p));
+  DONE;
+})
+
+(define_expand "aarch64_umull2_laneq<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+   (match_operand:VQ_HSI 1 "register_operand" "w")
+   (match_operand:<VCONQ> 2 "register_operand" "<vwx>")
+   (match_operand:SI 3 "immediate_operand" "i")]
+  "TARGET_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  emit_insn (gen_aarch64_umull2_laneq<mode>_internal (operands[0], operands[1],
+                                                      operands[2], operands[3],
+                                                      p));
+  DONE;
+})
+
+(define_insn "aarch64_fmulx<mode>"
+  [(set (match_operand:VDQF 0 "register_operand" "=w")
+        (unspec:VDQF  [(match_operand:VDQF 1 "register_operand" "w")
+                       (match_operand:VDQF 2 "register_operand" "w")]
+                      UNSPEC_FMULX))]
+ "TARGET_SIMD"
+ "fmulx\\t%0.<vtype>, %1.<vtype>, %2.<vtype>"
+  [(set_attr "type" "neon_mul_s")]
+)
+
+(define_insn "aarch64_fmulx<mode>"
+  [(set (match_operand:GPF 0 "register_operand" "=w")
+        (unspec:GPF  [(match_operand:GPF 1 "register_operand" "w")
+                      (match_operand:GPF 2 "register_operand" "w")]
+                     UNSPEC_FMULX))]
+ "TARGET_SIMD"
+ "fmulx\\t%<s>0, %<s>1, %<s>2"
+  [(set_attr "type" "neon_mul_s")]
+)
+
+(define_insn "aarch64_fmulx_lane<mode>"
+  [(set (match_operand:VDQF 0 "register_operand" "=w")
+        (unspec:VDQF  [(match_operand:VDQF 1 "register_operand" "w")
+                       (match_operand:<VDQF_Q> 2 "register_operand" "w")
+                       (match_operand:SI 3 "immediate_operand" "i")]
+                      UNSPEC_FMULX_LANE))]
+ "TARGET_SIMD"
+ "fmulx\\t%0.<vtype>, %1.<vtype>, %2.<vetype>"
+  [(set_attr "type" "neon_mul_s")]
+)
+
+(define_insn "aarch64_pmull2v16qi"
+ [(set (match_operand:V8HI 0 "register_operand" "=w")
+       (unspec:V8HI [(match_operand:V16QI 1 "register_operand" "w")
+                     (match_operand:V16QI 2 "register_operand" "w")]
+                    UNSPEC_PMULL2))]
+  "TARGET_SIMD"
+  "pmull2\\t%0.8h, %1.16b, %2.16b"
+  [(set_attr "type" "neon_mul_b_long")]
+)
+
+(define_insn "aarch64_pmullv8qi"
+  [(set (match_operand:V8HI 0 "register_operand" "=w")
+        (unspec:V8HI  [(match_operand:V8QI 1 "register_operand" "w")
+                       (match_operand:V8QI 2 "register_operand" "w")]
+                      UNSPEC_PMULL))]
+ "TARGET_SIMD"
+ "pmull\\t%0.8h, %1.8b, %2.8b"
+  [(set_attr "type" "neon_mul_b_long")]
+)
+
 ;; FP vector operations.
 ;; AArch64 AdvSIMD supports single-precision (32-bit) and 
 ;; double-precision (64-bit) floating-point data types and arithmetic as
Index: gcc/config/aarch64/aarch64-simd-builtins.def
===================================================================
--- gcc/config/aarch64/aarch64-simd-builtins.def	(revision 218582)
+++ gcc/config/aarch64/aarch64-simd-builtins.def	(working copy)
@@ -187,6 +187,39 @@
   BUILTIN_VSDQ_HSI (TERNOP_LANE, sqrdmulh_lane, 0)
   BUILTIN_VSDQ_HSI (TERNOP_LANE, sqrdmulh_laneq, 0)
 
+  /* Implemented by vec_widen_<su>mult_hi_<mode>.  */
+  BUILTIN_VQW (BINOP, vec_widen_smult_hi_, 10)
+  BUILTIN_VQW (BINOPU, vec_widen_umult_hi_, 10)
+  /* Implemented by aarch64_<su>mull<mode>.  */
+  BUILTIN_VD_BHSI (BINOPU, umull, 0)
+  BUILTIN_VD_BHSI (BINOP, smull, 0)
+  /* Implemented by aarch64_<su>mull_n<mode>.  */
+  BUILTIN_VD_HSI (BINOP, smull_n, 0)
+  BUILTIN_VD_HSI (BINOPU, umull_n, 0)
+  /* Implemented by aarch64_mul_n<mode>.  */
+  BUILTIN_VMUL (BINOP, mul_n, 0)
+  /* Implemented by aarch64_<su>mull2_n<mode>.  */
+  BUILTIN_VQ_HSI (BINOP, smull2_n, 0)
+  BUILTIN_VQ_HSI (BINOPU, umull2_n, 0)
+  /* Implemented by aarch64_<su>mull_lane<q><mode>.  */
+  BUILTIN_VD_HSI (TERNOP, smull_lane, 0)
+  BUILTIN_VD_HSI (TERNOPU, umull_lane, 0)
+  BUILTIN_VD_HSI (TERNOP, smull_laneq, 0)
+  BUILTIN_VD_HSI (TERNOPU, umull_laneq, 0)
+  /* Implemented by aarch64_<su>mull2_lane<q><mode>.  */
+  BUILTIN_VQ_HSI (TERNOP, smull2_lane, 0)
+  BUILTIN_VQ_HSI (TERNOP_LANE, umull2_lane, 0)
+  BUILTIN_VQ_HSI (TERNOP, smull2_laneq, 0)
+  BUILTIN_VQ_HSI (TERNOP_LANE, umull2_laneq, 0)
+  /* Implemented by aarch64_fmulx<mode>.  */
+  BUILTIN_VDQF (BINOP, fmulx, 0)
+  BUILTIN_GPF (BINOP, fmulx, 0)
+  BUILTIN_VDQF (BINOP, fmulx_lane, 0)
+
+  /* Implemented by aarch64_pmull<2><mode>.*/
+  VAR1 (BINOPP, pmull, 0, v8qi)
+  VAR1 (BINOPP, pmull2, 0, v16qi)
+
   BUILTIN_VSDQ_I_DI (BINOP, ashl, 3)
   /* Implemented by aarch64_<sur>shl<mode>.  */
   BUILTIN_VSDQ_I_DI (BINOP, sshl, 0)

Attachment: aarch64-instrinsic-vmul-v3.diff
Description: aarch64-instrinsic-vmul-v3.diff


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]