This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

[AArch64] [4/4 Fix vtbx1] Handle vtbx{1,3} emulation sequence using other intrinsics

From: James Greenhalgh <james dot greenhalgh at arm dot com>
To: gcc-patches at gcc dot gnu dot org
Cc: marcus dot shawcroft at arm dot com
Date: Fri, 22 Nov 2013 15:11:42 +0000
Subject: [AArch64] [4/4 Fix vtbx1] Handle vtbx{1,3} emulation sequence using other intrinsics
Authentication-results: sourceware.org; auth=none
References: <1385133102-19231-1-git-send-email-james dot greenhalgh at arm dot com>

Hi,

The vtbx_<psu>8 and vtbx_psu>8 intrinsics were buggy and could
generated junk. We fix that by moving their emulation to use
other neon intrinsics.

These new intrinsic sequences are closely inspired by
those suggested in the latest version of the Neon Intrinsics
specification.

Tested on aarch64-none-elf with no regressions.

OK?

Thanks,
James

---
gcc/

2013-11-22  James Greenhalgh  <james.greenhalgh@arm.com>

	* config/aarch64/arm_neon.h (vtbx1_<psu>8): Emulate behaviour
	using other intrinsics.
	(vtbx3_<psu>8): Likewise.

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 11f8037..7e374bc 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -15135,54 +15135,6 @@ vtbl4_p8 (poly8x8x4_t tab, uint8x8_t idx)
 }
 
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vtbx1_s8 (int8x8_t r, int8x8_t tab, int8x8_t idx)
-{
-  int8x8_t result;
-  int8x8_t tmp1;
-  int8x16_t temp = vcombine_s8 (tab, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("movi %0.8b, 8\n\t"
-	   "cmhs %0.8b, %3.8b, %0.8b\n\t"
-	   "tbl %1.8b, {%2.16b}, %3.8b\n\t"
-	   "bsl %0.8b, %4.8b, %1.8b\n\t"
-           : "+w"(result), "=&w"(tmp1)
-           : "w"(temp), "w"(idx), "w"(r)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtbx1_u8 (uint8x8_t r, uint8x8_t tab, uint8x8_t idx)
-{
-  uint8x8_t result;
-  uint8x8_t tmp1;
-  uint8x16_t temp = vcombine_u8 (tab, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("movi %0.8b, 8\n\t"
-	   "cmhs %0.8b, %3.8b, %0.8b\n\t"
-	   "tbl %1.8b, {%2.16b}, %3.8b\n\t"
-	   "bsl %0.8b, %4.8b, %1.8b\n\t"
-           : "+w"(result), "=&w"(tmp1)
-           : "w"(temp), "w"(idx), "w"(r)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vtbx1_p8 (poly8x8_t r, poly8x8_t tab, uint8x8_t idx)
-{
-  poly8x8_t result;
-  poly8x8_t tmp1;
-  poly8x16_t temp = vcombine_p8 (tab, vcreate_p8 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("movi %0.8b, 8\n\t"
-	   "cmhs %0.8b, %3.8b, %0.8b\n\t"
-	   "tbl %1.8b, {%2.16b}, %3.8b\n\t"
-	   "bsl %0.8b, %4.8b, %1.8b\n\t"
-           : "+w"(result), "=&w"(tmp1)
-           : "w"(temp), "w"(idx), "w"(r)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 vtbx2_s8 (int8x8_t r, int8x8x2_t tab, int8x8_t idx)
 {
   int8x8_t result = r;
@@ -15219,63 +15171,6 @@ vtbx2_p8 (poly8x8_t r, poly8x8x2_t tab, uint8x8_t idx)
 }
 
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vtbx3_s8 (int8x8_t r, int8x8x3_t tab, int8x8_t idx)
-{
-  int8x8_t result;
-  int8x8_t tmp1;
-  int8x16x2_t temp;
-  temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
-  temp.val[1] = vcombine_s8 (tab.val[2], vcreate_s8 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("ld1 {v16.16b - v17.16b}, %2\n\t"
-	   "movi %0.8b, 24\n\t"
-	   "cmhs %0.8b, %3.8b, %0.8b\n\t"
-	   "tbl %1.8b, {v16.16b - v17.16b}, %3.8b\n\t"
-	   "bsl %0.8b, %4.8b, %1.8b\n\t"
-           : "+w"(result), "=&w"(tmp1)
-           : "Q"(temp), "w"(idx), "w"(r)
-           : "v16", "v17", "memory");
-  return result;
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtbx3_u8 (uint8x8_t r, uint8x8x3_t tab, uint8x8_t idx)
-{
-  uint8x8_t result;
-  uint8x8_t tmp1;
-  uint8x16x2_t temp;
-  temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
-  temp.val[1] = vcombine_u8 (tab.val[2], vcreate_u8 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("ld1 {v16.16b - v17.16b}, %2\n\t"
-	   "movi %0.8b, 24\n\t"
-	   "cmhs %0.8b, %3.8b, %0.8b\n\t"
-	   "tbl %1.8b, {v16.16b - v17.16b}, %3.8b\n\t"
-	   "bsl %0.8b, %4.8b, %1.8b\n\t"
-           : "+w"(result), "=&w"(tmp1)
-           : "Q"(temp), "w"(idx), "w"(r)
-           : "v16", "v17", "memory");
-  return result;
-}
-
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vtbx3_p8 (poly8x8_t r, poly8x8x3_t tab, uint8x8_t idx)
-{
-  poly8x8_t result;
-  poly8x8_t tmp1;
-  poly8x16x2_t temp;
-  temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
-  temp.val[1] = vcombine_p8 (tab.val[2], vcreate_p8 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("ld1 {v16.16b - v17.16b}, %2\n\t"
-	   "movi %0.8b, 24\n\t"
-	   "cmhs %0.8b, %3.8b, %0.8b\n\t"
-	   "tbl %1.8b, {v16.16b - v17.16b}, %3.8b\n\t"
-	   "bsl %0.8b, %4.8b, %1.8b\n\t"
-           : "+w"(result), "=&w"(tmp1)
-           : "Q"(temp), "w"(idx), "w"(r)
-           : "v16", "v17", "memory");
-  return result;
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 vtbx4_s8 (int8x8_t r, int8x8x4_t tab, int8x8_t idx)
 {
   int8x8_t result = r;
@@ -24886,6 +24781,66 @@ vsubd_u64 (uint64x1_t __a, uint64x1_t __b)
   return __a - __b;
 }
 
+/* vtbx1  */
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vtbx1_s8 (int8x8_t __r, int8x8_t __tab, int8x8_t __idx)
+{
+  uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx),
+			      vmov_n_u8 (8));
+  int8x8_t __tbl = vtbl1_s8 (__tab, __idx);
+
+  return vbsl_s8 (__mask, __tbl, __r);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vtbx1_u8 (uint8x8_t __r, uint8x8_t __tab, uint8x8_t __idx)
+{
+  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (8));
+  uint8x8_t __tbl = vtbl1_u8 (__tab, __idx);
+
+  return vbsl_u8 (__mask, __tbl, __r);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vtbx1_p8 (poly8x8_t __r, poly8x8_t __tab, uint8x8_t __idx)
+{
+  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (8));
+  poly8x8_t __tbl = vtbl1_p8 (__tab, __idx);
+
+  return vbsl_p8 (__mask, __tbl, __r);
+}
+
+/* vtbx3  */
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vtbx3_s8 (int8x8_t __r, int8x8x3_t __tab, int8x8_t __idx)
+{
+  uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx),
+			      vmov_n_u8 (24));
+  int8x8_t __tbl = vtbl3_s8 (__tab, __idx);
+
+  return vbsl_s8 (__mask, __tbl, __r);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vtbx3_u8 (uint8x8_t __r, uint8x8x3_t __tab, uint8x8_t __idx)
+{
+  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (24));
+  uint8x8_t __tbl = vtbl3_u8 (__tab, __idx);
+
+  return vbsl_u8 (__mask, __tbl, __r);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vtbx3_p8 (poly8x8_t __r, poly8x8x3_t __tab, uint8x8_t __idx)
+{
+  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (24));
+  poly8x8_t __tbl = vtbl3_p8 (__tab, __idx);
+
+  return vbsl_p8 (__mask, __tbl, __r);
+}
+
 /* vtrn */
 
 __extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))

Follow-Ups:
- Re: [AArch64] [4/4 Fix vtbx1] Handle vtbx{1,3} emulation sequence using other intrinsics
  - From: Marcus Shawcroft

References:
- [AArch64] [0/4 Fix vtbx1] Fix behaviour of vtbx{1,3} intrinsics
  - From: James Greenhalgh

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]