This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[AVX]: Add AVX vpermilXXX builtins


Hi,

I am checking in this patch to add AVX vpermilXXX builtins.


H.J.
----
gcc/

2008-04-19  H.J. Lu  <hongjiu.lu@intel.com>

	* config/i386/gmmintrin.h (_mm_permutevar_pd): Call
	__builtin_ia32_vpermilvarpd instead of __builtin_ia32_vpermilpd128.
	(_mm256_permutevar_pd): Call __builtin_ia32_vpermilvarpd256
	instead of __builtin_ia32_vpermilpd256.
	(_mm_permutevar_ps): Call __builtin_ia32_vpermilvarps instead
	of __builtin_ia32_vpermilps128.
	(_mm256_permutevar_ps): Call __builtin_ia32_vpermilvarps256
	instead of __builtin_ia32_vpermilps256.
	(_mm_permute_pd): Call __builtin_ia32_vpermilpd instead of
	__builtin_ia32_vpermilpd128.
	(_mm256_permute_pd): Fix a typo.
	(_mm_permute_ps): Call __builtin_ia32_vpermilps instead of
	__builtin_ia32_vpermilps128.
	(_mm_permute2_pd): Call __builtin_ia32_vpermil2pd instead of
	__builtin_ia32_vpermil2pd128.
	(_mm_permute2_ps): Call __builtin_ia32_vpermil2ps instead of
	__builtin_ia32_vpermil2ps128.
	(_mm_maskload_pd): Call __builtin_ia32_maskloadpd instead of
	__builtin_ia32_maskloadpd128.
	(_mm_maskstore_pd): Call __builtin_ia32_maskstorepd instead of
	__builtin_ia32_maskstorepd128.
	(_mm_maskload_ps): Call __builtin_ia32_maskloadps instead of
	__builtin_ia32_maskloadps128.
	(_mm_maskstore_ps): Call __builtin_ia32_maskstoreps instead of
	__builtin_ia32_maskstoreps128.

	* config/i386/i386.c (ix86_builtins): Add IX86_BUILTIN_VPERMILVARPD,
	IX86_BUILTIN_VPERMILVARPS, IX86_BUILTIN_VPERMILVARPD256,
	IX86_BUILTIN_VPERMILVARPS256, IX86_BUILTIN_VPERMILPD,
	IX86_BUILTIN_VPERMILPS, IX86_BUILTIN_VPERMILPD256,
	IX86_BUILTIN_VPERMILPS256, IX86_BUILTIN_VPERMIL2PD,
	IX86_BUILTIN_VPERMIL2PS, IX86_BUILTIN_VPERMIL2PD256,
	IX86_BUILTIN_VPERMIL2PS256, IX86_BUILTIN_VPERM2F128PD256,
	IX86_BUILTIN_VPERM2F128PS256 and IX86_BUILTIN_VPERM2F128SI256.
	(sse_builtin_type): Add V8SF_FTYPE_V8SF_INT,
	V4DF_FTYPE_V4DF_INT, V8SI_FTYPE_V8SI_V8SI_INT,
	V8SF_FTYPE_V8SF_V8SF_V8SF_INT, V4DF_FTYPE_V4DF_V4DF_V4DF_INT,
	V4SF_FTYPE_V4SF_V4SF_V4SF_INT and V2DF_FTYPE_V2DF_V2DF_V2DF_INT.
	(bdesc_sse_args): Add __builtin_ia32_vperm2f128_pd256,
	__builtin_ia32_vperm2f128_ps256, __builtin_ia32_vperm2f128_si256,
	__builtin_ia32_vpermilpd, __builtin_ia32_vpermilps,
	__builtin_ia32_vpermilpd256, __builtin_ia32_vpermilps256,
	__builtin_ia32_vpermil2pd, __builtin_ia32_vpermil2ps,
	__builtin_ia32_vpermil2pd256 and __builtin_ia32_vpermil2ps256.
	(bdesc_2arg): Add __builtin_ia32_vpermilvarpd,
	__builtin_ia32_vpermilvarps, __builtin_ia32_vpermilvarpd256
	and __builtin_ia32_vpermilvarps256.
	(bdesc_1arg): Add IX86_BUILTIN_VPERMILPD, IX86_BUILTIN_VPERMILPS,
	IX86_BUILTIN_VPERMILPD256 and IX86_BUILTIN_VPERMILPS256.
	(ix86_init_mmx_sse_builtins): Handle bdesc_avx_4arg.  Support
	v8si_ftype_v8si_v8si_int in bdesc_2arg.  Define
	__builtin_ia32_vpermilpd, __builtin_ia32_vpermilps,
	__builtin_ia32_vpermilpd256 and __builtin_ia32_vpermilps256.
	(ix86_init_mmx_sse_builtins): Handle more AVX builtins.
	(ix86_expand_sse_operands_builtin): Support 4 arguments.  Handle
	more AVX builtins.

	* config/i386/i386.md (UNSPEC_VPERMIL): New.
	(UNSPEC_VPERMIL2): Likewise.
	(UNSPEC_VPERMIL2F128): Likewise.

	* config/i386/sse.md (vpermilbits): New.
	(avx_vpermil<mode>3): Likewise.
	(avx_vpermilvar<mode>3): Likewise.
	(avx_vpermil2<mode>3): Likewise.
	(avx_vpermil2f128<mode>3): Likewise.

gcc/testsuite/

2008-04-19  H.J. Lu  <hongjiu.lu@intel.com>

	* gcc.target/i386/avx-1.c: Add more tests for gmmintrin.h.  

	* gcc.target/i386/avx-2.c (test_3): New.
	Add more tests for gmmintrin.h.

Index: testsuite/gcc.target/i386/avx-1.c
===================================================================
--- testsuite/gcc.target/i386/avx-1.c	(revision 134473)
+++ testsuite/gcc.target/i386/avx-1.c	(working copy)
@@ -33,6 +33,17 @@
 #define __builtin_ia32_vextractf128_pd256(X, N) __builtin_ia32_vextractf128_pd256(X, 1)
 #define __builtin_ia32_vextractf128_ps256(X, N) __builtin_ia32_vextractf128_ps256(X, 1)
 #define __builtin_ia32_vextractf128_si256(X, N) __builtin_ia32_vextractf128_si256(X, 1)
+#define __builtin_ia32_vpermilpd(X, N) __builtin_ia32_vpermilpd(X, 1)
+#define __builtin_ia32_vpermilpd256(X, N) __builtin_ia32_vpermilpd256(X, 1)
+#define __builtin_ia32_vpermilps(X, N) __builtin_ia32_vpermilps(X, 1)
+#define __builtin_ia32_vpermilps256(X, N) __builtin_ia32_vpermilps256(X, 1)
+#define __builtin_ia32_vpermil2pd(X, Y, C, I) __builtin_ia32_vpermil2pd(X, Y, C, 1)
+#define __builtin_ia32_vpermil2pd256(X, Y, C, I) __builtin_ia32_vpermil2pd256(X, Y, C, 1)
+#define __builtin_ia32_vpermil2ps(X, Y, C, I) __builtin_ia32_vpermil2ps(X, Y, C, 1)
+#define __builtin_ia32_vpermil2ps256(X, Y, C, I) __builtin_ia32_vpermil2ps256(X, Y, C, 1)
+#define __builtin_ia32_vperm2f128_pd256(X, Y, C) __builtin_ia32_vperm2f128_pd256(X, Y, 1)
+#define __builtin_ia32_vperm2f128_ps256(X, Y, C) __builtin_ia32_vperm2f128_ps256(X, Y, 1)
+#define __builtin_ia32_vperm2f128_si256(X, Y, C) __builtin_ia32_vperm2f128_si256(X, Y, 1)
 
 /* wmmintrin.h */
 #define __builtin_ia32_aeskeygenassist128(X, C) __builtin_ia32_aeskeygenassist128(X, 1)
Index: testsuite/gcc.target/i386/avx-2.c
===================================================================
--- testsuite/gcc.target/i386/avx-2.c	(revision 134473)
+++ testsuite/gcc.target/i386/avx-2.c	(working copy)
@@ -35,6 +35,11 @@
   type _CONCAT(_,func) (op1_type A, op2_type B, int const I, int const L) \
   { return func (A, B, imm1, imm2); }
 
+#define test_3(func, type, op1_type, op2_type, op3_type, imm)	\
+  type _CONCAT(_,func) (op1_type A, op2_type B,				\
+			op3_type C, int const I)		\
+  { return func (A, B, C, imm); }
+
 #define test_4(func, type, op1_type, op2_type, op3_type, op4_type, imm)	\
   type _CONCAT(_,func) (op1_type A, op2_type B,				\
 			op3_type C, op4_type D, int const I)		\
@@ -63,6 +68,17 @@ test_2 (_mm256_cmp_ps, __m256, __m256, _
 test_1 (_mm256_extractf128_pd, __m128d, __m256d, 1)
 test_1 (_mm256_extractf128_ps, __m128, __m256, 1)
 test_1 (_mm256_extractf128_si256, __m128i, __m256i, 1)
+test_1 (_mm_permute_pd, __m128d, __m128d, 1)
+test_1 (_mm256_permute_pd, __m256d, __m256d, 1)
+test_1 (_mm_permute_ps, __m128, __m128, 1)
+test_1 (_mm256_permute_ps, __m256, __m256, 1)
+test_3 (_mm_permute2_pd, __m128d, __m128d, __m128d, __m128d, 1)
+test_3 (_mm256_permute2_pd, __m256d, __m256d, __m256d, __m256d, 1)
+test_3 (_mm_permute2_ps, __m128, __m128, __m128, __m128, 1)
+test_3 (_mm256_permute2_ps, __m256, __m256, __m256, __m256, 1)
+test_2 (_mm256_permute2f128_pd, __m256d, __m256d, __m256d, 1)
+test_2 (_mm256_permute2f128_ps, __m256, __m256, __m256, 1)
+test_2 (_mm256_permute2f128_si256, __m256i, __m256i, __m256i, 1)
 
 /* wmmintrin.h */
 test_1 (_mm_aeskeygenassist_si128, __m128i, __m128i, 1)
Index: config/i386/i386.md
===================================================================
--- config/i386/i386.md	(revision 134473)
+++ config/i386/i386.md	(working copy)
@@ -200,6 +200,9 @@
 
    ; For AVX support
    (UNSPEC_PCMP			166)
+   (UNSPEC_VPERMIL		167)
+   (UNSPEC_VPERMIL2		168)
+   (UNSPEC_VPERMIL2F128		169)
   ])
 
 (define_constants
Index: config/i386/gmmintrin.h
===================================================================
--- config/i386/gmmintrin.h	(revision 134473)
+++ config/i386/gmmintrin.h	(working copy)
@@ -521,48 +521,51 @@ _mm256_zeroupper (void)
   __builtin_ia32_vzeroupper ();
 }
 
-#if 0
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_permutevar_pd (__m128d __A, __m128d __B)
 {
-  return (__m128d) __builtin_ia32_vpermilpd128 ((__v2df)__A, (__v2df)__B);
+  return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A,
+						(__v2df)__B);
 }
 
 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_permutevar_pd (__m256d __A, __m256d __B)
 {
-  return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__A, (__v4df)__B);
+  return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A,
+						   (__v4df)__B);
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_permutevar_ps (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_vpermilps128 ((__v4sf)__A, (__v4sf)__B);
+  return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A,
+					       (__v4sf)__B);
 }
 
 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_permutevar_ps (__m256 __A, __m256 __B)
 {
-  return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__A, (__v8sf)__B);
+  return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A,
+						  (__v8sf)__B);
 }
 
 #ifdef __OPTIMIZE__
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_permute_pd (__m128d __X, const int __C)
 {
-  return (__m128d) __builtin_ia32_vpermilpd128 ((__v2df)__X, __C);
+  return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C);
 }
 
 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_permute_pd (__m256d __X, const int __C)
 {
-  return (__m256d) __builtin_ia32_vpermilps256 ((__v4df)__X, __C);
+  return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C);
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_permute_ps (__m128 __X, const int __C)
 {
-  return (__m128) __builtin_ia32_vpermilps128 ((__v4sf)__X, __C);
+  return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C);
 }
 
 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -574,10 +577,10 @@ _mm256_permute_ps (__m256 __X, const int
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_permute2_pd (__m128d __X, __m128d __Y, __m128d __C, const int __I)
 {
-  return (__m128d) __builtin_ia32_vpermil2pd128 ((__v2df)__X,
-						 (__v2df)__Y,
-						 (__v2df)__C,
-						 __I);
+  return (__m128d) __builtin_ia32_vpermil2pd ((__v2df)__X,
+					      (__v2df)__Y,
+					      (__v2df)__C,
+					      __I);
 }
 
 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -592,10 +595,10 @@ _mm256_permute2_pd (__m256d __X, __m256d
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_permute2_ps (__m128 __X, __m128 __Y, __m128 __C, const int __I)
 {
-  return (__m128) __builtin_ia32_vpermil2ps128 ((__v4sf)__X,
-						(__v4sf)__Y,
-						(__v4sf)__C,
-						__I);
+  return (__m128) __builtin_ia32_vpermil2ps ((__v4sf)__X,
+					     (__v4sf)__Y,
+					     (__v4sf)__C,
+					     __I);
 }
 
 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -607,44 +610,40 @@ _mm256_permute2_ps (__m256 __X, __m256 _
 						__I);
 }
 #else
-#define _mm_permute_pd(X, C)					\
-  ((__m128d) __builtin_ia32_vpermilpd128 ((__v2df)(__m128d)(X),	\
-					  (int)(C)))
-
-#define _mm256_permute_pd(X, C)					\
-  ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X),	\
-					  (int)(C)))
-
-#define _mm_permute_ps(X, C)					\
-  ((__m128) __builtin_ia32_vpermilps128 ((__v4sf)(__m128)(X),	\
-					 (int)(C)))
-
-#define _mm256_permute_ps(X, C)					\
-  ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X),	\
-					 (int)(C)))
-
-#define _mm_permute2_pd(X, Y, C, I)				\
-  ((__m128d) __builtin_ia32_vpermil2pd128 ((__v2df)(__m128d)(X),\
-					   (__v2df)(__m128d)(Y),\
-					   (__v2df)(__m128d)(C),\
-					   (int)(I)))
+#define _mm_permute_pd(X, C)						\
+  ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C)))
 
-#define _mm256_permute2_pd(X, Y, C, I)				\
-  ((__m256d) __builtin_ia32_vpermil2pd256 ((__v4sf)(__m256d)(X),\
-					   (__v4df)(__m256d)(Y),\
-					   (__v4df)(__m256d)(C),\
-					   (int)(I)))
+#define _mm256_permute_pd(X, C)						\
+  ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X),	(int)(C)))
 
-#define _mm_permute2_ps(X, Y, C, I)				\
-  ((__m128) __builtin_ia32_vpermil2ps128 ((__v4sf)(__m128)(X),	\
-					  (__v4sf)(__m128)(Y),  \
-					  (__v4sf)(__m128)(C),  \
-					  (int)(I)))
+#define _mm_permute_ps(X, C)						\
+  ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C)))
 
-#define _mm256_permute2_ps(X, Y, C, I)				\
-  ((__m256) __builtin_ia32_vpermil2ps256 ((__v8sf)(__m256)(X),	\
-					  (__v8sf)(__m256)(Y),  \
-					  (__v8sf)(__m256)(C),  \
+#define _mm256_permute_ps(X, C)						\
+  ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C)))
+
+#define _mm_permute2_pd(X, Y, C, I)					\
+  ((__m128d) __builtin_ia32_vpermil2pd ((__v2df)(__m128d)(X),		\
+					(__v2df)(__m128d)(Y),		\
+					(__v2df)(__m128d)(C),		\
+					(int)(I)))
+
+#define _mm256_permute2_pd(X, Y, C, I)					\
+  ((__m256d) __builtin_ia32_vpermil2pd256 ((__v4df)(__m256d)(X),	\
+					   (__v4df)(__m256d)(Y),	\
+					   (__v4df)(__m256d)(C),	\
+					   (int)(I)))
+
+#define _mm_permute2_ps(X, Y, C, I)					\
+  ((__m128) __builtin_ia32_vpermil2ps ((__v4sf)(__m128)(X),		\
+				       (__v4sf)(__m128)(Y),		\
+				       (__v4sf)(__m128)(C),		\
+				       (int)(I)))
+
+#define _mm256_permute2_ps(X, Y, C, I)					\
+  ((__m256) __builtin_ia32_vpermil2ps256 ((__v8sf)(__m256)(X),		\
+					  (__v8sf)(__m256)(Y),  	\
+					  (__v8sf)(__m256)(C),		\
 					  (int)(I)))
 #endif
 
@@ -652,43 +651,44 @@ _mm256_permute2_ps (__m256 __X, __m256 _
 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C)
 {
-  return (__m256d) __builtin_ia32_vperm2f128_pd ((__v4df)__X,
-						 (__v4df)__y,
-						 __C);
+  return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X,
+						    (__v4df)__Y,
+						    __C);
 }
 
 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C)
 {
-  return (__m256) __builtin_ia32_vperm2f128_ps ((__v8sf)__X,
-						(__v8sf)__y,
-						__C);
+  return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X,
+						   (__v8sf)__Y,
+						   __C);
 }
 
 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C)
 {
   return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X,
-						    (__v8si)__y,
+						    (__v8si)__Y,
 						    __C);
 }
 #else
-#define _mm256_permute2f128_pd(X, Y, C)				\
-  ((__m256d) __builtin_ia32_vperm2f128_pd ((__v4df)(__m256d)(X),\
-					   (__v4df)(__m256d)(Y),\
-					   (int)(C)))
-
-#define _mm256_permute2f128_ps(X, Y, C)				\
-  ((__m256) __builtin_ia32_vperm2f128_ps ((__v8sf)(__m256)(X),	\
-					  (__v8sf)(__m256)(Y),  \
-					  (int)(C)))
-
-#define _mm256_permute2f128_si256(X, Y, C)			\
-  ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X),\
-					      (__v8si)(__m256i)(Y),\
+#define _mm256_permute2f128_pd(X, Y, C)					\
+  ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X),	\
+					      (__v4df)(__m256d)(Y),	\
+					      (int)(C)))
+
+#define _mm256_permute2f128_ps(X, Y, C)					\
+  ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X),	\
+					     (__v8sf)(__m256)(Y),	\
+					     (int)(C)))
+
+#define _mm256_permute2f128_si256(X, Y, C)				\
+  ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X),	\
+					      (__v8si)(__m256i)(Y),	\
 					      (int)(C)))
 #endif
 
+#if 0
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_broadcast_ss (float const *__X)
 {
@@ -835,14 +835,14 @@ _mm256_storeu_si256 (__m256i *__P, __m25
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_maskload_pd (double const *__P, __m128d __M)
 {
-  return (__m128d) __builtin_ia32_maskloadpd128 ((__v2df)__P,
-						 (__v2df)__M);
+  return (__m128d) __builtin_ia32_maskloadpd ((__v2df)__P,
+					      (__v2df)__M);
 }
 
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_maskstore_pd (double *__P, __m128d __M, __m128d __A)
 {
-  __builtin_ia32_maskstorepd128 (__P, (__v2df)__M, (__v2df)__A);
+  __builtin_ia32_maskstorepd (__P, (__v2df)__M, (__v2df)__A);
 }
 
 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -861,14 +861,14 @@ _mm256_maskstore_pd (double *__P, __m256
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_maskload_ps (float const *__P, __m128 __M)
 {
-  return (__m128) __builtin_ia32_maskloadps128 ((__v4sf)__P,
-						(__v4sf)__M);
+  return (__m128) __builtin_ia32_maskloadps ((__v4sf)__P,
+					     (__v4sf)__M);
 }
 
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_maskstore_ps (float *__P, __m128 __M, __m128 __A)
 {
-  __builtin_ia32_maskstoreps128 (__P, (__v4sf)__M, (__v4sf)__A);
+  __builtin_ia32_maskstoreps (__P, (__v4sf)__M, (__v4sf)__A);
 }
 
 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
Index: config/i386/sse.md
===================================================================
--- config/i386/sse.md	(revision 134473)
+++ config/i386/sse.md	(working copy)
@@ -80,6 +80,9 @@
 ;; Mapping of immediate bits for blend instructions
 (define_mode_attr blendbits [(V8SF "255") (V4SF "15") (V4DF "15") (V2DF "3")])
 
+;; Mapping of immediate bits for vpermil instructions
+(define_mode_attr vpermilbits [(V8SF "255") (V4SF "255") (V4DF "15") (V2DF "3")])
+
 ;; Patterns whose name begins with "sse{,2,3}_" are invoked by intrinsics.
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -8640,3 +8643,50 @@
   [(set_attr "type" "sse")
    (set_attr "memory" "none")
    (set_attr "mode" "OI")])
+
+(define_insn "avx_vpermil<mode>"
+  [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x")
+	(unspec:AVXMODEF2P
+	  [(match_operand:AVXMODEF2P 1 "register_operand" "xm")
+	   (match_operand:SI 2 "const_0_to_<vpermilbits>_operand" "n")]
+	  UNSPEC_VPERMIL))]
+  "TARGET_AVX"
+  "vpermilp<avxmodesuffixf2c>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "avx_vpermilvar<mode>3"
+  [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x")
+	(unspec:AVXMODEF2P
+	  [(match_operand:AVXMODEF2P 1 "register_operand" "x")
+	   (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "xm")]
+	  UNSPEC_VPERMIL))]
+  "TARGET_AVX"
+  "vpermilp<avxmodesuffixf2c>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "avx_vpermil2<mode>3"
+  [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x,x")
+	(unspec:AVXMODEF2P
+	  [(match_operand:AVXMODEF2P 1 "register_operand" "x,x")
+	   (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "x,xm")
+	   (match_operand:AVXMODEF2P 3 "nonimmediate_operand" "xm,x")
+	   (match_operand:SI 4 "const_0_to_3_operand" "n,n")]
+	  UNSPEC_VPERMIL2))]
+  "TARGET_AVX"
+  "vpermil2p<avxmodesuffixf2c>\t{%4, %3, %2, %1, %0|%0, %1, %2, %3, %4}"
+  [(set_attr "type" "sselog")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "avx_vperm2f128<mode>3"
+  [(set (match_operand:AVX256MODE 0 "register_operand" "=x")
+	(unspec:AVX256MODE
+	  [(match_operand:AVX256MODE 1 "register_operand" "x")
+	   (match_operand:AVX256MODE 2 "nonimmediate_operand" "xm")
+	   (match_operand:SI 3 "const_0_to_255_operand" "n")]
+	  UNSPEC_VPERMIL2F128))]
+  "TARGET_AVX"
+  "vperm2f128\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "sselog")
+   (set_attr "mode" "<avxvecmode>")])
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 134473)
+++ config/i386/i386.c	(working copy)
@@ -17897,6 +17897,21 @@ enum ix86_builtins
   IX86_BUILTIN_EXTRACTF128SI256,
   IX86_BUILTIN_VZEROALL,
   IX86_BUILTIN_VZEROUPPER,
+  IX86_BUILTIN_VPERMILVARPD,
+  IX86_BUILTIN_VPERMILVARPS,
+  IX86_BUILTIN_VPERMILVARPD256,
+  IX86_BUILTIN_VPERMILVARPS256,
+  IX86_BUILTIN_VPERMILPD,
+  IX86_BUILTIN_VPERMILPS,
+  IX86_BUILTIN_VPERMILPD256,
+  IX86_BUILTIN_VPERMILPS256,
+  IX86_BUILTIN_VPERMIL2PD,
+  IX86_BUILTIN_VPERMIL2PS,
+  IX86_BUILTIN_VPERMIL2PD256,
+  IX86_BUILTIN_VPERMIL2PS256,
+  IX86_BUILTIN_VPERM2F128PD256,
+  IX86_BUILTIN_VPERM2F128PS256,
+  IX86_BUILTIN_VPERM2F128SI256,
 
   /* TFmode support builtins.  */
   IX86_BUILTIN_INFQ,
@@ -18247,9 +18262,11 @@ enum sse_builtin_type
   V4DF_FTYPE_V4SI,
   V4DF_FTYPE_V4SF,
   V4SF_FTYPE_V4DF,
+  V8SF_FTYPE_V8SF_INT,
   V4SI_FTYPE_V8SI_INT,
   V4SF_FTYPE_V8SF_INT,
   V2DF_FTYPE_V4DF_INT,
+  V4DF_FTYPE_V4DF_INT,
   V4SF_FTYPE_V4SF_INT,
   V2DI_FTYPE_V2DI_INT,
   V2DF_FTYPE_V2DF_INT,
@@ -18259,13 +18276,18 @@ enum sse_builtin_type
   V4SF_FTYPE_V4SF_V4SF_V4SF,
   V2DF_FTYPE_V2DF_V2DF_V2DF,
   V16QI_FTYPE_V16QI_V16QI_INT,
+  V8SI_FTYPE_V8SI_V8SI_INT,
   V8HI_FTYPE_V8HI_V8HI_INT,
   V8SF_FTYPE_V8SF_V8SF_INT,
   V4SI_FTYPE_V4SI_V4SI_INT,
   V4DF_FTYPE_V4DF_V4DF_INT,
   V4SF_FTYPE_V4SF_V4SF_INT,
   V2DI_FTYPE_V2DI_V2DI_INT,
-  V2DF_FTYPE_V2DF_V2DF_INT
+  V2DF_FTYPE_V2DF_V2DF_INT,
+  V8SF_FTYPE_V8SF_V8SF_V8SF_INT,
+  V4DF_FTYPE_V4DF_V4DF_V4DF_INT,
+  V4SF_FTYPE_V4SF_V4SF_V4SF_INT,
+  V2DF_FTYPE_V2DF_V2DF_V2DF_INT
 };
 
 /* SSE builtins with variable number of arguments.  */
@@ -18326,6 +18348,17 @@ static const struct builtin_description 
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvttpd2dq256, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvttps2dq256, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermil2v2df3,  "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermil2v4sf3,  "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermil2v4df3,  "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermil2v8sf3,  "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF_INT },
 };
 
 static const struct builtin_description bdesc_2arg[] =
@@ -18638,8 +18671,12 @@ static const struct builtin_description 
   { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, 0 },
   { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, 0 },
   { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, 0 },
-  { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3,  "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, 0 },
-  { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3,  "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, 0 },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, 0 },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, 0 },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, 0 },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, 0 },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, 0 },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, 0 },
 };
 
 static const struct builtin_description bdesc_1arg[] =
@@ -19511,10 +19548,18 @@ ix86_init_mmx_sse_builtins (void)
     = build_function_type_list (V4DF_type_node,
 				V4DF_type_node, V4DF_type_node,
 				NULL_TREE);
+  tree v8sf_ftype_v8sf_int
+    = build_function_type_list (V8SF_type_node,
+				V8SF_type_node, integer_type_node,
+				NULL_TREE);
   tree v4si_ftype_v8si_int
     = build_function_type_list (V4SI_type_node,
 				V8SI_type_node, integer_type_node,
 				NULL_TREE);
+  tree v4df_ftype_v4df_int
+    = build_function_type_list (V4DF_type_node,
+				V4DF_type_node, integer_type_node,
+				NULL_TREE);
   tree v4sf_ftype_v8sf_int
     = build_function_type_list (V4SF_type_node,
 				V8SF_type_node, integer_type_node,
@@ -19538,11 +19583,36 @@ ix86_init_mmx_sse_builtins (void)
 				V4DF_type_node, V4DF_type_node,
 				V4DF_type_node,
 				NULL_TREE);
+  tree v8si_ftype_v8si_v8si_int
+    = build_function_type_list (V8SI_type_node,
+				V8SI_type_node, V8SI_type_node,
+				integer_type_node,
+				NULL_TREE);
   tree v4df_ftype_v4df_v4df_int
     = build_function_type_list (V4DF_type_node,
 				V4DF_type_node, V4DF_type_node,
 				integer_type_node,
 				NULL_TREE);
+  tree v8sf_ftype_v8sf_v8sf_v8sf_int
+    = build_function_type_list (V8SF_type_node,
+				V8SF_type_node, V8SF_type_node,
+				V8SF_type_node, integer_type_node,
+				NULL_TREE);
+  tree v4df_ftype_v4df_v4df_v4df_int
+    = build_function_type_list (V4DF_type_node,
+				V4DF_type_node, V4DF_type_node,
+				V4DF_type_node, integer_type_node,
+				NULL_TREE);
+  tree v4sf_ftype_v4sf_v4sf_v4sf_int
+    = build_function_type_list (V4SF_type_node,
+				V4SF_type_node, V4SF_type_node,
+				V4SF_type_node, integer_type_node,
+				NULL_TREE);
+  tree v2df_ftype_v2df_v2df_v2df_int
+    = build_function_type_list (V2DF_type_node,
+				V2DF_type_node, V2DF_type_node,
+				V2DF_type_node, integer_type_node,
+				NULL_TREE);
 
   tree ftype;
 
@@ -19617,24 +19687,30 @@ ix86_init_mmx_sse_builtins (void)
 	case V4SF_FTYPE_V4DF:
 	  type = v4sf_ftype_v4df;
 	  break;
+	case V8SF_FTYPE_V8SF_INT:
+	  type = v8sf_ftype_v8sf_int;
+	  break;
 	case V4SI_FTYPE_V8SI_INT:
 	  type = v4si_ftype_v8si_int;
 	  break;
-	case V4SF_FTYPE_V8SF_INT:
-	  type = v4sf_ftype_v8sf_int;
-	  break;
-	case V2DF_FTYPE_V4DF_INT:
-	  type = v2df_ftype_v4df_int;
+	case V4DF_FTYPE_V4DF_INT:
+	  type = v4df_ftype_v4df_int;
 	  break;
 	case V4SF_FTYPE_V4SF_INT:
 	  type = v4sf_ftype_v4sf_int;
 	  break;
+	case V4SF_FTYPE_V8SF_INT:
+	  type = v4sf_ftype_v8sf_int;
+	  break;
 	case V2DI_FTYPE_V2DI_INT:
 	  type = v2di_ftype_v2di_int;
 	  break;
 	case V2DF_FTYPE_V2DF_INT:
 	  type = v2df_ftype_v2df_int;
 	  break;
+	case V2DF_FTYPE_V4DF_INT:
+	  type = v2df_ftype_v4df_int;
+	  break;
 	case V16QI_FTYPE_V16QI_V16QI_V16QI:
 	  type = v16qi_ftype_v16qi_v16qi_v16qi;
 	  break;
@@ -19653,6 +19729,9 @@ ix86_init_mmx_sse_builtins (void)
 	case V16QI_FTYPE_V16QI_V16QI_INT:
 	  type = v16qi_ftype_v16qi_v16qi_int;
 	  break;
+	case V8SI_FTYPE_V8SI_V8SI_INT:
+	  type = v8si_ftype_v8si_v8si_int;
+	  break;
 	case V8HI_FTYPE_V8HI_V8HI_INT:
 	  type = v8hi_ftype_v8hi_v8hi_int;
 	  break;
@@ -19674,6 +19753,18 @@ ix86_init_mmx_sse_builtins (void)
 	case V2DF_FTYPE_V2DF_V2DF_INT:
 	  type = v2df_ftype_v2df_v2df_int;
 	  break;
+	case V8SF_FTYPE_V8SF_V8SF_V8SF_INT:
+	  type = v8sf_ftype_v8sf_v8sf_v8sf_int;
+	  break;
+	case V4DF_FTYPE_V4DF_V4DF_V4DF_INT:
+	  type = v4df_ftype_v4df_v4df_v4df_int;
+	  break;
+	case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
+	  type = v4sf_ftype_v4sf_v4sf_v4sf_int;
+	  break;
+	case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
+	  type = v2df_ftype_v2df_v2df_v2df_int;
+	  break;
 	default:
 	  gcc_unreachable ();
 	}
@@ -20277,7 +20368,7 @@ ix86_expand_sse_operands_builtin (enum i
     {
       rtx op;
       enum machine_mode mode;
-    } args[3];
+    } args[4];
   bool last_arg_constant = false;
   const struct insn_data *insn_p = &insn_data[icode];
   enum machine_mode tmode = insn_p->operand[0].mode;
@@ -20292,7 +20383,9 @@ ix86_expand_sse_operands_builtin (enum i
     case V4SF_FTYPE_V4DF:
       nargs = 1;
       break;
+    case V8SF_FTYPE_V8SF_INT:
     case V4SI_FTYPE_V8SI_INT:
+    case V4DF_FTYPE_V4DF_INT:
     case V4SF_FTYPE_V8SF_INT:
     case V2DF_FTYPE_V4DF_INT:
     case V4SF_FTYPE_V4SF_INT:
@@ -20310,6 +20403,7 @@ ix86_expand_sse_operands_builtin (enum i
       break;
     case V16QI_FTYPE_V16QI_V16QI_INT:
     case V8HI_FTYPE_V8HI_V8HI_INT:
+    case V8SI_FTYPE_V8SI_V8SI_INT:
     case V8SF_FTYPE_V8SF_V8SF_INT: 
     case V4SI_FTYPE_V4SI_V4SI_INT:
     case V4DF_FTYPE_V4DF_V4DF_INT:
@@ -20319,6 +20413,13 @@ ix86_expand_sse_operands_builtin (enum i
       nargs = 3;
       last_arg_constant = true;
       break;
+    case V8SF_FTYPE_V8SF_V8SF_V8SF_INT:
+    case V4DF_FTYPE_V4DF_V4DF_V4DF_INT:
+    case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
+    case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
+      nargs = 4;
+      last_arg_constant = true;
+      break;
     default:
       gcc_unreachable ();
     }
@@ -20349,10 +20450,16 @@ ix86_expand_sse_operands_builtin (enum i
 	      case CODE_FOR_sse4_1_roundss:
 	      case CODE_FOR_sse4_1_blendps:
 	      case CODE_FOR_avx_blendpd256:
+	      case CODE_FOR_avx_vpermilv4df:
 		error ("the last argument must be a 4-bit immediate");
 		return const0_rtx;
 
 	      case CODE_FOR_sse4_1_blendpd:
+	      case CODE_FOR_avx_vpermilv2df:
+	      case CODE_FOR_avx_vpermil2v2df3:
+	      case CODE_FOR_avx_vpermil2v4sf3:
+	      case CODE_FOR_avx_vpermil2v4df3:
+	      case CODE_FOR_avx_vpermil2v8sf3:
 		error ("the last argument must be a 2-bit immediate");
 		return const0_rtx;
 
@@ -20366,8 +20473,8 @@ ix86_expand_sse_operands_builtin (enum i
 	      case CODE_FOR_avx_cmpssv4sf3:
 	      case CODE_FOR_avx_cmppdv2df3:
 	      case CODE_FOR_avx_cmppsv4sf3:
-	      case CODE_FOR_avx_cmppsv8sf3:
 	      case CODE_FOR_avx_cmppdv4df3:
+	      case CODE_FOR_avx_cmppsv8sf3:
 		error ("the last argument must be a 5-bit immediate");
 		return const0_rtx;
 
@@ -20409,6 +20516,10 @@ ix86_expand_sse_operands_builtin (enum i
       pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
 			     args[2].op);
       break;
+    case 4:
+      pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
+			     args[2].op, args[3].op);
+      break;
     default:
       gcc_unreachable ();
     }


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]