This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[AVX]: Add AVX vpermilXXX builtins
- From: "H.J. Lu" <hjl dot tools at gmail dot com>
- To: gcc-patches at gcc dot gnu dot org
- Date: Sat, 19 Apr 2008 15:21:49 -0700
- Subject: [AVX]: Add AVX vpermilXXX builtins
Hi,
I am checking in this patch to add AVX vpermilXXX builtins.
H.J.
----
gcc/
2008-04-19 H.J. Lu <hongjiu.lu@intel.com>
* config/i386/gmmintrin.h (_mm_permutevar_pd): Call
__builtin_ia32_vpermilvarpd instead of __builtin_ia32_vpermilpd128.
(_mm256_permutevar_pd): Call __builtin_ia32_vpermilvarpd256
instead of __builtin_ia32_vpermilpd256.
(_mm_permutevar_ps): Call __builtin_ia32_vpermilvarps instead
of __builtin_ia32_vpermilps128.
(_mm256_permutevar_ps): Call __builtin_ia32_vpermilvarps256
instead of __builtin_ia32_vpermilps256.
(_mm_permute_pd): Call __builtin_ia32_vpermilpd instead of
__builtin_ia32_vpermilpd128.
(_mm256_permute_pd): Fix a typo.
(_mm_permute_ps): Call __builtin_ia32_vpermilps instead of
__builtin_ia32_vpermilps128.
(_mm_permute2_pd): Call __builtin_ia32_vpermil2pd instead of
__builtin_ia32_vpermil2pd128.
(_mm_permute2_ps): Call __builtin_ia32_vpermil2ps instead of
__builtin_ia32_vpermil2ps128.
(_mm_maskload_pd): Call __builtin_ia32_maskloadpd instead of
__builtin_ia32_maskloadpd128.
(_mm_maskstore_pd): Call __builtin_ia32_maskstorepd instead of
__builtin_ia32_maskstorepd128.
(_mm_maskload_ps): Call __builtin_ia32_maskloadps instead of
__builtin_ia32_maskloadps128.
(_mm_maskstore_ps): Call __builtin_ia32_maskstoreps instead of
__builtin_ia32_maskstoreps128.
* config/i386/i386.c (ix86_builtins): Add IX86_BUILTIN_VPERMILVARPD,
IX86_BUILTIN_VPERMILVARPS, IX86_BUILTIN_VPERMILVARPD256,
IX86_BUILTIN_VPERMILVARPS256, IX86_BUILTIN_VPERMILPD,
IX86_BUILTIN_VPERMILPS, IX86_BUILTIN_VPERMILPD256,
IX86_BUILTIN_VPERMILPS256, IX86_BUILTIN_VPERMIL2PD,
IX86_BUILTIN_VPERMIL2PS, IX86_BUILTIN_VPERMIL2PD256,
IX86_BUILTIN_VPERMIL2PS256, IX86_BUILTIN_VPERM2F128PD256,
IX86_BUILTIN_VPERM2F128PS256 and IX86_BUILTIN_VPERM2F128SI256.
(sse_builtin_type): Add V8SF_FTYPE_V8SF_INT,
V4DF_FTYPE_V4DF_INT, V8SI_FTYPE_V8SI_V8SI_INT,
V8SF_FTYPE_V8SF_V8SF_V8SF_INT, V4DF_FTYPE_V4DF_V4DF_V4DF_INT,
V4SF_FTYPE_V4SF_V4SF_V4SF_INT and V2DF_FTYPE_V2DF_V2DF_V2DF_INT.
(bdesc_sse_args): Add __builtin_ia32_vperm2f128_pd256,
__builtin_ia32_vperm2f128_ps256, __builtin_ia32_vperm2f128_si256,
__builtin_ia32_vpermilpd, __builtin_ia32_vpermilps,
__builtin_ia32_vpermilpd256, __builtin_ia32_vpermilps256,
__builtin_ia32_vpermil2pd, __builtin_ia32_vpermil2ps,
__builtin_ia32_vpermil2pd256 and __builtin_ia32_vpermil2ps256.
(bdesc_2arg): Add __builtin_ia32_vpermilvarpd,
__builtin_ia32_vpermilvarps, __builtin_ia32_vpermilvarpd256
and __builtin_ia32_vpermilvarps256.
(bdesc_1arg): Add IX86_BUILTIN_VPERMILPD, IX86_BUILTIN_VPERMILPS,
IX86_BUILTIN_VPERMILPD256 and IX86_BUILTIN_VPERMILPS256.
(ix86_init_mmx_sse_builtins): Handle bdesc_avx_4arg. Support
v8si_ftype_v8si_v8si_int in bdesc_2arg. Define
__builtin_ia32_vpermilpd, __builtin_ia32_vpermilps,
__builtin_ia32_vpermilpd256 and __builtin_ia32_vpermilps256.
(ix86_init_mmx_sse_builtins): Handle more AVX builtins.
(ix86_expand_sse_operands_builtin): Support 4 arguments. Handle
more AVX builtins.
* config/i386/i386.md (UNSPEC_VPERMIL): New.
(UNSPEC_VPERMIL2): Likewise.
(UNSPEC_VPERMIL2F128): Likewise.
* config/i386/sse.md (vpermilbits): New.
(avx_vpermil<mode>3): Likewise.
(avx_vpermilvar<mode>3): Likewise.
(avx_vpermil2<mode>3): Likewise.
(avx_vpermil2f128<mode>3): Likewise.
gcc/testsuite/
2008-04-19 H.J. Lu <hongjiu.lu@intel.com>
* gcc.target/i386/avx-1.c: Add more tests for gmmintrin.h.
* gcc.target/i386/avx-2.c (test_3): New.
Add more tests for gmmintrin.h.
Index: testsuite/gcc.target/i386/avx-1.c
===================================================================
--- testsuite/gcc.target/i386/avx-1.c (revision 134473)
+++ testsuite/gcc.target/i386/avx-1.c (working copy)
@@ -33,6 +33,17 @@
#define __builtin_ia32_vextractf128_pd256(X, N) __builtin_ia32_vextractf128_pd256(X, 1)
#define __builtin_ia32_vextractf128_ps256(X, N) __builtin_ia32_vextractf128_ps256(X, 1)
#define __builtin_ia32_vextractf128_si256(X, N) __builtin_ia32_vextractf128_si256(X, 1)
+#define __builtin_ia32_vpermilpd(X, N) __builtin_ia32_vpermilpd(X, 1)
+#define __builtin_ia32_vpermilpd256(X, N) __builtin_ia32_vpermilpd256(X, 1)
+#define __builtin_ia32_vpermilps(X, N) __builtin_ia32_vpermilps(X, 1)
+#define __builtin_ia32_vpermilps256(X, N) __builtin_ia32_vpermilps256(X, 1)
+#define __builtin_ia32_vpermil2pd(X, Y, C, I) __builtin_ia32_vpermil2pd(X, Y, C, 1)
+#define __builtin_ia32_vpermil2pd256(X, Y, C, I) __builtin_ia32_vpermil2pd256(X, Y, C, 1)
+#define __builtin_ia32_vpermil2ps(X, Y, C, I) __builtin_ia32_vpermil2ps(X, Y, C, 1)
+#define __builtin_ia32_vpermil2ps256(X, Y, C, I) __builtin_ia32_vpermil2ps256(X, Y, C, 1)
+#define __builtin_ia32_vperm2f128_pd256(X, Y, C) __builtin_ia32_vperm2f128_pd256(X, Y, 1)
+#define __builtin_ia32_vperm2f128_ps256(X, Y, C) __builtin_ia32_vperm2f128_ps256(X, Y, 1)
+#define __builtin_ia32_vperm2f128_si256(X, Y, C) __builtin_ia32_vperm2f128_si256(X, Y, 1)
/* wmmintrin.h */
#define __builtin_ia32_aeskeygenassist128(X, C) __builtin_ia32_aeskeygenassist128(X, 1)
Index: testsuite/gcc.target/i386/avx-2.c
===================================================================
--- testsuite/gcc.target/i386/avx-2.c (revision 134473)
+++ testsuite/gcc.target/i386/avx-2.c (working copy)
@@ -35,6 +35,11 @@
type _CONCAT(_,func) (op1_type A, op2_type B, int const I, int const L) \
{ return func (A, B, imm1, imm2); }
+#define test_3(func, type, op1_type, op2_type, op3_type, imm) \
+ type _CONCAT(_,func) (op1_type A, op2_type B, \
+ op3_type C, int const I) \
+ { return func (A, B, C, imm); }
+
#define test_4(func, type, op1_type, op2_type, op3_type, op4_type, imm) \
type _CONCAT(_,func) (op1_type A, op2_type B, \
op3_type C, op4_type D, int const I) \
@@ -63,6 +68,17 @@ test_2 (_mm256_cmp_ps, __m256, __m256, _
test_1 (_mm256_extractf128_pd, __m128d, __m256d, 1)
test_1 (_mm256_extractf128_ps, __m128, __m256, 1)
test_1 (_mm256_extractf128_si256, __m128i, __m256i, 1)
+test_1 (_mm_permute_pd, __m128d, __m128d, 1)
+test_1 (_mm256_permute_pd, __m256d, __m256d, 1)
+test_1 (_mm_permute_ps, __m128, __m128, 1)
+test_1 (_mm256_permute_ps, __m256, __m256, 1)
+test_3 (_mm_permute2_pd, __m128d, __m128d, __m128d, __m128d, 1)
+test_3 (_mm256_permute2_pd, __m256d, __m256d, __m256d, __m256d, 1)
+test_3 (_mm_permute2_ps, __m128, __m128, __m128, __m128, 1)
+test_3 (_mm256_permute2_ps, __m256, __m256, __m256, __m256, 1)
+test_2 (_mm256_permute2f128_pd, __m256d, __m256d, __m256d, 1)
+test_2 (_mm256_permute2f128_ps, __m256, __m256, __m256, 1)
+test_2 (_mm256_permute2f128_si256, __m256i, __m256i, __m256i, 1)
/* wmmintrin.h */
test_1 (_mm_aeskeygenassist_si128, __m128i, __m128i, 1)
Index: config/i386/i386.md
===================================================================
--- config/i386/i386.md (revision 134473)
+++ config/i386/i386.md (working copy)
@@ -200,6 +200,9 @@
; For AVX support
(UNSPEC_PCMP 166)
+ (UNSPEC_VPERMIL 167)
+ (UNSPEC_VPERMIL2 168)
+ (UNSPEC_VPERMIL2F128 169)
])
(define_constants
Index: config/i386/gmmintrin.h
===================================================================
--- config/i386/gmmintrin.h (revision 134473)
+++ config/i386/gmmintrin.h (working copy)
@@ -521,48 +521,51 @@ _mm256_zeroupper (void)
__builtin_ia32_vzeroupper ();
}
-#if 0
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_permutevar_pd (__m128d __A, __m128d __B)
{
- return (__m128d) __builtin_ia32_vpermilpd128 ((__v2df)__A, (__v2df)__B);
+ return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A,
+ (__v2df)__B);
}
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_permutevar_pd (__m256d __A, __m256d __B)
{
- return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__A, (__v4df)__B);
+ return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A,
+ (__v4df)__B);
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_permutevar_ps (__m128 __A, __m128 __B)
{
- return (__m128) __builtin_ia32_vpermilps128 ((__v4sf)__A, (__v4sf)__B);
+ return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A,
+ (__v4sf)__B);
}
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_permutevar_ps (__m256 __A, __m256 __B)
{
- return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__A, (__v8sf)__B);
+ return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A,
+ (__v8sf)__B);
}
#ifdef __OPTIMIZE__
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_permute_pd (__m128d __X, const int __C)
{
- return (__m128d) __builtin_ia32_vpermilpd128 ((__v2df)__X, __C);
+ return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C);
}
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_permute_pd (__m256d __X, const int __C)
{
- return (__m256d) __builtin_ia32_vpermilps256 ((__v4df)__X, __C);
+ return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C);
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_permute_ps (__m128 __X, const int __C)
{
- return (__m128) __builtin_ia32_vpermilps128 ((__v4sf)__X, __C);
+ return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C);
}
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -574,10 +577,10 @@ _mm256_permute_ps (__m256 __X, const int
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_permute2_pd (__m128d __X, __m128d __Y, __m128d __C, const int __I)
{
- return (__m128d) __builtin_ia32_vpermil2pd128 ((__v2df)__X,
- (__v2df)__Y,
- (__v2df)__C,
- __I);
+ return (__m128d) __builtin_ia32_vpermil2pd ((__v2df)__X,
+ (__v2df)__Y,
+ (__v2df)__C,
+ __I);
}
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -592,10 +595,10 @@ _mm256_permute2_pd (__m256d __X, __m256d
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_permute2_ps (__m128 __X, __m128 __Y, __m128 __C, const int __I)
{
- return (__m128) __builtin_ia32_vpermil2ps128 ((__v4sf)__X,
- (__v4sf)__Y,
- (__v4sf)__C,
- __I);
+ return (__m128) __builtin_ia32_vpermil2ps ((__v4sf)__X,
+ (__v4sf)__Y,
+ (__v4sf)__C,
+ __I);
}
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -607,44 +610,40 @@ _mm256_permute2_ps (__m256 __X, __m256 _
__I);
}
#else
-#define _mm_permute_pd(X, C) \
- ((__m128d) __builtin_ia32_vpermilpd128 ((__v2df)(__m128d)(X), \
- (int)(C)))
-
-#define _mm256_permute_pd(X, C) \
- ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X), \
- (int)(C)))
-
-#define _mm_permute_ps(X, C) \
- ((__m128) __builtin_ia32_vpermilps128 ((__v4sf)(__m128)(X), \
- (int)(C)))
-
-#define _mm256_permute_ps(X, C) \
- ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), \
- (int)(C)))
-
-#define _mm_permute2_pd(X, Y, C, I) \
- ((__m128d) __builtin_ia32_vpermil2pd128 ((__v2df)(__m128d)(X),\
- (__v2df)(__m128d)(Y),\
- (__v2df)(__m128d)(C),\
- (int)(I)))
+#define _mm_permute_pd(X, C) \
+ ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C)))
-#define _mm256_permute2_pd(X, Y, C, I) \
- ((__m256d) __builtin_ia32_vpermil2pd256 ((__v4sf)(__m256d)(X),\
- (__v4df)(__m256d)(Y),\
- (__v4df)(__m256d)(C),\
- (int)(I)))
+#define _mm256_permute_pd(X, C) \
+ ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X), (int)(C)))
-#define _mm_permute2_ps(X, Y, C, I) \
- ((__m128) __builtin_ia32_vpermil2ps128 ((__v4sf)(__m128)(X), \
- (__v4sf)(__m128)(Y), \
- (__v4sf)(__m128)(C), \
- (int)(I)))
+#define _mm_permute_ps(X, C) \
+ ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C)))
-#define _mm256_permute2_ps(X, Y, C, I) \
- ((__m256) __builtin_ia32_vpermil2ps256 ((__v8sf)(__m256)(X), \
- (__v8sf)(__m256)(Y), \
- (__v8sf)(__m256)(C), \
+#define _mm256_permute_ps(X, C) \
+ ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C)))
+
+#define _mm_permute2_pd(X, Y, C, I) \
+ ((__m128d) __builtin_ia32_vpermil2pd ((__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), \
+ (__v2df)(__m128d)(C), \
+ (int)(I)))
+
+#define _mm256_permute2_pd(X, Y, C, I) \
+ ((__m256d) __builtin_ia32_vpermil2pd256 ((__v4df)(__m256d)(X), \
+ (__v4df)(__m256d)(Y), \
+ (__v4df)(__m256d)(C), \
+ (int)(I)))
+
+#define _mm_permute2_ps(X, Y, C, I) \
+ ((__m128) __builtin_ia32_vpermil2ps ((__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), \
+ (__v4sf)(__m128)(C), \
+ (int)(I)))
+
+#define _mm256_permute2_ps(X, Y, C, I) \
+ ((__m256) __builtin_ia32_vpermil2ps256 ((__v8sf)(__m256)(X), \
+ (__v8sf)(__m256)(Y), \
+ (__v8sf)(__m256)(C), \
(int)(I)))
#endif
@@ -652,43 +651,44 @@ _mm256_permute2_ps (__m256 __X, __m256 _
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C)
{
- return (__m256d) __builtin_ia32_vperm2f128_pd ((__v4df)__X,
- (__v4df)__y,
- __C);
+ return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X,
+ (__v4df)__Y,
+ __C);
}
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C)
{
- return (__m256) __builtin_ia32_vperm2f128_ps ((__v8sf)__X,
- (__v8sf)__y,
- __C);
+ return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X,
+ (__v8sf)__Y,
+ __C);
}
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C)
{
return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X,
- (__v8si)__y,
+ (__v8si)__Y,
__C);
}
#else
-#define _mm256_permute2f128_pd(X, Y, C) \
- ((__m256d) __builtin_ia32_vperm2f128_pd ((__v4df)(__m256d)(X),\
- (__v4df)(__m256d)(Y),\
- (int)(C)))
-
-#define _mm256_permute2f128_ps(X, Y, C) \
- ((__m256) __builtin_ia32_vperm2f128_ps ((__v8sf)(__m256)(X), \
- (__v8sf)(__m256)(Y), \
- (int)(C)))
-
-#define _mm256_permute2f128_si256(X, Y, C) \
- ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X),\
- (__v8si)(__m256i)(Y),\
+#define _mm256_permute2f128_pd(X, Y, C) \
+ ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X), \
+ (__v4df)(__m256d)(Y), \
+ (int)(C)))
+
+#define _mm256_permute2f128_ps(X, Y, C) \
+ ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X), \
+ (__v8sf)(__m256)(Y), \
+ (int)(C)))
+
+#define _mm256_permute2f128_si256(X, Y, C) \
+ ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X), \
+ (__v8si)(__m256i)(Y), \
(int)(C)))
#endif
+#if 0
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_broadcast_ss (float const *__X)
{
@@ -835,14 +835,14 @@ _mm256_storeu_si256 (__m256i *__P, __m25
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskload_pd (double const *__P, __m128d __M)
{
- return (__m128d) __builtin_ia32_maskloadpd128 ((__v2df)__P,
- (__v2df)__M);
+ return (__m128d) __builtin_ia32_maskloadpd ((__v2df)__P,
+ (__v2df)__M);
}
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskstore_pd (double *__P, __m128d __M, __m128d __A)
{
- __builtin_ia32_maskstorepd128 (__P, (__v2df)__M, (__v2df)__A);
+ __builtin_ia32_maskstorepd (__P, (__v2df)__M, (__v2df)__A);
}
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -861,14 +861,14 @@ _mm256_maskstore_pd (double *__P, __m256
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskload_ps (float const *__P, __m128 __M)
{
- return (__m128) __builtin_ia32_maskloadps128 ((__v4sf)__P,
- (__v4sf)__M);
+ return (__m128) __builtin_ia32_maskloadps ((__v4sf)__P,
+ (__v4sf)__M);
}
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskstore_ps (float *__P, __m128 __M, __m128 __A)
{
- __builtin_ia32_maskstoreps128 (__P, (__v4sf)__M, (__v4sf)__A);
+ __builtin_ia32_maskstoreps (__P, (__v4sf)__M, (__v4sf)__A);
}
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
Index: config/i386/sse.md
===================================================================
--- config/i386/sse.md (revision 134473)
+++ config/i386/sse.md (working copy)
@@ -80,6 +80,9 @@
;; Mapping of immediate bits for blend instructions
(define_mode_attr blendbits [(V8SF "255") (V4SF "15") (V4DF "15") (V2DF "3")])
+;; Mapping of immediate bits for vpermil instructions
+(define_mode_attr vpermilbits [(V8SF "255") (V4SF "255") (V4DF "15") (V2DF "3")])
+
;; Patterns whose name begins with "sse{,2,3}_" are invoked by intrinsics.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -8640,3 +8643,50 @@
[(set_attr "type" "sse")
(set_attr "memory" "none")
(set_attr "mode" "OI")])
+
+(define_insn "avx_vpermil<mode>"
+ [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x")
+ (unspec:AVXMODEF2P
+ [(match_operand:AVXMODEF2P 1 "register_operand" "xm")
+ (match_operand:SI 2 "const_0_to_<vpermilbits>_operand" "n")]
+ UNSPEC_VPERMIL))]
+ "TARGET_AVX"
+ "vpermilp<avxmodesuffixf2c>\t{%2, %1, %0|%0, %1, %2}"
+ [(set_attr "type" "sselog")
+ (set_attr "mode" "<MODE>")])
+
+(define_insn "avx_vpermilvar<mode>3"
+ [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x")
+ (unspec:AVXMODEF2P
+ [(match_operand:AVXMODEF2P 1 "register_operand" "x")
+ (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "xm")]
+ UNSPEC_VPERMIL))]
+ "TARGET_AVX"
+ "vpermilp<avxmodesuffixf2c>\t{%2, %1, %0|%0, %1, %2}"
+ [(set_attr "type" "sselog")
+ (set_attr "mode" "<MODE>")])
+
+(define_insn "avx_vpermil2<mode>3"
+ [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x,x")
+ (unspec:AVXMODEF2P
+ [(match_operand:AVXMODEF2P 1 "register_operand" "x,x")
+ (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "x,xm")
+ (match_operand:AVXMODEF2P 3 "nonimmediate_operand" "xm,x")
+ (match_operand:SI 4 "const_0_to_3_operand" "n,n")]
+ UNSPEC_VPERMIL2))]
+ "TARGET_AVX"
+ "vpermil2p<avxmodesuffixf2c>\t{%4, %3, %2, %1, %0|%0, %1, %2, %3, %4}"
+ [(set_attr "type" "sselog")
+ (set_attr "mode" "<MODE>")])
+
+(define_insn "avx_vperm2f128<mode>3"
+ [(set (match_operand:AVX256MODE 0 "register_operand" "=x")
+ (unspec:AVX256MODE
+ [(match_operand:AVX256MODE 1 "register_operand" "x")
+ (match_operand:AVX256MODE 2 "nonimmediate_operand" "xm")
+ (match_operand:SI 3 "const_0_to_255_operand" "n")]
+ UNSPEC_VPERMIL2F128))]
+ "TARGET_AVX"
+ "vperm2f128\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "sselog")
+ (set_attr "mode" "<avxvecmode>")])
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c (revision 134473)
+++ config/i386/i386.c (working copy)
@@ -17897,6 +17897,21 @@ enum ix86_builtins
IX86_BUILTIN_EXTRACTF128SI256,
IX86_BUILTIN_VZEROALL,
IX86_BUILTIN_VZEROUPPER,
+ IX86_BUILTIN_VPERMILVARPD,
+ IX86_BUILTIN_VPERMILVARPS,
+ IX86_BUILTIN_VPERMILVARPD256,
+ IX86_BUILTIN_VPERMILVARPS256,
+ IX86_BUILTIN_VPERMILPD,
+ IX86_BUILTIN_VPERMILPS,
+ IX86_BUILTIN_VPERMILPD256,
+ IX86_BUILTIN_VPERMILPS256,
+ IX86_BUILTIN_VPERMIL2PD,
+ IX86_BUILTIN_VPERMIL2PS,
+ IX86_BUILTIN_VPERMIL2PD256,
+ IX86_BUILTIN_VPERMIL2PS256,
+ IX86_BUILTIN_VPERM2F128PD256,
+ IX86_BUILTIN_VPERM2F128PS256,
+ IX86_BUILTIN_VPERM2F128SI256,
/* TFmode support builtins. */
IX86_BUILTIN_INFQ,
@@ -18247,9 +18262,11 @@ enum sse_builtin_type
V4DF_FTYPE_V4SI,
V4DF_FTYPE_V4SF,
V4SF_FTYPE_V4DF,
+ V8SF_FTYPE_V8SF_INT,
V4SI_FTYPE_V8SI_INT,
V4SF_FTYPE_V8SF_INT,
V2DF_FTYPE_V4DF_INT,
+ V4DF_FTYPE_V4DF_INT,
V4SF_FTYPE_V4SF_INT,
V2DI_FTYPE_V2DI_INT,
V2DF_FTYPE_V2DF_INT,
@@ -18259,13 +18276,18 @@ enum sse_builtin_type
V4SF_FTYPE_V4SF_V4SF_V4SF,
V2DF_FTYPE_V2DF_V2DF_V2DF,
V16QI_FTYPE_V16QI_V16QI_INT,
+ V8SI_FTYPE_V8SI_V8SI_INT,
V8HI_FTYPE_V8HI_V8HI_INT,
V8SF_FTYPE_V8SF_V8SF_INT,
V4SI_FTYPE_V4SI_V4SI_INT,
V4DF_FTYPE_V4DF_V4DF_INT,
V4SF_FTYPE_V4SF_V4SF_INT,
V2DI_FTYPE_V2DI_V2DI_INT,
- V2DF_FTYPE_V2DF_V2DF_INT
+ V2DF_FTYPE_V2DF_V2DF_INT,
+ V8SF_FTYPE_V8SF_V8SF_V8SF_INT,
+ V4DF_FTYPE_V4DF_V4DF_V4DF_INT,
+ V4SF_FTYPE_V4SF_V4SF_V4SF_INT,
+ V2DF_FTYPE_V2DF_V2DF_V2DF_INT
};
/* SSE builtins with variable number of arguments. */
@@ -18326,6 +18348,17 @@ static const struct builtin_description
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvttpd2dq256, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvttps2dq256, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF_INT },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF_INT },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF_INT },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF_INT },
};
static const struct builtin_description bdesc_2arg[] =
@@ -18638,8 +18671,12 @@ static const struct builtin_description
{ OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, 0 },
{ OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, 0 },
{ OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, 0 },
- { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, 0 },
- { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, 0 },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, 0 },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, 0 },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, 0 },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, 0 },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, 0 },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, 0 },
};
static const struct builtin_description bdesc_1arg[] =
@@ -19511,10 +19548,18 @@ ix86_init_mmx_sse_builtins (void)
= build_function_type_list (V4DF_type_node,
V4DF_type_node, V4DF_type_node,
NULL_TREE);
+ tree v8sf_ftype_v8sf_int
+ = build_function_type_list (V8SF_type_node,
+ V8SF_type_node, integer_type_node,
+ NULL_TREE);
tree v4si_ftype_v8si_int
= build_function_type_list (V4SI_type_node,
V8SI_type_node, integer_type_node,
NULL_TREE);
+ tree v4df_ftype_v4df_int
+ = build_function_type_list (V4DF_type_node,
+ V4DF_type_node, integer_type_node,
+ NULL_TREE);
tree v4sf_ftype_v8sf_int
= build_function_type_list (V4SF_type_node,
V8SF_type_node, integer_type_node,
@@ -19538,11 +19583,36 @@ ix86_init_mmx_sse_builtins (void)
V4DF_type_node, V4DF_type_node,
V4DF_type_node,
NULL_TREE);
+ tree v8si_ftype_v8si_v8si_int
+ = build_function_type_list (V8SI_type_node,
+ V8SI_type_node, V8SI_type_node,
+ integer_type_node,
+ NULL_TREE);
tree v4df_ftype_v4df_v4df_int
= build_function_type_list (V4DF_type_node,
V4DF_type_node, V4DF_type_node,
integer_type_node,
NULL_TREE);
+ tree v8sf_ftype_v8sf_v8sf_v8sf_int
+ = build_function_type_list (V8SF_type_node,
+ V8SF_type_node, V8SF_type_node,
+ V8SF_type_node, integer_type_node,
+ NULL_TREE);
+ tree v4df_ftype_v4df_v4df_v4df_int
+ = build_function_type_list (V4DF_type_node,
+ V4DF_type_node, V4DF_type_node,
+ V4DF_type_node, integer_type_node,
+ NULL_TREE);
+ tree v4sf_ftype_v4sf_v4sf_v4sf_int
+ = build_function_type_list (V4SF_type_node,
+ V4SF_type_node, V4SF_type_node,
+ V4SF_type_node, integer_type_node,
+ NULL_TREE);
+ tree v2df_ftype_v2df_v2df_v2df_int
+ = build_function_type_list (V2DF_type_node,
+ V2DF_type_node, V2DF_type_node,
+ V2DF_type_node, integer_type_node,
+ NULL_TREE);
tree ftype;
@@ -19617,24 +19687,30 @@ ix86_init_mmx_sse_builtins (void)
case V4SF_FTYPE_V4DF:
type = v4sf_ftype_v4df;
break;
+ case V8SF_FTYPE_V8SF_INT:
+ type = v8sf_ftype_v8sf_int;
+ break;
case V4SI_FTYPE_V8SI_INT:
type = v4si_ftype_v8si_int;
break;
- case V4SF_FTYPE_V8SF_INT:
- type = v4sf_ftype_v8sf_int;
- break;
- case V2DF_FTYPE_V4DF_INT:
- type = v2df_ftype_v4df_int;
+ case V4DF_FTYPE_V4DF_INT:
+ type = v4df_ftype_v4df_int;
break;
case V4SF_FTYPE_V4SF_INT:
type = v4sf_ftype_v4sf_int;
break;
+ case V4SF_FTYPE_V8SF_INT:
+ type = v4sf_ftype_v8sf_int;
+ break;
case V2DI_FTYPE_V2DI_INT:
type = v2di_ftype_v2di_int;
break;
case V2DF_FTYPE_V2DF_INT:
type = v2df_ftype_v2df_int;
break;
+ case V2DF_FTYPE_V4DF_INT:
+ type = v2df_ftype_v4df_int;
+ break;
case V16QI_FTYPE_V16QI_V16QI_V16QI:
type = v16qi_ftype_v16qi_v16qi_v16qi;
break;
@@ -19653,6 +19729,9 @@ ix86_init_mmx_sse_builtins (void)
case V16QI_FTYPE_V16QI_V16QI_INT:
type = v16qi_ftype_v16qi_v16qi_int;
break;
+ case V8SI_FTYPE_V8SI_V8SI_INT:
+ type = v8si_ftype_v8si_v8si_int;
+ break;
case V8HI_FTYPE_V8HI_V8HI_INT:
type = v8hi_ftype_v8hi_v8hi_int;
break;
@@ -19674,6 +19753,18 @@ ix86_init_mmx_sse_builtins (void)
case V2DF_FTYPE_V2DF_V2DF_INT:
type = v2df_ftype_v2df_v2df_int;
break;
+ case V8SF_FTYPE_V8SF_V8SF_V8SF_INT:
+ type = v8sf_ftype_v8sf_v8sf_v8sf_int;
+ break;
+ case V4DF_FTYPE_V4DF_V4DF_V4DF_INT:
+ type = v4df_ftype_v4df_v4df_v4df_int;
+ break;
+ case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
+ type = v4sf_ftype_v4sf_v4sf_v4sf_int;
+ break;
+ case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
+ type = v2df_ftype_v2df_v2df_v2df_int;
+ break;
default:
gcc_unreachable ();
}
@@ -20277,7 +20368,7 @@ ix86_expand_sse_operands_builtin (enum i
{
rtx op;
enum machine_mode mode;
- } args[3];
+ } args[4];
bool last_arg_constant = false;
const struct insn_data *insn_p = &insn_data[icode];
enum machine_mode tmode = insn_p->operand[0].mode;
@@ -20292,7 +20383,9 @@ ix86_expand_sse_operands_builtin (enum i
case V4SF_FTYPE_V4DF:
nargs = 1;
break;
+ case V8SF_FTYPE_V8SF_INT:
case V4SI_FTYPE_V8SI_INT:
+ case V4DF_FTYPE_V4DF_INT:
case V4SF_FTYPE_V8SF_INT:
case V2DF_FTYPE_V4DF_INT:
case V4SF_FTYPE_V4SF_INT:
@@ -20310,6 +20403,7 @@ ix86_expand_sse_operands_builtin (enum i
break;
case V16QI_FTYPE_V16QI_V16QI_INT:
case V8HI_FTYPE_V8HI_V8HI_INT:
+ case V8SI_FTYPE_V8SI_V8SI_INT:
case V8SF_FTYPE_V8SF_V8SF_INT:
case V4SI_FTYPE_V4SI_V4SI_INT:
case V4DF_FTYPE_V4DF_V4DF_INT:
@@ -20319,6 +20413,13 @@ ix86_expand_sse_operands_builtin (enum i
nargs = 3;
last_arg_constant = true;
break;
+ case V8SF_FTYPE_V8SF_V8SF_V8SF_INT:
+ case V4DF_FTYPE_V4DF_V4DF_V4DF_INT:
+ case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
+ case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
+ nargs = 4;
+ last_arg_constant = true;
+ break;
default:
gcc_unreachable ();
}
@@ -20349,10 +20450,16 @@ ix86_expand_sse_operands_builtin (enum i
case CODE_FOR_sse4_1_roundss:
case CODE_FOR_sse4_1_blendps:
case CODE_FOR_avx_blendpd256:
+ case CODE_FOR_avx_vpermilv4df:
error ("the last argument must be a 4-bit immediate");
return const0_rtx;
case CODE_FOR_sse4_1_blendpd:
+ case CODE_FOR_avx_vpermilv2df:
+ case CODE_FOR_avx_vpermil2v2df3:
+ case CODE_FOR_avx_vpermil2v4sf3:
+ case CODE_FOR_avx_vpermil2v4df3:
+ case CODE_FOR_avx_vpermil2v8sf3:
error ("the last argument must be a 2-bit immediate");
return const0_rtx;
@@ -20366,8 +20473,8 @@ ix86_expand_sse_operands_builtin (enum i
case CODE_FOR_avx_cmpssv4sf3:
case CODE_FOR_avx_cmppdv2df3:
case CODE_FOR_avx_cmppsv4sf3:
- case CODE_FOR_avx_cmppsv8sf3:
case CODE_FOR_avx_cmppdv4df3:
+ case CODE_FOR_avx_cmppsv8sf3:
error ("the last argument must be a 5-bit immediate");
return const0_rtx;
@@ -20409,6 +20516,10 @@ ix86_expand_sse_operands_builtin (enum i
pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
args[2].op);
break;
+ case 4:
+ pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
+ args[2].op, args[3].op);
+ break;
default:
gcc_unreachable ();
}