This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH] Add vpermil2p{s,d} as part of XOP for upcoming AMD Orochi processor
- From: Sebastian Pop <sebpop at gmail dot com>
- To: GCC Patches <gcc-patches at gcc dot gnu dot org>
- Cc: Uros Bizjak <ubizjak at gmail dot com>, Richard Henderson <rth at redhat dot com>, Jan Hubicka <jh at suse dot cz>
- Date: Sat, 13 Feb 2010 05:53:33 -0600
- Subject: [PATCH] Add vpermil2p{s,d} as part of XOP for upcoming AMD Orochi processor
Hi,
this patch adds back the support for the vpermil2p{s,d} insns: these
were initially part of AVX, they were reverted from GCC by HJ Lu,
and finally these insns appeared in AMD's manual 6 as part of XOP:
http://support.amd.com/us/Processor_TechDocs/43479.pdf
The attached patch passed bootstrap and test on amd64-linux.
Ok for trunk?
Thanks,
Sebastian Pop
--
AMD / Open Source Compiler Engineering / GNU Tools
From e9a3cce8f23a5c21d0c15c195f2fc832c2e455ca Mon Sep 17 00:00:00 2001
From: Sebastian Pop <sebpop@gmail.com>
Date: Sat, 6 Feb 2010 10:12:52 -0600
Subject: [PATCH] Add support for vpermil2p* in XOP.
2010-02-13 Sebastian Pop <sebastian.pop@amd.com>
* config/i386/i386-builtin-types.def
(V2DF_FTYPE_V2DF_V2DF_V2DI_INT): Declared.
(V4DF_FTYPE_V4DF_V4DF_V4DI_INT): Declared.
(V4SF_FTYPE_V4SF_V4SF_V4SI_INT): Declared.
(V8SF_FTYPE_V8SF_V8SF_V8SI_INT): Declared.
* config/i386/i386.c (enum ix86_builtins): Add IX86_BUILTIN_VPERMIL2PD,
IX86_BUILTIN_VPERMIL2PS, IX86_BUILTIN_VPERMIL2PD256, and
IX86_BUILTIN_VPERMIL2PS256.
(MULTI_ARG_4_DF2_DI_I): Defined.
(MULTI_ARG_4_DF2_DI_I1): Defined.
(MULTI_ARG_4_SF2_SI_I): Defined.
(MULTI_ARG_4_SF2_SI_I1): Defined.
(bdesc_multi_arg): Add __builtin_ia32_vpermil2pd,
__builtin_ia32_vpermil2ps, __builtin_ia32_vpermil2pd256, and
__builtin_ia32_vpermil2ps256.
(ix86_expand_multi_arg_builtin): Handle MULTI_ARG_4_DF2_DI_I,
MULTI_ARG_4_DF2_DI_I1, MULTI_ARG_4_SF2_SI_I, and
MULTI_ARG_4_SF2_SI_I1. Handle builtins with 4 arguments.
(ix86_expand_args_builtin): Handle MULTI_ARG_4_DF2_DI_I,
MULTI_ARG_4_DF2_DI_I1, MULTI_ARG_4_SF2_SI_I, and
MULTI_ARG_4_SF2_SI_I1. Handle CODE_FOR_xop_vpermil2v2df3,
CODE_FOR_xop_vpermil2v4sf3, CODE_FOR_xop_vpermil2v4df3, and
CODE_FOR_xop_vpermil2v8sf3.
* config/i386/i386.md (UNSPEC_VPERMIL2): Declared.
* config/i386/sse.md (xop_vpermil2<mode>3): New insn pattern.
* config/i386/xopintrin.h (_mm_permute2_pd): New.
(_mm256_permute2_pd): New.
(_mm_permute2_ps): New.
(_mm256_permute2_ps): New.
* gcc.target/i386/sse-14.c: Add tests for _mm_permute2_pd,
_mm256_permute2_pd, _mm_permute2_ps, and _mm256_permute2_ps.
* gcc.target/i386/xop-vpermil2pd-1.c: New.
* gcc.target/i386/xop-vpermil2pd-256-1.c: New.
* gcc.target/i386/xop-vpermil2ps-1.c: New.
* gcc.target/i386/xop-vpermil2ps-256-1.c: New.
---
gcc/config/i386/i386-builtin-types.def | 4 +
gcc/config/i386/i386.c | 36 +++++++++++
gcc/config/i386/i386.md | 11 ++--
gcc/config/i386/sse.md | 14 ++++
gcc/config/i386/xopintrin.h | 64 ++++++++++++++++++++
gcc/testsuite/gcc.target/i386/sse-14.c | 4 +
gcc/testsuite/gcc.target/i386/xop-vpermil2pd-1.c | 55 +++++++++++++++++
.../gcc.target/i386/xop-vpermil2pd-256-1.c | 56 +++++++++++++++++
gcc/testsuite/gcc.target/i386/xop-vpermil2ps-1.c | 62 +++++++++++++++++++
.../gcc.target/i386/xop-vpermil2ps-256-1.c | 62 +++++++++++++++++++
10 files changed, 363 insertions(+), 5 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/xop-vpermil2pd-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/xop-vpermil2pd-256-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/xop-vpermil2ps-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/xop-vpermil2ps-256-1.c
diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def
index 5fec964..10310e2 100644
--- a/gcc/config/i386/i386-builtin-types.def
+++ b/gcc/config/i386/i386-builtin-types.def
@@ -311,6 +311,7 @@ DEF_FUNCTION_TYPE (V16QI, V16QI, V16QI, V16QI)
DEF_FUNCTION_TYPE (V1DI, V1DI, V1DI, INT)
DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, INT)
DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, V2DF)
+DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, V2DI, INT)
DEF_FUNCTION_TYPE (V2DI, V2DI, DI, INT)
DEF_FUNCTION_TYPE (V2DI, V2DI, UINT, UINT)
DEF_FUNCTION_TYPE (V2DI, V2DI, V2DI, INT)
@@ -319,11 +320,13 @@ DEF_FUNCTION_TYPE (V32QI, V32QI, V32QI, V32QI)
DEF_FUNCTION_TYPE (V4DF, V4DF, V2DF, INT)
DEF_FUNCTION_TYPE (V4DF, V4DF, V4DF, INT)
DEF_FUNCTION_TYPE (V4DF, V4DF, V4DF, V4DF)
+DEF_FUNCTION_TYPE (V4DF, V4DF, V4DF, V4DI, INT)
DEF_FUNCTION_TYPE (V4DI, V4DI, V4DI, V4DI)
DEF_FUNCTION_TYPE (V4HI, V4HI, HI, INT)
DEF_FUNCTION_TYPE (V4SF, V4SF, FLOAT, INT)
DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, INT)
DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, V4SF)
+DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, V4SI, INT)
DEF_FUNCTION_TYPE (V4SI, V4SI, SI, INT)
DEF_FUNCTION_TYPE (V4SI, V4SI, V4SI, INT)
DEF_FUNCTION_TYPE (V4SI, V4SI, V4SI, V2DI)
@@ -335,6 +338,7 @@ DEF_FUNCTION_TYPE (V8HI, V8HI, V8HI, V8HI)
DEF_FUNCTION_TYPE (V8SF, V8SF, V4SF, INT)
DEF_FUNCTION_TYPE (V8SF, V8SF, V8SF, INT)
DEF_FUNCTION_TYPE (V8SF, V8SF, V8SF, V8SF)
+DEF_FUNCTION_TYPE (V8SF, V8SF, V8SF, V8SI, INT)
DEF_FUNCTION_TYPE (V8SI, V8SI, V4SI, INT)
DEF_FUNCTION_TYPE (V8SI, V8SI, V8SI, INT)
DEF_FUNCTION_TYPE (V8SI, V8SI, V8SI, V8SI)
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 5bc4a64..ac5ee3d 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -20958,6 +20958,10 @@ enum ix86_builtins
IX86_BUILTIN_VPERMILPS,
IX86_BUILTIN_VPERMILPD256,
IX86_BUILTIN_VPERMILPS256,
+ IX86_BUILTIN_VPERMIL2PD,
+ IX86_BUILTIN_VPERMIL2PS,
+ IX86_BUILTIN_VPERMIL2PD256,
+ IX86_BUILTIN_VPERMIL2PS256,
IX86_BUILTIN_VPERM2F128PD256,
IX86_BUILTIN_VPERM2F128PS256,
IX86_BUILTIN_VPERM2F128SI256,
@@ -22147,6 +22151,10 @@ static const struct builtin_description bdesc_args[] =
};
/* FMA4 and XOP. */
+#define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
+#define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
+#define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
+#define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
#define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
#define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
#define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
@@ -22389,6 +22397,11 @@ static const struct builtin_description bdesc_multi_arg[] =
{ OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
{ OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
+
};
/* Set up all the MMX/SSE builtins, even builtins for instructions that are not
@@ -22769,6 +22782,14 @@ ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
switch (m_type)
{
+ case MULTI_ARG_4_DF2_DI_I:
+ case MULTI_ARG_4_DF2_DI_I1:
+ case MULTI_ARG_4_SF2_SI_I:
+ case MULTI_ARG_4_SF2_SI_I1:
+ nargs = 4;
+ last_arg_constant = true;
+ break;
+
case MULTI_ARG_3_SF:
case MULTI_ARG_3_DF:
case MULTI_ARG_3_SF2:
@@ -22912,6 +22933,10 @@ ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
break;
+ case 4:
+ pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
+ break;
+
default:
gcc_unreachable ();
}
@@ -23530,6 +23555,13 @@ ix86_expand_args_builtin (const struct builtin_description *d,
nargs = 3;
nargs_constant = 2;
break;
+ case MULTI_ARG_4_DF2_DI_I:
+ case MULTI_ARG_4_DF2_DI_I1:
+ case MULTI_ARG_4_SF2_SI_I:
+ case MULTI_ARG_4_SF2_SI_I1:
+ nargs = 4;
+ nargs_constant = 1;
+ break;
case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
nargs = 4;
nargs_constant = 2;
@@ -23599,6 +23631,10 @@ ix86_expand_args_builtin (const struct builtin_description *d,
case CODE_FOR_sse4_1_blendpd:
case CODE_FOR_avx_vpermilv2df:
+ case CODE_FOR_xop_vpermil2v2df3:
+ case CODE_FOR_xop_vpermil2v4sf3:
+ case CODE_FOR_xop_vpermil2v4df3:
+ case CODE_FOR_xop_vpermil2v8sf3:
error ("the last argument must be a 2-bit immediate");
return const0_rtx;
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index b4a8a83..924433f 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -219,11 +219,12 @@
; For AVX support
(UNSPEC_PCMP 166)
(UNSPEC_VPERMIL 167)
- (UNSPEC_VPERMIL2F128 168)
- (UNSPEC_MASKLOAD 169)
- (UNSPEC_MASKSTORE 170)
- (UNSPEC_CAST 171)
- (UNSPEC_VTESTP 172)
+ (UNSPEC_VPERMIL2 168)
+ (UNSPEC_VPERMIL2F128 169)
+ (UNSPEC_MASKLOAD 170)
+ (UNSPEC_MASKSTORE 171)
+ (UNSPEC_CAST 172)
+ (UNSPEC_VTESTP 173)
])
(define_constants
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 50b1b14..1056716 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -11539,6 +11539,20 @@
(set_attr "length_immediate" "1")
(set_attr "mode" "TI")])
+(define_insn "xop_vpermil2<mode>3"
+ [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x")
+ (unspec:AVXMODEF2P
+ [(match_operand:AVXMODEF2P 1 "register_operand" "x")
+ (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "%x")
+ (match_operand:<avxpermvecmode> 3 "nonimmediate_operand" "xm")
+ (match_operand:SI 4 "const_0_to_3_operand" "n")]
+ UNSPEC_VPERMIL2))]
+ "TARGET_XOP"
+ "vpermil2p<xopmodesuffixf2c>\t{%4, %3, %2, %1, %0|%0, %1, %2, %3, %4}"
+ [(set_attr "type" "sse4arg")
+ (set_attr "length_immediate" "1")
+ (set_attr "mode" "<MODE>")])
+
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(define_insn "*avx_aesenc"
[(set (match_operand:V2DI 0 "register_operand" "=x")
diff --git a/gcc/config/i386/xopintrin.h b/gcc/config/i386/xopintrin.h
index 803417a..30ce72d 100644
--- a/gcc/config/i386/xopintrin.h
+++ b/gcc/config/i386/xopintrin.h
@@ -766,6 +766,70 @@ _mm256_frcz_pd (__m256d __A)
return (__m256d) __builtin_ia32_vfrczpd256 ((__v4df)__A);
}
+/* PERMIL2 */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permute2_pd (__m128d __X, __m128d __Y, __m128i __C, const int __I)
+{
+ return (__m128d) __builtin_ia32_vpermil2pd ((__v2df)__X,
+ (__v2df)__Y,
+ (__v2di)__C,
+ __I);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute2_pd (__m256d __X, __m256d __Y, __m256i __C, const int __I)
+{
+ return (__m256d) __builtin_ia32_vpermil2pd256 ((__v4df)__X,
+ (__v4df)__Y,
+ (__v4di)__C,
+ __I);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permute2_ps (__m128 __X, __m128 __Y, __m128i __C, const int __I)
+{
+ return (__m128) __builtin_ia32_vpermil2ps ((__v4sf)__X,
+ (__v4sf)__Y,
+ (__v4si)__C,
+ __I);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute2_ps (__m256 __X, __m256 __Y, __m256i __C, const int __I)
+{
+ return (__m256) __builtin_ia32_vpermil2ps256 ((__v8sf)__X,
+ (__v8sf)__Y,
+ (__v8si)__C,
+ __I);
+}
+#else
+#define _mm_permute2_pd(X, Y, C, I) \
+ ((__m128d) __builtin_ia32_vpermil2pd ((__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), \
+ (__v2di)(__m128d)(C), \
+ (int)(I)))
+
+#define _mm256_permute2_pd(X, Y, C, I) \
+ ((__m256d) __builtin_ia32_vpermil2pd256 ((__v4df)(__m256d)(X), \
+ (__v4df)(__m256d)(Y), \
+ (__v4di)(__m256d)(C), \
+ (int)(I)))
+
+#define _mm_permute2_ps(X, Y, C, I) \
+ ((__m128) __builtin_ia32_vpermil2ps ((__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), \
+ (__v4si)(__m128)(C), \
+ (int)(I)))
+
+#define _mm256_permute2_ps(X, Y, C, I) \
+ ((__m256) __builtin_ia32_vpermil2ps256 ((__v8sf)(__m256)(X), \
+ (__v8sf)(__m256)(Y), \
+ (__v8si)(__m256)(C), \
+ (int)(I)))
+#endif /* __OPTIMIZE__ */
+
#endif /* __XOP__ */
#endif /* _XOPMMINTRIN_H_INCLUDED */
diff --git a/gcc/testsuite/gcc.target/i386/sse-14.c b/gcc/testsuite/gcc.target/i386/sse-14.c
index c3f72e4..96a3f21 100644
--- a/gcc/testsuite/gcc.target/i386/sse-14.c
+++ b/gcc/testsuite/gcc.target/i386/sse-14.c
@@ -162,6 +162,10 @@ test_1 ( _mm_roti_epi8, __m128i, __m128i, 1)
test_1 ( _mm_roti_epi16, __m128i, __m128i, 1)
test_1 ( _mm_roti_epi32, __m128i, __m128i, 1)
test_1 ( _mm_roti_epi64, __m128i, __m128i, 1)
+test_3 (_mm_permute2_pd, __m128d, __m128d, __m128d, __m128d, 1)
+test_3 (_mm256_permute2_pd, __m256d, __m256d, __m256d, __m256d, 1)
+test_3 (_mm_permute2_ps, __m128, __m128, __m128, __m128, 1)
+test_3 (_mm256_permute2_ps, __m256, __m256, __m256, __m256, 1)
/* lwpintrin.h */
test_2 ( __lwpval32, void, unsigned int, unsigned int, 1)
diff --git a/gcc/testsuite/gcc.target/i386/xop-vpermil2pd-1.c b/gcc/testsuite/gcc.target/i386/xop-vpermil2pd-1.c
new file mode 100644
index 0000000..c7f0594
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/xop-vpermil2pd-1.c
@@ -0,0 +1,55 @@
+/* { dg-do run } */
+/* { dg-require-effective-target xop } */
+/* { dg-options "-O2 -mxop" } */
+
+#include "xop-check.h"
+
+#ifndef ZERO_MATCH
+#define ZERO_MATCH 2
+#endif
+
+static double
+select2dp(double *src1, double *src2, long long sel)
+{
+ double tmp = 0.0;
+
+ if ((sel & 0x3) == 0) tmp = src1[0];
+ if ((sel & 0x3) == 1) tmp = src1[1];
+ if ((sel & 0x3) == 2) tmp = src2[0];
+ if ((sel & 0x3) == 3) tmp = src2[1];
+
+ return tmp;
+}
+
+static double
+sel_and_condzerodp(double *src1, double *src2, long long sel, int imm8)
+{
+ double tmp;
+
+ tmp = select2dp(src1, src2, sel & 0x3);
+
+ if (((imm8 & 0x3) == 2) && ((sel & 0x4) == 0x4)) tmp = 0;
+ if (((imm8 & 0x3) == 3) && ((sel & 0x4) == 0x0)) tmp = 0;
+
+ return tmp;
+}
+
+void static
+xop_test ()
+{
+ union128d s1, s2, u;
+ union128i_q s3;
+ double e[2];
+
+ s1.x = _mm_set_pd (1, 2);
+ s2.x = _mm_set_pd (3, 4);
+ s3.x = _mm_set_epi64x (1, 2);
+ u.x = _mm_permute2_pd(s1.x, s2.x, s3.x, ZERO_MATCH);
+
+ e[0] = sel_and_condzerodp (s1.a, s2.a, (s3.a[0] & 0xe)>>1, ZERO_MATCH);
+ e[1] = sel_and_condzerodp (s1.a, s2.a, (s3.a[1] & 0xe)>>1, ZERO_MATCH);
+
+ if (check_union128d (u, e))
+ abort ();
+}
+
diff --git a/gcc/testsuite/gcc.target/i386/xop-vpermil2pd-256-1.c b/gcc/testsuite/gcc.target/i386/xop-vpermil2pd-256-1.c
new file mode 100644
index 0000000..90012db
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/xop-vpermil2pd-256-1.c
@@ -0,0 +1,56 @@
+/* { dg-do run } */
+/* { dg-require-effective-target xop } */
+/* { dg-options "-O2 -mxop" } */
+
+#include "xop-check.h"
+
+#ifndef ZERO_MATCH
+#define ZERO_MATCH 1
+#endif
+
+static double
+select2dp(double *src1, double *src2, long long sel)
+{
+ double tmp = 3.414;
+
+ if ((sel & 0x3) == 0) tmp = src1[0];
+ if ((sel & 0x3) == 1) tmp = src1[1];
+ if ((sel & 0x3) == 2) tmp = src2[0];
+ if ((sel & 0x3) == 3) tmp = src2[1];
+
+ return tmp;
+}
+
+static double
+sel_and_condzerodp(double *src1, double *src2, long long sel, int imm8)
+{
+ double tmp;
+
+ tmp = select2dp(src1, src2, sel);
+
+ if (((imm8 & 0x3) == 2) && ((sel & 0x4) == 0x4)) tmp = 0;
+ if (((imm8 & 0x3) == 3) && ((sel & 0x4) == 0x0)) tmp = 0;
+
+ return tmp;
+}
+
+void static
+xop_test ()
+{
+ union256d u, s1, s2;
+ double e[4] = {0.0};
+ union256i_q s3;
+
+ s1.x = _mm256_set_pd (1, 2, 3, 4);
+ s2.x = _mm256_set_pd (5, 6, 7, 8);
+ s3.x = _mm256_set_epi64x (0, 1, 2, 3);
+ u.x = _mm256_permute2_pd(s1.x, s2.x, s3.x, ZERO_MATCH);
+
+ e[0] = sel_and_condzerodp (s1.a, s2.a, (s3.a[0] & 0xe)>>1, ZERO_MATCH);
+ e[1] = sel_and_condzerodp (s1.a, s2.a, (s3.a[1] & 0xe)>>1, ZERO_MATCH);
+ e[2] = sel_and_condzerodp (s1.a + 2, s2.a + 2, (s3.a[2] & 0xe)>>1, ZERO_MATCH);
+ e[3] = sel_and_condzerodp (s1.a + 2, s2.a + 2, (s3.a[3] & 0xe)>>1, ZERO_MATCH);
+
+ if (check_union256d (u, e))
+ abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/xop-vpermil2ps-1.c b/gcc/testsuite/gcc.target/i386/xop-vpermil2ps-1.c
new file mode 100644
index 0000000..be47564
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/xop-vpermil2ps-1.c
@@ -0,0 +1,62 @@
+/* { dg-do run } */
+/* { dg-require-effective-target xop } */
+/* { dg-options "-O2 -mxop" } */
+
+#include "xop-check.h"
+
+#ifndef ZERO_MATCH
+#define ZERO_MATCH 1
+#endif
+
+static float
+select2sp(float *src1, float *src2, int sel)
+{
+ float tmp;
+
+ if ((sel & 0x7) == 0) tmp = src1[0];
+ if ((sel & 0x7) == 1) tmp = src1[1];
+ if ((sel & 0x7) == 2) tmp = src1[2];
+ if ((sel & 0x7) == 3) tmp = src1[3];
+ if ((sel & 0x7) == 4) tmp = src2[0];
+ if ((sel & 0x7) == 5) tmp = src2[1];
+ if ((sel & 0x7) == 6) tmp = src2[2];
+ if ((sel & 0x7) == 7) tmp = src2[3];
+
+ return tmp;
+}
+static float
+sel_and_condzerosp(float *src1, float *src2, int sel, int imm8)
+{
+ float tmp;
+
+ tmp = select2sp(src1, src2, sel & 0x7);
+
+ if (((imm8 & 0x3) == 2) && ((sel & 0x8) == 0x8)) tmp = 0;
+ if (((imm8 & 0x3) == 3) && ((sel & 0x8) == 0x0)) tmp = 0;
+
+ return tmp;
+}
+
+void static
+xop_test ()
+{
+ int i;
+ union128 source1, source2, u;
+ union128i_d source3;
+ float s1[4] = {1, 2, 3, 4};
+ float s2[4] = {5, 6, 7, 8};
+ int s3[4] = {0, 1, 0, 1};
+ float e[4];
+
+ source1.x = _mm_loadu_ps(s1);
+ source2.x = _mm_loadu_ps(s2);
+ source3.x = _mm_loadu_si128((__m128i*) s3);
+ u.x = _mm_permute2_ps(source1.x, source2.x, source3.x, ZERO_MATCH);
+
+ for (i = 0; i < 4; ++i) {
+ e[i] = sel_and_condzerosp(&s1[i & 0x4], &s2[i & 0x4], s3[i] & 0xf, ZERO_MATCH & 0x3);
+ }
+
+ if (check_union128 (u, e))
+ abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/xop-vpermil2ps-256-1.c b/gcc/testsuite/gcc.target/i386/xop-vpermil2ps-256-1.c
new file mode 100644
index 0000000..4a5fcc6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/xop-vpermil2ps-256-1.c
@@ -0,0 +1,62 @@
+/* { dg-do run } */
+/* { dg-require-effective-target xop } */
+/* { dg-options "-O2 -mxop" } */
+
+#include "xop-check.h"
+
+#ifndef ZERO_MATCH
+#define ZERO_MATCH 3
+#endif
+
+static float
+select2sp(float *src1, float *src2, int sel)
+{
+ float tmp;
+
+ if ((sel & 0x7) == 0) tmp = src1[0];
+ if ((sel & 0x7) == 1) tmp = src1[1];
+ if ((sel & 0x7) == 2) tmp = src1[2];
+ if ((sel & 0x7) == 3) tmp = src1[3];
+ if ((sel & 0x7) == 4) tmp = src2[0];
+ if ((sel & 0x7) == 5) tmp = src2[1];
+ if ((sel & 0x7) == 6) tmp = src2[2];
+ if ((sel & 0x7) == 7) tmp = src2[3];
+
+ return tmp;
+}
+static float
+sel_and_condzerosp(float *src1, float *src2, int sel, int imm8)
+{
+ float tmp;
+
+ tmp = select2sp(src1, src2, sel & 0x7);
+
+ if (((imm8 & 0x3) == 2) && ((sel & 0x8) == 0x8)) tmp = 0;
+ if (((imm8 & 0x3) == 3) && ((sel & 0x8) == 0x0)) tmp = 0;
+
+ return tmp;
+}
+
+void static
+xop_test ()
+{
+ int i;
+ union256 source1, source2, u;
+ union256i_d source3;
+ float s1[8]={1, 2, 3, 4, 5, 6, 7, 8};
+ float s2[8]={9, 10, 11, 12, 13, 14, 15, 16};
+ int s3[8]={11, 2, 3, 15, 5, 12, 7, 8};
+ float e[8];
+
+ source1.x = _mm256_loadu_ps(s1);
+ source2.x = _mm256_loadu_ps(s2);
+ source3.x = _mm256_loadu_si256((__m256i*) s3);
+ u.x = _mm256_permute2_ps(source1.x, source2.x, source3.x, ZERO_MATCH);
+
+ for (i = 0; i < 8; ++i) {
+ e[i] = sel_and_condzerosp(&s1[i & 0x4], &s2[i & 0x4], s3[i] & 0xf, ZERO_MATCH & 0x3);
+ }
+
+ if (check_union256(u, e))
+ abort ();
+}
--
1.6.3.3