This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [i386] Replace builtins with vector extensions
- From: Marc Glisse <marc dot glisse at inria dot fr>
- To: Kirill Yukhin <kirill dot yukhin at gmail dot com>
- Cc: GCC Patches <gcc-patches at gcc dot gnu dot org>, Uros Bizjak <ubizjak at gmail dot com>
- Date: Sat, 26 Jul 2014 19:34:25 +0200 (CEST)
- Subject: Re: [i386] Replace builtins with vector extensions
- Authentication-results: sourceware.org; auth=none
- References: <alpine dot DEB dot 2 dot 02 dot 1404112137530 dot 19663 at stedding dot saclay dot inria dot fr> <alpine dot DEB dot 2 dot 10 dot 1404281322180 dot 3620 at laptop-mg dot saclay dot inria dot fr> <alpine dot DEB dot 2 dot 10 dot 1405171529460 dot 3642 at laptop-mg dot saclay dot inria dot fr> <alpine dot DEB dot 2 dot 10 dot 1406281230560 dot 9234 at laptop-mg dot saclay dot inria dot fr> <20140703101605 dot GA12583 at msticlxl57 dot ims dot intel dot com> <alpine dot DEB dot 2 dot 10 dot 1407031613450 dot 2526 at laptop-mg dot saclay dot inria dot fr> <20140708111402 dot GB14139 at msticlxl57 dot ims dot intel dot com>
On Tue, 8 Jul 2014, Kirill Yukhin wrote:
Hello Marc.
On 04 Jul 21:11, Marc Glisse wrote:
On Thu, 3 Jul 2014, Kirill Yukhin wrote:
like combining 2 shuffles unless the result is the identity. And
expanding shuffles that can be done in a single instruction works
well.
But I am happy not doing them yet. To be very specific, could you
list which intrinsics you would like to remove from the posted
patch?
I am not a x86 maintainer, however while such a replacements produce
correct semantics and probably enable optimizations, I support your patch.
Probably you could try such your approach on AVX2, AVX-512 whose intrinsics
are well covered by tests?
I did some AVX and AVX512F intrinsics, and it still passes the testsuite
(on my old pre-AVX x86_64-linux-gnu).
2014-07-26 Marc Glisse <marc.glisse@inria.fr>
* config/i386/xmmintrin.h (_mm_add_ps, _mm_sub_ps, _mm_mul_ps,
_mm_div_ps, _mm_store_ss, _mm_cvtss_f32): Use vector extensions
instead of builtins.
* config/i386/avxintrin.h (_mm256_add_pd, _mm256_add_ps,
_mm256_div_pd, _mm256_div_ps, _mm256_mul_pd, _mm256_mul_ps,
_mm256_sub_pd, _mm256_sub_ps): Likewise.
* config/i386/avx512fintrin.h (_mm512_add_pd, _mm512_add_ps,
_mm512_sub_pd, _mm512_sub_ps, _mm512_mul_pd, _mm512_mul_ps,
_mm512_div_pd, _mm512_div_ps): Likewise.
* config/i386/emmintrin.h (_mm_store_sd, _mm_cvtsd_f64, _mm_storeh_pd,
_mm_cvtsi128_si64, _mm_cvtsi128_si64x, _mm_add_pd, _mm_sub_pd,
_mm_mul_pd, _mm_div_pd, _mm_storel_epi64, _mm_movepi64_pi64,
_mm_loadh_pd, _mm_loadl_pd): Likewise.
(_mm_sqrt_sd): Fix comment.
--
Marc Glisse
Index: gcc/config/i386/avx512fintrin.h
===================================================================
--- gcc/config/i386/avx512fintrin.h (revision 213083)
+++ gcc/config/i386/avx512fintrin.h (working copy)
@@ -10598,26 +10598,21 @@ _mm512_maskz_sqrt_ps (__mmask16 __U, __m
(__v16sf)
_mm512_setzero_ps (),
(__mmask16) __U,
_MM_FROUND_CUR_DIRECTION);
}
extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_add_pd (__m512d __A, __m512d __B)
{
- return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df)
- _mm512_undefined_pd (),
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
+ return __A + __B;
}
extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_add_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
{
return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
(__v8df) __B,
(__v8df) __W,
(__mmask8) __U,
@@ -10633,26 +10628,21 @@ _mm512_maskz_add_pd (__mmask8 __U, __m51
(__v8df)
_mm512_setzero_pd (),
(__mmask8) __U,
_MM_FROUND_CUR_DIRECTION);
}
extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_add_ps (__m512 __A, __m512 __B)
{
- return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf)
- _mm512_undefined_ps (),
- (__mmask16) -1,
- _MM_FROUND_CUR_DIRECTION);
+ return __A + __B;
}
extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_add_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
{
return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
(__v16sf) __B,
(__v16sf) __W,
(__mmask16) __U,
@@ -10668,26 +10658,21 @@ _mm512_maskz_add_ps (__mmask16 __U, __m5
(__v16sf)
_mm512_setzero_ps (),
(__mmask16) __U,
_MM_FROUND_CUR_DIRECTION);
}
extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_sub_pd (__m512d __A, __m512d __B)
{
- return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df)
- _mm512_undefined_pd (),
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
+ return __A - __B;
}
extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_sub_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
{
return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
(__v8df) __B,
(__v8df) __W,
(__mmask8) __U,
@@ -10703,26 +10688,21 @@ _mm512_maskz_sub_pd (__mmask8 __U, __m51
(__v8df)
_mm512_setzero_pd (),
(__mmask8) __U,
_MM_FROUND_CUR_DIRECTION);
}
extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_sub_ps (__m512 __A, __m512 __B)
{
- return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf)
- _mm512_undefined_ps (),
- (__mmask16) -1,
- _MM_FROUND_CUR_DIRECTION);
+ return __A - __B;
}
extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_sub_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
{
return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
(__v16sf) __B,
(__v16sf) __W,
(__mmask16) __U,
@@ -10738,26 +10718,21 @@ _mm512_maskz_sub_ps (__mmask16 __U, __m5
(__v16sf)
_mm512_setzero_ps (),
(__mmask16) __U,
_MM_FROUND_CUR_DIRECTION);
}
extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mul_pd (__m512d __A, __m512d __B)
{
- return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df)
- _mm512_undefined_pd (),
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
+ return __A * __B;
}
extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_mul_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
{
return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
(__v8df) __B,
(__v8df) __W,
(__mmask8) __U,
@@ -10773,26 +10748,21 @@ _mm512_maskz_mul_pd (__mmask8 __U, __m51
(__v8df)
_mm512_setzero_pd (),
(__mmask8) __U,
_MM_FROUND_CUR_DIRECTION);
}
extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mul_ps (__m512 __A, __m512 __B)
{
- return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf)
- _mm512_undefined_ps (),
- (__mmask16) -1,
- _MM_FROUND_CUR_DIRECTION);
+ return __A * __B;
}
extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_mul_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
{
return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
(__v16sf) __B,
(__v16sf) __W,
(__mmask16) __U,
@@ -10808,26 +10778,21 @@ _mm512_maskz_mul_ps (__mmask16 __U, __m5
(__v16sf)
_mm512_setzero_ps (),
(__mmask16) __U,
_MM_FROUND_CUR_DIRECTION);
}
extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_div_pd (__m512d __M, __m512d __V)
{
- return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M,
- (__v8df) __V,
- (__v8df)
- _mm512_undefined_pd (),
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
+ return __M / __V;
}
extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_div_pd (__m512d __W, __mmask8 __U, __m512d __M, __m512d __V)
{
return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M,
(__v8df) __V,
(__v8df) __W,
(__mmask8) __U,
@@ -10843,26 +10808,21 @@ _mm512_maskz_div_pd (__mmask8 __U, __m51
(__v8df)
_mm512_setzero_pd (),
(__mmask8) __U,
_MM_FROUND_CUR_DIRECTION);
}
extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_div_ps (__m512 __A, __m512 __B)
{
- return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf)
- _mm512_undefined_ps (),
- (__mmask16) -1,
- _MM_FROUND_CUR_DIRECTION);
+ return __A / __B;
}
extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_div_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
{
return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
(__v16sf) __B,
(__v16sf) __W,
(__mmask16) __U,
Index: gcc/config/i386/avxintrin.h
===================================================================
--- gcc/config/i386/avxintrin.h (revision 213083)
+++ gcc/config/i386/avxintrin.h (working copy)
@@ -117,27 +117,27 @@ typedef double __m256d __attribute__ ((_
/* Greater-than-or-equal (ordered, non-signaling) */
#define _CMP_GE_OQ 0x1d
/* Greater-than (ordered, non-signaling) */
#define _CMP_GT_OQ 0x1e
/* True (unordered, signaling) */
#define _CMP_TRUE_US 0x1f
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_add_pd (__m256d __A, __m256d __B)
{
- return (__m256d) __builtin_ia32_addpd256 ((__v4df)__A, (__v4df)__B);
+ return __A + __B;
}
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_add_ps (__m256 __A, __m256 __B)
{
- return (__m256) __builtin_ia32_addps256 ((__v8sf)__A, (__v8sf)__B);
+ return __A + __B;
}
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_addsub_pd (__m256d __A, __m256d __B)
{
return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B);
}
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_addsub_ps (__m256 __A, __m256 __B)
@@ -211,27 +211,27 @@ extern __inline __m256 __attribute__((__
_mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M)
{
return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X,
(__v8sf)__Y,
(__v8sf)__M);
}
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_div_pd (__m256d __A, __m256d __B)
{
- return (__m256d) __builtin_ia32_divpd256 ((__v4df)__A, (__v4df)__B);
+ return __A / __B;
}
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_div_ps (__m256 __A, __m256 __B)
{
- return (__m256) __builtin_ia32_divps256 ((__v8sf)__A, (__v8sf)__B);
+ return __A / __B;
}
/* Dot product instructions with mask-defined summing and zeroing parts
of result. */
#ifdef __OPTIMIZE__
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_dp_ps (__m256 __X, __m256 __Y, const int __M)
{
return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X,
@@ -288,27 +288,27 @@ _mm256_min_pd (__m256d __A, __m256d __B)
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_min_ps (__m256 __A, __m256 __B)
{
return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B);
}
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mul_pd (__m256d __A, __m256d __B)
{
- return (__m256d) __builtin_ia32_mulpd256 ((__v4df)__A, (__v4df)__B);
+ return __A * __B;
}
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mul_ps (__m256 __A, __m256 __B)
{
- return (__m256) __builtin_ia32_mulps256 ((__v8sf)__A, (__v8sf)__B);
+ return __A * __B;
}
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_or_pd (__m256d __A, __m256d __B)
{
return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B);
}
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_or_ps (__m256 __A, __m256 __B)
@@ -336,27 +336,27 @@ _mm256_shuffle_ps (__m256 __A, __m256 __
(__v4df)(__m256d)(B), (int)(N)))
#define _mm256_shuffle_ps(A, B, N) \
((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A), \
(__v8sf)(__m256)(B), (int)(N)))
#endif
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_sub_pd (__m256d __A, __m256d __B)
{
- return (__m256d) __builtin_ia32_subpd256 ((__v4df)__A, (__v4df)__B);
+ return __A - __B;
}
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_sub_ps (__m256 __A, __m256 __B)
{
- return (__m256) __builtin_ia32_subps256 ((__v8sf)__A, (__v8sf)__B);
+ return __A - __B;
}
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_xor_pd (__m256d __A, __m256d __B)
{
return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B);
}
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_xor_ps (__m256 __A, __m256 __B)
Index: gcc/config/i386/emmintrin.h
===================================================================
--- gcc/config/i386/emmintrin.h (revision 213083)
+++ gcc/config/i386/emmintrin.h (working copy)
@@ -161,40 +161,40 @@ _mm_store_pd (double *__P, __m128d __A)
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeu_pd (double *__P, __m128d __A)
{
__builtin_ia32_storeupd (__P, __A);
}
/* Stores the lower DPFP value. */
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_sd (double *__P, __m128d __A)
{
- *__P = __builtin_ia32_vec_ext_v2df (__A, 0);
+ *__P = __A[0];
}
extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_f64 (__m128d __A)
{
- return __builtin_ia32_vec_ext_v2df (__A, 0);
+ return __A[0];
}
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storel_pd (double *__P, __m128d __A)
{
_mm_store_sd (__P, __A);
}
/* Stores the upper DPFP value. */
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeh_pd (double *__P, __m128d __A)
{
- *__P = __builtin_ia32_vec_ext_v2df (__A, 1);
+ *__P = __A[1];
}
/* Store the lower DPFP value across two words.
The address must be 16-byte aligned. */
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store1_pd (double *__P, __m128d __A)
{
_mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,0)));
}
@@ -215,86 +215,86 @@ extern __inline int __attribute__((__gnu
_mm_cvtsi128_si32 (__m128i __A)
{
return __builtin_ia32_vec_ext_v4si ((__v4si)__A, 0);
}
#ifdef __x86_64__
/* Intel intrinsic. */
extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si64 (__m128i __A)
{
- return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0);
+ return __A[0];
}
/* Microsoft intrinsic. */
extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si64x (__m128i __A)
{
- return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0);
+ return __A[0];
}
#endif
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_pd (__m128d __A, __m128d __B)
{
- return (__m128d)__builtin_ia32_addpd ((__v2df)__A, (__v2df)__B);
+ return __A + __B;
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_sd (__m128d __A, __m128d __B)
{
return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_pd (__m128d __A, __m128d __B)
{
- return (__m128d)__builtin_ia32_subpd ((__v2df)__A, (__v2df)__B);
+ return __A - __B;
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_sd (__m128d __A, __m128d __B)
{
return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_pd (__m128d __A, __m128d __B)
{
- return (__m128d)__builtin_ia32_mulpd ((__v2df)__A, (__v2df)__B);
+ return __A * __B;
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_sd (__m128d __A, __m128d __B)
{
return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_pd (__m128d __A, __m128d __B)
{
- return (__m128d)__builtin_ia32_divpd ((__v2df)__A, (__v2df)__B);
+ return __A / __B;
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_sd (__m128d __A, __m128d __B)
{
return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sqrt_pd (__m128d __A)
{
return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A);
}
-/* Return pair {sqrt (A[0), B[1]}. */
+/* Return pair {sqrt (B[0]), A[1]}. */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sqrt_sd (__m128d __A, __m128d __B)
{
__v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_pd (__m128d __A, __m128d __B)
{
@@ -708,27 +708,27 @@ _mm_store_si128 (__m128i *__P, __m128i _
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeu_si128 (__m128i *__P, __m128i __B)
{
__builtin_ia32_storedqu ((char *)__P, (__v16qi)__B);
}
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storel_epi64 (__m128i *__P, __m128i __B)
{
- *(long long *)__P = __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0);
+ *(long long *)__P = __B[0];
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movepi64_pi64 (__m128i __B)
{
- return (__m64) __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0);
+ return (__m64) __B[0];
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movpi64_epi64 (__m64 __A)
{
return _mm_set_epi64 ((__m64)0LL, __A);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_move_epi64 (__m128i __A)
@@ -915,27 +915,27 @@ _mm_unpackhi_pd (__m128d __A, __m128d __
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_pd (__m128d __A, __m128d __B)
{
return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadh_pd (__m128d __A, double const *__B)
{
- return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, __B);
+ return __extension__ (__m128d){ __A[0], __B[0] };
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadl_pd (__m128d __A, double const *__B)
{
- return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, __B);
+ return __extension__ (__m128d){ __B[0], __A[1] };
}
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_pd (__m128d __A)
{
return __builtin_ia32_movmskpd ((__v2df)__A);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_epi16 (__m128i __A, __m128i __B)
Index: gcc/config/i386/xmmintrin.h
===================================================================
--- gcc/config/i386/xmmintrin.h (revision 213083)
+++ gcc/config/i386/xmmintrin.h (working copy)
@@ -173,39 +173,39 @@ extern __inline __m128 __attribute__((__
_mm_max_ss (__m128 __A, __m128 __B)
{
return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B);
}
/* Perform the respective operation on the four SPFP values in A and B. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_ps (__m128 __A, __m128 __B)
{
- return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B);
+ return __A + __B;
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_ps (__m128 __A, __m128 __B)
{
- return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B);
+ return __A - __B;
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_ps (__m128 __A, __m128 __B)
{
- return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B);
+ return __A * __B;
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_ps (__m128 __A, __m128 __B)
{
- return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B);
+ return __A / __B;
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sqrt_ps (__m128 __A)
{
return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A);
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_rcp_ps (__m128 __A)
@@ -950,27 +950,27 @@ _mm_set_ps (const float __Z, const float
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_ps (float __Z, float __Y, float __X, float __W)
{
return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
}
/* Stores the lower SPFP value. */
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_ss (float *__P, __m128 __A)
{
- *__P = __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0);
+ *__P = __A[0];
}
extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_f32 (__m128 __A)
{
- return __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0);
+ return __A[0];
}
/* Store four SPFP values. The address must be 16-byte aligned. */
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_ps (float *__P, __m128 __A)
{
*(__v4sf *)__P = (__v4sf)__A;
}
/* Store four SPFP values. The address need not be 16-byte aligned. */