This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: PATCH: Add SSE4.1 support
- From: "H. J. Lu" <hjl at lucon dot org>
- To: Uros Bizjak <ubizjak at gmail dot com>
- Cc: GCC Patches <gcc-patches at gcc dot gnu dot org>, Richard Henderson <rth at redhat dot com>, Jan Hubicka <jh at suse dot cz>
- Date: Tue, 22 May 2007 07:07:10 -0700
- Subject: Re: PATCH: Add SSE4.1 support
- References: <5787cf470705220152w79640c8bt5d9987da300bdf1a@mail.gmail.com>
On Tue, May 22, 2007 at 10:52:11AM +0200, Uros Bizjak wrote:
> Hello!
>
> >This update adds prefix_extra to SSE4.1 instructions for longer
> >opcodes. I also removed ix86_expand_sse4_unpack since a new approach
> >is needed for the vectorizer to fully support SSE4.1.
>
> >2007-04-24 H.J. Lu <hongjiu.lu@intel.com>
> > Richard Henderson <rth@redhat.com>
> ...
> > * config/i386/smmintrin.h: New. The SSE4.1 intrinsic header
> > file.
>
> In order to test this header for compile-time failures, an inlined
> versions for all #defined functions should be added (guarded with
> #ifdef __OPTIMIZE__), as was recently changed in SVN mainline.
I am enclosing an incremental patch here. I have run my SSE4.1 tests
against the new hader at -O0, -O1, -O2 on both Linux/ia32 and
Linux/Intel64.
>
> Following that change, please update following files in the testsuite:
>
> gcc.target/i386/sse-12.c
> gcc.target/i386/sse-13.c
> gcc.target/i386/sse-14.c
> g++.dg/other/i386-2.C
I can't add them since not all assemblers support SSE4. I can add
some new SSE4 tests after "proc check_effective_target_sse4" is added.
>
> This will test your header as much as possible for optimized and
> unoptimized builds.
>
> Considering that this is mostly new functionality (already reviewed by
> rth), that it will be thoroughly tested by provided tests [1] and that
> no other i386 maintainer raised any issue with latest version of
> patch, it is OK for mainline.
>
> [1] http://gcc.gnu.org/ml/gcc-patches/2007-05/msg01361.html
I will check in the combined patch shortly.
Thanks.
H.J.
----
--- gcc/config/i386/smmintrin.h.sni-macro 2007-05-22 06:29:54.000000000 -0700
+++ gcc/config/i386/smmintrin.h 2007-05-22 06:50:52.000000000 -0700
@@ -66,8 +66,18 @@
/* Integer blend instructions - select data from 2 sources using
constant/variable mask. */
+#ifdef __OPTIMIZE__
+static __inline __m128i __attribute__((__always_inline__))
+_mm_blend_epi16 (__m128i __X, __m128i __Y, const int __M)
+{
+ return (__m128i) __builtin_ia32_pblendw128 ((__v8hi)__X,
+ (__v8hi)__Y,
+ __M);
+}
+#else
#define _mm_blend_epi16(X, Y, M) \
((__m128i) __builtin_ia32_pblendw128 ((__v8hi)(X), (__v8hi)(Y), (M)))
+#endif
static __inline __m128i __attribute__((__always_inline__))
_mm_blendv_epi8 (__m128i __X, __m128i __Y, __m128i __M)
@@ -80,8 +90,18 @@ _mm_blendv_epi8 (__m128i __X, __m128i __
/* Single precision floating point blend instructions - select data
from 2 sources using constant/variable mask. */
+#ifdef __OPTIMIZE__
+static __inline __m128 __attribute__((__always_inline__))
+_mm_blend_ps (__m128 __X, __m128 __Y, const int __M)
+{
+ return (__m128) __builtin_ia32_blendps ((__v4sf)__X,
+ (__v4sf)__Y,
+ __M);
+}
+#else
#define _mm_blend_ps(X, Y, M) \
((__m128) __builtin_ia32_blendps ((__v4sf)(X), (__v4sf)(Y), (M)))
+#endif
static __inline __m128 __attribute__((__always_inline__))
_mm_blendv_ps (__m128 __X, __m128 __Y, __m128 __M)
@@ -94,8 +114,18 @@ _mm_blendv_ps (__m128 __X, __m128 __Y, _
/* Double precision floating point blend instructions - select data
from 2 sources using constant/variable mask. */
+#ifdef __OPTIMIZE__
+static __inline __m128d __attribute__((__always_inline__))
+_mm_blend_pd (__m128d __X, __m128d __Y, const int __M)
+{
+ return (__m128d) __builtin_ia32_blendpd ((__v2df)__X,
+ (__v2df)__Y,
+ __M);
+}
+#else
#define _mm_blend_pd(X, Y, M) \
((__m128d) __builtin_ia32_blendpd ((__v2df)(X), (__v2df)(Y), (M)))
+#endif
static __inline __m128d __attribute__((__always_inline__))
_mm_blendv_pd (__m128d __X, __m128d __Y, __m128d __M)
@@ -108,11 +138,29 @@ _mm_blendv_pd (__m128d __X, __m128d __Y,
/* Dot product instructions with mask-defined summing and zeroing parts
of result. */
+#ifdef __OPTIMIZE__
+static __inline __m128 __attribute__((__always_inline__))
+_mm_dp_ps (__m128 __X, __m128 __Y, const int __M)
+{
+ return (__m128) __builtin_ia32_dpps ((__v4sf)__X,
+ (__v4sf)__Y,
+ __M);
+}
+
+static __inline __m128d __attribute__((__always_inline__))
+_mm_dp_pd (__m128d __X, __m128d __Y, const int __M)
+{
+ return (__m128d) __builtin_ia32_dppd ((__v2df)__X,
+ (__v2df)__Y,
+ __M);
+}
+#else
#define _mm_dp_ps(X, Y, M) \
((__m128) __builtin_ia32_dpps ((__v4sf)(X), (__v4sf)(Y), (M)))
#define _mm_dp_pd(X, Y, M) \
((__m128d) __builtin_ia32_dppd ((__v2df)(X), (__v2df)(Y), (M)))
+#endif
/* Packed integer 64-bit comparison, zeroing or filling with ones
corresponding parts of result. */
@@ -224,14 +272,35 @@ _mm_testnzc_si128 (__m128i __M, __m128i
element selected by index N. The bits [7-6] of N define S
index, the bits [5-4] define D index, and bits [3-0] define
zeroing mask for D. */
+
+#ifdef __OPTIMIZE__
+static __inline __m128 __attribute__((__always_inline__))
+_mm_insert_ps (__m128 __D, __m128 __S, const int __N)
+{
+ return (__m128) __builtin_ia32_insertps128 ((__v4sf)__D,
+ (__v4sf)__S,
+ __N);
+}
+#else
#define _mm_insert_ps(D, S, N) \
((__m128) __builtin_ia32_insertps128 ((__v4sf)(D), (__v4sf)(S), (N)))
+#endif
/* Helper macro to create the N value for _mm_insert_ps. */
#define _MM_MK_INSERTPS_NDX(S, D, M) (((S) << 6) | ((D) << 4) | (M))
/* Extract binary representation of single precision float from packed
single precision array element of X selected by index N. */
+
+#ifdef __OPTIMIZE__
+static __inline int __attribute__((__always_inline__))
+_mm_extract_ps (__m128 __X, const int __N)
+{
+ union { int i; float f; } __tmp;
+ __tmp.f = __builtin_ia32_vec_ext_v4sf ((__v4sf)__X, __N);
+ return __tmp.i;
+}
+#else
#define _mm_extract_ps(X, N) \
(__extension__ \
({ \
@@ -240,6 +309,7 @@ _mm_testnzc_si128 (__m128i __M, __m128i
__tmp.i; \
}) \
)
+#endif
/* Extract binary representation of single precision float into
D from packed single precision array element of S selected
@@ -256,6 +326,30 @@ _mm_testnzc_si128 (__m128i __M, __m128i
/* Insert integer, S, into packed integer array element of D
selected by index N. */
+#ifdef __OPTIMIZE__
+static __inline __m128i __attribute__((__always_inline__))
+_mm_insert_epi8 (__m128i __D, int __S, const int __N)
+{
+ return (__m128i) __builtin_ia32_vec_set_v16qi ((__v16qi)__D,
+ __S, __N);
+}
+
+static __inline __m128i __attribute__((__always_inline__))
+_mm_insert_epi32 (__m128i __D, int __S, const int __N)
+{
+ return (__m128i) __builtin_ia32_vec_set_v4si ((__v4si)__D,
+ __S, __N);
+}
+
+#ifdef __x86_64__
+static __inline __m128i __attribute__((__always_inline__))
+_mm_insert_epi64 (__m128i __D, long long __S, const int __N)
+{
+ return (__m128i) __builtin_ia32_vec_set_v2di ((__v2di)__D,
+ __S, __N);
+}
+#endif
+#else
#define _mm_insert_epi8(D, S, N) \
((__m128i) __builtin_ia32_vec_set_v16qi ((__v16qi)(D), (S), (N)))
@@ -266,10 +360,32 @@ _mm_testnzc_si128 (__m128i __M, __m128i
#define _mm_insert_epi64(D, S, N) \
((__m128i) __builtin_ia32_vec_set_v2di ((__v2di)(D), (S), (N)))
#endif
+#endif
/* Extract integer from packed integer array element of X selected by
index N. */
+#ifdef __OPTIMIZE__
+static __inline int __attribute__((__always_inline__))
+_mm_extract_epi8 (__m128i __X, const int __N)
+{
+ return __builtin_ia32_vec_ext_v16qi ((__v16qi)__X, __N);
+}
+
+static __inline int __attribute__((__always_inline__))
+_mm_extract_epi32 (__m128i __X, const int __N)
+{
+ return __builtin_ia32_vec_ext_v4si ((__v4si)__X, __N);
+}
+
+#ifdef __x86_64__
+static __inline long long __attribute__((__always_inline__))
+_mm_extract_epi64 (__m128i __X, const int __N)
+{
+ return __builtin_ia32_vec_ext_v2di ((__v2di)__X, __N);
+}
+#endif
+#else
#define _mm_extract_epi8(X, N) \
__builtin_ia32_vec_ext_v16qi ((__v16qi) X, (N))
#define _mm_extract_epi32(X, N) \
@@ -279,6 +395,7 @@ _mm_testnzc_si128 (__m128i __M, __m128i
#define _mm_extract_epi64(X, N) \
((long long) __builtin_ia32_vec_ext_v2di ((__v2di)(X), (N)))
#endif
+#endif
/* Return horizontal packed word minimum and its index in bits [15:0]
and bits [18:16] respectively. */
@@ -289,18 +406,52 @@ _mm_minpos_epu16 (__m128i __X)
}
/* Packed/scalar double precision floating point rounding. */
+
+#ifdef __OPTIMIZE__
+static __inline __m128d __attribute__((__always_inline__))
+_mm_round_pd (__m128d __V, const int __M)
+{
+ return (__m128d) __builtin_ia32_roundpd ((__v2df)__V, __M);
+}
+
+static __inline __m128d __attribute__((__always_inline__))
+_mm_round_sd(__m128d __D, __m128d __V, const int __M)
+{
+ return (__m128d) __builtin_ia32_roundsd ((__v2df)__D,
+ (__v2df)__V,
+ __M);
+}
+#else
#define _mm_round_pd(V, M) \
((__m128d) __builtin_ia32_roundpd ((__v2df)(V), (M)))
#define _mm_round_sd(D, V, M) \
((__m128d) __builtin_ia32_roundsd ((__v2df)(D), (__v2df)(V), (M)))
+#endif
/* Packed/scalar single precision floating point rounding. */
+
+#ifdef __OPTIMIZE__
+static __inline __m128 __attribute__((__always_inline__))
+_mm_round_ps (__m128 __V, const int __M)
+{
+ return (__m128) __builtin_ia32_roundps ((__v4sf)__V, __M);
+}
+
+static __inline __m128 __attribute__((__always_inline__))
+_mm_round_ss (__m128 __D, __m128 __V, const int __M)
+{
+ return (__m128) __builtin_ia32_roundss ((__v4sf)__D,
+ (__v4sf)__V,
+ __M);
+}
+#else
#define _mm_round_ps(V, M) \
((__m128) __builtin_ia32_roundps ((__v4sf)(V), (M)))
#define _mm_round_ss(D, V, M) \
((__m128) __builtin_ia32_roundss ((__v4sf)(D), (__v4sf)(V), (M)))
+#endif
/* Macros for ceil/floor intrinsics. */
#define _mm_ceil_pd(V) _mm_round_pd ((V), _MM_FROUND_CEIL)
@@ -402,8 +553,18 @@ _mm_packus_epi32 (__m128i __X, __m128i _
/* Sum absolute 8-bit integer difference of adjacent groups of 4
byte integers in the first 2 operands. Starting offsets within
operands are determined by the 3rd mask operand. */
+
+#ifdef __OPTIMIZE__
+static __inline __m128i __attribute__((__always_inline__))
+_mm_mpsadbw_epu8 (__m128i __X, __m128i __Y, const int __M)
+{
+ return (__m128i) __builtin_ia32_mpsadbw128 ((__v16qi)__X,
+ (__v16qi)__Y, __M);
+}
+#else
#define _mm_mpsadbw_epu8(X, Y, M) \
((__m128i) __builtin_ia32_mpsadbw128 ((__v16qi)(X), (__v16qi)(Y), (M)))
+#endif
/* Load double quadword using non-temporal aligned hint. */
static __inline __m128i __attribute__((__always_inline__))