This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
Re: PATCH: Add SSE4.1 support

From: "H. J. Lu" <hjl at lucon dot org>
To: Uros Bizjak <ubizjak at gmail dot com>
Cc: GCC Patches <gcc-patches at gcc dot gnu dot org>, Richard Henderson <rth at redhat dot com>, Jan Hubicka <jh at suse dot cz>
Date: Tue, 22 May 2007 07:07:10 -0700
Subject: Re: PATCH: Add SSE4.1 support
References: <5787cf470705220152w79640c8bt5d9987da300bdf1a@mail.gmail.com>
On Tue, May 22, 2007 at 10:52:11AM +0200, Uros Bizjak wrote:
> Hello!
> 
> >This update adds prefix_extra to SSE4.1 instructions for longer
> >opcodes. I also removed ix86_expand_sse4_unpack since a new approach
> >is needed for the vectorizer to fully support SSE4.1.
> 
> >2007-04-24  H.J. Lu  <hongjiu.lu@intel.com>
> >	    Richard Henderson  <rth@redhat.com>
> ...
> >	* config/i386/smmintrin.h: New. The SSE4.1 intrinsic header
> >	file.
> 
> In order to test this header for compile-time failures, an inlined
> versions for all #defined functions should be added (guarded with
> #ifdef __OPTIMIZE__), as was recently changed in SVN mainline.

I am enclosing an incremental patch here. I have run my SSE4.1 tests
against the new hader at -O0, -O1, -O2 on both Linux/ia32 and
Linux/Intel64.

> 
> Following that change, please update following files in the testsuite:
> 
> gcc.target/i386/sse-12.c
> gcc.target/i386/sse-13.c
> gcc.target/i386/sse-14.c
> g++.dg/other/i386-2.C

I can't add them since not all assemblers support SSE4. I can add
some new SSE4 tests after "proc check_effective_target_sse4" is added.

> 
> This will test your header as much as possible for optimized and
> unoptimized builds.
> 
> Considering that this is mostly new functionality (already reviewed by
> rth), that it will be thoroughly tested by provided tests [1] and that
> no other i386 maintainer raised any issue with latest version of
> patch, it is OK for mainline.
> 
> [1] http://gcc.gnu.org/ml/gcc-patches/2007-05/msg01361.html

I will check in the combined patch shortly.

Thanks.

H.J.
----
--- gcc/config/i386/smmintrin.h.sni-macro	2007-05-22 06:29:54.000000000 -0700
+++ gcc/config/i386/smmintrin.h	2007-05-22 06:50:52.000000000 -0700
@@ -66,8 +66,18 @@
 /* Integer blend instructions - select data from 2 sources using
    constant/variable mask.  */
 
+#ifdef __OPTIMIZE__
+static __inline __m128i __attribute__((__always_inline__))
+_mm_blend_epi16 (__m128i __X, __m128i __Y, const int __M)
+{
+  return (__m128i) __builtin_ia32_pblendw128 ((__v8hi)__X,
+					      (__v8hi)__Y,
+					      __M);
+}
+#else
 #define _mm_blend_epi16(X, Y, M) \
   ((__m128i) __builtin_ia32_pblendw128 ((__v8hi)(X), (__v8hi)(Y), (M)))
+#endif
 
 static __inline __m128i __attribute__((__always_inline__))
 _mm_blendv_epi8 (__m128i __X, __m128i __Y, __m128i __M)
@@ -80,8 +90,18 @@ _mm_blendv_epi8 (__m128i __X, __m128i __
 /* Single precision floating point blend instructions - select data
    from 2 sources using constant/variable mask.  */
 
+#ifdef __OPTIMIZE__
+static __inline __m128 __attribute__((__always_inline__))
+_mm_blend_ps (__m128 __X, __m128 __Y, const int __M)
+{
+  return (__m128) __builtin_ia32_blendps ((__v4sf)__X,
+					  (__v4sf)__Y,
+					  __M);
+}
+#else
 #define _mm_blend_ps(X, Y, M) \
   ((__m128) __builtin_ia32_blendps ((__v4sf)(X), (__v4sf)(Y), (M)))
+#endif
 
 static __inline __m128 __attribute__((__always_inline__))
 _mm_blendv_ps (__m128 __X, __m128 __Y, __m128 __M)
@@ -94,8 +114,18 @@ _mm_blendv_ps (__m128 __X, __m128 __Y, _
 /* Double precision floating point blend instructions - select data
    from 2 sources using constant/variable mask.  */
 
+#ifdef __OPTIMIZE__
+static __inline __m128d __attribute__((__always_inline__))
+_mm_blend_pd (__m128d __X, __m128d __Y, const int __M)
+{
+  return (__m128d) __builtin_ia32_blendpd ((__v2df)__X,
+					   (__v2df)__Y,
+					   __M);
+}
+#else
 #define _mm_blend_pd(X, Y, M) \
   ((__m128d) __builtin_ia32_blendpd ((__v2df)(X), (__v2df)(Y), (M)))
+#endif
 
 static __inline __m128d __attribute__((__always_inline__))
 _mm_blendv_pd (__m128d __X, __m128d __Y, __m128d __M)
@@ -108,11 +138,29 @@ _mm_blendv_pd (__m128d __X, __m128d __Y,
 /* Dot product instructions with mask-defined summing and zeroing parts
    of result.  */
 
+#ifdef __OPTIMIZE__
+static __inline __m128 __attribute__((__always_inline__))
+_mm_dp_ps (__m128 __X, __m128 __Y, const int __M)
+{
+  return (__m128) __builtin_ia32_dpps ((__v4sf)__X,
+				       (__v4sf)__Y,
+				       __M);
+}
+
+static __inline __m128d __attribute__((__always_inline__))
+_mm_dp_pd (__m128d __X, __m128d __Y, const int __M)
+{
+  return (__m128d) __builtin_ia32_dppd ((__v2df)__X,
+					(__v2df)__Y,
+					__M);
+}
+#else
 #define _mm_dp_ps(X, Y, M) \
   ((__m128) __builtin_ia32_dpps ((__v4sf)(X), (__v4sf)(Y), (M)))
 
 #define _mm_dp_pd(X, Y, M) \
   ((__m128d) __builtin_ia32_dppd ((__v2df)(X), (__v2df)(Y), (M)))
+#endif
 
 /* Packed integer 64-bit comparison, zeroing or filling with ones
    corresponding parts of result.  */
@@ -224,14 +272,35 @@ _mm_testnzc_si128 (__m128i __M, __m128i 
    element selected by index N.  The bits [7-6] of N define S
    index, the bits [5-4] define D index, and bits [3-0] define
    zeroing mask for D.  */
+
+#ifdef __OPTIMIZE__
+static __inline __m128 __attribute__((__always_inline__))
+_mm_insert_ps (__m128 __D, __m128 __S, const int __N)
+{
+  return (__m128) __builtin_ia32_insertps128 ((__v4sf)__D,
+					      (__v4sf)__S,
+					      __N);
+}
+#else
 #define _mm_insert_ps(D, S, N) \
   ((__m128) __builtin_ia32_insertps128 ((__v4sf)(D), (__v4sf)(S), (N)))
+#endif
 
 /* Helper macro to create the N value for _mm_insert_ps.  */
 #define _MM_MK_INSERTPS_NDX(S, D, M) (((S) << 6) | ((D) << 4) | (M))
 
 /* Extract binary representation of single precision float from packed
    single precision array element of X selected by index N.  */
+
+#ifdef __OPTIMIZE__
+static __inline int __attribute__((__always_inline__))
+_mm_extract_ps (__m128 __X, const int __N)
+{
+  union { int i; float f; } __tmp;
+  __tmp.f = __builtin_ia32_vec_ext_v4sf ((__v4sf)__X, __N);
+  return __tmp.i;
+}
+#else
 #define _mm_extract_ps(X, N) \
   (__extension__ 						\
    ({								\
@@ -240,6 +309,7 @@ _mm_testnzc_si128 (__m128i __M, __m128i 
       __tmp.i;							\
     })								\
    )
+#endif
 
 /* Extract binary representation of single precision float into
    D from packed single precision array element of S selected
@@ -256,6 +326,30 @@ _mm_testnzc_si128 (__m128i __M, __m128i 
 /* Insert integer, S, into packed integer array element of D
    selected by index N.  */
 
+#ifdef __OPTIMIZE__
+static __inline __m128i __attribute__((__always_inline__))
+_mm_insert_epi8 (__m128i __D, int __S, const int __N)
+{
+  return (__m128i) __builtin_ia32_vec_set_v16qi ((__v16qi)__D,
+						 __S, __N);
+}
+
+static __inline __m128i __attribute__((__always_inline__))
+_mm_insert_epi32 (__m128i __D, int __S, const int __N)
+{
+  return (__m128i) __builtin_ia32_vec_set_v4si ((__v4si)__D,
+						 __S, __N);
+}
+
+#ifdef __x86_64__
+static __inline __m128i __attribute__((__always_inline__))
+_mm_insert_epi64 (__m128i __D, long long __S, const int __N)
+{
+  return (__m128i) __builtin_ia32_vec_set_v2di ((__v2di)__D,
+						 __S, __N);
+}
+#endif
+#else
 #define _mm_insert_epi8(D, S, N) \
   ((__m128i) __builtin_ia32_vec_set_v16qi ((__v16qi)(D), (S), (N)))
 
@@ -266,10 +360,32 @@ _mm_testnzc_si128 (__m128i __M, __m128i 
 #define _mm_insert_epi64(D, S, N) \
   ((__m128i) __builtin_ia32_vec_set_v2di ((__v2di)(D), (S), (N)))
 #endif
+#endif
 
 /* Extract integer from packed integer array element of X selected by
    index N.  */
 
+#ifdef __OPTIMIZE__
+static __inline int __attribute__((__always_inline__))
+_mm_extract_epi8 (__m128i __X, const int __N)
+{
+   return __builtin_ia32_vec_ext_v16qi ((__v16qi)__X, __N);
+}
+
+static __inline int __attribute__((__always_inline__))
+_mm_extract_epi32 (__m128i __X, const int __N)
+{
+   return __builtin_ia32_vec_ext_v4si ((__v4si)__X, __N);
+}
+
+#ifdef __x86_64__
+static __inline long long  __attribute__((__always_inline__))
+_mm_extract_epi64 (__m128i __X, const int __N)
+{
+  return __builtin_ia32_vec_ext_v2di ((__v2di)__X, __N);
+}
+#endif
+#else
 #define _mm_extract_epi8(X, N) \
   __builtin_ia32_vec_ext_v16qi ((__v16qi) X, (N))
 #define _mm_extract_epi32(X, N) \
@@ -279,6 +395,7 @@ _mm_testnzc_si128 (__m128i __M, __m128i 
 #define _mm_extract_epi64(X, N) \
   ((long long) __builtin_ia32_vec_ext_v2di ((__v2di)(X), (N)))
 #endif
+#endif
 
 /* Return horizontal packed word minimum and its index in bits [15:0]
    and bits [18:16] respectively.  */
@@ -289,18 +406,52 @@ _mm_minpos_epu16 (__m128i __X)
 }
 
 /* Packed/scalar double precision floating point rounding.  */
+
+#ifdef __OPTIMIZE__
+static __inline __m128d __attribute__((__always_inline__))
+_mm_round_pd (__m128d __V, const int __M)
+{
+  return (__m128d) __builtin_ia32_roundpd ((__v2df)__V, __M);
+}
+
+static __inline __m128d __attribute__((__always_inline__))
+_mm_round_sd(__m128d __D, __m128d __V, const int __M)
+{
+  return (__m128d) __builtin_ia32_roundsd ((__v2df)__D,
+					   (__v2df)__V,
+					   __M);
+}
+#else
 #define _mm_round_pd(V, M) \
   ((__m128d) __builtin_ia32_roundpd ((__v2df)(V), (M)))
 
 #define _mm_round_sd(D, V, M) \
   ((__m128d) __builtin_ia32_roundsd ((__v2df)(D), (__v2df)(V), (M)))
+#endif
 
 /* Packed/scalar single precision floating point rounding.  */
+
+#ifdef __OPTIMIZE__
+static __inline __m128 __attribute__((__always_inline__))
+_mm_round_ps (__m128 __V, const int __M)
+{
+  return (__m128) __builtin_ia32_roundps ((__v4sf)__V, __M);
+}
+
+static __inline __m128 __attribute__((__always_inline__))
+_mm_round_ss (__m128 __D, __m128 __V, const int __M)
+{
+  return (__m128) __builtin_ia32_roundss ((__v4sf)__D,
+					  (__v4sf)__V,
+					  __M);
+}
+#else
 #define _mm_round_ps(V, M) \
   ((__m128) __builtin_ia32_roundps ((__v4sf)(V), (M)))
 
 #define _mm_round_ss(D, V, M) \
   ((__m128) __builtin_ia32_roundss ((__v4sf)(D), (__v4sf)(V), (M)))
+#endif
 
 /* Macros for ceil/floor intrinsics.  */
 #define _mm_ceil_pd(V)	   _mm_round_pd ((V), _MM_FROUND_CEIL)
@@ -402,8 +553,18 @@ _mm_packus_epi32 (__m128i __X, __m128i _
 /* Sum absolute 8-bit integer difference of adjacent groups of 4
    byte integers in the first 2 operands.  Starting offsets within
    operands are determined by the 3rd mask operand.  */
+
+#ifdef __OPTIMIZE__
+static __inline __m128i __attribute__((__always_inline__))
+_mm_mpsadbw_epu8 (__m128i __X, __m128i __Y, const int __M)
+{
+  return (__m128i) __builtin_ia32_mpsadbw128 ((__v16qi)__X,
+					      (__v16qi)__Y, __M);
+}
+#else
 #define _mm_mpsadbw_epu8(X, Y, M) \
   ((__m128i) __builtin_ia32_mpsadbw128 ((__v16qi)(X), (__v16qi)(Y), (M)))
+#endif
 
 /* Load double quadword using non-temporal aligned hint.  */
 static __inline __m128i __attribute__((__always_inline__))
Follow-Ups:
- Re: PATCH: Add SSE4.1 support
  - From: Uros Bizjak
References:
- Re: PATCH: Add SSE4.1 support
  - From: Uros Bizjak
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]