This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[x86, 2/n] Replace builtins with vector extensions
- From: Marc Glisse <marc dot glisse at inria dot fr>
- To: gcc-patches at gcc dot gnu dot org
- Cc: ubizjak at gmail dot com
- Date: Sat, 18 Oct 2014 13:52:08 +0200 (CEST)
- Subject: [x86, 2/n] Replace builtins with vector extensions
- Authentication-results: sourceware.org; auth=none
Hello,
this time, +-* for 128 bit integer vectors. I am using an unsigned type so
the compiler knows that we expect wrapping. I don't know why Intel's
description of mullo insists that the multiplication is signed, that only
matters for the high part...
Next parts (waiting for approval for this one) should be:
- same thing with 256 and 512 bit integer vectors
- & | ^ (integer only)
Maybe (or it can wait until the next release):
- < > == abs min max (integer only)
2014-10-20 Marc Glisse <marc.glisse@inria.fr>
* config/i386/emmintrin.h (__v2du, __v4su, __v8hu, __v16qu): New
typedefs.
(_mm_add_epi8, _mm_add_epi16, _mm_add_epi32, _mm_add_epi64,
_mm_sub_epi8, _mm_sub_epi16, _mm_sub_epi32, _mm_sub_epi64,
_mm_mullo_epi16): Use vector extensions instead of builtins.
* config/i386/smmintrin.h (_mm_mullo_epi32): Likewise.
--
Marc Glisse
Index: gcc/config/i386/emmintrin.h
===================================================================
--- gcc/config/i386/emmintrin.h (revision 216422)
+++ gcc/config/i386/emmintrin.h (working copy)
@@ -32,23 +32,27 @@
#ifndef __SSE2__
#pragma GCC push_options
#pragma GCC target("sse2")
#define __DISABLE_SSE2__
#endif /* __SSE2__ */
/* SSE2 */
typedef double __v2df __attribute__ ((__vector_size__ (16)));
typedef long long __v2di __attribute__ ((__vector_size__ (16)));
+typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16)));
typedef int __v4si __attribute__ ((__vector_size__ (16)));
+typedef unsigned int __v4su __attribute__ ((__vector_size__ (16)));
typedef short __v8hi __attribute__ ((__vector_size__ (16)));
+typedef unsigned short __v8hu __attribute__ ((__vector_size__ (16)));
typedef char __v16qi __attribute__ ((__vector_size__ (16)));
+typedef unsigned char __v16qu __attribute__ ((__vector_size__ (16)));
/* The Intel API is flexible enough that we must allow aliasing with other
vector types, and their scalar components. */
typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
/* Create a selector for use with the SHUFPD instruction. */
#define _MM_SHUFFLE2(fp1,fp0) \
(((fp1) << 1) | (fp0))
@@ -999,39 +1003,39 @@ _mm_unpacklo_epi32 (__m128i __A, __m128i
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi64 (__m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__A, (__v2di)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi8 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_paddb128 ((__v16qi)__A, (__v16qi)__B);
+ return (__m128i) ((__v16qu)__A + (__v16qu)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi16 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_paddw128 ((__v8hi)__A, (__v8hi)__B);
+ return (__m128i) ((__v8hu)__A + (__v8hu)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi32 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_paddd128 ((__v4si)__A, (__v4si)__B);
+ return (__m128i) ((__v4su)__A + (__v4su)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi64 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_paddq128 ((__v2di)__A, (__v2di)__B);
+ return (__m128i) ((__v2du)__A + (__v2du)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epi8 (__m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_paddsb128 ((__v16qi)__A, (__v16qi)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epi16 (__m128i __A, __m128i __B)
@@ -1047,39 +1051,39 @@ _mm_adds_epu8 (__m128i __A, __m128i __B)
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epu16 (__m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_paddusw128 ((__v8hi)__A, (__v8hi)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi8 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_psubb128 ((__v16qi)__A, (__v16qi)__B);
+ return (__m128i) ((__v16qu)__A - (__v16qu)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi16 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_psubw128 ((__v8hi)__A, (__v8hi)__B);
+ return (__m128i) ((__v8hu)__A - (__v8hu)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi32 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_psubd128 ((__v4si)__A, (__v4si)__B);
+ return (__m128i) ((__v4su)__A - (__v4su)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi64 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_psubq128 ((__v2di)__A, (__v2di)__B);
+ return (__m128i) ((__v2du)__A - (__v2du)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epi8 (__m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_psubsb128 ((__v16qi)__A, (__v16qi)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epi16 (__m128i __A, __m128i __B)
@@ -1107,21 +1111,21 @@ _mm_madd_epi16 (__m128i __A, __m128i __B
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_epi16 (__m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_pmulhw128 ((__v8hi)__A, (__v8hi)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mullo_epi16 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_pmullw128 ((__v8hi)__A, (__v8hi)__B);
+ return (__m128i) ((__v8hu)__A * (__v8hu)__B);
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_su32 (__m64 __A, __m64 __B)
{
return (__m64)__builtin_ia32_pmuludq ((__v2si)__A, (__v2si)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_epu32 (__m128i __A, __m128i __B)
Index: gcc/config/i386/smmintrin.h
===================================================================
--- gcc/config/i386/smmintrin.h (revision 216422)
+++ gcc/config/i386/smmintrin.h (working copy)
@@ -318,21 +318,21 @@ extern __inline __m128i __attribute__((_
_mm_max_epu32 (__m128i __X, __m128i __Y)
{
return (__m128i) __builtin_ia32_pmaxud128 ((__v4si)__X, (__v4si)__Y);
}
/* Packed integer 32-bit multiplication with truncation of upper
halves of results. */
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mullo_epi32 (__m128i __X, __m128i __Y)
{
- return (__m128i) __builtin_ia32_pmulld128 ((__v4si)__X, (__v4si)__Y);
+ return (__m128i) ((__v4su)__X * (__v4su)__Y);
}
/* Packed integer 32-bit multiplication of 2 pairs of operands
with two 64-bit results. */
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_epi32 (__m128i __X, __m128i __Y)
{
return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__X, (__v4si)__Y);
}