This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[x86, 3/n] Replace builtins with vector extensions
- From: Marc Glisse <marc dot glisse at inria dot fr>
- To: gcc-patches at gcc dot gnu dot org
- Cc: ubizjak at gmail dot com
- Date: Sat, 8 Nov 2014 13:47:25 +0100 (CET)
- Subject: [x86, 3/n] Replace builtins with vector extensions
- Authentication-results: sourceware.org; auth=none
Hello,
this patch mechanically extends +-* for integer vectors of size 256 and
512 (the previous patch only handled 128).
Regtested together with the next patch.
2014-11-10 Marc Glisse <marc.glisse@inria.fr>
* config/i386/avxintrin.h (__v4du, __v8su, __v16hu, __v32qu):
New typedefs.
* config/i386/avx512fintrin.h (__v8du, __v16su, __v32hu, __v64qu):
Likewise.
(_mm512_mullo_epi32, _mm512_add_epi64, _mm512_sub_epi64,
_mm512_add_epi32, _mm512_sub_epi32): Use vector extensions
instead of builtins.
* config/i386/avx2intrin.h (_mm256_add_epi8, _mm256_add_epi16,
_mm256_add_epi32, _mm256_add_epi64, _mm256_mullo_epi16,
_mm256_mullo_epi32, _mm256_sub_epi8, _mm256_sub_epi16,
_mm256_sub_epi32, _mm256_sub_epi64): Likewise.
* config/i386/avx512bwintrin.h (_mm512_mullo_epi16, _mm512_add_epi8,
_mm512_sub_epi8, _mm512_sub_epi16, _mm512_add_epi16): Likewise.
* config/i386/avx512dqintrin.h (_mm512_mullo_epi64): Likewise.
* config/i386/avx512vldqintrin.h (_mm256_mullo_epi64, _mm_mullo_epi64):
Likewise.
--
Marc Glisse
Index: config/i386/avx2intrin.h
===================================================================
--- config/i386/avx2intrin.h (revision 217249)
+++ config/i386/avx2intrin.h (working copy)
@@ -97,42 +97,42 @@ extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_packus_epi16 (__m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_packuswb256 ((__v16hi)__A, (__v16hi)__B);
}
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_add_epi8 (__m256i __A, __m256i __B)
{
- return (__m256i)__builtin_ia32_paddb256 ((__v32qi)__A, (__v32qi)__B);
+ return (__m256i) ((__v32qu)__A + (__v32qu)__B);
}
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_add_epi16 (__m256i __A, __m256i __B)
{
- return (__m256i)__builtin_ia32_paddw256 ((__v16hi)__A, (__v16hi)__B);
+ return (__m256i) ((__v16hu)__A + (__v16hu)__B);
}
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_add_epi32 (__m256i __A, __m256i __B)
{
- return (__m256i)__builtin_ia32_paddd256 ((__v8si)__A, (__v8si)__B);
+ return (__m256i) ((__v8su)__A + (__v8su)__B);
}
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_add_epi64 (__m256i __A, __m256i __B)
{
- return (__m256i)__builtin_ia32_paddq256 ((__v4di)__A, (__v4di)__B);
+ return (__m256i) ((__v4du)__A + (__v4du)__B);
}
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_adds_epi8 (__m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_paddsb256 ((__v32qi)__A, (__v32qi)__B);
}
extern __inline __m256i
@@ -548,28 +548,28 @@ extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mulhi_epi16 (__m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_pmulhw256 ((__v16hi)__A, (__v16hi)__B);
}
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mullo_epi16 (__m256i __A, __m256i __B)
{
- return (__m256i)__builtin_ia32_pmullw256 ((__v16hi)__A, (__v16hi)__B);
+ return (__m256i) ((__v16hu)__A * (__v16hu)__B);
}
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mullo_epi32 (__m256i __A, __m256i __B)
{
- return (__m256i)__builtin_ia32_pmulld256 ((__v8si)__A, (__v8si)__B);
+ return (__m256i) ((__v8su)__A * (__v8su)__B);
}
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mul_epu32 (__m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_pmuludq256 ((__v8si)__A, (__v8si)__B);
}
extern __inline __m256i
@@ -778,42 +778,42 @@ extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_srl_epi64 (__m256i __A, __m128i __B)
{
return (__m256i)__builtin_ia32_psrlq256((__v4di)__A, (__v2di)__B);
}
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_sub_epi8 (__m256i __A, __m256i __B)
{
- return (__m256i)__builtin_ia32_psubb256 ((__v32qi)__A, (__v32qi)__B);
+ return (__m256i) ((__v32qu)__A - (__v32qu)__B);
}
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_sub_epi16 (__m256i __A, __m256i __B)
{
- return (__m256i)__builtin_ia32_psubw256 ((__v16hi)__A, (__v16hi)__B);
+ return (__m256i) ((__v16hu)__A - (__v16hu)__B);
}
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_sub_epi32 (__m256i __A, __m256i __B)
{
- return (__m256i)__builtin_ia32_psubd256 ((__v8si)__A, (__v8si)__B);
+ return (__m256i) ((__v8su)__A - (__v8su)__B);
}
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_sub_epi64 (__m256i __A, __m256i __B)
{
- return (__m256i)__builtin_ia32_psubq256 ((__v4di)__A, (__v4di)__B);
+ return (__m256i) ((__v4du)__A - (__v4du)__B);
}
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_subs_epi8 (__m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_psubsb256 ((__v32qi)__A, (__v32qi)__B);
}
extern __inline __m256i
Index: config/i386/avx512bwintrin.h
===================================================================
--- config/i386/avx512bwintrin.h (revision 217249)
+++ config/i386/avx512bwintrin.h (working copy)
@@ -457,25 +457,21 @@ _mm512_maskz_mulhi_epu16 (__mmask32 __U,
(__v32hi) __B,
(__v32hi)
_mm512_setzero_hi (),
(__mmask32) __U);
}
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mullo_epi16 (__m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A,
- (__v32hi) __B,
- (__v32hi)
- _mm512_setzero_hi (),
- (__mmask32) -1);
+ return (__m512i) ((__v32hu) __A * (__v32hu) __B);
}
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_mullo_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
__m512i __B)
{
return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A,
(__v32hi) __B,
(__v32hi) __W,
@@ -666,25 +662,21 @@ _mm512_maskz_avg_epu8 (__mmask64 __U, __
(__v64qi) __B,
(__v64qi)
_mm512_setzero_qi(),
(__mmask64) __U);
}
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_add_epi8 (__m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A,
- (__v64qi) __B,
- (__v64qi)
- _mm512_setzero_qi (),
- (__mmask64) -1);
+ return (__m512i) ((__v64qu) __A + (__v64qu) __B);
}
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_add_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
__m512i __B)
{
return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A,
(__v64qi) __B,
(__v64qi) __W,
@@ -699,25 +691,21 @@ _mm512_maskz_add_epi8 (__mmask64 __U, __
(__v64qi) __B,
(__v64qi)
_mm512_setzero_qi (),
(__mmask64) __U);
}
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_sub_epi8 (__m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A,
- (__v64qi) __B,
- (__v64qi)
- _mm512_setzero_qi (),
- (__mmask64) -1);
+ return (__m512i) ((__v64qu) __A - (__v64qu) __B);
}
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_sub_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
__m512i __B)
{
return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A,
(__v64qi) __B,
(__v64qi) __W,
@@ -897,25 +885,21 @@ _mm512_maskz_adds_epu8 (__mmask64 __U, _
(__v64qi) __B,
(__v64qi)
_mm512_setzero_qi (),
(__mmask64) __U);
}
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_sub_epi16 (__m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A,
- (__v32hi) __B,
- (__v32hi)
- _mm512_setzero_hi (),
- (__mmask32) -1);
+ return (__m512i) ((__v32hu) __A - (__v32hu) __B);
}
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_sub_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
__m512i __B)
{
return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A,
(__v32hi) __B,
(__v32hi) __W,
@@ -996,25 +980,21 @@ _mm512_maskz_subs_epu16 (__mmask32 __U,
(__v32hi) __B,
(__v32hi)
_mm512_setzero_hi (),
(__mmask32) __U);
}
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_add_epi16 (__m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A,
- (__v32hi) __B,
- (__v32hi)
- _mm512_setzero_hi (),
- (__mmask32) -1);
+ return (__m512i) ((__v32hu) __A + (__v32hu) __B);
}
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_add_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
__m512i __B)
{
return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A,
(__v32hi) __B,
(__v32hi) __W,
Index: config/i386/avx512dqintrin.h
===================================================================
--- config/i386/avx512dqintrin.h (revision 217249)
+++ config/i386/avx512dqintrin.h (working copy)
@@ -218,25 +218,21 @@ _mm512_maskz_broadcast_i32x8 (__mmask16
__A,
(__v16si)
_mm512_setzero_si512 (),
__M);
}
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mullo_epi64 (__m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_pmullq512_mask ((__v8di) __A,
- (__v8di) __B,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) -1);
+ return (__m512i) ((__v8du) __A * (__v8du) __B);
}
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_mullo_epi64 (__m512i __W, __mmask8 __U, __m512i __A,
__m512i __B)
{
return (__m512i) __builtin_ia32_pmullq512_mask ((__v8di) __A,
(__v8di) __B,
(__v8di) __W,
Index: config/i386/avx512fintrin.h
===================================================================
--- config/i386/avx512fintrin.h (revision 217249)
+++ config/i386/avx512fintrin.h (working copy)
@@ -31,23 +31,27 @@
#ifndef __AVX512F__
#pragma GCC push_options
#pragma GCC target("avx512f")
#define __DISABLE_AVX512F__
#endif /* __AVX512F__ */
/* Internal data types for implementing the intrinsics. */
typedef double __v8df __attribute__ ((__vector_size__ (64)));
typedef float __v16sf __attribute__ ((__vector_size__ (64)));
typedef long long __v8di __attribute__ ((__vector_size__ (64)));
+typedef unsigned long long __v8du __attribute__ ((__vector_size__ (64)));
typedef int __v16si __attribute__ ((__vector_size__ (64)));
+typedef unsigned int __v16su __attribute__ ((__vector_size__ (64)));
typedef short __v32hi __attribute__ ((__vector_size__ (64)));
+typedef unsigned short __v32hu __attribute__ ((__vector_size__ (64)));
typedef char __v64qi __attribute__ ((__vector_size__ (64)));
+typedef unsigned char __v64qu __attribute__ ((__vector_size__ (64)));
/* The Intel API is flexible enough that we must allow aliasing with other
vector types, and their scalar components. */
typedef float __m512 __attribute__ ((__vector_size__ (64), __may_alias__));
typedef long long __m512i __attribute__ ((__vector_size__ (64), __may_alias__));
typedef double __m512d __attribute__ ((__vector_size__ (64), __may_alias__));
typedef unsigned char __mmask8;
typedef unsigned short __mmask16;
@@ -508,25 +512,21 @@ __attribute__ ((__gnu_inline__, __always
_mm512_mask_store_epi32 (void *__P, __mmask16 __U, __m512i __A)
{
__builtin_ia32_movdqa32store512_mask ((__v16si *) __P, (__v16si) __A,
(__mmask16) __U);
}
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mullo_epi32 (__m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A,
- (__v16si) __B,
- (__v16si)
- _mm512_undefined_si512 (),
- (__mmask16) -1);
+ return (__m512i) ((__v16su) __A * (__v16su) __B);
}
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_mullo_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A,
(__v16si) __B,
(__v16si)
_mm512_setzero_si512 (),
@@ -635,25 +635,21 @@ _mm512_maskz_srlv_epi32 (__mmask16 __U,
(__v16si) __Y,
(__v16si)
_mm512_setzero_si512 (),
(__mmask16) __U);
}
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_add_epi64 (__m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A,
- (__v8di) __B,
- (__v8di)
- _mm512_undefined_si512 (),
- (__mmask8) -1);
+ return (__m512i) ((__v8du) __A + (__v8du) __B);
}
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_add_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A,
(__v8di) __B,
(__v8di) __W,
(__mmask8) __U);
@@ -667,25 +663,21 @@ _mm512_maskz_add_epi64 (__mmask8 __U, __
(__v8di) __B,
(__v8di)
_mm512_setzero_si512 (),
(__mmask8) __U);
}
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_sub_epi64 (__m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A,
- (__v8di) __B,
- (__v8di)
- _mm512_undefined_pd (),
- (__mmask8) -1);
+ return (__m512i) ((__v8du) __A - (__v8du) __B);
}
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_sub_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A,
(__v8di) __B,
(__v8di) __W,
(__mmask8) __U);
@@ -795,25 +787,21 @@ _mm512_maskz_srlv_epi64 (__mmask8 __U, _
(__v8di) __Y,
(__v8di)
_mm512_setzero_si512 (),
(__mmask8) __U);
}
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_add_epi32 (__m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A,
- (__v16si) __B,
- (__v16si)
- _mm512_undefined_si512 (),
- (__mmask16) -1);
+ return (__m512i) ((__v16su) __A + (__v16su) __B);
}
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_add_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A,
(__v16si) __B,
(__v16si) __W,
(__mmask16) __U);
@@ -858,25 +846,21 @@ _mm512_maskz_mul_epi32 (__mmask8 __M, __
(__v16si) __Y,
(__v8di)
_mm512_setzero_si512 (),
__M);
}
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_sub_epi32 (__m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A,
- (__v16si) __B,
- (__v16si)
- _mm512_undefined_si512 (),
- (__mmask16) -1);
+ return (__m512i) ((__v16su) __A - (__v16su) __B);
}
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_sub_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A,
(__v16si) __B,
(__v16si) __W,
(__mmask16) __U);
Index: config/i386/avx512vldqintrin.h
===================================================================
--- config/i386/avx512vldqintrin.h (revision 217249)
+++ config/i386/avx512vldqintrin.h (working copy)
@@ -537,25 +537,21 @@ _mm_maskz_broadcast_i32x2 (__mmask8 __M,
__A,
(__v4si)
_mm_setzero_si128 (),
__M);
}
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mullo_epi64 (__m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_pmullq256_mask ((__v4di) __A,
- (__v4di) __B,
- (__v4di)
- _mm256_setzero_si256 (),
- (__mmask8) -1);
+ return (__m256i) ((__v4du) __A * (__v4du) __B);
}
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_mullo_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
__m256i __B)
{
return (__m256i) __builtin_ia32_pmullq256_mask ((__v4di) __A,
(__v4di) __B,
(__v4di) __W,
@@ -570,25 +566,21 @@ _mm256_maskz_mullo_epi64 (__mmask8 __U,
(__v4di) __B,
(__v4di)
_mm256_setzero_si256 (),
(__mmask8) __U);
}
extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mullo_epi64 (__m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_pmullq128_mask ((__v2di) __A,
- (__v2di) __B,
- (__v2di)
- _mm_setzero_di (),
- (__mmask8) -1);
+ return (__m128i) ((__v2du) __A * (__v2du) __B);
}
extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_mullo_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
__m128i __B)
{
return (__m128i) __builtin_ia32_pmullq128_mask ((__v2di) __A,
(__v2di) __B,
(__v2di) __W,
Index: config/i386/avxintrin.h
===================================================================
--- config/i386/avxintrin.h (revision 217249)
+++ config/i386/avxintrin.h (working copy)
@@ -34,23 +34,27 @@
#ifndef __AVX__
#pragma GCC push_options
#pragma GCC target("avx")
#define __DISABLE_AVX__
#endif /* __AVX__ */
/* Internal data types for implementing the intrinsics. */
typedef double __v4df __attribute__ ((__vector_size__ (32)));
typedef float __v8sf __attribute__ ((__vector_size__ (32)));
typedef long long __v4di __attribute__ ((__vector_size__ (32)));
+typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
typedef int __v8si __attribute__ ((__vector_size__ (32)));
+typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
typedef short __v16hi __attribute__ ((__vector_size__ (32)));
+typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
typedef char __v32qi __attribute__ ((__vector_size__ (32)));
+typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
/* The Intel API is flexible enough that we must allow aliasing with other
vector types, and their scalar components. */
typedef float __m256 __attribute__ ((__vector_size__ (32),
__may_alias__));
typedef long long __m256i __attribute__ ((__vector_size__ (32),
__may_alias__));
typedef double __m256d __attribute__ ((__vector_size__ (32),
__may_alias__));