This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [PATCH] [x86] Avoid builtins for SSE/AVX2 immidiate logical shifts
- From: Allan Sandfeld Jensen <linux at carewolf dot com>
- To: gcc-patches at gcc dot gnu dot org, Jakub Jelinek <jakub at redhat dot com>
- Date: Mon, 24 Apr 2017 15:15:11 +0200
- Subject: Re: [PATCH] [x86] Avoid builtins for SSE/AVX2 immidiate logical shifts
- Authentication-results: sourceware.org; auth=none
- References: <201704221338.46300.linux@carewolf.com> <201704240933.09704.linux@carewolf.com> <20170424074349.GG1809@tucnak>
On Monday 24 April 2017, Jakub Jelinek wrote:
> On Mon, Apr 24, 2017 at 09:33:09AM +0200, Allan Sandfeld Jensen wrote:
> > --- a/gcc/config/i386/avx2intrin.h
> > +++ b/gcc/config/i386/avx2intrin.h
> > @@ -667,7 +667,7 @@ extern __inline __m256i
> >
> > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> > _mm256_slli_epi16 (__m256i __A, int __B)
> > {
> >
> > - return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
> > + return ((__B & 0xff) < 16) ? (__m256i)((__v16hi)__A << (__B & 0xff)) :
> > _mm256_setzero_si256();
> >
> > }
>
> What is the advantage of doing that when you replace one operation with
> several (&, <, ?:, <<)?
> I'd say instead we should fold the builtins if in the gimple fold target
> hook we see the shift count constant and can decide based on that.
> Or we could use __builtin_constant_p (__B) to decide whether to use
> the generic vector shifts or builtin, but that means larger IL.
>
Okay, I have tried that, and I also made it more obvious how the intrinsics
can become non-immediate shift.
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index b58f5050db0..b9406550fc5 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,10 @@
+2017-04-22 Allan Sandfeld Jensen <sandfeld@kde.org>
+
+ * config/i386/emmintrin.h (_mm_slli_*, _mm_srli_*):
+ Use vector intrinstics instead of builtins.
+ * config/i386/avx2intrin.h (_mm256_slli_*, _mm256_srli_*):
+ Use vector intrinstics instead of builtins.
+
2017-04-21 Uros Bizjak <ubizjak@gmail.com>
* config/i386/i386.md (*extzvqi_mem_rex64): Move above *extzv<mode>.
diff --git a/gcc/config/i386/avx2intrin.h b/gcc/config/i386/avx2intrin.h
index 82f170a3d61..64ba52b244e 100644
--- a/gcc/config/i386/avx2intrin.h
+++ b/gcc/config/i386/avx2intrin.h
@@ -665,13 +665,6 @@ _mm256_slli_si256 (__m256i __A, const int __N)
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_slli_epi16 (__m256i __A, int __B)
-{
- return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
-}
-
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_sll_epi16 (__m256i __A, __m128i __B)
{
return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B);
@@ -679,9 +672,11 @@ _mm256_sll_epi16 (__m256i __A, __m128i __B)
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_slli_epi32 (__m256i __A, int __B)
+_mm256_slli_epi16 (__m256i __A, int __B)
{
- return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B);
+ if (__builtin_constant_p(__B))
+ return ((unsigned int)__B < 16) ? (__m256i)((__v16hi)__A << __B) : _mm256_setzero_si256();
+ return _mm256_sll_epi16(__A, _mm_cvtsi32_si128(__B));
}
extern __inline __m256i
@@ -693,9 +688,11 @@ _mm256_sll_epi32 (__m256i __A, __m128i __B)
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_slli_epi64 (__m256i __A, int __B)
+_mm256_slli_epi32 (__m256i __A, int __B)
{
- return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B);
+ if (__builtin_constant_p(__B))
+ return ((unsigned int)__B < 32) ? (__m256i)((__v8si)__A << __B) : _mm256_setzero_si256();
+ return _mm256_sll_epi32(__A, _mm_cvtsi32_si128(__B));
}
extern __inline __m256i
@@ -707,6 +704,15 @@ _mm256_sll_epi64 (__m256i __A, __m128i __B)
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_slli_epi64 (__m256i __A, int __B)
+{
+ if (__builtin_constant_p(__B))
+ return ((unsigned int)__B < 64) ? (__m256i)((__v4di)__A << __B) : _mm256_setzero_si256();
+ return _mm256_sll_epi64(__A, _mm_cvtsi32_si128(__B));
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_srai_epi16 (__m256i __A, int __B)
{
return (__m256i)__builtin_ia32_psrawi256 ((__v16hi)__A, __B);
@@ -756,13 +762,6 @@ _mm256_srli_si256 (__m256i __A, const int __N)
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_srli_epi16 (__m256i __A, int __B)
-{
- return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B);
-}
-
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_srl_epi16 (__m256i __A, __m128i __B)
{
return (__m256i)__builtin_ia32_psrlw256((__v16hi)__A, (__v8hi)__B);
@@ -770,9 +769,11 @@ _mm256_srl_epi16 (__m256i __A, __m128i __B)
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_srli_epi32 (__m256i __A, int __B)
+_mm256_srli_epi16 (__m256i __A, int __B)
{
- return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B);
+ if (__builtin_constant_p(__B))
+ return ((unsigned int)__B < 16) ? (__m256i)((__v16hu)__A >> __B) : _mm256_setzero_si256();
+ return _mm256_srl_epi16(__A, _mm_cvtsi32_si128(__B));
}
extern __inline __m256i
@@ -784,9 +785,11 @@ _mm256_srl_epi32 (__m256i __A, __m128i __B)
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_srli_epi64 (__m256i __A, int __B)
+_mm256_srli_epi32 (__m256i __A, int __B)
{
- return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B);
+ if (__builtin_constant_p(__B))
+ return ((unsigned int)__B < 32) ? (__m256i)((__v8su)__A >> __B) : _mm256_setzero_si256();
+ return _mm256_srl_epi32(__A, _mm_cvtsi32_si128(__B));
}
extern __inline __m256i
@@ -798,6 +801,15 @@ _mm256_srl_epi64 (__m256i __A, __m128i __B)
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srli_epi64 (__m256i __A, int __B)
+{
+ if (__builtin_constant_p(__B))
+ return ((unsigned int)__B < 64) ? (__m256i)((__v4du)__A >> __B) : _mm256_setzero_si256();
+ return _mm256_srl_epi64(__A, _mm_cvtsi32_si128(__B));
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_sub_epi8 (__m256i __A, __m256i __B)
{
return (__m256i) ((__v32qu)__A - (__v32qu)__B);
diff --git a/gcc/config/i386/emmintrin.h b/gcc/config/i386/emmintrin.h
index 828f4a07a9b..419041e2acb 100644
--- a/gcc/config/i386/emmintrin.h
+++ b/gcc/config/i386/emmintrin.h
@@ -903,6 +903,28 @@ _mm_cvtss_sd (__m128d __A, __m128 __B)
return (__m128d)__builtin_ia32_cvtss2sd ((__v2df) __A, (__v4sf)__B);
}
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi32_si128 (int __A)
+{
+ return _mm_set_epi32 (0, 0, 0, __A);
+}
+
+#ifdef __x86_64__
+/* Intel intrinsic. */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64_si128 (long long __A)
+{
+ return _mm_set_epi64x (0, __A);
+}
+
+/* Microsoft intrinsic. */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64x_si128 (long long __A)
+{
+ return _mm_set_epi64x (0, __A);
+}
+#endif
+
#ifdef __OPTIMIZE__
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask)
@@ -1138,21 +1160,75 @@ _mm_mul_epu32 (__m128i __A, __m128i __B)
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sll_epi16 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_psllw128((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sll_epi32 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_pslld128((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sll_epi64 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_psllq128((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sra_epi16 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_psraw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sra_epi32 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_psrad128 ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srl_epi16 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_psrlw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srl_epi32 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_psrld128 ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srl_epi64 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_psrlq128 ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi16 (__m128i __A, int __B)
{
- return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B);
+ if (__builtin_constant_p(__B))
+ return ((unsigned int)__B < 16) ? (__m128i)((__v8hi)__A << __B) : _mm_setzero_si128();
+ return _mm_sll_epi16(__A, _mm_cvtsi32_si128(__B));
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi32 (__m128i __A, int __B)
{
- return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B);
+ if (__builtin_constant_p(__B))
+ return ((unsigned int)__B < 32) ? (__m128i)((__v4si)__A << __B) : _mm_setzero_si128();
+ return _mm_sll_epi32(__A, _mm_cvtsi32_si128(__B));
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi64 (__m128i __A, int __B)
{
- return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B);
+ if (__builtin_constant_p(__B))
+ return ((unsigned int)__B < 64) ? (__m128i)((__v2di)__A << __B) : _mm_setzero_si128();
+ return _mm_sll_epi64(__A, _mm_cvtsi32_si128(__B));
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1205,67 +1281,25 @@ _mm_slli_si128 (__m128i __A, const int __N)
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_epi16 (__m128i __A, int __B)
{
- return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B);
+ if (__builtin_constant_p(__B))
+ return ((unsigned int)__B < 16) ? (__m128i)((__v8hu)__A >> __B) : _mm_setzero_si128();
+ return _mm_srl_epi16(__A, _mm_cvtsi32_si128(__B));
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_epi32 (__m128i __A, int __B)
{
- return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B);
+ if (__builtin_constant_p(__B))
+ return ((unsigned int)__B < 32) ? (__m128i)((__v4su)__A >> __B) : _mm_setzero_si128();
+ return _mm_srl_epi32(__A, _mm_cvtsi32_si128(__B));
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_epi64 (__m128i __A, int __B)
{
- return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B);
-}
-
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_sll_epi16 (__m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_psllw128((__v8hi)__A, (__v8hi)__B);
-}
-
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_sll_epi32 (__m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_pslld128((__v4si)__A, (__v4si)__B);
-}
-
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_sll_epi64 (__m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_psllq128((__v2di)__A, (__v2di)__B);
-}
-
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_sra_epi16 (__m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_psraw128 ((__v8hi)__A, (__v8hi)__B);
-}
-
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_sra_epi32 (__m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_psrad128 ((__v4si)__A, (__v4si)__B);
-}
-
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_srl_epi16 (__m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_psrlw128 ((__v8hi)__A, (__v8hi)__B);
-}
-
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_srl_epi32 (__m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_psrld128 ((__v4si)__A, (__v4si)__B);
-}
-
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_srl_epi64 (__m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_psrlq128 ((__v2di)__A, (__v2di)__B);
+ if (__builtin_constant_p(__B))
+ return ((unsigned int)__B < 64) ? (__m128i)((__v2du)__A >> __B) : _mm_setzero_si128();
+ return _mm_srl_epi64(__A, _mm_cvtsi32_si128(__B));
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1497,28 +1531,6 @@ _mm_mfence (void)
__builtin_ia32_mfence ();
}
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cvtsi32_si128 (int __A)
-{
- return _mm_set_epi32 (0, 0, 0, __A);
-}
-
-#ifdef __x86_64__
-/* Intel intrinsic. */
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cvtsi64_si128 (long long __A)
-{
- return _mm_set_epi64x (0, __A);
-}
-
-/* Microsoft intrinsic. */
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cvtsi64x_si128 (long long __A)
-{
- return _mm_set_epi64x (0, __A);
-}
-#endif
-
/* Casts between various SP, DP, INT vector types. Note that these do no
conversion of values, they just change the type. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 6f4dc8d5095..a4470730ac6 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,10 @@
+2017-04-22 Allan Sandfeld Jensen <sandfeld@kde.org>
+
+ * gcc.target/i386/sse2-shifts-1.c: New testcases of shift intrinsics
+ producing intended instructions.
+ * gcc.target/i386/sse2-shifts-2.c: New testcasse of shift intrinsics
+ being folded.
+
2017-04-21 Janus Weil <janus@gcc.gnu.org>
PR fortran/80392
diff --git a/gcc/testsuite/gcc.target/i386/sse2-shifts-1.c b/gcc/testsuite/gcc.target/i386/sse2-shifts-1.c
new file mode 100644
index 00000000000..a2305cf042a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-shifts-1.c
@@ -0,0 +1,54 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mno-avx" } */
+/* { dg-require-effective-target sse2 } */
+
+#include <emmintrin.h>
+
+__m128i test1(__m128i a)
+{
+ return _mm_slli_epi16(a, 9);
+}
+
+__m128i test2(__m128i a)
+{
+ return _mm_slli_epi32(a, 13);
+}
+
+__m128i test3(__m128i a)
+{
+ return _mm_slli_epi64(a, 17);
+}
+
+__m128i test4(__m128i a)
+{
+ return _mm_srli_epi16(a, 9);
+}
+
+__m128i test5(__m128i a)
+{
+ return _mm_srli_epi32(a, 13);
+}
+
+__m128i test6(__m128i a)
+{
+ return _mm_srli_epi64(a, 7);
+}
+
+__m128i test7(__m128i a)
+{
+ return _mm_srai_epi16(a, 3);
+}
+
+__m128i test8(__m128i a)
+{
+ return _mm_srai_epi32(a, 6);
+}
+
+/* { dg-final { scan-assembler "psllw" } } */
+/* { dg-final { scan-assembler "pslld" } } */
+/* { dg-final { scan-assembler "psllq" } } */
+/* { dg-final { scan-assembler "psrlw" } } */
+/* { dg-final { scan-assembler "psrld" } } */
+/* { dg-final { scan-assembler "psrlq" } } */
+/* { dg-final { scan-assembler "psraw" } } */
+/* { dg-final { scan-assembler "psrad" } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse2-shifts-2.c b/gcc/testsuite/gcc.target/i386/sse2-shifts-2.c
new file mode 100644
index 00000000000..ce05a7dc44e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-shifts-2.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+/* { dg-require-effective-target sse2 } */
+
+#include <emmintrin.h>
+
+__m128i test1(__m128i a)
+{
+ a = _mm_slli_epi16(a, 2);
+ return _mm_slli_epi16(a, 3);
+}
+/* { dg-final { scan-assembler "psllw.*5"} } */
+
+__m128i test3(__m128i a)
+{
+ a = _mm_srli_epi16(a, 4);
+ return _mm_srli_epi16(a, 9);
+}
+/* { dg-final { scan-assembler-times "psrlw" 1} } */
+
+__m128i test4(__m128i a)
+{
+ a = _mm_setr_epi32(128, 255, 86, 23);
+ return _mm_srli_epi32(a, 8);
+}
+/* { dg-final { scan-assembler-not "psrld"} } */
+