This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] Add missing _mm{256,512}_zext* intrinsics (PRs target/83250, target/91340)


Hi!

The following patch adds 9 missing intrinsics, which are like _mm*_cast*,
but don't leave the upper bits undefined - set them to zero instead.
The implementation uses code that combine manages to optimize well,
the only problem is that as the 512-bit intrinsics are supposed to be
avx512f and some needed intrinsics they'd ideally use are avx512dq, it means
that for _mm512_zextpd128_pd512/_mm512_zextps256_ps512 we emit
vmovaps/vmovapd instead of vmovapd/vmovaps.

I've also discovered that for AVX, there is no test coverage of the various
cast intrinsics, so I've added that too.

The PR has some details on other possible expansions, it would be nice to
optimize also those definitions into the same code, but it will require some
extra define_insn_and_split, though I think that can be done incrementally;
and once done, perhaps we could change the _mm512_zextpd128_pd512/_mm512_zextps256_ps512
so that they actually generate the right ps vs. pd variant of move.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2019-08-12  Jakub Jelinek  <jakub@redhat.com>

	PR target/83250
	PR target/91340
	* config/i386/avxintrin.h (_mm256_zextpd128_pd256,
	_mm256_zextps128_ps256, _mm256_zextsi128_si256): New intrinsics.
	* config/i386/avx512fintrin.h (_mm512_zextpd128_pd512,
	_mm512_zextps128_ps512, _mm512_zextsi128_si512, _mm512_zextpd256_pd512,
	_mm512_zextps256_ps512, _mm512_zextsi256_si512): Likewise.

	* gcc.target/i386/avx-typecast-1.c: New test.
	* gcc.target/i386/avx-typecast-2.c: New test.
	* gcc.target/i386/avx512f-typecast-2.c: New test.

--- gcc/config/i386/avxintrin.h.jj	2019-08-05 12:25:34.476667673 +0200
+++ gcc/config/i386/avxintrin.h	2019-08-12 14:33:07.905601186 +0200
@@ -1484,6 +1484,26 @@ _mm256_castsi128_si256 (__m128i __A)
   return (__m256i) __builtin_ia32_si256_si ((__v4si)__A);
 }
 
+/* Similarly, but with zero extension instead of undefined values.  */
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_zextpd128_pd256 (__m128d __A)
+{
+  return _mm256_insertf128_pd (_mm256_setzero_pd (), __A, 0);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_zextps128_ps256 (__m128 __A)
+{
+  return _mm256_insertf128_ps (_mm256_setzero_ps (), __A, 0);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_zextsi128_si256 (__m128i __A)
+{
+  return _mm256_insertf128_si256 (_mm256_setzero_si256 (), __A, 0);
+}
+
 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_set_m128 ( __m128 __H, __m128 __L)
 {
--- gcc/config/i386/avx512fintrin.h.jj	2019-07-12 09:34:49.524385009 +0200
+++ gcc/config/i386/avx512fintrin.h	2019-08-12 14:36:52.281169281 +0200
@@ -15437,6 +15437,48 @@ _mm512_castsi256_si512 (__m256i __A)
   return (__m512i)__builtin_ia32_si512_256si ((__v8si)__A);
 }
 
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_zextpd128_pd512 (__m128d __A)
+{
+  return (__m512d) _mm512_insertf32x4 (_mm512_setzero_ps (), (__m128) __A, 0);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_zextps128_ps512 (__m128 __A)
+{
+  return _mm512_insertf32x4 (_mm512_setzero_ps (), __A, 0);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_zextsi128_si512 (__m128i __A)
+{
+  return _mm512_inserti32x4 (_mm512_setzero_si512 (), __A, 0);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_zextpd256_pd512 (__m256d __A)
+{
+  return _mm512_insertf64x4 (_mm512_setzero_pd (), __A, 0);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_zextps256_ps512 (__m256 __A)
+{
+  return (__m512) _mm512_insertf64x4 (_mm512_setzero_pd (), (__m256d) __A, 0);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_zextsi256_si512 (__m256i __A)
+{
+  return _mm512_inserti64x4 (_mm512_setzero_si512 (), __A, 0);
+}
+
 extern __inline __mmask16
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_cmpeq_epu32_mask (__m512i __A, __m512i __B)
--- gcc/testsuite/gcc.target/i386/avx-typecast-1.c.jj	2019-08-12 15:12:51.597209881 +0200
+++ gcc/testsuite/gcc.target/i386/avx-typecast-1.c	2019-08-12 15:12:47.334274860 +0200
@@ -0,0 +1,83 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+
+extern int memcmp (const void *, const void *, __SIZE_TYPE__);
+
+void
+avx_test (void)
+{
+  union256i_d  a, ad;
+  union256  b, bd;
+  union256d  c, cd;
+  union128i_d  d, dd;
+  union128  e, ed;
+  union128d  f, fd;
+  int i;
+
+  for (i = 0; i < 8; i++)
+    {
+      a.a[i] = 7146908634 + i;
+      b.a[i] = 45.12f + i;
+    }
+
+  for (i = 0; i < 4; i++)
+    {
+      c.a[i] = 41234512513451345.0905 + i;
+      d.a[i] = 109534 + i;
+      e.a[i] = 85034.095f + i;
+    }
+
+  for (i = 0; i < 2; i++)
+    f.a[i] = 41234512451345.0905 + i;
+
+  bd.x = _mm256_castpd_ps (c.x);
+  if (memcmp (bd.a, c.a, 32))
+    abort ();
+
+  ad.x = _mm256_castpd_si256 (c.x);
+  if (memcmp (ad.a, c.a, 32))
+    abort ();
+
+  cd.x = _mm256_castps_pd (b.x);
+  if (memcmp (cd.a, b.a, 32))
+    abort ();
+
+  ad.x = _mm256_castps_si256 (b.x);
+  if (memcmp (ad.a, b.a, 32))
+    abort ();
+
+  bd.x = _mm256_castsi256_ps (a.x);
+  if (memcmp (bd.a, a.a, 32))
+    abort ();
+
+  cd.x = _mm256_castsi256_pd (a.x);
+  if (memcmp (cd.a, a.a, 32))
+    abort ();
+
+  fd.x = _mm256_castpd256_pd128 (c.x);
+  if (memcmp (fd.a, c.a, 16))
+    abort ();
+
+  ed.x = _mm256_castps256_ps128 (b.x);
+  if (memcmp (ed.a, b.a, 16))
+    abort ();
+
+  dd.x = _mm256_castsi256_si128 (a.x);
+  if (memcmp (dd.a, a.a, 16))
+    abort ();
+
+  cd.x = _mm256_castpd128_pd256 (f.x);
+  if (memcmp (cd.a, f.a, 16))
+    abort ();
+
+  bd.x = _mm256_castps128_ps256 (e.x);
+  if (memcmp (bd.a, e.a, 16))
+    abort ();
+
+  ad.x = _mm256_castsi128_si256 (d.x);
+  if (memcmp (ad.a, d.a, 16))
+    abort ();
+}
--- gcc/testsuite/gcc.target/i386/avx-typecast-2.c.jj	2019-08-12 15:12:55.056157156 +0200
+++ gcc/testsuite/gcc.target/i386/avx-typecast-2.c	2019-08-12 15:14:57.108296731 +0200
@@ -0,0 +1,46 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+
+extern int memcmp (const void *, const void *, __SIZE_TYPE__);
+
+void
+avx_test (void)
+{
+  union256i_d ad, zero;
+  union256 bd;
+  union256d cd;
+  union128i_d d;
+  union128 e;
+  union128d f;
+  int i;
+
+  for (i = 0; i < 8; i++)
+    zero.a[i] = 0;
+
+  for (i = 0; i < 4; i++)
+    {
+      d.a[i] = 109534 + i;
+      e.a[i] = 85034.095f + i;
+    }
+
+  for (i = 0; i < 2; i++)
+    f.a[i] = 41234512451345.0905 + i;
+
+  cd.x = _mm256_zextpd128_pd256 (f.x);
+  if (memcmp (cd.a, f.a, 16)
+      || memcmp (&cd.a[2], &zero.a, 16))
+    abort ();
+
+  bd.x = _mm256_zextps128_ps256 (e.x);
+  if (memcmp (bd.a, e.a, 16)
+      || memcmp (&bd.a[4], &zero.a, 16))
+    abort ();
+
+  ad.x = _mm256_zextsi128_si256 (d.x);
+  if (memcmp (ad.a, d.a, 16)
+      || memcmp (&ad.a[4], &zero.a, 16))
+    abort ();
+}
--- gcc/testsuite/gcc.target/i386/avx512f-typecast-2.c.jj	2019-08-12 14:38:41.389500441 +0200
+++ gcc/testsuite/gcc.target/i386/avx512f-typecast-2.c	2019-08-12 14:47:10.291717937 +0200
@@ -0,0 +1,71 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512f" } */
+/* { dg-require-effective-target avx512f } */
+
+#include "avx512f-check.h"
+
+extern int memcmp (const void *, const void *, __SIZE_TYPE__);
+
+void
+avx512f_test (void)
+{
+  union512i_d ad, zero;
+  union512 bd;
+  union512d cd;
+  union256i_d d;
+  union256 e;
+  union256d f;
+  union128i_d g;
+  union128 h;
+  union128d k;
+  int i;
+
+  for (i = 0; i < 16; i++)
+    zero.a[i] = 0;
+
+  for (i = 0; i < 8; i++)
+    {
+      d.a[i] = 109534 + i;
+      e.a[i] = 85034.095f + i;
+    }
+
+  for (i = 0; i < 4; i++)
+    {
+      f.a[i] = 41234512451345.0905 + i;
+      g.a[i] = 71469086341 + i;
+      h.a[i] = 45.1264f + i;
+    }
+
+  for (i = 0; i < 2; i++)
+    k.a[i] = 7146908634.576 + i;
+
+  cd.x = _mm512_zextpd128_pd512 (k.x);
+  if (memcmp (cd.a, k.a, 16)
+      || memcmp (&cd.a[2], &zero.a, 48))
+    abort ();
+
+  bd.x = _mm512_zextps128_ps512 (h.x);
+  if (memcmp (bd.a, h.a, 16)
+      || memcmp (&bd.a[4], &zero.a, 48))
+    abort ();
+
+  ad.x = _mm512_zextsi128_si512 (g.x);
+  if (memcmp (ad.a, g.a, 16)
+      || memcmp (&ad.a[4], &zero.a, 48))
+    abort ();
+
+  cd.x = _mm512_zextpd256_pd512 (f.x);
+  if (memcmp (cd.a, f.a, 32)
+      || memcmp (&cd.a[4], &zero.a, 32))
+    abort ();
+
+  bd.x = _mm512_zextps256_ps512 (e.x);
+  if (memcmp (bd.a, e.a, 32)
+      || memcmp (&bd.a[8], &zero.a, 32))
+    abort ();
+
+  ad.x = _mm512_zextsi256_si512 (d.x);
+  if (memcmp (ad.a, d.a, 32)
+      || memcmp (&ad.a[8], &zero.a, 32))
+    abort ();
+}

	Jakub


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]