This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] Add _mm256_{load,store}u2_m128{,d,i} intrinsics (PR target/91341)


On Mon, Aug 5, 2019 at 9:30 AM Jakub Jelinek <jakub@redhat.com> wrote:
>
> Hi!
>
> The following patch adds a couple of intrinsics that both ICC and clang
> have, but GCC doesn't.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> They emit optimal code except for the last one, _mm256_storeu2_m128i,
> where we emit
>         vmovups %xmm0, (%rsi)
>         vextractf128    $0x1, %ymm0, %xmm0
>         vmovups %xmm0, (%rdi)
> instead of
>         vmovups %xmm0, (%rsi)
>         vextractf128    $0x1, %ymm0, (%rdi)
> That is because for _mm256_extractf128_si256 is implemented as V8SImode
> pattern, but _m128i is V2DImode, and we don't have a pattern like:
>         (set (match_operand:V2DI 0 ("nonimmediate_operand") ("=xm, vm"))
>           (subreg:V2DI
>             (vec_select:V4SI (match_operand:V8SI 1 ("register_operand") ("x, v"))
>                 (parallel [
>                         (const_int 4 [0x4])
>                         (const_int 5 [0x5])
>                         (const_int 6 [0x6])
>                         (const_int 7 [0x7])
>                     ])) 0))
> Shall we add that (and just for this mode combination, or using iterators
> for others)?  Unfortunately the builtin that would use V2DI in the
> vec_select instead of V4SI is AVX2 and so can't be used in this case.

Let's leave this for now. We already have similar cases of subreg
mismatches (not only with xmm regs) that result in unmerged memory
operands, and they are fairly benign.

> 2019-08-05  Jakub Jelinek  <jakub@redhat.com>
>
>         PR target/91341
>         * config/i386/avxintrin.h (_mm256_loadu2_m128, _mm256_storeu2_m128,
>         _mm256_loadu2_m128d, _mm256_storeu2_m128d, _mm256_loadu2_m128i,
>         _mm256_storeu2_m128i): New function.
>
>         * gcc.target/i386/avx-loadu2-m128-1.c: New test.
>         * gcc.target/i386/avx-loadu2-m128-2.c: New test.
>         * gcc.target/i386/avx-loadu2-m128d-1.c: New test.
>         * gcc.target/i386/avx-loadu2-m128d-2.c: New test.
>         * gcc.target/i386/avx-loadu2-m128i-1.c: New test.
>         * gcc.target/i386/avx-loadu2-m128i-2.c: New test.
>         * gcc.target/i386/avx-storeu2-m128-1.c: New test.
>         * gcc.target/i386/avx-storeu2-m128-2.c: New test.
>         * gcc.target/i386/avx-storeu2-m128d-1.c: New test.
>         * gcc.target/i386/avx-storeu2-m128d-2.c: New test.
>         * gcc.target/i386/avx-storeu2-m128i-1.c: New test.
>         * gcc.target/i386/avx-storeu2-m128i-2.c: New test.

OK.

Thanks,
Uros.

> --- gcc/config/i386/avxintrin.h.jj      2019-01-01 12:37:32.417724576 +0100
> +++ gcc/config/i386/avxintrin.h 2019-08-04 16:39:10.091659072 +0200
> @@ -1520,6 +1520,48 @@ _mm256_setr_m128i (__m128i __L, __m128i
>    return _mm256_set_m128i (__H, __L);
>  }
>
> +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_loadu2_m128 (float const *__PH, float const *__PL)
> +{
> +  return _mm256_insertf128_ps (_mm256_castps128_ps256 (_mm_loadu_ps (__PL)),
> +                              _mm_loadu_ps (__PH), 1);
> +}
> +
> +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_storeu2_m128 (float *__PH, float *__PL, __m256 __A)
> +{
> +  _mm_storeu_ps (__PL, _mm256_castps256_ps128 (__A));
> +  _mm_storeu_ps (__PH, _mm256_extractf128_ps (__A, 1));
> +}
> +
> +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_loadu2_m128d (double const *__PH, double const *__PL)
> +{
> +  return _mm256_insertf128_pd (_mm256_castpd128_pd256 (_mm_loadu_pd (__PL)),
> +                              _mm_loadu_pd (__PH), 1);
> +}
> +
> +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_storeu2_m128d (double *__PH, double *__PL, __m256d __A)
> +{
> +  _mm_storeu_pd (__PL, _mm256_castpd256_pd128 (__A));
> +  _mm_storeu_pd (__PH, _mm256_extractf128_pd (__A, 1));
> +}
> +
> +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_loadu2_m128i (__m128i_u const *__PH, __m128i_u const *__PL)
> +{
> +  return _mm256_insertf128_si256 (_mm256_castsi128_si256 (_mm_loadu_si128 (__PL)),
> +                                 _mm_loadu_si128 (__PH), 1);
> +}
> +
> +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_storeu2_m128i (__m128i_u *__PH, __m128i_u *__PL, __m256i __A)
> +{
> +  _mm_storeu_si128 (__PL, _mm256_castsi256_si128 (__A));
> +  _mm_storeu_si128 (__PH, _mm256_extractf128_si256 (__A, 1));
> +}
> +
>  #ifdef __DISABLE_AVX__
>  #undef __DISABLE_AVX__
>  #pragma GCC pop_options
> --- gcc/testsuite/gcc.target/i386/avx-loadu2-m128-1.c.jj        2019-08-04 16:52:17.205753124 +0200
> +++ gcc/testsuite/gcc.target/i386/avx-loadu2-m128-1.c   2019-08-04 16:50:01.315810000 +0200
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-final { scan-assembler "\tvmovups\t" } } */
> +/* { dg-final { scan-assembler "\tvinsertf128\t" } } */
> +
> +#include <immintrin.h>
> +
> +__m256
> +foo (float const *hi, float const *lo)
> +{
> +  return _mm256_loadu2_m128 (hi, lo);
> +}
> --- gcc/testsuite/gcc.target/i386/avx-loadu2-m128-2.c.jj        2019-08-04 16:52:20.358705400 +0200
> +++ gcc/testsuite/gcc.target/i386/avx-loadu2-m128-2.c   2019-08-04 16:59:50.002899417 +0200
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-require-effective-target avx } */
> +
> +#include "avx-check.h"
> +
> +static void
> +avx_test (void)
> +{
> +  union256 u;
> +  float e[8] = { 1.5f, -9.5f, 13.25f, -24.75f, -18.75f, 12.0f, 0.0f, 9.0f };
> +  float f[8] = { -24.75f, -18.75f, 12.0f, 0.0f, -9.5f, 13.25f, -24.75f, -18.75f };
> +
> +  u.x = _mm256_loadu2_m128 (e + 1, e + 3);
> +  if (check_union256 (u, f))
> +    abort ();
> +}
> --- gcc/testsuite/gcc.target/i386/avx-loadu2-m128d-1.c.jj       2019-08-04 16:52:17.205753124 +0200
> +++ gcc/testsuite/gcc.target/i386/avx-loadu2-m128d-1.c  2019-08-04 17:03:13.548818465 +0200
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-final { scan-assembler "\tvmovupd\t" } } */
> +/* { dg-final { scan-assembler "\tvinsertf128\t" } } */
> +
> +#include <immintrin.h>
> +
> +__m256d
> +foo (double const *hi, double const *lo)
> +{
> +  return _mm256_loadu2_m128d (hi, lo);
> +}
> --- gcc/testsuite/gcc.target/i386/avx-loadu2-m128d-2.c.jj       2019-08-04 16:52:20.358705400 +0200
> +++ gcc/testsuite/gcc.target/i386/avx-loadu2-m128d-2.c  2019-08-04 17:05:00.342201999 +0200
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-require-effective-target avx } */
> +
> +#include "avx-check.h"
> +
> +static void
> +avx_test (void)
> +{
> +  union256d u;
> +  double e[8] = { 1.5, -9.5, 13.25, -24.75, -18.75, 12.0, 0.0, 9.0 };
> +  double f[4] = { 12.0, 0.0, -9.5, 13.25 };
> +
> +  u.x = _mm256_loadu2_m128d (e + 1, e + 5);
> +  if (check_union256d (u, f))
> +    abort ();
> +}
> --- gcc/testsuite/gcc.target/i386/avx-loadu2-m128i-1.c.jj       2019-08-04 16:52:17.205753124 +0200
> +++ gcc/testsuite/gcc.target/i386/avx-loadu2-m128i-1.c  2019-08-04 17:06:44.386628690 +0200
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-final { scan-assembler "\tvmovdqu\t" } } */
> +/* { dg-final { scan-assembler "\tvinsert\[fi]128\t" } } */
> +
> +#include <immintrin.h>
> +
> +__m256i
> +foo (__m128i_u const *hi, __m128i_u const *lo)
> +{
> +  return _mm256_loadu2_m128i (hi, lo);
> +}
> --- gcc/testsuite/gcc.target/i386/avx-loadu2-m128i-2.c.jj       2019-08-04 16:52:20.358705400 +0200
> +++ gcc/testsuite/gcc.target/i386/avx-loadu2-m128i-2.c  2019-08-04 17:11:04.864691481 +0200
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-require-effective-target avx } */
> +
> +#include "avx-check.h"
> +
> +static void
> +avx_test (void)
> +{
> +  union256i_d u;
> +  int e[8] = { 1, -9, 13, -24, -18, 12, 0, 9 };
> +  int f[8] = { -24, -18, 12, 0, -9, 13, -24, -18 };
> +
> +  u.x = _mm256_loadu2_m128i ((__m128i_u *) (e + 1), (__m128i_u *) (e + 3));
> +  if (check_union256i_d (u, f))
> +    abort ();
> +}
> --- gcc/testsuite/gcc.target/i386/avx-storeu2-m128-1.c.jj       2019-08-04 17:13:27.124541181 +0200
> +++ gcc/testsuite/gcc.target/i386/avx-storeu2-m128-1.c  2019-08-04 17:15:14.546917455 +0200
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-final { scan-assembler "\tvmovups\t" } } */
> +/* { dg-final { scan-assembler "\tvextractf128\t" } } */
> +
> +#include <immintrin.h>
> +
> +void
> +foo (float *hi, float *lo, __m256 a)
> +{
> +  _mm256_storeu2_m128 (hi, lo, a);
> +}
> --- gcc/testsuite/gcc.target/i386/avx-storeu2-m128-2.c.jj       2019-08-04 17:13:30.135495667 +0200
> +++ gcc/testsuite/gcc.target/i386/avx-storeu2-m128-2.c  2019-08-04 17:19:36.590956577 +0200
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-require-effective-target avx } */
> +
> +#include "avx-check.h"
> +
> +static void
> +avx_test (void)
> +{
> +  float e[12] = { -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f };
> +  float f[12] = { -1.0f, -18.75f, 12.0f, 0.0f, 9.0f, -1.0f, 1.5f, -9.5f, 13.25f, -24.75f, -1.0f, -1.0f };
> +  int i;
> +  __m256 x = _mm256_set_ps (1.5f, -9.5f, 13.25f, -24.75f, -18.75f, 12.0f, 0.0f, 9.0f);
> +  _mm256_storeu2_m128 (e + 1, e + 6, x);
> +  for (i = 0; i < 12; i++)
> +    if (e[i] != f[i])
> +      abort ();
> +}
> --- gcc/testsuite/gcc.target/i386/avx-storeu2-m128d-1.c.jj      2019-08-04 17:13:27.124541181 +0200
> +++ gcc/testsuite/gcc.target/i386/avx-storeu2-m128d-1.c 2019-08-04 17:34:55.951056592 +0200
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-final { scan-assembler "\tvmovup\[sd]\t" } } */
> +/* { dg-final { scan-assembler "\tvextractf128\t" } } */
> +
> +#include <immintrin.h>
> +
> +void
> +foo (double *hi, double *lo, __m256d a)
> +{
> +  _mm256_storeu2_m128d (hi, lo, a);
> +}
> --- gcc/testsuite/gcc.target/i386/avx-storeu2-m128d-2.c.jj      2019-08-04 17:13:30.135495667 +0200
> +++ gcc/testsuite/gcc.target/i386/avx-storeu2-m128d-2.c 2019-08-04 17:35:17.505730678 +0200
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-require-effective-target avx } */
> +
> +#include "avx-check.h"
> +
> +static void
> +avx_test (void)
> +{
> +  double e[8] = { -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0 };
> +  double f[8] = { -1.0, 13.25, -24.75, -1.0, 1.5, -9.5, -1.0, -1.0 };
> +  int i;
> +  __m256d x = _mm256_set_pd (1.5, -9.5, 13.25, -24.75);
> +  _mm256_storeu2_m128d (e + 1, e + 4, x);
> +  for (i = 0; i < 8; i++)
> +    if (e[i] != f[i])
> +      abort ();
> +}
> --- gcc/testsuite/gcc.target/i386/avx-storeu2-m128i-1.c.jj      2019-08-04 17:13:27.124541181 +0200
> +++ gcc/testsuite/gcc.target/i386/avx-storeu2-m128i-1.c 2019-08-04 17:42:55.207811439 +0200
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-final { scan-assembler "\tvmov(dqu|ups)\t" } } */
> +/* { dg-final { scan-assembler "\tvextract\[if]128\t" } } */
> +
> +#include <immintrin.h>
> +
> +void
> +foo (__m128i_u *hi, __m128i_u *lo, __m256i a)
> +{
> +  _mm256_storeu2_m128i (hi, lo, a);
> +}
> --- gcc/testsuite/gcc.target/i386/avx-storeu2-m128i-2.c.jj      2019-08-04 17:13:30.135495667 +0200
> +++ gcc/testsuite/gcc.target/i386/avx-storeu2-m128i-2.c 2019-08-04 17:43:30.488278278 +0200
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-require-effective-target avx } */
> +
> +#include "avx-check.h"
> +
> +static void
> +avx_test (void)
> +{
> +  int e[12] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
> +  int f[12] = { -1, -18, 12, 0, 9, -1, 1, -9, 13, -24, -1, -1 };
> +  int i;
> +  __m256i x = _mm256_set_epi32 (1, -9, 13, -24, -18, 12, 0, 9);
> +  _mm256_storeu2_m128i ((__m128i_u *) (e + 1), (__m128i_u *) (e + 6), x);
> +  for (i = 0; i < 12; i++)
> +    if (e[i] != f[i])
> +      abort ();
> +}
>
>         Jakub


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]