This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
[PATCH] Fix -masm=intel output for AVX512{F,VL} gathers (PR target/79299)

From: Jakub Jelinek <jakub at redhat dot com>
To: Uros Bizjak <ubizjak at gmail dot com>, Kirill Yukhin <kirill dot yukhin at gmail dot com>, Andrew Senkevich <andrew dot n dot senkevich at gmail dot com>
Cc: gcc-patches at gcc dot gnu dot org
Date: Tue, 31 Jan 2017 19:22:05 +0100
Subject: [PATCH] Fix -masm=intel output for AVX512{F,VL} gathers (PR target/79299)
Authentication-results: sourceware.org; auth=none
Reply-to: Jakub Jelinek <jakub at redhat dot com>
Hi!

As mentioned in the PR and shown by the testcases, many of the AVX512{F,VL}
gathers fail to assemble with binutils.

Unlike AVX2 gathers, gas for some strange reason requires the memory operand
to be {Q,XMM,YMM,ZMM}WORD depending on the sizes of all the memory locations
together (and when not masked), rather then just using always DWORD or QWORD
depending on whether the argument is described as vm32{x,y,z} or
vm64{x,y,z} and for some AVX512{F,VL} that is what is actually emitted.
I have only a year old ICC around and that emits DWORD/QWORD instead.

Anyway, the following patch honors what gas accepts, if we want to change
gas, we likely want to do that afterwards (but keep compatibility), then add a
configure check whether gas supports the correct stuff and then decide based
on that.

There is also a fix for an inconsistency in the destination register:
%5, %x0%{%1%}|%t0%{%1%}, %...
where we were using %xmm? in AT&T, but %ymm? in Intel mode.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2017-01-31  Jakub Jelinek  <jakub@redhat.com>

	PR target/79299
	* config/i386/sse.md (xtg_mode, gatherq_mode): New mode attrs.
	(*avx512f_gathersi<mode>, *avx512f_gathersi<mode>_2,
	*avx512f_gatherdi<mode>, *avx512f_gatherdi<mode>_2): Use them,
	fix -masm=intel patterns.

	* gcc.target/i386/avx512vl-pr79299-1.c: New test.
	* gcc.target/i386/avx512vl-pr79299-2.c: New test.

--- gcc/config/i386/sse.md.jj	2017-01-26 13:22:55.000000000 +0100
+++ gcc/config/i386/sse.md	2017-01-31 14:33:15.389332480 +0100
@@ -811,6 +811,12 @@ (define_mode_attr concat_tg_mode
   [(V32QI "t") (V16HI "t") (V8SI "t") (V4DI "t") (V8SF "t") (V4DF "t")
    (V64QI "g") (V32HI "g") (V16SI "g") (V8DI "g") (V16SF "g") (V8DF "g")])
 
+;; Tie mode of assembler operand to mode iterator
+(define_mode_attr xtg_mode
+  [(V16QI "x") (V8HI "x") (V4SI "x") (V2DI "x") (V4SF "x") (V2DF "x")
+   (V32QI "t") (V16HI "t") (V8SI "t") (V4DI "t") (V8SF "t") (V4DF "t")
+   (V64QI "g") (V32HI "g") (V16SI "g") (V8DI "g") (V16SF "g") (V8DF "g")])
+
 ;; Half mask mode for unpacks
 (define_mode_attr HALFMASKMODE
   [(DI "SI") (SI "HI")])
@@ -19041,6 +19047,12 @@ (define_insn "*avx2_gatherdi<mode>_4"
    (set_attr "prefix" "vex")
    (set_attr "mode" "<sseinsnmode>")])
 
+;; Memory operand override for -masm=intel of the v*gatherq* patterns.
+(define_mode_attr gatherq_mode
+  [(V4SI "q") (V2DI "x") (V4SF "q") (V2DF "x")
+   (V8SI "x") (V4DI "t") (V8SF "x") (V4DF "t")
+   (V16SI "t") (V8DI "g") (V16SF "t") (V8DF "g")])
+
 (define_expand "<avx512>_gathersi<mode>"
   [(parallel [(set (match_operand:VI48F 0 "register_operand")
 		   (unspec:VI48F
@@ -19074,7 +19086,7 @@ (define_insn "*avx512f_gathersi<mode>"
 	  UNSPEC_GATHER))
    (clobber (match_scratch:<avx512fmaskmode> 2 "=&Yk"))]
   "TARGET_AVX512F"
-  "v<sseintprefix>gatherd<ssemodesuffix>\t{%6, %0%{%2%}|%0%{%2%}, %g6}"
+  "v<sseintprefix>gatherd<ssemodesuffix>\t{%6, %0%{%2%}|%0%{%2%}, %<xtg_mode>6}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
@@ -19093,7 +19105,7 @@ (define_insn "*avx512f_gathersi<mode>_2"
 	  UNSPEC_GATHER))
    (clobber (match_scratch:<avx512fmaskmode> 1 "=&Yk"))]
   "TARGET_AVX512F"
-  "v<sseintprefix>gatherd<ssemodesuffix>\t{%5, %0%{%1%}|%0%{%1%}, %g5}"
+  "v<sseintprefix>gatherd<ssemodesuffix>\t{%5, %0%{%1%}|%0%{%1%}, %<xtg_mode>5}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
@@ -19133,9 +19145,7 @@ (define_insn "*avx512f_gatherdi<mode>"
    (clobber (match_scratch:QI 2 "=&Yk"))]
   "TARGET_AVX512F"
 {
-  if (GET_MODE_SIZE (GET_MODE_INNER (<MODE>mode)) == 4)
-    return "v<sseintprefix>gatherq<ssemodesuffix>\t{%6, %1%{%2%}|%1%{%2%}, %t6}";
-  return "v<sseintprefix>gatherq<ssemodesuffix>\t{%6, %1%{%2%}|%1%{%2%}, %g6}";
+  return "v<sseintprefix>gatherq<ssemodesuffix>\t{%6, %1%{%2%}|%1%{%2%}, %<gatherq_mode>6}";
 }
   [(set_attr "type" "ssemov")
    (set_attr "prefix" "evex")
@@ -19159,11 +19169,11 @@ (define_insn "*avx512f_gatherdi<mode>_2"
   if (<MODE>mode != <VEC_GATHER_SRCDI>mode)
     {
       if (<MODE_SIZE> != 64)
-	return "v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %x0%{%1%}|%t0%{%1%}, %g5}";
+	return "v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %x0%{%1%}|%x0%{%1%}, %<gatherq_mode>5}";
       else
-	return "v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %t0%{%1%}|%t0%{%1%}, %g5}";
+	return "v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %t0%{%1%}|%t0%{%1%}, %t5}";
     }
-  return "v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %0%{%1%}|%0%{%1%}, %g5}";
+  return "v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %0%{%1%}|%0%{%1%}, %<gatherq_mode>5}";
 }
   [(set_attr "type" "ssemov")
    (set_attr "prefix" "evex")
--- gcc/testsuite/gcc.target/i386/avx512vl-pr79299-1.c.jj	2017-01-31 13:15:20.919592886 +0100
+++ gcc/testsuite/gcc.target/i386/avx512vl-pr79299-1.c	2017-01-31 14:33:38.425028304 +0100
@@ -0,0 +1,91 @@
+/* PR target/79299 */
+/* { dg-do assemble { target avx512vl } } */
+/* { dg-options "-Ofast -mavx512vl -masm=intel" } */
+
+#define N 1024
+
+unsigned long long a[N];
+unsigned int b[N], c[N], d[N], e[N], f[N];
+unsigned long long g[N], h[N], j[N], k[N];
+float l[N], m[N], n[N], o[N];
+double p[N], q[N], r[N], s[N];
+
+void
+f1 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    d[i] = c[a[i]];
+  for (i = 0; i < N; i++)
+    e[i] = f[i] ? f[i] : c[a[i]];
+}
+
+void
+f2 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    d[i] = c[b[i]];
+  for (i = 0; i < N; i++)
+    e[i] = f[i] ? f[i] : c[b[i]];
+}
+
+void
+f3 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    h[i] = g[a[i]];
+  for (i = 0; i < N; i++)
+    j[i] = k[i] != 0.0 ? k[i] : g[a[i]];
+}
+
+void
+f4 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    h[i] = g[b[i]];
+  for (i = 0; i < N; i++)
+    j[i] = k[i] != 0.0 ? k[i] : g[b[i]];
+}
+
+void
+f5 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    m[i] = l[a[i]];
+  for (i = 0; i < N; i++)
+    n[i] = o[i] ? o[i] : l[a[i]];
+}
+
+void
+f6 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    m[i] = c[b[i]];
+  for (i = 0; i < N; i++)
+    n[i] = o[i] ? o[i] : c[b[i]];
+}
+
+void
+f7 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    q[i] = p[a[i]];
+  for (i = 0; i < N; i++)
+    r[i] = s[i] != 0.0 ? s[i] : p[a[i]];
+}
+
+void
+f8 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    q[i] = p[b[i]];
+  for (i = 0; i < N; i++)
+    r[i] = s[i] != 0.0 ? s[i] : p[b[i]];
+}
--- gcc/testsuite/gcc.target/i386/avx512vl-pr79299-2.c.jj	2017-01-31 14:19:22.435366309 +0100
+++ gcc/testsuite/gcc.target/i386/avx512vl-pr79299-2.c	2017-01-31 14:33:46.465922128 +0100
@@ -0,0 +1,293 @@
+/* PR target/79299 */
+/* { dg-do assemble { target avx512vl } } */
+/* { dg-options "-Ofast -mavx512vl -masm=intel" } */
+
+#include <immintrin.h>
+
+__m512
+f1 (__m512i x, void const *y)
+{
+  return _mm512_i32gather_ps (x, y, 1);
+}
+
+__m512
+f2 (__m512 x, __mmask16 y, __m512i z, void const *w)
+{
+  return _mm512_mask_i32gather_ps (x, y, z, w, 1);
+}
+
+__m512d
+f3 (__m256i x, void const *y)
+{
+  return _mm512_i32gather_pd (x, y, 1);
+}
+
+__m512d
+f4 (__m512d x, __mmask8 y, __m256i z, void const *w)
+{
+  return _mm512_mask_i32gather_pd (x, y, z, w, 1);
+}
+
+__m256
+f5 (__m512i x, void const *y)
+{
+  return _mm512_i64gather_ps (x, y, 1);
+}
+
+__m256
+f6 (__m256 x, __mmask16 y, __m512i z, void const *w)
+{
+  return _mm512_mask_i64gather_ps (x, y, z, w, 1);
+}
+
+__m512d
+f7 (__m512i x, void const *y)
+{
+  return _mm512_i64gather_pd (x, y, 1);
+}
+
+__m512d
+f8 (__m512d x, __mmask8 y, __m512i z, void const *w)
+{
+  return _mm512_mask_i64gather_pd (x, y, z, w, 1);
+}
+
+__m512i
+f9 (__m512i x, void const *y)
+{
+  return _mm512_i32gather_epi32 (x, y, 1);
+}
+
+__m512i
+f10 (__m512i x, __mmask16 y, __m512i z, void const *w)
+{
+  return _mm512_mask_i32gather_epi32 (x, y, z, w, 1);
+}
+
+__m512i
+f11 (__m256i x, void const *y)
+{
+  return _mm512_i32gather_epi64 (x, y, 1);
+}
+
+__m512i
+f12 (__m512i x, __mmask8 y, __m256i z, void const *w)
+{
+  return _mm512_mask_i32gather_epi64 (x, y, z, w, 1);
+}
+
+__m256i
+f13 (__m512i x, void const *y)
+{
+  return _mm512_i64gather_epi32 (x, y, 1);
+}
+
+__m256i
+f14 (__m256i x, __mmask16 y, __m512i z, void const *w)
+{
+  return _mm512_mask_i64gather_epi32 (x, y, z, w, 1);
+}
+
+__m512i
+f15 (__m512i x, void const *y)
+{
+  return _mm512_i64gather_epi64 (x, y, 1);
+}
+
+__m512i
+f16 (__m512i x, __mmask8 y, __m512i z, void const *w)
+{
+  return _mm512_mask_i64gather_epi64 (x, y, z, w, 1);
+}
+
+__m256
+f17 (__m256 x, __mmask8 y, __m256i z, void const *w)
+{
+  return _mm256_mmask_i32gather_ps (x, y, z, w, 1);
+}
+
+__m128
+f18 (__m128 x, __mmask8 y, __m128i z, void const *w)
+{
+  return _mm_mmask_i32gather_ps (x, y, z, w, 1);
+}
+
+__m256d
+f19 (__m256d x, __mmask8 y, __m128i z, void const *w)
+{
+  return _mm256_mmask_i32gather_pd (x, y, z, w, 1);
+}
+
+__m128d
+f20 (__m128d x, __mmask8 y, __m128i z, void const *w)
+{
+  return _mm_mmask_i32gather_pd (x, y, z, w, 1);
+}
+
+__m128
+f21 (__m128 x, __mmask8 y, __m256i z, void const *w)
+{
+  return _mm256_mmask_i64gather_ps (x, y, z, w, 1);
+}
+
+__m128
+f22 (__m128 x, __mmask8 y, __m128i z, void const *w)
+{
+  return _mm_mmask_i64gather_ps (x, y, z, w, 1);
+}
+
+__m256d
+f23 (__m256d x, __mmask8 y, __m256i z, void const *w)
+{
+  return _mm256_mmask_i64gather_pd (x, y, z, w, 1);
+}
+
+__m128d
+f24 (__m128d x, __mmask8 y, __m128i z, void const *w)
+{
+  return _mm_mmask_i64gather_pd (x, y, z, w, 1);
+}
+
+__m256i
+f25 (__m256i x, __mmask8 y, __m256i z, void const *w)
+{
+  return _mm256_mmask_i32gather_epi32 (x, y, z, w, 1);
+}
+
+__m128i
+f26 (__m128i x, __mmask8 y, __m128i z, void const *w)
+{
+  return _mm_mmask_i32gather_epi32 (x, y, z, w, 1);
+}
+
+__m256i
+f27 (__m256i x, __mmask8 y, __m128i z, void const *w)
+{
+  return _mm256_mmask_i32gather_epi64 (x, y, z, w, 1);
+}
+
+__m128i
+f28 (__m128i x, __mmask8 y, __m128i z, void const *w)
+{
+  return _mm_mmask_i32gather_epi64 (x, y, z, w, 1);
+}
+
+__m128i
+f29 (__m128i x, __mmask8 y, __m256i z, void const *w)
+{
+  return _mm256_mmask_i64gather_epi32 (x, y, z, w, 1);
+}
+
+__m128i
+f30 (__m128i x, __mmask8 y, __m128i z, void const *w)
+{
+  return _mm_mmask_i64gather_epi32 (x, y, z, w, 1);
+}
+
+__m256i
+f31 (__m256i x, __mmask8 y, __m256i z, void const *w)
+{
+  return _mm256_mmask_i64gather_epi64 (x, y, z, w, 1);
+}
+
+__m128i
+f32 (__m128i x, __mmask8 y, __m128i z, void const *w)
+{
+  return _mm_mmask_i64gather_epi64 (x, y, z, w, 1);
+}
+
+__m256
+f33 (__m256 x, __m256i z, void const *w)
+{
+  return _mm256_mmask_i32gather_ps (x, -1, z, w, 1);
+}
+
+__m128
+f34 (__m128 x, __m128i z, void const *w)
+{
+  return _mm_mmask_i32gather_ps (x, -1, z, w, 1);
+}
+
+__m256d
+f35 (__m256d x, __m128i z, void const *w)
+{
+  return _mm256_mmask_i32gather_pd (x, -1, z, w, 1);
+}
+
+__m128d
+f36 (__m128d x, __m128i z, void const *w)
+{
+  return _mm_mmask_i32gather_pd (x, -1, z, w, 1);
+}
+
+__m128
+f37 (__m128 x, __m256i z, void const *w)
+{
+  return _mm256_mmask_i64gather_ps (x, -1, z, w, 1);
+}
+
+__m128
+f38 (__m128 x, __m128i z, void const *w)
+{
+  return _mm_mmask_i64gather_ps (x, -1, z, w, 1);
+}
+
+__m256d
+f39 (__m256d x, __m256i z, void const *w)
+{
+  return _mm256_mmask_i64gather_pd (x, -1, z, w, 1);
+}
+
+__m128d
+f40 (__m128d x, __m128i z, void const *w)
+{
+  return _mm_mmask_i64gather_pd (x, -1, z, w, 1);
+}
+
+__m256i
+f41 (__m256i x, __m256i z, void const *w)
+{
+  return _mm256_mmask_i32gather_epi32 (x, -1, z, w, 1);
+}
+
+__m128i
+f42 (__m128i x, __m128i z, void const *w)
+{
+  return _mm_mmask_i32gather_epi32 (x, -1, z, w, 1);
+}
+
+__m256i
+f43 (__m256i x, __m128i z, void const *w)
+{
+  return _mm256_mmask_i32gather_epi64 (x, -1, z, w, 1);
+}
+
+__m128i
+f44 (__m128i x, __m128i z, void const *w)
+{
+  return _mm_mmask_i32gather_epi64 (x, -1, z, w, 1);
+}
+
+__m128i
+f45 (__m128i x, __m256i z, void const *w)
+{
+  return _mm256_mmask_i64gather_epi32 (x, -1, z, w, 1);
+}
+
+__m128i
+f46 (__m128i x, __m128i z, void const *w)
+{
+  return _mm_mmask_i64gather_epi32 (x, -1, z, w, 1);
+}
+
+__m256i
+f47 (__m256i x, __m256i z, void const *w)
+{
+  return _mm256_mmask_i64gather_epi64 (x, -1, z, w, 1);
+}
+
+__m128i
+f48 (__m128i x, __m128i z, void const *w)
+{
+  return _mm_mmask_i64gather_epi64 (x, -1, z, w, 1);
+}

	Jakub
Follow-Ups:
- Re: [PATCH] Fix -masm=intel output for AVX512{F,VL} gathers (PR target/79299)
  - From: Rainer Orth
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]