[PATCH] Fix AVX512* wrong-code due to *<extract_type>_vinsert<shuffletype><extract_suf>_0 (PR target/90991)

Jakub Jelinek jakub@redhat.com
Wed Jun 26 08:11:00 GMT 2019


Hi!

The following testcase is miscompiled starting with my PR85480 change.
While it is perfectly fine to use "xm" or "vm" constraints for the source
operand when the other operand is "C", we rely on the AVX/AVX512 behavior
that most 128-bit or 256-bit vector instructions clear the upper bits of the
512-bit vectors, but if the source is in memory, we need to check if it is
aligned or maybe misaligned and use corresponding aligned or unaligned loads
accordingly, rather than always aligned ones.

Fixed thusly, bootstrapped/regtested on x86_64-linux and i686-linux, ok for
trunk and 9.2?

Note, I have a follow-up patch to improve avx_vec_concat<mode>, just will
need to test it.

2019-06-26  Jakub Jelinek  <jakub@redhat.com>

	PR target/90991
	* config/i386/sse.md
	(*<extract_type>_vinsert<shuffletype><extract_suf>_0): Use vmovupd,
	vmovups, vmovdqu, vmovdqu32 or vmovdqu64 instead of the aligned
	insns if operands[2] is misaligned_operand.

	* gcc.target/i386/avx512dq-pr90991-1.c: New test.

--- gcc/config/i386/sse.md.jj	2019-06-21 08:43:17.734263742 +0200
+++ gcc/config/i386/sse.md	2019-06-25 22:36:12.955476294 +0200
@@ -13744,15 +13744,29 @@ (define_insn "*<extract_type>_vinsert<sh
   switch (<MODE>mode)
     {
     case E_V8DFmode:
-      return "vmovapd\t{%2, %x0|%x0, %2}";
+      if (misaligned_operand (operands[2], <ssequartermode>mode))
+	return "vmovupd\t{%2, %x0|%x0, %2}";
+      else
+	return "vmovapd\t{%2, %x0|%x0, %2}";
     case E_V16SFmode:
-      return "vmovaps\t{%2, %x0|%x0, %2}";
+      if (misaligned_operand (operands[2], <ssequartermode>mode))
+	return "vmovups\t{%2, %x0|%x0, %2}";
+      else
+	return "vmovaps\t{%2, %x0|%x0, %2}";
     case E_V8DImode:
-      return which_alternative == 2 ? "vmovdqa64\t{%2, %x0|%x0, %2}"
-				    : "vmovdqa\t{%2, %x0|%x0, %2}";
+      if (misaligned_operand (operands[2], <ssequartermode>mode))
+	return which_alternative == 2 ? "vmovdqu64\t{%2, %x0|%x0, %2}"
+				      : "vmovdqu\t{%2, %x0|%x0, %2}";
+      else
+	return which_alternative == 2 ? "vmovdqa64\t{%2, %x0|%x0, %2}"
+				      : "vmovdqa\t{%2, %x0|%x0, %2}";
     case E_V16SImode:
-      return which_alternative == 2 ? "vmovdqa32\t{%2, %x0|%x0, %2}"
-				    : "vmovdqa\t{%2, %x0|%x0, %2}";
+      if (misaligned_operand (operands[2], <ssequartermode>mode))
+	return which_alternative == 2 ? "vmovdqu32\t{%2, %x0|%x0, %2}"
+				      : "vmovdqu\t{%2, %x0|%x0, %2}";
+      else
+	return which_alternative == 2 ? "vmovdqa32\t{%2, %x0|%x0, %2}"
+				      : "vmovdqa\t{%2, %x0|%x0, %2}";
     default:
       gcc_unreachable ();
     }
--- gcc/testsuite/gcc.target/i386/avx512dq-pr90991-1.c.jj	2019-06-25 23:17:44.831272146 +0200
+++ gcc/testsuite/gcc.target/i386/avx512dq-pr90991-1.c	2019-06-25 23:27:27.732357035 +0200
@@ -0,0 +1,47 @@
+/* PR target/90991 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512dq -masm=att" } */
+/* { dg-final { scan-assembler-times "vmovaps\[ \t]\+\\(\[^\n\r]*\\), %xmm0" 1 } } */
+/* { dg-final { scan-assembler-times "vmovapd\[ \t]\+\\(\[^\n\r]*\\), %xmm0" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqa\[ \t]\+\\(\[^\n\r]*\\), %xmm0" 1 } } */
+/* { dg-final { scan-assembler-times "vmovups\[ \t]\+\\(\[^\n\r]*\\), %xmm0" 1 } } */
+/* { dg-final { scan-assembler-times "vmovupd\[ \t]\+\\(\[^\n\r]*\\), %xmm0" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[ \t]\+\\(\[^\n\r]*\\), %xmm0" 1 } } */
+
+#include <x86intrin.h>
+
+__m512
+f1 (void *a)
+{
+  return _mm512_insertf32x4 (_mm512_set1_ps (0.0f), _mm_load_ps (a), 0);
+}
+
+__m512d
+f2 (void *a)
+{
+  return _mm512_insertf64x2 (_mm512_set1_pd (0.0), _mm_load_pd (a), 0);
+}
+
+__m512i
+f3 (void *a)
+{
+  return _mm512_inserti32x4 (_mm512_set1_epi32 (0), _mm_load_si128 (a), 0);
+}
+
+__m512
+f4 (void *a)
+{
+  return _mm512_insertf32x4 (_mm512_set1_ps (0.0f), _mm_loadu_ps (a), 0);
+}
+
+__m512d
+f5 (void *a)
+{
+  return _mm512_insertf64x2 (_mm512_set1_pd (0.0), _mm_loadu_pd (a), 0);
+}
+
+__m512i
+f6 (void *a)
+{
+  return _mm512_inserti32x4 (_mm512_set1_epi32 (0), _mm_loadu_si128 (a), 0);
+}

	Jakub



More information about the Gcc-patches mailing list