[PATCH] Optimize AVX512 vpcmpeq* against 0 into vptestnm* rather than vptestm* (PR target/85832, PR target/86036)

Jakub Jelinek jakub@redhat.com
Mon Jun 4 13:08:00 GMT 2018


Hi!

On Wed, May 23, 2018 at 08:45:19AM +0200, Jakub Jelinek wrote:
> As mentioned in the PR, vptestm* instructions with the same input operand used
> twice perform the same comparison as vpcmpeq* against zero vector, with the
> advantage that a register holding CONST0_RTX (mode) is not needed.
> 
> 2018-05-23  Jakub Jelinek  <jakub@redhat.com>
> 
> 	PR target/85832
> 	* config/i386/sse.md (<avx512>_eq<mode>3<mask_scalar_merge_name>_1):
> 	Add (=Yk,v,C) variant using vptestm insn.  Use TARGET_AVX512BW
> 	in test instead of TARGET_AVX512F for VI12_AVX512VL iterator.
> 
> 	* gcc.target/i386/avx512f-pr85832.c: New test.
> 	* gcc.target/i386/avx512vl-pr85832.c: New test.
> 	* gcc.target/i386/avx512bw-pr85832.c: New test.
> 	* gcc.target/i386/avx512vlbw-pr85832.c: New test.

I've unfortunately not added an executable testcase nor tested it under sde,
so missed that say vpcmpeqw with 0 vector as one of the operands actually
doesn't do what vptestmw with the other argument repeated does, it does
exactly the opposite, vpcmpeqw sets bits in the mask register for elements
that are equal to 0, but vptestmw sets bits in the mask register for elements
where and of first arg and second arg (i.e. the argument that is repeated)
is non-zero.  Fortunately there is vptestnmw which does what we want.

Bootstrapped/regtested on x86_64-linux and i686-linux + tested on the
testcase with sde.  Ok for trunk?

2018-06-04  Jakub Jelinek  <jakub@redhat.com>

	PR target/85832
	PR target/86036
	* config/i386/sse.md (<avx512>_eq<mode>3<mask_scalar_merge_name>_1):
	Use vptestnm rather than vptestm in (=Yc,v,C) variant.

	* gcc.target/i386/avx512f-pr85832.c: Expect vptestnm rather than
	vptestm.
	* gcc.target/i386/avx512vl-pr85832.c: Likewise.
	* gcc.target/i386/avx512vlbw-pr85832.c: Likewise.
	* gcc.target/i386/avx512bw-pr85832.c: Likewise.
	* gcc.target/i386/avx512bw-pr86036.c: New test.

--- gcc/config/i386/sse.md.jj	2018-05-31 20:53:41.933453308 +0200
+++ gcc/config/i386/sse.md	2018-06-04 10:29:02.667720644 +0200
@@ -11287,7 +11287,7 @@ (define_insn "<avx512>_eq<mode>3<mask_sc
   "TARGET_AVX512BW && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
   "@
    vpcmpeq<ssemodesuffix>\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}
-   vptestm<ssemodesuffix>\t{%1, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %1}"
+   vptestnm<ssemodesuffix>\t{%1, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %1}"
   [(set_attr "type" "ssecmp")
    (set_attr "prefix_extra" "1")
    (set_attr "prefix" "evex")
@@ -11302,7 +11302,7 @@ (define_insn "<avx512>_eq<mode>3<mask_sc
   "TARGET_AVX512F && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
   "@
    vpcmpeq<ssemodesuffix>\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}
-   vptestm<ssemodesuffix>\t{%1, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %1}"
+   vptestnm<ssemodesuffix>\t{%1, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %1}"
   [(set_attr "type" "ssecmp")
    (set_attr "prefix_extra" "1")
    (set_attr "prefix" "evex")
--- gcc/testsuite/gcc.target/i386/avx512f-pr85832.c.jj	2018-05-25 14:35:23.123416639 +0200
+++ gcc/testsuite/gcc.target/i386/avx512f-pr85832.c	2018-06-04 11:00:00.773880446 +0200
@@ -1,8 +1,8 @@
 /* PR target/85832 */
 /* { dg-do compile } */
 /* { dg-options "-O2 -mavx512f -mno-avx512vl -mno-avx512bw -masm=att" } */
-/* { dg-final { scan-assembler-times {\mvptestmd\M} 1 } } */
-/* { dg-final { scan-assembler-times {\mvptestmq\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mvptestnmd\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mvptestnmq\M} 1 } } */
 
 #include <x86intrin.h>
 
--- gcc/testsuite/gcc.target/i386/avx512vl-pr85832.c.jj	2018-05-25 14:35:23.123416639 +0200
+++ gcc/testsuite/gcc.target/i386/avx512vl-pr85832.c	2018-06-04 11:00:09.995895313 +0200
@@ -1,8 +1,8 @@
 /* PR target/85832 */
 /* { dg-do compile } */
 /* { dg-options "-O2 -mavx512vl -mno-avx512bw -masm=att" } */
-/* { dg-final { scan-assembler-times {\mvptestmd\M} 2 } } */
-/* { dg-final { scan-assembler-times {\mvptestmq\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvptestnmd\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvptestnmq\M} 2 } } */
 
 #include <x86intrin.h>
 
--- gcc/testsuite/gcc.target/i386/avx512vlbw-pr85832.c.jj	2018-05-25 14:35:23.124416640 +0200
+++ gcc/testsuite/gcc.target/i386/avx512vlbw-pr85832.c	2018-06-04 11:00:06.020888898 +0200
@@ -1,8 +1,8 @@
 /* PR target/85832 */
 /* { dg-do compile } */
 /* { dg-options "-O2 -mavx512vl -mavx512bw -masm=att" } */
-/* { dg-final { scan-assembler-times {\mvptestmb\M} 2 } } */
-/* { dg-final { scan-assembler-times {\mvptestmw\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvptestnmb\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvptestnmw\M} 2 } } */
 
 #include <x86intrin.h>
 
--- gcc/testsuite/gcc.target/i386/avx512bw-pr85832.c.jj	2018-05-25 14:35:23.124416640 +0200
+++ gcc/testsuite/gcc.target/i386/avx512bw-pr85832.c	2018-06-04 10:59:53.015867934 +0200
@@ -1,8 +1,8 @@
 /* PR target/85832 */
 /* { dg-do compile } */
 /* { dg-options "-O2 -mavx512bw -mno-avx512vl -masm=att" } */
-/* { dg-final { scan-assembler-times {\mvptestmb\M} 1 } } */
-/* { dg-final { scan-assembler-times {\mvptestmw\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mvptestnmb\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mvptestnmw\M} 1 } } */
 
 #include <x86intrin.h>
 
--- gcc/testsuite/gcc.target/i386/avx512bw-pr86036.c.jj	2018-06-04 11:04:24.860193859 +0200
+++ gcc/testsuite/gcc.target/i386/avx512bw-pr86036.c	2018-06-04 11:18:47.618218756 +0200
@@ -0,0 +1,48 @@
+/* PR target/86036 */
+/* { dg-do run } */
+/* { dg-options "-O -mavx512bw" } */
+/* { dg-require-effective-target avx512bw } */
+
+#define AVX512BW
+#include "avx512f-helper.h"
+
+typedef unsigned short V __attribute__ ((vector_size (64)));
+
+__attribute__((noipa)) V
+foo (V a)
+{
+  return a >= 3;
+}
+
+__attribute__((noipa)) V
+bar (V a)
+{
+  return a != 0;
+}
+
+__attribute__((noipa)) V
+baz (V a)
+{
+  return a == 0;
+}
+
+void
+TEST (void)
+{
+  V a = (V) { 3, 17, 2, 0, 9, 1, 2, 3, 0, 0, 0, 3, 3, 3, 3, 3,
+	      9, 16387, 9, 3, 3, 0, 0, 3, 3, 3, 0, 0, 0, 0, 3, 3 };
+  V b = foo (a);
+  V c = (V) { -1, -1, 0, 0, -1, 0, 0, -1, 0, 0, 0, -1, -1, -1, -1, -1,
+	      -1, -1, -1, -1, -1, 0, 0, -1, -1, -1, 0, 0, 0, 0, -1, -1 };
+  if (__builtin_memcmp (&b, &c, sizeof (b)))
+    abort ();
+  V d = bar (a);
+  V e = (V) { -1, -1, -1, 0, -1, -1, -1, -1, 0, 0, 0, -1, -1, -1, -1, -1,
+	      -1, -1, -1, -1, -1, 0, 0, -1, -1, -1, 0, 0, 0, 0, -1, -1 };
+  if (__builtin_memcmp (&d, &e, sizeof (d)))
+    abort ();
+  V f = baz (a);
+  V g = ~e;
+  if (__builtin_memcmp (&f, &g, sizeof (f)))
+    abort ();
+}


	Jakub



More information about the Gcc-patches mailing list