This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: PATCH: PR target/42542: Vectorizer produces incorrect results on max of signed intergers


On Mon, Jan 4, 2010 at 6:17 AM, Uros Bizjak <ubizjak@gmail.com> wrote:
> On Wed, Dec 30, 2009 at 4:48 PM, H.J. Lu <hongjiu.lu@intel.com> wrote:
>
>> x86 integer vector subtraction insns don't set EFLAGS and there is no
>> parallel unsigned V4SI saturating subtraction. We can't easily detect
>> unsigned underflow. We can only enable umaxv4si3 and uminv4si3 for
>> SSE4.1 and XOP. ?Tested on Linux/Intel Core i7. ?OK for trunk and 4.3/4.4?
>
> I see no other way to fixup the underflow. Some comments below:
>
>> ? [(set (match_operand:SSEMODE24 0 "register_operand" "")
>> ? ? ? ?(umin:SSEMODE24 (match_operand:SSEMODE24 1 "register_operand" "")
>> ? ? ? ? ? ? ? ? ? ? ? ?(match_operand:SSEMODE24 2 "register_operand" "")))]
>> - ?"TARGET_SSE2"
>> + ?"(TARGET_SSE2 && <MODE>mode == V8HImode)
>> + ? || TARGET_SSE4_1
>> + ? || TARGET_XOP"
>> ?{
>> ? if (TARGET_SSE4_1)
>> ? ? ix86_fixup_binary_operands_no_copy (UMIN, <MODE>mode, operands);
>
> Please split above pattern into two non-macroized patterns using V8HI
> and V4SI modes.
>
>> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-1.c
>> index dbb154d..8da7fb0 100644
>> --- a/gcc/testsuite/gcc.dg/vect/vect-reduc-1.c
>> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-1.c
>> @@ -52,5 +52,5 @@ int main (void)
>> ? return 0;
>> ?}
>>
>> -/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { xfail { vect_no_int_add || vect_no_int_max } } } } */
>> +/* { dg-final { if { ![ istarget i?86-*-* ] && ![ istarget x86_64-*-* ] } { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { xfail { vect_no_int_add || vect_no_int_max } } } } } */
>> ?/* { dg-final { cleanup-tree-dump "vect" } } */
>
> Please also split i.e. vect_no_uint_max (and probably some kind of
> vect_no_ushort_max) from vect_no_int_max procedure (defined in
> testsuite/lib/target_supports.exp) and use them where appropriate in
> vectorizer testsuite.
>
> The patch is OK with these changes, but you will need separate
> approval for generic, non-target dependant testsuite changes. Please
> wait some time before the patch is backported to 4.4. and 4.3.
>

This is the x86 patch I checked. I will submit a separate patch for
generic, non-target dependant testsuite changes. Before that change
is checked in, gcc.dg/vect/vect-reduc-1.c will fail on x86.

Thanks.


-- 
H.J.
gcc/

2010-01-04  H.J. Lu  <hongjiu.lu@intel.com>

	PR target/42542
	* config/i386/i386.c (ix86_expand_int_vcond): Don't convert
	GTU to GT for V4SI and V2DI.

	* config/i386/sse.md (umaxv4si3): Enabled for SSE4.1 and XOP.
	(umin<mode>3): Removed.
	(uminv8hi3): New.
	(uminv4si3): Likewise.

gcc/testsuite/

2010-01-04  H.J. Lu  <hongjiu.lu@intel.com>

	PR target/42542
	* gcc.target/i386/pr42542-1.c: New.
	* gcc.target/i386/pr42542-1a.c: Likewise.
	* gcc.target/i386/pr42542-1b.c: Likewise.
	* gcc.target/i386/pr42542-2.c: Likewise.
	* gcc.target/i386/pr42542-2a.c: Likewise.
	* gcc.target/i386/pr42542-2b.c: Likewise.
	* gcc.target/i386/pr42542-3.c: Likewise.
	* gcc.target/i386/pr42542-3a.c: Likewise.

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 3e15b9d..fdcc6b0 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -16252,37 +16252,6 @@ ix86_expand_int_vcond (rtx operands[])
 
 	  switch (mode)
 	    {
-	    case V4SImode:
-	    case V2DImode:
-		{
-		  rtx t1, t2, mask;
-
-		  /* Perform a parallel modulo subtraction.  */
-		  t1 = gen_reg_rtx (mode);
-		  emit_insn ((mode == V4SImode
-			      ? gen_subv4si3
-			      : gen_subv2di3) (t1, cop0, cop1));
-
-		  /* Extract the original sign bit of op0.  */
-		  mask = ix86_build_signbit_mask (GET_MODE_INNER (mode),
-						  true, false);
-		  t2 = gen_reg_rtx (mode);
-		  emit_insn ((mode == V4SImode
-			      ? gen_andv4si3
-			      : gen_andv2di3) (t2, cop0, mask));
-
-		  /* XOR it back into the result of the subtraction.
-		     This results in the sign bit set iff we saw
-		     unsigned underflow.  */
-		  x = gen_reg_rtx (mode);
-		  emit_insn ((mode == V4SImode
-			      ? gen_xorv4si3
-			      : gen_xorv2di3) (x, t1, t2));
-
-		  code = GT;
-		}
-	      break;
-
 	    case V16QImode:
 	    case V8HImode:
 	      /* Perform a parallel unsigned saturating subtraction.  */
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 9bbea80..787fc98 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -6138,7 +6138,7 @@
   [(set (match_operand:V4SI 0 "register_operand" "")
 	(umax:V4SI (match_operand:V4SI 1 "register_operand" "")
 		   (match_operand:V4SI 2 "register_operand" "")))]
-  "TARGET_SSE2"
+  "TARGET_SSE4_1 || TARGET_XOP"
 {
   if (TARGET_SSE4_1)
     ix86_fixup_binary_operands_no_copy (UMAX, V4SImode, operands);
@@ -6195,14 +6195,39 @@
     }
 })
 
-(define_expand "umin<mode>3"
-  [(set (match_operand:SSEMODE24 0 "register_operand" "")
-	(umin:SSEMODE24 (match_operand:SSEMODE24 1 "register_operand" "")
-			(match_operand:SSEMODE24 2 "register_operand" "")))]
+(define_expand "uminv8hi3"
+  [(set (match_operand:V8HI 0 "register_operand" "")
+	(umin:V8HI (match_operand:V8HI 1 "register_operand" "")
+		   (match_operand:V8HI 2 "register_operand" "")))]
   "TARGET_SSE2"
 {
   if (TARGET_SSE4_1)
-    ix86_fixup_binary_operands_no_copy (UMIN, <MODE>mode, operands);
+    ix86_fixup_binary_operands_no_copy (UMIN, V8HImode, operands);
+  else
+    {
+      rtx xops[6];
+      bool ok;
+
+      xops[0] = operands[0];
+      xops[1] = operands[2];
+      xops[2] = operands[1];
+      xops[3] = gen_rtx_GTU (VOIDmode, operands[1], operands[2]);
+      xops[4] = operands[1];
+      xops[5] = operands[2];
+      ok = ix86_expand_int_vcond (xops);
+      gcc_assert (ok);
+      DONE;
+    }
+})
+
+(define_expand "uminv4si3"
+  [(set (match_operand:V4SI 0 "register_operand" "")
+	(umin:V4SI (match_operand:V4SI 1 "register_operand" "")
+		   (match_operand:V4SI 2 "register_operand" "")))]
+  "TARGET_SSE4_1 || TARGET_XOP"
+{
+  if (TARGET_SSE4_1)
+    ix86_fixup_binary_operands_no_copy (UMIN, V4SImode, operands);
   else
     {
       rtx xops[6];
diff --git a/gcc/testsuite/gcc.target/i386/pr42542-1.c b/gcc/testsuite/gcc.target/i386/pr42542-1.c
new file mode 100644
index 0000000..6e115c3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr42542-1.c
@@ -0,0 +1,77 @@
+/* { dg-do run } */
+/* { dg-options "-O1 -msse2 -ftree-vectorize" } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse2-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse2_test
+#endif
+
+#include CHECK_H
+
+unsigned int v1[] __attribute__ ((aligned(16))) =
+{
+  0x80000000, 1, 0xa0000000, 2,
+  3, 0xd0000000, 0xf0000000, 0xe0000000
+};
+unsigned int v2[] __attribute__ ((aligned(16))) =
+{
+  4, 0xb0000000, 5, 0xc0000000,
+  0xd0000000, 6, 7, 8
+};
+
+unsigned int max[] =
+{
+  0x80000000, 0xb0000000, 0xa0000000, 0xc0000000,
+  0xd0000000, 0xd0000000, 0xf0000000, 0xe0000000
+};
+
+unsigned int min[] =
+{
+  4, 1, 5, 2,
+  3, 6, 7, 8
+};
+
+unsigned int res[16] __attribute__ ((aligned(16)));
+
+extern void abort (void);
+
+void
+find_max (void)
+{
+  int i;
+
+  for (i = 0; i < 8; i++)
+    res[i] = v1[i] < v2[i] ? v2[i] : v1[i];
+}
+
+void
+find_min (void)
+{
+  int i;
+
+  for (i = 0; i < 8; i++)
+    res[i] = v1[i] > v2[i] ? v2[i] : v1[i];
+}
+
+static void
+TEST (void)
+{
+  int i;
+  int err = 0;
+
+  find_max ();
+  for (i = 0; i < 8; i++)
+    if (res[i] != max[i])
+      err++;
+
+  find_min ();
+  for (i = 0; i < 8; i++)
+    if (res[i] != min[i])
+      err++;
+
+  if (err)
+    abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr42542-1a.c b/gcc/testsuite/gcc.target/i386/pr42542-1a.c
new file mode 100644
index 0000000..cd77175
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr42542-1a.c
@@ -0,0 +1,8 @@
+/* { dg-do run } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O1 -msse4.1 -ftree-vectorize" } */
+
+#define CHECK_H "sse4_1-check.h"
+#define TEST sse4_1_test
+
+#include "pr42542-1.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr42542-1b.c b/gcc/testsuite/gcc.target/i386/pr42542-1b.c
new file mode 100644
index 0000000..7651f07
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr42542-1b.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -msse4.1 -ftree-vectorize" } */
+
+#define CHECK_H "sse4_1-check.h"
+#define TEST sse4_1_test
+
+#include "pr42542-1.c"
+
+/* { dg-final { scan-assembler "pmaxud" } } */
+/* { dg-final { scan-assembler "pminud" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr42542-2.c b/gcc/testsuite/gcc.target/i386/pr42542-2.c
new file mode 100644
index 0000000..fc59534
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr42542-2.c
@@ -0,0 +1,77 @@
+/* { dg-do run } */
+/* { dg-options "-O1 -msse2 -ftree-vectorize" } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse2-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse2_test
+#endif
+
+#include CHECK_H
+
+unsigned short v1[] __attribute__ ((aligned(16))) =
+{
+  0x8000, 0x9000, 1, 10, 0xa000, 0xb000, 2, 20,
+  3, 30, 0xd000, 0xe000, 0xf000, 0xe000, 25, 30
+};
+unsigned short v2[] __attribute__ ((aligned(16))) =
+{
+  4, 40, 0xb000, 0x8000, 5, 50, 0xc000, 0xf000,
+  0xd000, 0xa000, 6, 65, 7, 75, 0xe000, 0xc000
+};
+
+unsigned short max[] =
+{
+  0x8000, 0x9000, 0xb000, 0x8000, 0xa000, 0xb000, 0xc000, 0xf000,
+  0xd000, 0xa000, 0xd000, 0xe000, 0xf000, 0xe000, 0xe000, 0xc000
+};
+
+unsigned short min[] =
+{
+  4, 40, 1, 10, 5, 50, 2, 20,
+  3, 30, 6, 65, 7, 75, 25, 30
+};
+
+unsigned short res[16] __attribute__ ((aligned(16)));
+
+extern void abort (void);
+
+void
+find_max (void)
+{
+  int i;
+
+  for (i = 0; i < 16; i++)
+    res[i] = v1[i] < v2[i] ? v2[i] : v1[i];
+}
+
+void
+find_min (void)
+{
+  int i;
+
+  for (i = 0; i < 16; i++)
+    res[i] = v1[i] > v2[i] ? v2[i] : v1[i];
+}
+
+static void
+TEST (void)
+{
+  int i;
+  int err = 0;
+
+  find_max ();
+  for (i = 0; i < 16; i++)
+    if (res[i] != max[i])
+      err++;
+
+  find_min ();
+  for (i = 0; i < 16; i++)
+    if (res[i] != min[i])
+      err++;
+
+  if (err)
+    abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr42542-2a.c b/gcc/testsuite/gcc.target/i386/pr42542-2a.c
new file mode 100644
index 0000000..bcefa9c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr42542-2a.c
@@ -0,0 +1,8 @@
+/* { dg-do run } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O1 -msse4.1 -ftree-vectorize" } */
+
+#define CHECK_H "sse4_1-check.h"
+#define TEST sse4_1_test
+
+#include "pr42542-2.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr42542-2b.c b/gcc/testsuite/gcc.target/i386/pr42542-2b.c
new file mode 100644
index 0000000..ddb539b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr42542-2b.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -msse4.1 -ftree-vectorize" } */
+
+#define CHECK_H "sse4_1-check.h"
+#define TEST sse4_1_test
+
+#include "pr42542-2.c"
+
+/* { dg-final { scan-assembler "pmaxuw" } } */
+/* { dg-final { scan-assembler "pminuw" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr42542-3.c b/gcc/testsuite/gcc.target/i386/pr42542-3.c
new file mode 100644
index 0000000..028d2f8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr42542-3.c
@@ -0,0 +1,85 @@
+/* { dg-do run } */
+/* { dg-options "-O1 -msse2 -ftree-vectorize" } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse2-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse2_test
+#endif
+
+#include CHECK_H
+
+unsigned char v1[] __attribute__ ((aligned(16))) =
+{
+  0x80, 0xd0, 0x90, 0xa0, 1, 15, 10, 15,
+  0xa0, 0xc0, 0xb0, 0xf0, 2, 25, 20, 35,
+  3, 34, 30, 36, 0xd0, 0x80, 0xe0, 0xb0,
+  0xf0, 0xe0, 0xe0, 0x80, 25, 34, 30, 40
+};
+unsigned char v2[] __attribute__ ((aligned(16))) =
+{
+  4, 44, 40, 48, 0xb0, 0x80, 0x80, 0x90,
+  5, 55, 50, 51, 0xc0, 0xb0, 0xf0, 0xd0,
+  0xd0, 0x80, 0xa0, 0xf0, 6, 61, 65, 68,
+  7, 76, 75, 81, 0xe0, 0xf0, 0xc0, 0x90
+};
+
+unsigned char max[] =
+{
+  0x80, 0xd0, 0x90, 0xa0, 0xb0, 0x80, 0x80, 0x90,
+  0xa0, 0xc0, 0xb0, 0xf0, 0xc0, 0xb0, 0xf0, 0xd0,
+  0xd0, 0x80, 0xa0, 0xf0, 0xd0, 0x80, 0xe0, 0xb0,
+  0xf0, 0xe0, 0xe0, 0x80, 0xe0, 0xf0, 0xc0, 0x90
+};
+
+unsigned char min[] =
+{
+  4, 44, 40, 48, 1, 15, 10, 15,
+  5, 55, 50, 51, 2, 25, 20, 35,
+  3, 34, 30, 36, 6, 61, 65, 68,
+  7, 76, 75, 81, 25, 34, 30, 40
+};
+
+unsigned char res[32] __attribute__ ((aligned(16)));
+
+extern void abort (void);
+
+void
+find_max (void)
+{
+  int i;
+
+  for (i = 0; i < 32; i++)
+    res[i] = v1[i] < v2[i] ? v2[i] : v1[i];
+}
+
+void
+find_min (void)
+{
+  int i;
+
+  for (i = 0; i < 32; i++)
+    res[i] = v1[i] > v2[i] ? v2[i] : v1[i];
+}
+
+static void
+TEST (void)
+{
+  int i;
+  int err = 0;
+
+  find_max ();
+  for (i = 0; i < 32; i++)
+    if (res[i] != max[i])
+      err++;
+
+  find_min ();
+  for (i = 0; i < 32; i++)
+    if (res[i] != min[i])
+      err++;
+
+  if (err)
+    abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr42542-3a.c b/gcc/testsuite/gcc.target/i386/pr42542-3a.c
new file mode 100644
index 0000000..754e59e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr42542-3a.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -msse2 -ftree-vectorize" } */
+
+#include "pr42542-3.c"
+
+/* { dg-final { scan-assembler "pmaxub" } } */
+/* { dg-final { scan-assembler "pminub" } } */

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]