This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] Optimize V8HImode UMIN reduction using PHMINPOSUW insn and some cleanup


Hi!

This patch is partly taken from my part of the PR50374 patch,
though that patch will need some further work in the vectorizer
etc.

SSE4.1 has the phminposuw insn which can be used for reduction
instead of 3 shuffles and 3 min insns:
...
-	vpsrldq	$8, %xmm0, %xmm1
-	vpminuw	%xmm1, %xmm0, %xmm0
-	vpsrldq	$4, %xmm0, %xmm1
-	vpminuw	%xmm0, %xmm1, %xmm0
-	vpsrldq	$2, %xmm0, %xmm1
-	vpminuw	%xmm0, %xmm1, %xmm0
+	vphminposuw	%xmm0, %xmm0
 	vpextrw	$0, %xmm0, %eax

E.g.
#define N 32
unsigned short b[N];
__attribute__((noinline)) unsigned short
vecumin (void)
{
  int i;
  unsigned short r = 65535;
  for (i = 0; i < N; ++i)
    if (r > b[i]) r = b[i];
  return r;
}
function got ~ 12.5% faster when executing it 1000000000x
on SandyBridge.  The insn doesn't have 256-bit counterpart
in AVX unfortunately, so it is left for V8HImode only.

The other change is just a cleanup of ix86_expand_reduc.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2011-10-13  Jakub Jelinek  <jakub@redhat.com>

	* config/i386/sse.md (reduc_umin_v8hi): New pattern.
	* config/i386/i386.c (ix86_build_const_vector): Handle
	also V32QI, V16QI, V16HI and V8HI modes.
	(emit_reduc_half): New function.
	(ix86_expand_reduc): Use phminposuw insn for V8HImode UMIN.
	Use emit_reduc_half helper function.

	* gcc.target/i386/sse4_1-phminposuw-2.c: New test.
	* gcc.target/i386/sse4_1-phminposuw-3.c: New test.
	* gcc.target/i386/avx-vphminposuw-2.c: New test.
	* gcc.target/i386/avx-vphminposuw-3.c: New test.

--- gcc/config/i386/sse.md.jj	2011-10-13 11:13:41.000000000 +0200
+++ gcc/config/i386/sse.md	2011-10-13 12:26:13.000000000 +0200
@@ -1303,6 +1303,16 @@ (define_expand "reduc_<code>_<mode>"
   DONE;
 })
 
+(define_expand "reduc_umin_v8hi"
+  [(umin:V8HI
+     (match_operand:V8HI 0 "register_operand" "")
+     (match_operand:V8HI 1 "register_operand" ""))]
+  "TARGET_SSE4_1"
+{
+  ix86_expand_reduc (gen_uminv8hi3, operands[0], operands[1]);
+  DONE;
+})
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
 ;; Parallel floating point comparisons
--- gcc/config/i386/i386.c.jj	2011-10-13 11:13:41.000000000 +0200
+++ gcc/config/i386/i386.c	2011-10-13 11:56:19.000000000 +0200
@@ -17008,6 +17008,10 @@ ix86_build_const_vector (enum machine_mo
 
   switch (mode)
     {
+    case V32QImode:
+    case V16QImode:
+    case V16HImode:
+    case V8HImode:
     case V8SImode:
     case V4SImode:
     case V4DImode:
@@ -33250,72 +33254,100 @@ ix86_expand_vector_extract (bool mmx_ok,
     }
 }
 
-/* Expand a vector reduction.  FN is the binary pattern to reduce;
-   DEST is the destination; IN is the input vector.  */
+/* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
+   to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
+   The upper bits of DEST are undefined, though they shouldn't cause
+   exceptions (some bits from src or all zeros are ok).  */
 
-void
-ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
+static void
+emit_reduc_half (rtx dest, rtx src, int i)
 {
-  rtx tmp1, tmp2, tmp3, tmp4, tmp5;
-  enum machine_mode mode = GET_MODE (in);
-  int i;
-
-  tmp1 = gen_reg_rtx (mode);
-  tmp2 = gen_reg_rtx (mode);
-  tmp3 = gen_reg_rtx (mode);
-
-  switch (mode)
+  rtx tem;
+  switch (GET_MODE (src))
     {
     case V4SFmode:
-      emit_insn (gen_sse_movhlps (tmp1, in, in));
-      emit_insn (fn (tmp2, tmp1, in));
-      emit_insn (gen_sse_shufps_v4sf (tmp3, tmp2, tmp2,
-				      const1_rtx, const1_rtx,
-				      GEN_INT (1+4), GEN_INT (1+4)));
+      if (i == 128)
+	tem = gen_sse_movhlps (dest, src, src);
+      else
+	tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
+				   GEN_INT (1 + 4), GEN_INT (1 + 4));
+      break;
+    case V2DFmode:
+      tem = gen_vec_interleave_highv2df (dest, src, src);
+      break;
+    case V16QImode:
+    case V8HImode:
+    case V4SImode:
+    case V2DImode:
+      tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
+				gen_lowpart (V1TImode, src),
+				GEN_INT (i / 2));
       break;
     case V8SFmode:
-      tmp4 = gen_reg_rtx (mode);
-      tmp5 = gen_reg_rtx (mode);
-      emit_insn (gen_avx_vperm2f128v8sf3 (tmp4, in, in, const1_rtx));
-      emit_insn (fn (tmp5, tmp4, in));
-      emit_insn (gen_avx_shufps256 (tmp1, tmp5, tmp5, GEN_INT (2+12)));
-      emit_insn (fn (tmp2, tmp1, tmp5));
-      emit_insn (gen_avx_shufps256 (tmp3, tmp2, tmp2, const1_rtx));
+      if (i == 256)
+	tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
+      else
+	tem = gen_avx_shufps256 (dest, src, src,
+				 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
       break;
     case V4DFmode:
-      emit_insn (gen_avx_vperm2f128v4df3 (tmp1, in, in, const1_rtx));
-      emit_insn (fn (tmp2, tmp1, in));
-      emit_insn (gen_avx_shufpd256 (tmp3, tmp2, tmp2, const1_rtx));
+      if (i == 256)
+	tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
+      else
+	tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
       break;
     case V32QImode:
     case V16HImode:
     case V8SImode:
     case V4DImode:
-      emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, tmp1),
-				    gen_lowpart (V4DImode, in),
-				    gen_lowpart (V4DImode, in),
-				    const1_rtx));
-      tmp4 = in;
-      tmp5 = tmp1;
-      for (i = 64; i >= GET_MODE_BITSIZE (GET_MODE_INNER (mode)); i >>= 1)
-	{
-	  if (i != 64)
-	    {
-	      tmp2 = gen_reg_rtx (mode);
-	      tmp3 = gen_reg_rtx (mode);
-	    }
-	  emit_insn (fn (tmp2, tmp4, tmp5));
-	  emit_insn (gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, tmp3),
-					 gen_lowpart (V2TImode, tmp2),
-					 GEN_INT (i)));
-	  tmp4 = tmp2;
-	  tmp5 = tmp3;
-	}
+      if (i == 256)
+	tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
+				 gen_lowpart (V4DImode, src),
+				 gen_lowpart (V4DImode, src),
+				 const1_rtx);
+      else
+	tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
+				  gen_lowpart (V2TImode, src),
+				  GEN_INT (i / 2));
       break;
     default:
       gcc_unreachable ();
     }
-  emit_insn (fn (dest, tmp2, tmp3));
+  emit_insn (tem);
+}
+
+/* Expand a vector reduction.  FN is the binary pattern to reduce;
+   DEST is the destination; IN is the input vector.  */
+
+void
+ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
+{
+  rtx half, dst, vec = in;
+  enum machine_mode mode = GET_MODE (in);
+  int i;
+
+  /* SSE4 has a special instruction for V8HImode UMIN reduction.  */
+  if (TARGET_SSE4_1
+      && mode == V8HImode
+      && fn == gen_uminv8hi3)
+    {
+      emit_insn (gen_sse4_1_phminposuw (dest, in));
+      return;
+    }
+
+  for (i = GET_MODE_BITSIZE (mode);
+       i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
+       i >>= 1)
+    {
+      half = gen_reg_rtx (mode);
+      emit_reduc_half (half, vec, i);
+      if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
+	dst = dest;
+      else
+	dst = gen_reg_rtx (mode);
+      emit_insn (fn (dst, half, vec));
+      vec = dst;
+    }
 }
 
 /* Target hook for scalar_mode_supported_p.  */
--- gcc/testsuite/gcc.target/i386/sse4_1-phminposuw-2.c.jj	2011-10-13 12:43:45.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/sse4_1-phminposuw-2.c	2011-10-13 12:50:54.000000000 +0200
@@ -0,0 +1,78 @@
+/* { dg-do run } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O3 -msse4.1 -mno-avx2" } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse4_1-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse4_1_test
+#endif
+
+#include CHECK_H
+
+extern void abort (void);
+
+#define N 1024
+short a[N], c, e;
+unsigned short b[N], d, f;
+
+__attribute__((noinline)) short
+vecsmax (void)
+{
+  int i;
+  short r = -32768;
+  for (i = 0; i < N; ++i)
+    if (r < a[i]) r = a[i];
+  return r;
+}
+
+__attribute__((noinline)) unsigned short
+vecumax (void)
+{
+  int i;
+  unsigned short r = 0;
+  for (i = 0; i < N; ++i)
+    if (r < b[i]) r = b[i];
+  return r;
+}
+
+__attribute__((noinline)) short
+vecsmin (void)
+{
+  int i;
+  short r = 32767;
+  for (i = 0; i < N; ++i)
+    if (r > a[i]) r = a[i];
+  return r;
+}
+
+__attribute__((noinline)) unsigned short
+vecumin (void)
+{
+  int i;
+  unsigned short r = 65535;
+  for (i = 0; i < N; ++i)
+    if (r > b[i]) r = b[i];
+  return r;
+}
+
+static void
+TEST (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    {
+      a[i] = i - N / 2;
+      b[i] = i + 32768 - N / 2;
+    }
+  a[N / 3] = N;
+  a[2 * N / 3] = -N;
+  b[N / 5] = 32768 + N;
+  b[4 * N / 5] = 32768 - N;
+  if (vecsmax () != N || vecsmin () != -N)
+    abort ();
+  if (vecumax () != 32768 + N || vecumin () != 32768 - N)
+    abort ();
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-phminposuw-3.c.jj	2011-10-13 12:48:49.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/sse4_1-phminposuw-3.c	2011-10-13 12:50:57.000000000 +0200
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -msse4.1 -mno-avx2" } */
+
+#include "sse4_1-phminposuw-2.c"
+
+/* { dg-final { scan-assembler "phminposuw\[^\n\r\]*xmm" } } */
--- gcc/testsuite/gcc.target/i386/avx-vphminposuw-2.c.jj	2011-10-13 12:45:46.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/avx-vphminposuw-2.c	2011-10-13 12:50:46.000000000 +0200
@@ -0,0 +1,8 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx } */
+/* { dg-options "-O3 -mavx -mno-avx2" } */
+
+#define CHECK_H "avx-check.h"
+#define TEST avx_test
+
+#include "sse4_1-phminposuw-2.c"
--- gcc/testsuite/gcc.target/i386/avx-vphminposuw-3.c.jj	2011-10-13 12:49:40.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/avx-vphminposuw-3.c	2011-10-13 12:50:50.000000000 +0200
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx -mno-avx2" } */
+
+#include "avx-vphminposuw-2.c"
+
+/* { dg-final { scan-assembler "vphminposuw\[^\n\r\]*xmm" } } */

	Jakub


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]