This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[RFC PATCH] AVX2 32-byte integer {s,u}m{in,ax} and vcond{,u} patterns


Hi!

On Fri, Sep 16, 2011 at 01:24:53PM +0200, Jakub Jelinek wrote:
> Surprisingly with -mavx2 the integer loops aren't vectorized with
> 32-byte vectors, wonder why.  But looking at the integer umin/umax/smin/smax
> 16-byte reductions they generate good code even without reduc_* patterns,
> apparently using vector shifts.

Seems on that testcase the integer loops weren't using 32-byte vectors
because there were no expanders for 32-byte integer min/max.
The following patch adds that (and also 32-byte integer condition
vcond/u because it is related).  With this all the integer loops
in that testcase are nicely vectorized with 32-byte vectors with -mavx2,
unfortunately the reductions look terrible.

The problem is that AVX2 doesn't have 32-byte whole vector shift right
(well, in theory it has it if the shift count is exactly 128 - vextractf128).
For shift counts > 128 we could in theory handle it as two instructions,
vextractf128 plus a 16-byte whole vector shift with count - 128, but
reductions actually don't need the two steps, we only care about the
bottom bits after the shifts and the upper bits can contain anything.

So, either we can fix this by adding reduc_{smin,smax,umin,umax}_v{32q,16h,8s,4d}i
patterns (at that point I guess I should just macroize them together with
the reduc_{smin,smax,umin,umax}_v{4sf,8sf,4df}) and handle the 4 32-byte
integer modes also in ix86_expand_reduc, or come up with some new optab
for an operation like whole vector shift right, but which would allow
the upper bits to be undefined and would only allow shifts by
vector size / 2, / 4, / 8 down to element size and corresponding tree code.
What do you prefer?

OT: seems the AVX2 support put the avx2_<code><mode>3 and
*avx2_<code><mode>3 patterns (the former after this patch <code><mode>3)
in a wrong spot, in between vec_shr_<mode> expander and sse2_lshrv1ti3
insn which implements what the expander expands.  Uros, would you like to
move it elsewhere?  Where exactly?

This patch has been tested on x86_64-linux and i686-linux on SandyBridge.

2011-09-16  Jakub Jelinek  <jakub@redhat.com>

	* config/i386/i386.c (ix86_build_const_vector): Handle V8SImode
	and V4DImode.
	(ix86_build_signbit_mask): Likewise.
	(ix86_expand_int_vcond): Likewise.  Handle V16HImode and
	V32QImode.
	(bdesc_args): Use CODE_FOR_{s,u}m{ax,in}v{32q,16h,8s}i3
	instead of CODE_FOR_avx2_{s,u}m{ax,in}v{32q,16h,8s}i3.
	* config/i386/sse.md (avx2_<code><mode>3 umaxmin expand): Rename
	to...
	(<code><mode>3) ... this.
	(avx2_<code><mode>3 smaxmin expand): Rename to...
	(<code><mode>3) ... this.
	(smax<mode>3, smin<mode>3): Macroize using smaxmin code iterator.
	(smaxv2di3, sminv2di3): Macroize using smaxmin code iterator and
	VI8_AVX2 mode iterator.
	(umaxv2di3, uminv2di3): Macroize using umaxmin code iterator and
	VI8_AVX2 mode iterator.
	(vcond<V_256:mode><VI_256:mode>, vcondu<V_256:mode><VI_256:mode>):
	New expanders.
	
--- gcc/config/i386/i386.c.jj	2011-09-16 11:54:27.000000000 +0200
+++ gcc/config/i386/i386.c	2011-09-16 16:46:12.000000000 +0200
@@ -16951,7 +16951,9 @@ ix86_build_const_vector (enum machine_mo
 
   switch (mode)
     {
+    case V8SImode:
     case V4SImode:
+    case V4DImode:
     case V2DImode:
       gcc_assert (vect);
     case V8SFmode:
@@ -16992,6 +16994,7 @@ ix86_build_signbit_mask (enum machine_mo
   /* Find the sign bit, sign extended to 2*HWI.  */
   switch (mode)
     {
+    case V8SImode:
     case V4SImode:
     case V8SFmode:
     case V4SFmode:
@@ -17001,6 +17004,7 @@ ix86_build_signbit_mask (enum machine_mo
       lo = 0x80000000, hi = lo < 0;
       break;
 
+    case V4DImode:
     case V2DImode:
     case V4DFmode:
     case V2DFmode:
@@ -19112,17 +19116,26 @@ ix86_expand_int_vcond (rtx operands[])
 
 	  switch (mode)
 	    {
+	    case V8SImode:
+	    case V4DImode:
 	    case V4SImode:
 	    case V2DImode:
 		{
 		  rtx t1, t2, mask;
 		  rtx (*gen_sub3) (rtx, rtx, rtx);
 
+		  switch (mode)
+		    {
+		    case V8SImode: gen_sub3 = gen_subv8si3; break;
+		    case V4DImode: gen_sub3 = gen_subv4di3; break;
+		    case V4SImode: gen_sub3 = gen_subv4si3; break;
+		    case V2DImode: gen_sub3 = gen_subv2di3; break;
+		    default:
+		      gcc_unreachable ();
+		    }
 		  /* Subtract (-(INT MAX) - 1) from both operands to make
 		     them signed.  */
 		  mask = ix86_build_signbit_mask (mode, true, false);
-		  gen_sub3 = (mode == V4SImode
-			      ? gen_subv4si3 : gen_subv2di3);
 		  t1 = gen_reg_rtx (mode);
 		  emit_insn (gen_sub3 (t1, cop0, mask));
 
@@ -19135,6 +19148,8 @@ ix86_expand_int_vcond (rtx operands[])
 		}
 	      break;
 
+	    case V32QImode:
+	    case V16HImode:
 	    case V16QImode:
 	    case V8HImode:
 	      /* Perform a parallel unsigned saturating subtraction.  */
@@ -25728,18 +25743,18 @@ static const struct builtin_description 
   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
-  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
-  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
-  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
-  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
-  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
-  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
-  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
-  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
-  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
-  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
-  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
-  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2  , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
--- gcc/config/i386/sse.md.jj	2011-09-16 15:12:43.000000000 +0200
+++ gcc/config/i386/sse.md	2011-09-16 17:04:07.000000000 +0200
@@ -5806,7 +5806,7 @@ (define_expand "vec_shr_<mode>"
   operands[1] = gen_lowpart (V1TImode, operands[1]);
 })
 
-(define_expand "avx2_<code><mode>3"
+(define_expand "<code><mode>3"
   [(set (match_operand:VI124_256 0 "register_operand" "")
 	(umaxmin:VI124_256
 	  (match_operand:VI124_256 1 "nonimmediate_operand" "")
@@ -5853,7 +5853,7 @@ (define_insn "sse2_lshrv1ti3"
    (set_attr "prefix" "orig,vex")
    (set_attr "mode" "TI")])
 
-(define_expand "avx2_<code><mode>3"
+(define_expand "<code><mode>3"
   [(set (match_operand:VI124_256 0 "register_operand" "")
 	(smaxmin:VI124_256
 	  (match_operand:VI124_256 1 "nonimmediate_operand" "")
@@ -5904,47 +5904,22 @@ (define_insn "*<code>v8hi3"
    (set_attr "prefix" "orig,vex")
    (set_attr "mode" "TI")])
 
-(define_expand "smax<mode>3"
-  [(set (match_operand:VI14_128 0 "register_operand" "")
-	(smax:VI14_128 (match_operand:VI14_128 1 "register_operand" "")
-		       (match_operand:VI14_128 2 "register_operand" "")))]
-  "TARGET_SSE2"
-{
-  if (TARGET_SSE4_1)
-    ix86_fixup_binary_operands_no_copy (SMAX, <MODE>mode, operands);
-  else
-    {
-      rtx xops[6];
-      bool ok;
-
-      xops[0] = operands[0];
-      xops[1] = operands[1];
-      xops[2] = operands[2];
-      xops[3] = gen_rtx_GT (VOIDmode, operands[1], operands[2]);
-      xops[4] = operands[1];
-      xops[5] = operands[2];
-      ok = ix86_expand_int_vcond (xops);
-      gcc_assert (ok);
-      DONE;
-    }
-})
-
-(define_expand "smin<mode>3"
+(define_expand "<code><mode>3"
   [(set (match_operand:VI14_128 0 "register_operand" "")
-	(smin:VI14_128 (match_operand:VI14_128 1 "register_operand" "")
-		       (match_operand:VI14_128 2 "register_operand" "")))]
+	(smaxmin:VI14_128 (match_operand:VI14_128 1 "register_operand" "")
+			  (match_operand:VI14_128 2 "register_operand" "")))]
   "TARGET_SSE2"
 {
   if (TARGET_SSE4_1)
-    ix86_fixup_binary_operands_no_copy (SMIN, <MODE>mode, operands);
+    ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);
   else
     {
       rtx xops[6];
       bool ok;
 
       xops[0] = operands[0];
-      xops[1] = operands[2];
-      xops[2] = operands[1];
+      xops[1] = operands[<CODE> == SMAX ? 1 : 2];
+      xops[2] = operands[<CODE> == SMAX ? 2 : 1];
       xops[3] = gen_rtx_GT (VOIDmode, operands[1], operands[2]);
       xops[4] = operands[1];
       xops[5] = operands[2];
@@ -5962,38 +5937,18 @@ (define_expand "<code>v8hi3"
   "TARGET_SSE2"
   "ix86_fixup_binary_operands_no_copy (<CODE>, V8HImode, operands);")
 
-(define_expand "smaxv2di3"
-  [(set (match_operand:V2DI 0 "register_operand" "")
-	(smax:V2DI (match_operand:V2DI 1 "register_operand" "")
-		   (match_operand:V2DI 2 "register_operand" "")))]
-  "TARGET_SSE4_2"
-{
-  rtx xops[6];
-  bool ok;
-
-  xops[0] = operands[0];
-  xops[1] = operands[1];
-  xops[2] = operands[2];
-  xops[3] = gen_rtx_GT (VOIDmode, operands[1], operands[2]);
-  xops[4] = operands[1];
-  xops[5] = operands[2];
-  ok = ix86_expand_int_vcond (xops);
-  gcc_assert (ok);
-  DONE;
-})
-
-(define_expand "sminv2di3"
-  [(set (match_operand:V2DI 0 "register_operand" "")
-	(smin:V2DI (match_operand:V2DI 1 "register_operand" "")
-		   (match_operand:V2DI 2 "register_operand" "")))]
+(define_expand "<code><mode>3"
+  [(set (match_operand:VI8_AVX2 0 "register_operand" "")
+	(smaxmin:VI8_AVX2 (match_operand:VI8_AVX2 1 "register_operand" "")
+			  (match_operand:VI8_AVX2 2 "register_operand" "")))]
   "TARGET_SSE4_2"
 {
   rtx xops[6];
   bool ok;
 
   xops[0] = operands[0];
-  xops[1] = operands[2];
-  xops[2] = operands[1];
+  xops[1] = operands[<CODE> == SMAX ? 1 : 2];
+  xops[2] = operands[<CODE> == SMAX ? 2 : 1];
   xops[3] = gen_rtx_GT (VOIDmode, operands[1], operands[2]);
   xops[4] = operands[1];
   xops[5] = operands[2];
@@ -6110,38 +6065,18 @@ (define_expand "umin<mode>3"
     }
 })
 
-(define_expand "umaxv2di3"
-  [(set (match_operand:V2DI 0 "register_operand" "")
-	(umax:V2DI (match_operand:V2DI 1 "register_operand" "")
-		   (match_operand:V2DI 2 "register_operand" "")))]
-  "TARGET_SSE4_2"
-{
-  rtx xops[6];
-  bool ok;
-
-  xops[0] = operands[0];
-  xops[1] = operands[1];
-  xops[2] = operands[2];
-  xops[3] = gen_rtx_GTU (VOIDmode, operands[1], operands[2]);
-  xops[4] = operands[1];
-  xops[5] = operands[2];
-  ok = ix86_expand_int_vcond (xops);
-  gcc_assert (ok);
-  DONE;
-})
-
-(define_expand "uminv2di3"
-  [(set (match_operand:V2DI 0 "register_operand" "")
-	(umin:V2DI (match_operand:V2DI 1 "register_operand" "")
-		   (match_operand:V2DI 2 "register_operand" "")))]
+(define_expand "<code><mode>3"
+  [(set (match_operand:VI8_AVX2 0 "register_operand" "")
+	(umaxmin:VI8_AVX2 (match_operand:VI8_AVX2 1 "register_operand" "")
+			  (match_operand:VI8_AVX2 2 "register_operand" "")))]
   "TARGET_SSE4_2"
 {
   rtx xops[6];
   bool ok;
 
   xops[0] = operands[0];
-  xops[1] = operands[2];
-  xops[2] = operands[1];
+  xops[1] = operands[<CODE> == UMAX ? 1 : 2];
+  xops[2] = operands[<CODE> == UMAX ? 2 : 1];
   xops[3] = gen_rtx_GTU (VOIDmode, operands[1], operands[2]);
   xops[4] = operands[1];
   xops[5] = operands[2];
@@ -6265,6 +6200,23 @@ (define_insn "sse2_gt<mode>3"
    (set_attr "prefix" "orig,vex")
    (set_attr "mode" "TI")])
 
+(define_expand "vcond<V_256:mode><VI_256:mode>"
+  [(set (match_operand:V_256 0 "register_operand" "")
+	(if_then_else:V_256
+	  (match_operator 3 ""
+	    [(match_operand:VI_256 4 "nonimmediate_operand" "")
+	     (match_operand:VI_256 5 "nonimmediate_operand" "")])
+	  (match_operand:V_256 1 "general_operand" "")
+	  (match_operand:V_256 2 "general_operand" "")))]
+  "TARGET_AVX2
+   && (GET_MODE_NUNITS (<V_256:MODE>mode)
+       == GET_MODE_NUNITS (<VI_256:MODE>mode))"
+{
+  bool ok = ix86_expand_int_vcond (operands);
+  gcc_assert (ok);
+  DONE;
+})
+
 (define_expand "vcond<V_128:mode><VI124_128:mode>"
   [(set (match_operand:V_128 0 "register_operand" "")
 	(if_then_else:V_128
@@ -6297,6 +6249,23 @@ (define_expand "vcond<VI8F_128:mode>v2di
   DONE;
 })
 
+(define_expand "vcondu<V_256:mode><VI_256:mode>"
+  [(set (match_operand:V_256 0 "register_operand" "")
+	(if_then_else:V_256
+	  (match_operator 3 ""
+	    [(match_operand:VI_256 4 "nonimmediate_operand" "")
+	     (match_operand:VI_256 5 "nonimmediate_operand" "")])
+	  (match_operand:V_256 1 "general_operand" "")
+	  (match_operand:V_256 2 "general_operand" "")))]
+  "TARGET_AVX2
+   && (GET_MODE_NUNITS (<V_256:MODE>mode)
+       == GET_MODE_NUNITS (<VI_256:MODE>mode))"
+{
+  bool ok = ix86_expand_int_vcond (operands);
+  gcc_assert (ok);
+  DONE;
+})
+
 (define_expand "vcondu<V_128:mode><VI124_128:mode>"
   [(set (match_operand:V_128 0 "register_operand" "")
 	(if_then_else:V_128

	Jakub


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]