i386 vector min/max improvements

Richard Henderson rth@redhat.com
Wed Jun 29 17:38:00 GMT 2005


Implements stuff for V4SImode along the same lines as I came up with
for V2SImode on ia64.  Also copies the same unsigned saturation tricks
for vcond and umin.

One new trick for umax: the combination of us_minus and plus results
in umax without further bit manipulation.  There are two cases:

  (1) op2 >= op1.  In this case we saturate to zero, resulting in
      0 + op2, resulting in op2.

  (2) op2 < op1.  In this case no saturation happens, resulting in
      (op1 - op2) + op2 == op1.


r~


        * config/i386/i386.c (ix86_expand_int_vcond): Remove unsignedp
        argument.  Simplify canonicalization of condition.  Use unsigned
        saturating subtraction for QI and HImode unsigned compares.  Use
        bit arithmetic tricks for SImode unsigned compares.
        * config/i386/i386-protos.h (ix86_expand_int_vcond): Update decl.
        * config/i386/sse.md (SSEMODE14): New.
        (umaxv8hi3): Use us_minus+plus to avoid vcond.
        (umaxv4si3): New.
        (smax<SSEMODE14>3): Rename from smaxv16qi3 and macroize.
        (smin<SSEMODE14>3): Similarly with sminv16qi3.
        (umin<SSEMODE24>3): Similarly with uminv8hi3.

        * lib/target-supports.exp (check_effective_target_vect_no_max):
        Remove i386 and x86_64.

Index: config/i386/i386-protos.h
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/i386/i386-protos.h,v
retrieving revision 1.142
diff -u -p -r1.142 i386-protos.h
--- config/i386/i386-protos.h	25 Jun 2005 01:21:07 -0000	1.142
+++ config/i386/i386-protos.h	29 Jun 2005 17:18:42 -0000
@@ -149,7 +149,7 @@ extern int ix86_expand_setcc (enum rtx_c
 extern int ix86_expand_int_movcc (rtx[]);
 extern int ix86_expand_fp_movcc (rtx[]);
 extern bool ix86_expand_fp_vcond (rtx[]);
-extern bool ix86_expand_int_vcond (rtx[], bool);
+extern bool ix86_expand_int_vcond (rtx[]);
 extern int ix86_expand_int_addcc (rtx[]);
 extern void ix86_expand_call (rtx, rtx, rtx, rtx, rtx, int);
 extern void x86_initialize_trampoline (rtx, rtx, rtx);
Index: config/i386/i386.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/i386/i386.c,v
retrieving revision 1.837
diff -u -p -r1.837 i386.c
--- config/i386/i386.c	27 Jun 2005 08:03:20 -0000	1.837
+++ config/i386/i386.c	29 Jun 2005 17:18:43 -0000
@@ -10501,94 +10501,102 @@ ix86_expand_fp_vcond (rtx operands[])
 /* Expand a signed integral vector conditional move.  */
 
 bool
-ix86_expand_int_vcond (rtx operands[], bool unsignedp)
+ix86_expand_int_vcond (rtx operands[])
 {
   enum machine_mode mode = GET_MODE (operands[0]);
   enum rtx_code code = GET_CODE (operands[3]);
-  rtx cmp, x;
+  bool negate = false;
+  rtx x, cop0, cop1;
 
-  if (unsignedp)
-    code = signed_condition (code);
-  if (code == NE || code == LE || code == GE)
-    {
-      /* Inverse of a supported code.  */
-      x = operands[1];
-      operands[1] = operands[2];
-      operands[2] = x;
-      code = reverse_condition (code);
-    }
-  if (code == LT)
+  cop0 = operands[4];
+  cop1 = operands[5];
+
+  /* Canonicalize the comparison to EQ, GT, GTU.  */
+  switch (code)
     {
-      /* Swap of a supported code.  */
-      x = operands[4];
-      operands[4] = operands[5];
-      operands[5] = x;
-      code = swap_condition (code);
-    }
-  gcc_assert (code == EQ || code == GT);
+    case EQ:
+    case GT:
+    case GTU:
+      break;
+
+    case NE:
+    case LE:
+    case LEU:
+      code = reverse_condition (code);
+      negate = true;
+      break;
 
-  /* Unlike floating-point, we can rely on the optimizers to have already
-     converted to MIN/MAX expressions, so we don't have to handle that.  */
+    case GE:
+    case GEU:
+      code = reverse_condition (code);
+      negate = true;
+      /* FALLTHRU */
 
-  /* Unsigned GT is not directly supported.  We can zero-extend QI and
-     HImode elements to the next wider element size, use a signed compare,
-     then repack.  For three extra instructions, this is definitely a win.  */
-  if (code == GT && unsignedp)
-    {
-      rtx o0l, o0h, o1l, o1h, cl, ch, zero;
-      enum machine_mode wider;
-      rtx (*unpackl) (rtx, rtx, rtx);
-      rtx (*unpackh) (rtx, rtx, rtx);
-      rtx (*pack) (rtx, rtx, rtx);
+    case LT:
+    case LTU:
+      code = swap_condition (code);
+      x = cop0, cop0 = cop1, cop1 = x;
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
 
+  /* Unsigned parallel compare is not supported by the hardware.  Play some
+     tricks to turn this into a signed comparison against 0.  */
+  if (code == GTU)
+    {
       switch (mode)
 	{
-	case V16QImode:
-	  wider = V8HImode;
-	  unpackl = gen_sse2_punpcklbw;
-	  unpackh = gen_sse2_punpckhbw;
-	  pack = gen_sse2_packsswb;
+	case V4SImode:
+	  {
+	    rtx t1, t2, mask;
+
+	    /* Perform a parallel modulo subtraction.  */
+	    t1 = gen_reg_rtx (mode);
+	    emit_insn (gen_subv4si3 (t1, cop0, cop1));
+
+	    /* Extract the original sign bit of op0.  */
+	    mask = GEN_INT (-0x80000000);
+	    mask = gen_rtx_CONST_VECTOR (mode,
+			gen_rtvec (4, mask, mask, mask, mask));
+	    mask = force_reg (mode, mask);
+	    t2 = gen_reg_rtx (mode);
+	    emit_insn (gen_andv4si3 (t2, cop0, mask));
+
+	    /* XOR it back into the result of the subtraction.  This results
+	       in the sign bit set iff we saw unsigned underflow.  */
+	    x = gen_reg_rtx (mode);
+	    emit_insn (gen_xorv4si3 (x, t1, t2));
+
+	    code = GT;
+	  }
 	  break;
+
+	case V16QImode:
 	case V8HImode:
-	  wider = V4SImode;
-	  unpackl = gen_sse2_punpcklwd;
-	  unpackh = gen_sse2_punpckhwd;
-	  pack = gen_sse2_packssdw;
+	  /* Perform a parallel unsigned saturating subtraction.  */
+	  x = gen_reg_rtx (mode);
+	  emit_insn (gen_rtx_SET (VOIDmode, x,
+				  gen_rtx_US_MINUS (mode, cop0, cop1)));
+
+	  code = EQ;
+	  negate = !negate;
 	  break;
+
 	default:
 	  gcc_unreachable ();
 	}
 
-      operands[4] = force_reg (mode, operands[4]);
-      operands[5] = force_reg (mode, operands[5]);
-
-      o0l = gen_reg_rtx (wider);
-      o0h = gen_reg_rtx (wider);
-      o1l = gen_reg_rtx (wider);
-      o1h = gen_reg_rtx (wider);
-      cl = gen_reg_rtx (wider);
-      ch = gen_reg_rtx (wider);
-      cmp = gen_reg_rtx (mode);
-      zero = force_reg (mode, CONST0_RTX (mode));
-
-      emit_insn (unpackl (gen_lowpart (mode, o0l), operands[4], zero));
-      emit_insn (unpackh (gen_lowpart (mode, o0h), operands[4], zero));
-      emit_insn (unpackl (gen_lowpart (mode, o1l), operands[5], zero));
-      emit_insn (unpackh (gen_lowpart (mode, o1h), operands[5], zero));
-
-      x = gen_rtx_GT (wider, o0l, o1l);
-      emit_insn (gen_rtx_SET (VOIDmode, cl, x));
-
-      x = gen_rtx_GT (wider, o0h, o1h);
-      emit_insn (gen_rtx_SET (VOIDmode, ch, x));
-
-      emit_insn (pack (cmp, cl, ch));
+      cop0 = x;
+      cop1 = CONST0_RTX (mode);
     }
-  else
-    cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
-			       operands[1], operands[2]);
 
-  ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
+  x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
+			   operands[1+negate], operands[2-negate]);
+
+  ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
+			 operands[2-negate]);
   return true;
 }
 
Index: config/i386/sse.md
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/i386/sse.md,v
retrieving revision 1.20
diff -u -p -r1.20 sse.md
--- config/i386/sse.md	28 Jun 2005 09:00:42 -0000	1.20
+++ config/i386/sse.md	29 Jun 2005 17:18:44 -0000
@@ -30,6 +30,7 @@
 ;; Mix-n-match
 (define_mode_macro SSEMODE12 [V16QI V8HI])
 (define_mode_macro SSEMODE24 [V8HI V4SI])
+(define_mode_macro SSEMODE14 [V16QI V4SI])
 (define_mode_macro SSEMODE124 [V16QI V8HI V4SI])
 (define_mode_macro SSEMODE248 [V8HI V4SI V2DI])
 
@@ -2741,26 +2742,6 @@
   operands[1] = gen_lowpart (TImode, operands[1]);
 })
 
-(define_expand "smaxv16qi3"
-  [(set (match_operand:V16QI 0 "register_operand" "")
-	(smax:V16QI (match_operand:V16QI 1 "register_operand" "")
-		    (match_operand:V16QI 2 "register_operand" "")))]
-  "TARGET_SSE2"
-{
-  rtx xops[6];
-  bool ok;
-
-  xops[0] = operands[0];
-  xops[1] = operands[1];
-  xops[2] = operands[2];
-  xops[3] = gen_rtx_GT (VOIDmode, operands[1], operands[2]);
-  xops[4] = operands[1];
-  xops[5] = operands[2];
-  ok = ix86_expand_int_vcond (xops, false);
-  gcc_assert (ok);
-  DONE;
-})
-
 (define_expand "umaxv16qi3"
   [(set (match_operand:V16QI 0 "register_operand" "")
 	(umax:V16QI (match_operand:V16QI 1 "nonimmediate_operand" "")
@@ -2794,33 +2775,42 @@
    (set_attr "mode" "TI")])
 
 (define_expand "umaxv8hi3"
-  [(set (match_operand:V8HI 0 "register_operand" "")
-	(umax:V8HI (match_operand:V8HI 1 "register_operand" "")
-		   (match_operand:V8HI 2 "register_operand" "")))]
+  [(set (match_operand:V8HI 0 "register_operand" "=x")
+	(us_minus:V8HI (match_operand:V8HI 1 "register_operand" "0")
+		       (match_operand:V8HI 2 "nonimmediate_operand" "xm")))
+   (set (match_dup 3)
+	(plus:V8HI (match_dup 0) (match_dup 2)))]
   "TARGET_SSE2"
 {
-  rtx xops[6], t1, t2;
-  bool ok;
+  operands[3] = operands[0];
+  if (rtx_equal_p (operands[0], operands[2]))
+    operands[0] = gen_reg_rtx (V8HImode);
+})
 
-  t1 = gen_reg_rtx (V8HImode);
-  emit_insn (gen_sse2_ussubv8hi3 (t1, operands[2], operands[1]));
-  t2 = force_reg (V8HImode, CONST0_RTX (V8HImode));
+(define_expand "smax<mode>3"
+  [(set (match_operand:SSEMODE14 0 "register_operand" "")
+	(smax:SSEMODE14 (match_operand:SSEMODE14 1 "register_operand" "")
+			(match_operand:SSEMODE14 2 "register_operand" "")))]
+  "TARGET_SSE2"
+{
+  rtx xops[6];
+  bool ok;
 
   xops[0] = operands[0];
   xops[1] = operands[1];
   xops[2] = operands[2];
-  xops[3] = gen_rtx_EQ (VOIDmode, t1, t2);
-  xops[4] = t1;
-  xops[5] = t2;
-  ok = ix86_expand_int_vcond (xops, false);
+  xops[3] = gen_rtx_GT (VOIDmode, operands[1], operands[2]);
+  xops[4] = operands[1];
+  xops[5] = operands[2];
+  ok = ix86_expand_int_vcond (xops);
   gcc_assert (ok);
   DONE;
 })
 
-(define_expand "sminv16qi3"
-  [(set (match_operand:V16QI 0 "register_operand" "")
-	(smin:V16QI (match_operand:V16QI 1 "register_operand" "")
-		    (match_operand:V16QI 2 "register_operand" "")))]
+(define_expand "umaxv4si3"
+  [(set (match_operand:V4SI 0 "register_operand" "")
+	(umax:V4SI (match_operand:V4SI 1 "register_operand" "")
+		   (match_operand:V4SI 2 "register_operand" "")))]
   "TARGET_SSE2"
 {
   rtx xops[6];
@@ -2829,10 +2819,10 @@
   xops[0] = operands[0];
   xops[1] = operands[1];
   xops[2] = operands[2];
-  xops[3] = gen_rtx_GT (VOIDmode, operands[1], operands[2]);
-  xops[4] = operands[2];
-  xops[5] = operands[1];
-  ok = ix86_expand_int_vcond (xops, false);
+  xops[3] = gen_rtx_GTU (VOIDmode, operands[1], operands[2]);
+  xops[4] = operands[1];
+  xops[5] = operands[2];
+  ok = ix86_expand_int_vcond (xops);
   gcc_assert (ok);
   DONE;
 })
@@ -2869,26 +2859,42 @@
   [(set_attr "type" "sseiadd")
    (set_attr "mode" "TI")])
 
-(define_expand "uminv8hi3"
-  [(set (match_operand:V8HI 0 "register_operand" "")
-	(umin:V8HI (match_operand:V8HI 1 "register_operand" "")
-		   (match_operand:V8HI 2 "register_operand" "")))]
+(define_expand "smin<mode>3"
+  [(set (match_operand:SSEMODE14 0 "register_operand" "")
+	(smin:SSEMODE14 (match_operand:SSEMODE14 1 "register_operand" "")
+			(match_operand:SSEMODE14 2 "register_operand" "")))]
   "TARGET_SSE2"
 {
-  rtx xops[6], t1, t2;
+  rtx xops[6];
   bool ok;
 
-  t1 = gen_reg_rtx (V8HImode);
-  emit_insn (gen_sse2_ussubv8hi3 (t1, operands[1], operands[2]));
-  t2 = force_reg (V8HImode, CONST0_RTX (V8HImode));
+  xops[0] = operands[0];
+  xops[1] = operands[2];
+  xops[2] = operands[1];
+  xops[3] = gen_rtx_GT (VOIDmode, operands[1], operands[2]);
+  xops[4] = operands[1];
+  xops[5] = operands[2];
+  ok = ix86_expand_int_vcond (xops);
+  gcc_assert (ok);
+  DONE;
+})
+
+(define_expand "umin<mode>3"
+  [(set (match_operand:SSEMODE24 0 "register_operand" "")
+	(umin:SSEMODE24 (match_operand:SSEMODE24 1 "register_operand" "")
+			(match_operand:SSEMODE24 2 "register_operand" "")))]
+  "TARGET_SSE2"
+{
+  rtx xops[6];
+  bool ok;
 
   xops[0] = operands[0];
-  xops[1] = operands[1];
-  xops[2] = operands[2];
-  xops[3] = gen_rtx_EQ (VOIDmode, t1, t2);
-  xops[4] = t1;
-  xops[5] = t2;
-  ok = ix86_expand_int_vcond (xops, false);
+  xops[1] = operands[2];
+  xops[2] = operands[1];
+  xops[3] = gen_rtx_GTU (VOIDmode, operands[1], operands[2]);
+  xops[4] = operands[1];
+  xops[5] = operands[2];
+  ok = ix86_expand_int_vcond (xops);
   gcc_assert (ok);
   DONE;
 })
@@ -2929,7 +2935,7 @@
           (match_operand:SSEMODE124 2 "general_operand" "")))]
   "TARGET_SSE2"
 {
-  if (ix86_expand_int_vcond (operands, false))
+  if (ix86_expand_int_vcond (operands))
     DONE;
   else
     FAIL;
@@ -2945,7 +2951,7 @@
           (match_operand:SSEMODE12 2 "general_operand" "")))]
   "TARGET_SSE2"
 {
-  if (ix86_expand_int_vcond (operands, true))
+  if (ix86_expand_int_vcond (operands))
     DONE;
   else
     FAIL;
Index: testsuite/lib/target-supports.exp
===================================================================
RCS file: /cvs/gcc/gcc/gcc/testsuite/lib/target-supports.exp,v
retrieving revision 1.65
diff -u -p -r1.65 target-supports.exp
--- testsuite/lib/target-supports.exp	25 Jun 2005 01:45:24 -0000	1.65
+++ testsuite/lib/target-supports.exp	29 Jun 2005 17:18:44 -0000
@@ -973,9 +973,7 @@ proc check_effective_target_vect_no_max 
 	verbose "check_effective_target_vect_no_max: using cached result" 2
     } else {
 	set et_vect_no_max_saved 0
-	if { [istarget i?86-*-*]
-	     || [istarget x86_64-*-*]
-	     || [istarget sparc*-*-*]
+	if { [istarget sparc*-*-*]
 	     || [istarget alpha*-*-*] } {
 	    set et_vect_no_max_saved 1
 	}



More information about the Gcc-patches mailing list