This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
i386 vector min/max improvements
- From: Richard Henderson <rth at redhat dot com>
- To: gcc-patches at gcc dot gnu dot org
- Date: Wed, 29 Jun 2005 10:38:28 -0700
- Subject: i386 vector min/max improvements
Implements stuff for V4SImode along the same lines as I came up with
for V2SImode on ia64. Also copies the same unsigned saturation tricks
for vcond and umin.
One new trick for umax: the combination of us_minus and plus results
in umax without further bit manipulation. There are two cases:
(1) op2 >= op1. In this case we saturate to zero, resulting in
0 + op2, resulting in op2.
(2) op2 < op1. In this case no saturation happens, resulting in
(op1 - op2) + op2 == op1.
r~
* config/i386/i386.c (ix86_expand_int_vcond): Remove unsignedp
argument. Simplify canonicalization of condition. Use unsigned
saturating subtraction for QI and HImode unsigned compares. Use
bit arithmetic tricks for SImode unsigned compares.
* config/i386/i386-protos.h (ix86_expand_int_vcond): Update decl.
* config/i386/sse.md (SSEMODE14): New.
(umaxv8hi3): Use us_minus+plus to avoid vcond.
(umaxv4si3): New.
(smax<SSEMODE14>3): Rename from smaxv16qi3 and macroize.
(smin<SSEMODE14>3): Similarly with sminv16qi3.
(umin<SSEMODE24>3): Similarly with uminv8hi3.
* lib/target-supports.exp (check_effective_target_vect_no_max):
Remove i386 and x86_64.
Index: config/i386/i386-protos.h
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/i386/i386-protos.h,v
retrieving revision 1.142
diff -u -p -r1.142 i386-protos.h
--- config/i386/i386-protos.h 25 Jun 2005 01:21:07 -0000 1.142
+++ config/i386/i386-protos.h 29 Jun 2005 17:18:42 -0000
@@ -149,7 +149,7 @@ extern int ix86_expand_setcc (enum rtx_c
extern int ix86_expand_int_movcc (rtx[]);
extern int ix86_expand_fp_movcc (rtx[]);
extern bool ix86_expand_fp_vcond (rtx[]);
-extern bool ix86_expand_int_vcond (rtx[], bool);
+extern bool ix86_expand_int_vcond (rtx[]);
extern int ix86_expand_int_addcc (rtx[]);
extern void ix86_expand_call (rtx, rtx, rtx, rtx, rtx, int);
extern void x86_initialize_trampoline (rtx, rtx, rtx);
Index: config/i386/i386.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/i386/i386.c,v
retrieving revision 1.837
diff -u -p -r1.837 i386.c
--- config/i386/i386.c 27 Jun 2005 08:03:20 -0000 1.837
+++ config/i386/i386.c 29 Jun 2005 17:18:43 -0000
@@ -10501,94 +10501,102 @@ ix86_expand_fp_vcond (rtx operands[])
/* Expand a signed integral vector conditional move. */
bool
-ix86_expand_int_vcond (rtx operands[], bool unsignedp)
+ix86_expand_int_vcond (rtx operands[])
{
enum machine_mode mode = GET_MODE (operands[0]);
enum rtx_code code = GET_CODE (operands[3]);
- rtx cmp, x;
+ bool negate = false;
+ rtx x, cop0, cop1;
- if (unsignedp)
- code = signed_condition (code);
- if (code == NE || code == LE || code == GE)
- {
- /* Inverse of a supported code. */
- x = operands[1];
- operands[1] = operands[2];
- operands[2] = x;
- code = reverse_condition (code);
- }
- if (code == LT)
+ cop0 = operands[4];
+ cop1 = operands[5];
+
+ /* Canonicalize the comparison to EQ, GT, GTU. */
+ switch (code)
{
- /* Swap of a supported code. */
- x = operands[4];
- operands[4] = operands[5];
- operands[5] = x;
- code = swap_condition (code);
- }
- gcc_assert (code == EQ || code == GT);
+ case EQ:
+ case GT:
+ case GTU:
+ break;
+
+ case NE:
+ case LE:
+ case LEU:
+ code = reverse_condition (code);
+ negate = true;
+ break;
- /* Unlike floating-point, we can rely on the optimizers to have already
- converted to MIN/MAX expressions, so we don't have to handle that. */
+ case GE:
+ case GEU:
+ code = reverse_condition (code);
+ negate = true;
+ /* FALLTHRU */
- /* Unsigned GT is not directly supported. We can zero-extend QI and
- HImode elements to the next wider element size, use a signed compare,
- then repack. For three extra instructions, this is definitely a win. */
- if (code == GT && unsignedp)
- {
- rtx o0l, o0h, o1l, o1h, cl, ch, zero;
- enum machine_mode wider;
- rtx (*unpackl) (rtx, rtx, rtx);
- rtx (*unpackh) (rtx, rtx, rtx);
- rtx (*pack) (rtx, rtx, rtx);
+ case LT:
+ case LTU:
+ code = swap_condition (code);
+ x = cop0, cop0 = cop1, cop1 = x;
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+ /* Unsigned parallel compare is not supported by the hardware. Play some
+ tricks to turn this into a signed comparison against 0. */
+ if (code == GTU)
+ {
switch (mode)
{
- case V16QImode:
- wider = V8HImode;
- unpackl = gen_sse2_punpcklbw;
- unpackh = gen_sse2_punpckhbw;
- pack = gen_sse2_packsswb;
+ case V4SImode:
+ {
+ rtx t1, t2, mask;
+
+ /* Perform a parallel modulo subtraction. */
+ t1 = gen_reg_rtx (mode);
+ emit_insn (gen_subv4si3 (t1, cop0, cop1));
+
+ /* Extract the original sign bit of op0. */
+ mask = GEN_INT (-0x80000000);
+ mask = gen_rtx_CONST_VECTOR (mode,
+ gen_rtvec (4, mask, mask, mask, mask));
+ mask = force_reg (mode, mask);
+ t2 = gen_reg_rtx (mode);
+ emit_insn (gen_andv4si3 (t2, cop0, mask));
+
+ /* XOR it back into the result of the subtraction. This results
+ in the sign bit set iff we saw unsigned underflow. */
+ x = gen_reg_rtx (mode);
+ emit_insn (gen_xorv4si3 (x, t1, t2));
+
+ code = GT;
+ }
break;
+
+ case V16QImode:
case V8HImode:
- wider = V4SImode;
- unpackl = gen_sse2_punpcklwd;
- unpackh = gen_sse2_punpckhwd;
- pack = gen_sse2_packssdw;
+ /* Perform a parallel unsigned saturating subtraction. */
+ x = gen_reg_rtx (mode);
+ emit_insn (gen_rtx_SET (VOIDmode, x,
+ gen_rtx_US_MINUS (mode, cop0, cop1)));
+
+ code = EQ;
+ negate = !negate;
break;
+
default:
gcc_unreachable ();
}
- operands[4] = force_reg (mode, operands[4]);
- operands[5] = force_reg (mode, operands[5]);
-
- o0l = gen_reg_rtx (wider);
- o0h = gen_reg_rtx (wider);
- o1l = gen_reg_rtx (wider);
- o1h = gen_reg_rtx (wider);
- cl = gen_reg_rtx (wider);
- ch = gen_reg_rtx (wider);
- cmp = gen_reg_rtx (mode);
- zero = force_reg (mode, CONST0_RTX (mode));
-
- emit_insn (unpackl (gen_lowpart (mode, o0l), operands[4], zero));
- emit_insn (unpackh (gen_lowpart (mode, o0h), operands[4], zero));
- emit_insn (unpackl (gen_lowpart (mode, o1l), operands[5], zero));
- emit_insn (unpackh (gen_lowpart (mode, o1h), operands[5], zero));
-
- x = gen_rtx_GT (wider, o0l, o1l);
- emit_insn (gen_rtx_SET (VOIDmode, cl, x));
-
- x = gen_rtx_GT (wider, o0h, o1h);
- emit_insn (gen_rtx_SET (VOIDmode, ch, x));
-
- emit_insn (pack (cmp, cl, ch));
+ cop0 = x;
+ cop1 = CONST0_RTX (mode);
}
- else
- cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
- operands[1], operands[2]);
- ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
+ x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
+ operands[1+negate], operands[2-negate]);
+
+ ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
+ operands[2-negate]);
return true;
}
Index: config/i386/sse.md
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/i386/sse.md,v
retrieving revision 1.20
diff -u -p -r1.20 sse.md
--- config/i386/sse.md 28 Jun 2005 09:00:42 -0000 1.20
+++ config/i386/sse.md 29 Jun 2005 17:18:44 -0000
@@ -30,6 +30,7 @@
;; Mix-n-match
(define_mode_macro SSEMODE12 [V16QI V8HI])
(define_mode_macro SSEMODE24 [V8HI V4SI])
+(define_mode_macro SSEMODE14 [V16QI V4SI])
(define_mode_macro SSEMODE124 [V16QI V8HI V4SI])
(define_mode_macro SSEMODE248 [V8HI V4SI V2DI])
@@ -2741,26 +2742,6 @@
operands[1] = gen_lowpart (TImode, operands[1]);
})
-(define_expand "smaxv16qi3"
- [(set (match_operand:V16QI 0 "register_operand" "")
- (smax:V16QI (match_operand:V16QI 1 "register_operand" "")
- (match_operand:V16QI 2 "register_operand" "")))]
- "TARGET_SSE2"
-{
- rtx xops[6];
- bool ok;
-
- xops[0] = operands[0];
- xops[1] = operands[1];
- xops[2] = operands[2];
- xops[3] = gen_rtx_GT (VOIDmode, operands[1], operands[2]);
- xops[4] = operands[1];
- xops[5] = operands[2];
- ok = ix86_expand_int_vcond (xops, false);
- gcc_assert (ok);
- DONE;
-})
-
(define_expand "umaxv16qi3"
[(set (match_operand:V16QI 0 "register_operand" "")
(umax:V16QI (match_operand:V16QI 1 "nonimmediate_operand" "")
@@ -2794,33 +2775,42 @@
(set_attr "mode" "TI")])
(define_expand "umaxv8hi3"
- [(set (match_operand:V8HI 0 "register_operand" "")
- (umax:V8HI (match_operand:V8HI 1 "register_operand" "")
- (match_operand:V8HI 2 "register_operand" "")))]
+ [(set (match_operand:V8HI 0 "register_operand" "=x")
+ (us_minus:V8HI (match_operand:V8HI 1 "register_operand" "0")
+ (match_operand:V8HI 2 "nonimmediate_operand" "xm")))
+ (set (match_dup 3)
+ (plus:V8HI (match_dup 0) (match_dup 2)))]
"TARGET_SSE2"
{
- rtx xops[6], t1, t2;
- bool ok;
+ operands[3] = operands[0];
+ if (rtx_equal_p (operands[0], operands[2]))
+ operands[0] = gen_reg_rtx (V8HImode);
+})
- t1 = gen_reg_rtx (V8HImode);
- emit_insn (gen_sse2_ussubv8hi3 (t1, operands[2], operands[1]));
- t2 = force_reg (V8HImode, CONST0_RTX (V8HImode));
+(define_expand "smax<mode>3"
+ [(set (match_operand:SSEMODE14 0 "register_operand" "")
+ (smax:SSEMODE14 (match_operand:SSEMODE14 1 "register_operand" "")
+ (match_operand:SSEMODE14 2 "register_operand" "")))]
+ "TARGET_SSE2"
+{
+ rtx xops[6];
+ bool ok;
xops[0] = operands[0];
xops[1] = operands[1];
xops[2] = operands[2];
- xops[3] = gen_rtx_EQ (VOIDmode, t1, t2);
- xops[4] = t1;
- xops[5] = t2;
- ok = ix86_expand_int_vcond (xops, false);
+ xops[3] = gen_rtx_GT (VOIDmode, operands[1], operands[2]);
+ xops[4] = operands[1];
+ xops[5] = operands[2];
+ ok = ix86_expand_int_vcond (xops);
gcc_assert (ok);
DONE;
})
-(define_expand "sminv16qi3"
- [(set (match_operand:V16QI 0 "register_operand" "")
- (smin:V16QI (match_operand:V16QI 1 "register_operand" "")
- (match_operand:V16QI 2 "register_operand" "")))]
+(define_expand "umaxv4si3"
+ [(set (match_operand:V4SI 0 "register_operand" "")
+ (umax:V4SI (match_operand:V4SI 1 "register_operand" "")
+ (match_operand:V4SI 2 "register_operand" "")))]
"TARGET_SSE2"
{
rtx xops[6];
@@ -2829,10 +2819,10 @@
xops[0] = operands[0];
xops[1] = operands[1];
xops[2] = operands[2];
- xops[3] = gen_rtx_GT (VOIDmode, operands[1], operands[2]);
- xops[4] = operands[2];
- xops[5] = operands[1];
- ok = ix86_expand_int_vcond (xops, false);
+ xops[3] = gen_rtx_GTU (VOIDmode, operands[1], operands[2]);
+ xops[4] = operands[1];
+ xops[5] = operands[2];
+ ok = ix86_expand_int_vcond (xops);
gcc_assert (ok);
DONE;
})
@@ -2869,26 +2859,42 @@
[(set_attr "type" "sseiadd")
(set_attr "mode" "TI")])
-(define_expand "uminv8hi3"
- [(set (match_operand:V8HI 0 "register_operand" "")
- (umin:V8HI (match_operand:V8HI 1 "register_operand" "")
- (match_operand:V8HI 2 "register_operand" "")))]
+(define_expand "smin<mode>3"
+ [(set (match_operand:SSEMODE14 0 "register_operand" "")
+ (smin:SSEMODE14 (match_operand:SSEMODE14 1 "register_operand" "")
+ (match_operand:SSEMODE14 2 "register_operand" "")))]
"TARGET_SSE2"
{
- rtx xops[6], t1, t2;
+ rtx xops[6];
bool ok;
- t1 = gen_reg_rtx (V8HImode);
- emit_insn (gen_sse2_ussubv8hi3 (t1, operands[1], operands[2]));
- t2 = force_reg (V8HImode, CONST0_RTX (V8HImode));
+ xops[0] = operands[0];
+ xops[1] = operands[2];
+ xops[2] = operands[1];
+ xops[3] = gen_rtx_GT (VOIDmode, operands[1], operands[2]);
+ xops[4] = operands[1];
+ xops[5] = operands[2];
+ ok = ix86_expand_int_vcond (xops);
+ gcc_assert (ok);
+ DONE;
+})
+
+(define_expand "umin<mode>3"
+ [(set (match_operand:SSEMODE24 0 "register_operand" "")
+ (umin:SSEMODE24 (match_operand:SSEMODE24 1 "register_operand" "")
+ (match_operand:SSEMODE24 2 "register_operand" "")))]
+ "TARGET_SSE2"
+{
+ rtx xops[6];
+ bool ok;
xops[0] = operands[0];
- xops[1] = operands[1];
- xops[2] = operands[2];
- xops[3] = gen_rtx_EQ (VOIDmode, t1, t2);
- xops[4] = t1;
- xops[5] = t2;
- ok = ix86_expand_int_vcond (xops, false);
+ xops[1] = operands[2];
+ xops[2] = operands[1];
+ xops[3] = gen_rtx_GTU (VOIDmode, operands[1], operands[2]);
+ xops[4] = operands[1];
+ xops[5] = operands[2];
+ ok = ix86_expand_int_vcond (xops);
gcc_assert (ok);
DONE;
})
@@ -2929,7 +2935,7 @@
(match_operand:SSEMODE124 2 "general_operand" "")))]
"TARGET_SSE2"
{
- if (ix86_expand_int_vcond (operands, false))
+ if (ix86_expand_int_vcond (operands))
DONE;
else
FAIL;
@@ -2945,7 +2951,7 @@
(match_operand:SSEMODE12 2 "general_operand" "")))]
"TARGET_SSE2"
{
- if (ix86_expand_int_vcond (operands, true))
+ if (ix86_expand_int_vcond (operands))
DONE;
else
FAIL;
Index: testsuite/lib/target-supports.exp
===================================================================
RCS file: /cvs/gcc/gcc/gcc/testsuite/lib/target-supports.exp,v
retrieving revision 1.65
diff -u -p -r1.65 target-supports.exp
--- testsuite/lib/target-supports.exp 25 Jun 2005 01:45:24 -0000 1.65
+++ testsuite/lib/target-supports.exp 29 Jun 2005 17:18:44 -0000
@@ -973,9 +973,7 @@ proc check_effective_target_vect_no_max
verbose "check_effective_target_vect_no_max: using cached result" 2
} else {
set et_vect_no_max_saved 0
- if { [istarget i?86-*-*]
- || [istarget x86_64-*-*]
- || [istarget sparc*-*-*]
+ if { [istarget sparc*-*-*]
|| [istarget alpha*-*-*] } {
set et_vect_no_max_saved 1
}