This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
improve ia64 vector min/max
- From: Richard Henderson <rth at redhat dot com>
- To: gcc-patches at gcc dot gnu dot org
- Date: Tue, 28 Jun 2005 15:52:28 -0700
- Subject: improve ia64 vector min/max
For V8QI and V4HI, now using the unsigned saturating subtraction idea
from Paulo, instead of playing games to widen to larger vectors. This
is significantly smaller and faster.
For V2SI (which has no saturating arithmetic), I'm playing some bit
manipulation games
unsigned int x, y;
signed int t = (x - y) ^ (x & 0x80000000);
x > y -> t > 0
The result is the same number of instructions (and the same latency
as far as I can see) as the decomposition method I had been using,
but this way doesn't use predication. Since gcc can't do nested
predication, this should allow for improving things a bit in ifcvt.
r~
* config/ia64/ia64.c (ia64_expand_vecint_compare): Use unsigned
saturating subtraction for QI and HImode unsigned compares. Use
bit arithmetic tricks for SImode unsigned compares.
(ia64_expand_vcondu_v2si): Remove.
(ia64_expand_vecint_cmov): Don't call it.
Index: config/ia64/ia64.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/ia64/ia64.c,v
retrieving revision 1.383
diff -u -p -d -r1.383 ia64.c
--- config/ia64/ia64.c 28 Jun 2005 21:54:59 -0000 1.383
+++ config/ia64/ia64.c 28 Jun 2005 22:30:55 -0000
@@ -1526,7 +1526,8 @@ ia64_expand_compare (enum rtx_code code,
return gen_rtx_fmt_ee (code, mode, cmp, const0_rtx);
}
-/* Generate an integral vector comparison. */
+/* Generate an integral vector comparison. Return true if the condition has
+ been reversed, and so the sense of the comparison should be inverted. */
static bool
ia64_expand_vecint_compare (enum rtx_code code, enum machine_mode mode,
@@ -1535,95 +1536,80 @@ ia64_expand_vecint_compare (enum rtx_cod
bool negate = false;
rtx x;
+ /* Canonicalize the comparison to EQ, GT, GTU. */
switch (code)
{
case EQ:
case GT:
+ case GTU:
break;
case NE:
- code = EQ;
- negate = true;
- break;
-
case LE:
- code = GT;
+ case LEU:
+ code = reverse_condition (code);
negate = true;
break;
case GE:
+ case GEU:
+ code = reverse_condition (code);
negate = true;
/* FALLTHRU */
case LT:
- x = op0;
- op0 = op1;
- op1 = x;
- code = GT;
- break;
-
- case GTU:
- case GEU:
case LTU:
- case LEU:
- {
- rtx w0h, w0l, w1h, w1l, ch, cl;
- enum machine_mode wmode;
- rtx (*unpack_l) (rtx, rtx, rtx);
- rtx (*unpack_h) (rtx, rtx, rtx);
- rtx (*pack) (rtx, rtx, rtx);
+ code = swap_condition (code);
+ x = op0, op0 = op1, op1 = x;
+ break;
- /* We don't have native unsigned comparisons, but we can generate
- them better than generic code can. */
+ default:
+ gcc_unreachable ();
+ }
- gcc_assert (mode != V2SImode);
- switch (mode)
+ /* Unsigned parallel compare is not supported by the hardware. Play some
+ tricks to turn this into a GT comparison against 0. */
+ if (code == GTU)
+ {
+ switch (mode)
+ {
+ case V2SImode:
{
- case V8QImode:
- wmode = V4HImode;
- pack = gen_pack2_sss;
- unpack_l = gen_unpack1_l;
- unpack_h = gen_unpack1_h;
- break;
-
- case V4HImode:
- wmode = V2SImode;
- pack = gen_pack4_sss;
- unpack_l = gen_unpack2_l;
- unpack_h = gen_unpack2_h;
- break;
-
- default:
- gcc_unreachable ();
- }
-
- /* Unpack into wider vectors, zero extending the elements. */
+ rtx t1, t2, mask;
- w0l = gen_reg_rtx (wmode);
- w0h = gen_reg_rtx (wmode);
- w1l = gen_reg_rtx (wmode);
- w1h = gen_reg_rtx (wmode);
- emit_insn (unpack_l (gen_lowpart (mode, w0l), op0, CONST0_RTX (mode)));
- emit_insn (unpack_h (gen_lowpart (mode, w0h), op0, CONST0_RTX (mode)));
- emit_insn (unpack_l (gen_lowpart (mode, w1l), op1, CONST0_RTX (mode)));
- emit_insn (unpack_h (gen_lowpart (mode, w1h), op1, CONST0_RTX (mode)));
+ /* Perform a parallel modulo subtraction. */
+ t1 = gen_reg_rtx (V2SImode);
+ emit_insn (gen_subv2si3 (t1, op0, op1));
- /* Compare in the wider mode. */
+ /* Extract the original sign bit of op0. */
+ mask = GEN_INT (-0x80000000);
+ mask = gen_rtx_CONST_VECTOR (V2SImode, gen_rtvec (2, mask, mask));
+ mask = force_reg (V2SImode, mask);
+ t2 = gen_reg_rtx (V2SImode);
+ emit_insn (gen_andv2si3 (t2, op0, mask));
- cl = gen_reg_rtx (wmode);
- ch = gen_reg_rtx (wmode);
- code = signed_condition (code);
- ia64_expand_vecint_compare (code, wmode, cl, w0l, w1l);
- negate = ia64_expand_vecint_compare (code, wmode, ch, w0h, w1h);
+ /* XOR it back into the result of the subtraction. This results
+ in the sign bit set iff we saw unsigned underflow. */
+ x = gen_reg_rtx (V2SImode);
+ emit_insn (gen_xorv2si3 (x, t1, t2));
+ }
+ break;
- /* Repack into a single narrower vector. */
+ case V8QImode:
+ case V4HImode:
+ /* Perform a parallel unsigned saturating subtraction. */
+ x = gen_reg_rtx (mode);
+ emit_insn (gen_rtx_SET (VOIDmode, x,
+ gen_rtx_US_MINUS (mode, op0, op1)));
+ break;
- emit_insn (pack (dest, cl, ch));
- }
- return negate;
+ default:
+ gcc_unreachable ();
+ }
- default:
- gcc_unreachable ();
+ code = GT;
+ op0 = x;
+ op1 = CONST0_RTX (mode);
}
x = gen_rtx_fmt_ee (code, mode, op0, op1);
@@ -1632,59 +1618,6 @@ ia64_expand_vecint_compare (enum rtx_cod
return negate;
}
-static void
-ia64_expand_vcondu_v2si (enum rtx_code code, rtx operands[])
-{
- rtx dl, dh, bl, bh, op1l, op1h, op2l, op2h, op4l, op4h, op5l, op5h, x;
-
- /* In this case, we extract the two SImode quantities and generate
- normal comparisons for each of them. */
-
- op1l = gen_lowpart (SImode, operands[1]);
- op2l = gen_lowpart (SImode, operands[2]);
- op4l = gen_lowpart (SImode, operands[4]);
- op5l = gen_lowpart (SImode, operands[5]);
-
- op1h = gen_reg_rtx (SImode);
- op2h = gen_reg_rtx (SImode);
- op4h = gen_reg_rtx (SImode);
- op5h = gen_reg_rtx (SImode);
-
- emit_insn (gen_lshrdi3 (gen_lowpart (DImode, op1h),
- gen_lowpart (DImode, operands[1]), GEN_INT (32)));
- emit_insn (gen_lshrdi3 (gen_lowpart (DImode, op2h),
- gen_lowpart (DImode, operands[2]), GEN_INT (32)));
- emit_insn (gen_lshrdi3 (gen_lowpart (DImode, op4h),
- gen_lowpart (DImode, operands[4]), GEN_INT (32)));
- emit_insn (gen_lshrdi3 (gen_lowpart (DImode, op5h),
- gen_lowpart (DImode, operands[5]), GEN_INT (32)));
-
- bl = gen_reg_rtx (BImode);
- x = gen_rtx_fmt_ee (code, BImode, op4l, op5l);
- emit_insn (gen_rtx_SET (VOIDmode, bl, x));
-
- bh = gen_reg_rtx (BImode);
- x = gen_rtx_fmt_ee (code, BImode, op4h, op5h);
- emit_insn (gen_rtx_SET (VOIDmode, bh, x));
-
- /* With the results of the comparisons, emit conditional moves. */
-
- dl = gen_reg_rtx (SImode);
- x = gen_rtx_NE (VOIDmode, bl, const0_rtx);
- x = gen_rtx_IF_THEN_ELSE (SImode, x, op1l, op2l);
- emit_insn (gen_rtx_SET (VOIDmode, dl, x));
-
- dh = gen_reg_rtx (SImode);
- x = gen_rtx_NE (VOIDmode, bh, const0_rtx);
- x = gen_rtx_IF_THEN_ELSE (SImode, x, op1h, op2h);
- emit_insn (gen_rtx_SET (VOIDmode, dh, x));
-
- /* Merge the two partial results back into a vector. */
-
- x = gen_rtx_VEC_CONCAT (V2SImode, dl, dh);
- emit_insn (gen_rtx_SET (VOIDmode, operands[0], x));
-}
-
/* Emit an integral vector conditional move. */
void
@@ -1695,15 +1628,6 @@ ia64_expand_vecint_cmov (rtx operands[])
bool negate;
rtx cmp, x, ot, of;
- /* Since we don't have unsigned V2SImode comparisons, it's more efficient
- to special-case them entirely. */
- if (mode == V2SImode
- && (code == GTU || code == GEU || code == LEU || code == LTU))
- {
- ia64_expand_vcondu_v2si (code, operands);
- return;
- }
-
cmp = gen_reg_rtx (mode);
negate = ia64_expand_vecint_compare (code, mode, cmp,
operands[4], operands[5]);