This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH][4.3] Expand lfloor/lceil inline for x86_64/i?86 SSE math
- From: Richard Guenther <rguenther at suse dot de>
- To: gcc-patches at gcc dot gnu dot org
- Date: Wed, 18 Oct 2006 16:21:51 +0200 (CEST)
- Subject: [PATCH][4.3] Expand lfloor/lceil inline for x86_64/i?86 SSE math
This third patch in the series adds expanders for lfloor and lceil. It
changes the existing optabs from optabs that only take the float mode to
conversion optabs (because in principle we can expand lfloor to SImode
directly - but we don't do at the moment because we don't have a
builtin for that). It basically avoids hassle with the tricks we play
for the x87 lfloor/lceil expanders. And it's more correct, of course.
[ We expand lfloor, for example, to
cvttsd2siq %xmm0, %rax
cvtsi2sdq %rax, %xmm1
leaq -1(%rax), %rdx
ucomisd %xmm0, %xmm1
cmova %rdx, %rax
ret
]
Bootstrapped and tested on {x86_64,i686}-linux-gnu.
Ok for 4.3? (I'll pause sending the series after this three, four
more to come for expanding rint, floor/ceil, round and trunc)
Thanks,
Richard.
--
Richard Guenther <rguenther@suse.de>
Novell / SUSE Labs
2006-08-23 Richard Guenther <rguenther@suse.de>
* genopinit.c (optabs): Change lfloor_optab and lceil_optab
to conversion optabs.
* optabs.c (init_optabs): Initialize lfloor_optab and lceil_optab
as conversion optab.
* optabs.h (enum optab_index): Remove OTI_lfloor and OTI_lceil.
(enum convert_optab_index): Add COI_lfloor and COI_lceil.
(lfloor_optab, lceil_optab): Adjust defines.
* builtins.c (expand_builtin_int_roundingfn): Adjust for
lfloor and lceil optabs now being conversion optabs.
* config/i386/i386-protos.h (ix86_expand_lfloorceil): Declare.
* config/i386/i386.c (ix86_expand_sse_compare_and_jump):
New static helper function.
(ix86_expand_lfloorceil): New function to expand lfloor and
lceil inline.
* config/i386/i386.md (lfloor<mode>2): Split into ...
(lfloorxf<mode>2): ... x87 variant
(lfloor<mode>di2, lfloor<mode>si2): ... and SSE variants
using ix86_expand_lfloorceil.
(lceil<mode>2, lceilxf<mode>2, lceil<mode>di2, lceil<mode>si2):
Likewise.
* doc/md.texi (lfloorMN, lceilMN): Document.
* gcc.target/i386/math-torture/lfloor.c: New testcase.
* gcc.target/i386/math-torture/lceil.c: Likewise.
Index: gcc/builtins.c
===================================================================
--- gcc.orig/builtins.c
+++ gcc/builtins.c
@@ -2235,7 +2235,7 @@ expand_builtin_sincos (tree exp)
static rtx
expand_builtin_int_roundingfn (tree exp, rtx target, rtx subtarget)
{
- optab builtin_optab;
+ convert_optab builtin_optab;
rtx op0, insns, tmp;
tree fndecl = get_callee_fndecl (exp);
tree arglist = TREE_OPERAND (exp, 1);
@@ -2270,44 +2270,37 @@ expand_builtin_int_roundingfn (tree exp,
/* Make a suitable register to place result in. */
mode = TYPE_MODE (TREE_TYPE (exp));
- /* Before working hard, check whether the instruction is available. */
- if (builtin_optab->handlers[(int) mode].insn_code != CODE_FOR_nothing)
- {
- target = gen_reg_rtx (mode);
-
- /* Wrap the computation of the argument in a SAVE_EXPR, as we may
- need to expand the argument again. This way, we will not perform
- side-effects more the once. */
- narg = builtin_save_expr (arg);
- if (narg != arg)
- {
- arg = narg;
- arglist = build_tree_list (NULL_TREE, arg);
- exp = build_function_call_expr (fndecl, arglist);
- }
-
- op0 = expand_expr (arg, subtarget, VOIDmode, 0);
+ target = gen_reg_rtx (mode);
- start_sequence ();
+ /* Wrap the computation of the argument in a SAVE_EXPR, as we may
+ need to expand the argument again. This way, we will not perform
+ side-effects more the once. */
+ narg = builtin_save_expr (arg);
+ if (narg != arg)
+ {
+ arg = narg;
+ arglist = build_tree_list (NULL_TREE, arg);
+ exp = build_function_call_expr (fndecl, arglist);
+ }
- /* Compute into TARGET.
- Set TARGET to wherever the result comes back. */
- target = expand_unop (mode, builtin_optab, op0, target, 0);
+ op0 = expand_expr (arg, subtarget, VOIDmode, 0);
- if (target != 0)
- {
- /* Output the entire sequence. */
- insns = get_insns ();
- end_sequence ();
- emit_insn (insns);
- return target;
- }
+ start_sequence ();
- /* If we were unable to expand via the builtin, stop the sequence
- (without outputting the insns). */
+ /* Compute into TARGET. */
+ if (expand_sfix_optab (target, op0, builtin_optab))
+ {
+ /* Output the entire sequence. */
+ insns = get_insns ();
end_sequence ();
+ emit_insn (insns);
+ return target;
}
+ /* If we were unable to expand via the builtin, stop the sequence
+ (without outputting the insns). */
+ end_sequence ();
+
/* Fall back to floating point rounding optab. */
fallback_fndecl = mathfn_built_in (TREE_TYPE (arg), fallback_fn);
/* We shouldn't get here on targets without TARGET_C99_FUNCTIONS.
Index: gcc/config/i386/i386-protos.h
===================================================================
--- gcc.orig/config/i386/i386-protos.h
+++ gcc/config/i386/i386-protos.h
@@ -158,6 +158,7 @@ extern void ix86_emit_i387_log1p (rtx, r
extern enum rtx_code ix86_reverse_condition (enum rtx_code, enum machine_mode);
extern void ix86_expand_lround (rtx, rtx);
+extern void ix86_expand_lfloorceil (rtx, rtx, bool);
#ifdef TREE_CODE
extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree);
Index: gcc/config/i386/i386.c
===================================================================
--- gcc.orig/config/i386/i386.c
+++ gcc/config/i386/i386.c
@@ -18999,6 +18999,36 @@ ix86_sse_copysign_to_positive (rtx resul
gen_rtx_IOR (mode, abs_value, sgn)));
}
+/* Expands a comparison of OP0 with OP1 using comparison code CODE,
+ swapping the operands if SWAP_OPERANDS is true. The expanded
+ code is a forward jump to a newly created label in case the
+ comparison is true. The generated label rtx is returned. */
+static rtx
+ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
+ bool swap_operands)
+{
+ rtx label, tmp;
+
+ if (swap_operands)
+ {
+ tmp = op0;
+ op0 = op1;
+ op1 = tmp;
+ }
+
+ label = gen_label_rtx ();
+ tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
+ emit_insn (gen_rtx_SET (VOIDmode, tmp,
+ gen_rtx_COMPARE (CCFPUmode, op0, op1)));
+ tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
+ tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
+ gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
+ tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
+ JUMP_LABEL (tmp) = label;
+
+ return label;
+}
+
/* Expand SSE sequence for computing lround from OP1 storing
into OP0. */
void
@@ -19022,4 +19052,37 @@ ix86_expand_lround (rtx op0, rtx op1)
expand_fix (op0, adj, 0);
}
+/* Expand SSE2 sequence for computing lround from OPERAND1 storing
+ into OPERAND0. */
+void
+ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
+{
+ /* C code for the stuff we're doing below (for do_floor):
+ xi = (long)op1;
+ xi -= (double)xi > op1 ? 1 : 0;
+ return xi;
+ */
+ enum machine_mode fmode = GET_MODE (op1);
+ enum machine_mode imode = GET_MODE (op0);
+ rtx ireg, freg, label;
+
+ /* reg = (long)op1 */
+ ireg = gen_reg_rtx (imode);
+ expand_fix (ireg, op1, 0);
+
+ /* freg = (double)reg */
+ freg = gen_reg_rtx (fmode);
+ expand_float (freg, ireg, 0);
+
+ /* ireg = (freg > op1) ? ireg - 1 : ireg */
+ label = ix86_expand_sse_compare_and_jump (UNLE,
+ freg, op1, !do_floor);
+ expand_simple_binop (imode, do_floor ? MINUS : PLUS,
+ ireg, const1_rtx, ireg, 0, OPTAB_DIRECT);
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+
+ emit_move_insn (op0, ireg);
+}
+
#include "gt-i386.h"
Index: gcc/config/i386/i386.md
===================================================================
--- gcc.orig/config/i386/i386.md
+++ gcc/config/i386/i386.md
@@ -17150,7 +17150,7 @@
(use (match_dup 3))])]
"")
-(define_expand "lfloor<mode>2"
+(define_expand "lfloorxf<mode>2"
[(parallel [(set (match_operand:X87MODEI 0 "nonimmediate_operand" "")
(unspec:X87MODEI [(match_operand:XF 1 "register_operand" "")]
UNSPEC_FIST_FLOOR))
@@ -17160,6 +17160,26 @@
&& flag_unsafe_math_optimizations"
"")
+(define_expand "lfloor<mode>di2"
+ [(match_operand:DI 0 "nonimmediate_operand" "")
+ (match_operand:SSEMODEF 1 "register_operand" "")]
+ "SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH && TARGET_64BIT
+ && !flag_trapping_math"
+{
+ ix86_expand_lfloorceil (operand0, operand1, true);
+ DONE;
+})
+
+(define_expand "lfloor<mode>si2"
+ [(match_operand:SI 0 "nonimmediate_operand" "")
+ (match_operand:SSEMODEF 1 "register_operand" "")]
+ "SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH
+ && !flag_trapping_math"
+{
+ ix86_expand_lfloorceil (operand0, operand1, true);
+ DONE;
+})
+
;; Rounding mode control word calculation could clobber FLAGS_REG.
(define_insn_and_split "frndintxf2_ceil"
[(set (match_operand:XF 0 "register_operand" "=f")
@@ -17391,7 +17411,7 @@
(use (match_dup 3))])]
"")
-(define_expand "lceil<mode>2"
+(define_expand "lceilxf<mode>2"
[(parallel [(set (match_operand:X87MODEI 0 "nonimmediate_operand" "")
(unspec:X87MODEI [(match_operand:XF 1 "register_operand" "")]
UNSPEC_FIST_CEIL))
@@ -17401,6 +17421,26 @@
&& flag_unsafe_math_optimizations"
"")
+(define_expand "lceil<mode>di2"
+ [(match_operand:DI 0 "nonimmediate_operand" "")
+ (match_operand:SSEMODEF 1 "register_operand" "")]
+ "SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH && TARGET_64BIT
+ && !flag_trapping_math"
+{
+ ix86_expand_lfloorceil (operand0, operand1, false);
+ DONE;
+})
+
+(define_expand "lceil<mode>si2"
+ [(match_operand:SI 0 "nonimmediate_operand" "")
+ (match_operand:SSEMODEF 1 "register_operand" "")]
+ "SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH
+ && !flag_trapping_math"
+{
+ ix86_expand_lfloorceil (operand0, operand1, false);
+ DONE;
+})
+
;; Rounding mode control word calculation could clobber FLAGS_REG.
(define_insn_and_split "frndintxf2_trunc"
[(set (match_operand:XF 0 "register_operand" "=f")
Index: gcc/doc/md.texi
===================================================================
--- gcc.orig/doc/md.texi
+++ gcc/doc/md.texi
@@ -3613,6 +3613,18 @@ Convert operand 1 (valid for floating po
point mode @var{n} as a signed number rounding to nearest and away
from zero and store in operand 0 (which has mode @var{n}).
+@cindex @code{lfloor@var{m}@var{n}2}
+@item @samp{lfloor@var{m}2}
+Convert operand 1 (valid for floating point mode @var{m}) to fixed
+point mode @var{n} as a signed number rounding down and store in
+operand 0 (which has mode @var{n}).
+
+@cindex @code{lceil@var{m}@var{n}2}
+@item @samp{lceil@var{m}2}
+Convert operand 1 (valid for floating point mode @var{m}) to fixed
+point mode @var{n} as a signed number rounding up and store in
+operand 0 (which has mode @var{n}).
+
@cindex @code{copysign@var{m}3} instruction pattern
@item @samp{copysign@var{m}3}
Store a value with the magnitude of operand 1 and the sign of operand
Index: gcc/genopinit.c
===================================================================
--- gcc.orig/genopinit.c
+++ gcc/genopinit.c
@@ -120,9 +120,9 @@ static const char * const optabs[] =
"copysign_optab->handlers[$A].insn_code = CODE_FOR_$(copysign$F$a3$)",
"sqrt_optab->handlers[$A].insn_code = CODE_FOR_$(sqrt$a2$)",
"floor_optab->handlers[$A].insn_code = CODE_FOR_$(floor$a2$)",
- "lfloor_optab->handlers[$A].insn_code = CODE_FOR_$(lfloor$a2$)",
+ "lfloor_optab->handlers[$B][$A].insn_code = CODE_FOR_$(lfloor$F$a$I$b2$)",
"ceil_optab->handlers[$A].insn_code = CODE_FOR_$(ceil$a2$)",
- "lceil_optab->handlers[$A].insn_code = CODE_FOR_$(lceil$a2$)",
+ "lceil_optab->handlers[$B][$A].insn_code = CODE_FOR_$(lceil$F$a$I$b2$)",
"round_optab->handlers[$A].insn_code = CODE_FOR_$(round$a2$)",
"btrunc_optab->handlers[$A].insn_code = CODE_FOR_$(btrunc$a2$)",
"nearbyint_optab->handlers[$A].insn_code = CODE_FOR_$(nearbyint$a2$)",
Index: gcc/optabs.c
===================================================================
--- gcc.orig/optabs.c
+++ gcc/optabs.c
@@ -5299,9 +5299,7 @@ init_optabs (void)
parity_optab = init_optab (PARITY);
sqrt_optab = init_optab (SQRT);
floor_optab = init_optab (UNKNOWN);
- lfloor_optab = init_optab (UNKNOWN);
ceil_optab = init_optab (UNKNOWN);
- lceil_optab = init_optab (UNKNOWN);
round_optab = init_optab (UNKNOWN);
btrunc_optab = init_optab (UNKNOWN);
nearbyint_optab = init_optab (UNKNOWN);
@@ -5366,6 +5364,8 @@ init_optabs (void)
ufloat_optab = init_convert_optab (UNSIGNED_FLOAT);
lrint_optab = init_convert_optab (UNKNOWN);
lround_optab = init_convert_optab (UNKNOWN);
+ lfloor_optab = init_convert_optab (UNKNOWN);
+ lceil_optab = init_convert_optab (UNKNOWN);
for (i = 0; i < NUM_MACHINE_MODES; i++)
{
@@ -5489,6 +5489,10 @@ init_optabs (void)
MODE_INT, MODE_FLOAT);
init_interclass_conv_libfuncs (lround_optab, "lround",
MODE_INT, MODE_FLOAT);
+ init_interclass_conv_libfuncs (lfloor_optab, "lfloor",
+ MODE_INT, MODE_FLOAT);
+ init_interclass_conv_libfuncs (lceil_optab, "lceil",
+ MODE_INT, MODE_FLOAT);
/* sext_optab is also used for FLOAT_EXTEND. */
init_intraclass_conv_libfuncs (sext_optab, "extend", MODE_FLOAT, true);
Index: gcc/optabs.h
===================================================================
--- gcc.orig/optabs.h
+++ gcc/optabs.h
@@ -189,9 +189,7 @@ enum optab_index
OTI_log1p,
/* Rounding functions */
OTI_floor,
- OTI_lfloor,
OTI_ceil,
- OTI_lceil,
OTI_btrunc,
OTI_round,
OTI_nearbyint,
@@ -337,9 +335,7 @@ extern GTY(()) optab optab_table[OTI_MAX
#define log2_optab (optab_table[OTI_log2])
#define log1p_optab (optab_table[OTI_log1p])
#define floor_optab (optab_table[OTI_floor])
-#define lfloor_optab (optab_table[OTI_lfloor])
#define ceil_optab (optab_table[OTI_ceil])
-#define lceil_optab (optab_table[OTI_lceil])
#define btrunc_optab (optab_table[OTI_btrunc])
#define round_optab (optab_table[OTI_round])
#define nearbyint_optab (optab_table[OTI_nearbyint])
@@ -407,6 +403,8 @@ enum convert_optab_index
COI_lrint,
COI_lround,
+ COI_lfloor,
+ COI_lceil,
COI_MAX
};
@@ -424,6 +422,8 @@ extern GTY(()) convert_optab convert_opt
#define ufloat_optab (convert_optab_table[COI_ufloat])
#define lrint_optab (convert_optab_table[COI_lrint])
#define lround_optab (convert_optab_table[COI_lround])
+#define lfloor_optab (convert_optab_table[COI_lfloor])
+#define lceil_optab (convert_optab_table[COI_lceil])
/* These arrays record the insn_code of insns that may be needed to
perform input and output reloads of special objects. They provide a
Index: gcc/testsuite/gcc.target/i386/math-torture/lceil.c
===================================================================
--- /dev/null
+++ gcc/testsuite/gcc.target/i386/math-torture/lceil.c
@@ -0,0 +1,26 @@
+/* { dg-do assemble } */
+
+long testlf (float x)
+{
+ return __builtin_lceilf (x);
+}
+long testl (double x)
+{
+ return __builtin_lceil (x);
+}
+long testll (long double x)
+{
+ return __builtin_lceill (x);
+}
+long long testllf (float x)
+{
+ return __builtin_llceilf (x);
+}
+long long testll_ (double x)
+{
+ return __builtin_llceil (x);
+}
+long long testlll (long double x)
+{
+ return __builtin_llceill (x);
+}
Index: gcc/testsuite/gcc.target/i386/math-torture/lfloor.c
===================================================================
--- /dev/null
+++ gcc/testsuite/gcc.target/i386/math-torture/lfloor.c
@@ -0,0 +1,26 @@
+/* { dg-do assemble } */
+
+long testlf (float x)
+{
+ return __builtin_lfloorf (x);
+}
+long testl (double x)
+{
+ return __builtin_lfloor (x);
+}
+long testll (long double x)
+{
+ return __builtin_lfloorl (x);
+}
+long long testllf (float x)
+{
+ return __builtin_llfloorf (x);
+}
+long long testll_ (double x)
+{
+ return __builtin_llfloor (x);
+}
+long long testlll (long double x)
+{
+ return __builtin_llfloorl (x);
+}