This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH] Expand floor and ceil inline with SSE on x86_64 and i?86
- From: Richard Guenther <rguenther at suse dot de>
- To: gcc-patches at gcc dot gnu dot org
- Date: Sun, 29 Oct 2006 15:22:17 +0100 (CET)
- Subject: [PATCH] Expand floor and ceil inline with SSE on x86_64 and i?86
This patch adds the ability to expand the C99 floor{f,} and ceil{f,}
rounding functions inline using SSE math. Optabs and expanders for
387 math already exist, so this merely adds i386 backend helper functions
and adjusts the machine description accordingly.
The basic idea is, that if we can truncate and convert to integer
with appropriate precision (which we can for both DFmode and SFmode on
x86_64 and SFmode on i686), we can use C pseudocode like
double xa = fabs (x), x2;
if (!isless (xa, TWO52))
return x;
x2 = (double)(long)x;
/* Compensate. Floor: */
if (x2 > x)
x2 -= 1;
/* Compensate. Ceil: */
if (x2 < x)
x2 += 1;
return x2;
(where TWO52 is the precision of the FP mode, 2**52 for DFmode and
2**23 for SFmode). This results in assembly like
movapd %xmm0, %xmm1
movsd .LC7(%rip), %xmm0
movapd %xmm1, %xmm2
andpd %xmm0, %xmm2
ucomisd .LC2(%rip), %xmm2
jae .L40
jp .L40
cvttsd2siq %xmm1, %rax
movsd .LC9(%rip), %xmm0
cvtsi2sdq %rax, %xmm2
cmpnlesd %xmm2, %xmm1
andpd %xmm0, %xmm1
addsd %xmm2, %xmm1
.L40:
movapd %xmm1, %xmm0
ret
for ceil for example (.LC7 is a mask for fabs, .LC2 is 2**52,
.LC9 is 1.0).
The trick using truncation makes this expansion valid even for
rounding modes other than round-to-nearest, so the only constraint
is !flag_trapping_math.
The patch adds a 2nd variant for i686 DFmode where cvttsd2siq is
not available. There we expand to
double xa = fabs (x), x2;
if (!isless (xa, TWO52))
return x;
xa = xa + TWO52 - TWO52;
x2 = copysign (xa, x);
Compensate. Floor:
if (x2 > x)
x2 -= 1;
Compensate. Ceil:
if (x2 < x)
x2 += 1;
return x2;
which is similar, but does the rounding and compensation differently.
The resulting asm is slightly more envolved:
ceil_libcall:
pushl %ebp
movl %esp, %ebp
subl $8, %esp
movsd .LC6, %xmm0
movsd 8(%ebp), %xmm1
movsd .LC7, %xmm3
movapd %xmm1, %xmm2
andpd %xmm3, %xmm2
ucomisd %xmm2, %xmm0
jbe .L82
addsd %xmm0, %xmm2
movsd .LC8, %xmm3
subsd %xmm0, %xmm2
movapd %xmm1, %xmm0
andpd %xmm3, %xmm0
orpd %xmm0, %xmm2
movapd %xmm2, %xmm0
subsd %xmm1, %xmm0
xorpd %xmm1, %xmm1
cmpnlesd %xmm0, %xmm1
movsd .LC10, %xmm0
andpd %xmm0, %xmm1
addsd %xmm2, %xmm1
.L82:
movsd %xmm1, -8(%ebp)
fldl -8(%ebp)
leave
ret
but it should still be a benefit for SSE math as it does not require
a costly SSE to x87 register move and back (it is also benchmarked
faster than a library call with x87 math). This expansion we might
disable for -Os though?
Bootstrapped and tested on {x86_64,i686,ppc}-linux-gnu.
Ok for mainline?
Thanks,
Richard.
2006-08-23 Richard Guenther <rguenther@suse.de>
* config/i386/i386-protos.h (ix86_expand_floorceil): Declare.
(ix86_expand_floorceildf_32): Likewise.
* config/i386/i386.c (ix86_expand_sse_compare_mask): New
static helper function.
(ix86_expand_floorceil): Expander for floor and ceil to SSE
math.
(ix86_expand_floorceildf_32): Same for DFmode on 32bit archs.
* config/i386/i386.md (floordf2): Adjust to enable floor
expansion via ix86_expand_floorceil if TARGET_SSE_MATH and
-fno-trapping-math is enabled.
(floorsf2, ceildf2, ceilsf2): Likewise.
* config/i386/sse.md (sse_maskcmpsf3): New insn.
(sse2_maskcmpdf3): Likewise.
* gcc.target/i386/math-torture/ceil.c: New testcase.
* gcc.target/i386/math-torture/floor.c: Likewise.
Index: gcc/config/i386/i386-protos.h
===================================================================
--- gcc.orig/config/i386/i386-protos.h
+++ gcc/config/i386/i386-protos.h
@@ -160,6 +160,8 @@ extern enum rtx_code ix86_reverse_condit
extern void ix86_expand_lround (rtx, rtx);
extern void ix86_expand_lfloorceil (rtx, rtx, bool);
extern void ix86_expand_rint (rtx, rtx);
+extern void ix86_expand_floorceil (rtx, rtx, bool);
+extern void ix86_expand_floorceildf_32 (rtx, rtx, bool);
#ifdef TREE_CODE
extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree);
Index: gcc/config/i386/i386.c
===================================================================
--- gcc.orig/config/i386/i386.c
+++ gcc/config/i386/i386.c
@@ -19252,6 +19252,33 @@ ix86_expand_sse_compare_and_jump (enum r
return label;
}
+/* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
+ using comparison code CODE. Operands are swapped for the comparison if
+ SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
+static rtx
+ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
+ bool swap_operands)
+{
+ enum machine_mode mode = GET_MODE (op0);
+ rtx mask = gen_reg_rtx (mode);
+
+ if (swap_operands)
+ {
+ rtx tmp = op0;
+ op0 = op1;
+ op1 = tmp;
+ }
+
+ if (mode == DFmode)
+ emit_insn (gen_sse2_maskcmpdf3 (mask, op0, op1,
+ gen_rtx_fmt_ee (code, mode, op0, op1)));
+ else
+ emit_insn (gen_sse_maskcmpsf3 (mask, op0, op1,
+ gen_rtx_fmt_ee (code, mode, op0, op1)));
+
+ return mask;
+}
+
/* Generate and return a rtx of mode MODE for 2**n where n is the number
of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
static rtx
@@ -19365,4 +19392,117 @@ ix86_expand_rint (rtx operand0, rtx oper
emit_move_insn (operand0, res);
}
+/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
+ into OPERAND0. */
+void
+ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
+{
+ /* C code for the stuff we expand below.
+ double xa = fabs (x), x2;
+ if (!isless (xa, TWO52))
+ return x;
+ xa = xa + TWO52 - TWO52;
+ x2 = copysign (xa, x);
+ Compensate. Floor:
+ if (x2 > x)
+ x2 -= 1;
+ Compensate. Ceil:
+ if (x2 < x)
+ x2 += 1;
+ return x2;
+ */
+ enum machine_mode mode = GET_MODE (operand0);
+ rtx xa, TWO52, tmp, label, one, res, mask;
+
+ TWO52 = ix86_gen_TWO52 (mode);
+
+ /* Temporary for holding the result, initialized to the input
+ operand to ease control flow. */
+ res = gen_reg_rtx (mode);
+ emit_move_insn (res, operand1);
+
+ /* xa = abs (operand1) */
+ xa = ix86_expand_sse_fabs (res, &mask);
+
+ /* if (!isless (xa, TWO52)) goto label; */
+ label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+
+ /* xa = xa + TWO52 - TWO52; */
+ expand_simple_binop (mode, PLUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
+ expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
+
+ /* xa = copysign (xa, operand1) */
+ ix86_sse_copysign_to_positive (xa, xa, res, mask);
+
+ /* generate 1.0 */
+ one = force_reg (mode, const_double_from_real_value (dconst1, mode));
+
+ /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
+ tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
+ emit_insn (gen_rtx_SET (VOIDmode, tmp,
+ gen_rtx_AND (mode, one, tmp)));
+ expand_simple_binop (mode, do_floor ? MINUS : PLUS,
+ xa, tmp, res, 0, OPTAB_DIRECT);
+
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+
+ emit_move_insn (operand0, res);
+}
+
+/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
+ into OPERAND0. */
+void
+ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
+{
+ /* C code for the stuff we expand below.
+ double xa = fabs (x), x2;
+ if (!isless (xa, TWO52))
+ return x;
+ x2 = (double)(long)x;
+ Compensate. Floor:
+ if (x2 > x)
+ x2 -= 1;
+ Compensate. Ceil:
+ if (x2 < x)
+ x2 += 1;
+ return x2;
+ */
+ enum machine_mode mode = GET_MODE (operand0);
+ rtx xa, xi, TWO52, tmp, label, one, res;
+
+ TWO52 = ix86_gen_TWO52 (mode);
+
+ /* Temporary for holding the result, initialized to the input
+ operand to ease control flow. */
+ res = gen_reg_rtx (mode);
+ emit_move_insn (res, operand1);
+
+ /* xa = abs (operand1) */
+ xa = ix86_expand_sse_fabs (res, NULL);
+
+ /* if (!isless (xa, TWO52)) goto label; */
+ label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+
+ /* xa = (double)(long)x */
+ xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
+ expand_fix (xi, res, 0);
+ expand_float (xa, xi, 0);
+
+ /* generate 1.0 */
+ one = force_reg (mode, const_double_from_real_value (dconst1, mode));
+
+ /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
+ tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
+ emit_insn (gen_rtx_SET (VOIDmode, tmp,
+ gen_rtx_AND (mode, one, tmp)));
+ expand_simple_binop (mode, do_floor ? MINUS : PLUS,
+ xa, tmp, res, 0, OPTAB_DIRECT);
+
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+
+ emit_move_insn (operand0, res);
+}
+
#include "gt-i386.h"
Index: gcc/config/i386/i386.md
===================================================================
--- gcc.orig/config/i386/i386.md
+++ gcc/config/i386/i386.md
@@ -17440,10 +17440,22 @@
(define_expand "floordf2"
[(use (match_operand:DF 0 "register_operand" ""))
(use (match_operand:DF 1 "register_operand" ""))]
- "TARGET_USE_FANCY_MATH_387
- && (!(TARGET_SSE2 && TARGET_SSE_MATH) || TARGET_MIX_SSE_I387)
- && flag_unsafe_math_optimizations"
+ "(TARGET_USE_FANCY_MATH_387
+ && (!(TARGET_SSE2 && TARGET_SSE_MATH) || TARGET_MIX_SSE_I387)
+ && flag_unsafe_math_optimizations)
+ || (SSE_FLOAT_MODE_P (DFmode) && TARGET_SSE_MATH
+ && !flag_trapping_math)"
{
+ if (SSE_FLOAT_MODE_P (DFmode) && TARGET_SSE_MATH
+ && !flag_trapping_math)
+ {
+ if (TARGET_64BIT)
+ ix86_expand_floorceil (operand0, operand1, true);
+ else
+ ix86_expand_floorceildf_32 (operand0, operand1, true);
+ }
+ else
+ {
rtx op0 = gen_reg_rtx (XFmode);
rtx op1 = gen_reg_rtx (XFmode);
@@ -17451,16 +17463,24 @@
emit_insn (gen_frndintxf2_floor (op0, op1));
emit_insn (gen_truncxfdf2_i387_noop (operands[0], op0));
+ }
DONE;
})
(define_expand "floorsf2"
[(use (match_operand:SF 0 "register_operand" ""))
(use (match_operand:SF 1 "register_operand" ""))]
- "TARGET_USE_FANCY_MATH_387
- && (!TARGET_SSE_MATH || TARGET_MIX_SSE_I387)
- && flag_unsafe_math_optimizations"
-{
+ "(TARGET_USE_FANCY_MATH_387
+ && (!TARGET_SSE_MATH || TARGET_MIX_SSE_I387)
+ && flag_unsafe_math_optimizations)
+ || (SSE_FLOAT_MODE_P (SFmode) && TARGET_SSE_MATH
+ && !flag_trapping_math)"
+{
+ if (SSE_FLOAT_MODE_P (SFmode) && TARGET_SSE_MATH
+ && !flag_trapping_math)
+ ix86_expand_floorceil (operand0, operand1, true);
+ else
+ {
rtx op0 = gen_reg_rtx (XFmode);
rtx op1 = gen_reg_rtx (XFmode);
@@ -17468,6 +17488,7 @@
emit_insn (gen_frndintxf2_floor (op0, op1));
emit_insn (gen_truncxfsf2_i387_noop (operands[0], op0));
+ }
DONE;
})
@@ -17701,10 +17722,22 @@
(define_expand "ceildf2"
[(use (match_operand:DF 0 "register_operand" ""))
(use (match_operand:DF 1 "register_operand" ""))]
- "TARGET_USE_FANCY_MATH_387
- && (!(TARGET_SSE2 && TARGET_SSE_MATH) || TARGET_MIX_SSE_I387)
- && flag_unsafe_math_optimizations"
+ "(TARGET_USE_FANCY_MATH_387
+ && (!(TARGET_SSE2 && TARGET_SSE_MATH) || TARGET_MIX_SSE_I387)
+ && flag_unsafe_math_optimizations)
+ || (SSE_FLOAT_MODE_P (DFmode) && TARGET_SSE_MATH
+ && !flag_trapping_math)"
{
+ if (SSE_FLOAT_MODE_P (DFmode) && TARGET_SSE_MATH
+ && !flag_trapping_math)
+ {
+ if (TARGET_64BIT)
+ ix86_expand_floorceil (operand0, operand1, false);
+ else
+ ix86_expand_floorceildf_32 (operand0, operand1, false);
+ }
+ else
+ {
rtx op0 = gen_reg_rtx (XFmode);
rtx op1 = gen_reg_rtx (XFmode);
@@ -17712,16 +17745,24 @@
emit_insn (gen_frndintxf2_ceil (op0, op1));
emit_insn (gen_truncxfdf2_i387_noop (operands[0], op0));
+ }
DONE;
})
(define_expand "ceilsf2"
[(use (match_operand:SF 0 "register_operand" ""))
(use (match_operand:SF 1 "register_operand" ""))]
- "TARGET_USE_FANCY_MATH_387
- && (!TARGET_SSE_MATH || TARGET_MIX_SSE_I387)
- && flag_unsafe_math_optimizations"
-{
+ "(TARGET_USE_FANCY_MATH_387
+ && (!TARGET_SSE_MATH || TARGET_MIX_SSE_I387)
+ && flag_unsafe_math_optimizations)
+ || (SSE_FLOAT_MODE_P (SFmode) && TARGET_SSE_MATH
+ && !flag_trapping_math)"
+{
+ if (SSE_FLOAT_MODE_P (SFmode) && TARGET_SSE_MATH
+ && !flag_trapping_math)
+ ix86_expand_floorceil (operand0, operand1, false);
+ else
+ {
rtx op0 = gen_reg_rtx (XFmode);
rtx op1 = gen_reg_rtx (XFmode);
@@ -17729,6 +17770,7 @@
emit_insn (gen_frndintxf2_ceil (op0, op1));
emit_insn (gen_truncxfsf2_i387_noop (operands[0], op0));
+ }
DONE;
})
Index: gcc/config/i386/sse.md
===================================================================
--- gcc.orig/config/i386/sse.md
+++ gcc/config/i386/sse.md
@@ -733,6 +733,16 @@
[(set_attr "type" "ssecmp")
(set_attr "mode" "V4SF")])
+(define_insn "sse_maskcmpsf3"
+ [(set (match_operand:SF 0 "register_operand" "=x")
+ (match_operator:SF 3 "sse_comparison_operator"
+ [(match_operand:SF 1 "register_operand" "0")
+ (match_operand:SF 2 "nonimmediate_operand" "xm")]))]
+ "TARGET_SSE"
+ "cmp%D3ss\t{%2, %0|%0, %2}"
+ [(set_attr "type" "ssecmp")
+ (set_attr "mode" "SF")])
+
(define_insn "sse_vmmaskcmpv4sf3"
[(set (match_operand:V4SF 0 "register_operand" "=x")
(vec_merge:V4SF
@@ -1718,6 +1728,16 @@
[(set_attr "type" "ssecmp")
(set_attr "mode" "V2DF")])
+(define_insn "sse2_maskcmpdf3"
+ [(set (match_operand:DF 0 "register_operand" "=x")
+ (match_operator:DF 3 "sse_comparison_operator"
+ [(match_operand:DF 1 "register_operand" "0")
+ (match_operand:DF 2 "nonimmediate_operand" "xm")]))]
+ "TARGET_SSE2"
+ "cmp%D3sd\t{%2, %0|%0, %2}"
+ [(set_attr "type" "ssecmp")
+ (set_attr "mode" "DF")])
+
(define_insn "sse2_vmmaskcmpv2df3"
[(set (match_operand:V2DF 0 "register_operand" "=x")
(vec_merge:V2DF
Index: gcc/testsuite/gcc.target/i386/math-torture/ceil.c
===================================================================
--- /dev/null
+++ gcc/testsuite/gcc.target/i386/math-torture/ceil.c
@@ -0,0 +1,15 @@
+/* { dg-do assemble } */
+
+float testlf (float x)
+{
+ return __builtin_ceilf (x);
+}
+double testl (double x)
+{
+ return __builtin_ceil (x);
+}
+long double testll (long double x)
+{
+ return __builtin_ceill (x);
+}
+
Index: gcc/testsuite/gcc.target/i386/math-torture/floor.c
===================================================================
--- /dev/null
+++ gcc/testsuite/gcc.target/i386/math-torture/floor.c
@@ -0,0 +1,15 @@
+/* { dg-do assemble } */
+
+float testlf (float x)
+{
+ return __builtin_floorf (x);
+}
+double testl (double x)
+{
+ return __builtin_floor (x);
+}
+long double testll (long double x)
+{
+ return __builtin_floorl (x);
+}
+