This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH] Expand round inline with SSE on x86_64 and i?86
- From: Richard Guenther <rguenther at suse dot de>
- To: gcc-patches at gcc dot gnu dot org
- Date: Sun, 29 Oct 2006 15:34:31 +0100 (CET)
- Subject: [PATCH] Expand round inline with SSE on x86_64 and i?86
This patch adds inline expansion of C99 round rounding function
using SSE math. Like the previous one, it is a backend local change.
We expand round via the following pseudocode:
double xa = fabs (x);
if (!isless (xa, TWO52))
return x;
xa = (double)(long)(xa + nextafter (0.5, 0.0));
return copysign (xa, x);
which is basically (double)llround(x). This is not possible on i686
for DFmode, because we cannot truncate and convert to integer with
enough precision there, so the alternative expansion is used there:
double xa = fabs (x), xa2, x2;
if (!isless (xa, TWO52))
return x;
Using the absolute value and copying back sign makes
-0.0 -> -0.0 correct.
xa2 = xa + TWO52 - TWO52;
Compensate.
dxa = xa2 - xa;
if (dxa <= -0.5)
xa2 += 1;
else if (dxa > 0.5)
xa2 -= 1;
x2 = copysign (xa2, x);
return x2;
assembly produced for round is (x86_64)
round_libcall:
.LFB2:
movapd %xmm0, %xmm2
movsd .LC7(%rip), %xmm0
movapd %xmm2, %xmm1
andpd %xmm0, %xmm1
ucomisd .LC2(%rip), %xmm1
jae .L52
jp .L52
addsd .LC5(%rip), %xmm1
andnpd %xmm2, %xmm0
cvttsd2siq %xmm1, %rax
cvtsi2sdq %rax, %xmm1
movapd %xmm1, %xmm2
orpd %xmm0, %xmm2
.L52:
movapd %xmm2, %xmm0
ret
Bootstrapped and tested on {x86_64,i686}-linux.
Ok for mainline?
Thanks,
Richard.
2006-08-23 Richard Guenther <rguenther@suse.de>
* config/i386/i386-protos.h (ix86_expand_round): Declare.
(ix86_expand_rounddf_32): Likewise.
* config/i386/i386.c (ix86_expand_round): New function expanding
round inline for SSE math and -fno-trapping-math.
(ix86_expand_rounddf_32): Same for DFmode on 32bit archs.
* config/i386/i386.md (rounddf2, roundsf2): New pattern expanding
round via ix86_expand_round.
* gcc.target/i386/math-torture/round.c: New testcase.
Index: gcc/config/i386/i386-protos.h
===================================================================
--- gcc.orig/config/i386/i386-protos.h
+++ gcc/config/i386/i386-protos.h
@@ -162,6 +162,8 @@ extern void ix86_expand_lfloorceil (rtx,
extern void ix86_expand_rint (rtx, rtx);
extern void ix86_expand_floorceil (rtx, rtx, bool);
extern void ix86_expand_floorceildf_32 (rtx, rtx, bool);
+extern void ix86_expand_round (rtx, rtx);
+extern void ix86_expand_rounddf_32 (rtx, rtx);
#ifdef TREE_CODE
extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree);
Index: gcc/config/i386/i386.c
===================================================================
--- gcc.orig/config/i386/i386.c
+++ gcc/config/i386/i386.c
@@ -19505,4 +19505,129 @@ ix86_expand_floorceil (rtx operand0, rtx
emit_move_insn (operand0, res);
}
+/* Expand SSE sequence for computing round from OPERAND1 storing
+ into OPERAND0. Sequence that works without relying on DImode truncation
+ via cvttsd2siq that is only available on 64bit targets. */
+void
+ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
+{
+ /* C code for the stuff we expand below.
+ double xa = fabs (x), xa2, x2;
+ if (!isless (xa, TWO52))
+ return x;
+ Using the absolute value and copying back sign makes
+ -0.0 -> -0.0 correct.
+ xa2 = xa + TWO52 - TWO52;
+ Compensate.
+ dxa = xa2 - xa;
+ if (dxa <= -0.5)
+ xa2 += 1;
+ else if (dxa > 0.5)
+ xa2 -= 1;
+ x2 = copysign (xa2, x);
+ return x2;
+ */
+ enum machine_mode mode = GET_MODE (operand0);
+ rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
+
+ TWO52 = ix86_gen_TWO52 (mode);
+
+ /* Temporary for holding the result, initialized to the input
+ operand to ease control flow. */
+ res = gen_reg_rtx (mode);
+ emit_move_insn (res, operand1);
+
+ /* xa = abs (operand1) */
+ xa = ix86_expand_sse_fabs (res, &mask);
+
+ /* if (!isless (xa, TWO52)) goto label; */
+ label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+
+ /* xa2 = xa + TWO52 - TWO52; */
+ xa2 = gen_reg_rtx (mode);
+ expand_simple_binop (mode, PLUS, xa, TWO52, xa2, 0, OPTAB_DIRECT);
+ expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
+
+ /* dxa = xa2 - xa; */
+ dxa = gen_reg_rtx (mode);
+ expand_simple_binop (mode, MINUS, xa2, xa, dxa, 0, OPTAB_DIRECT);
+
+ /* generate 0.5, 1.0 and -0.5 */
+ half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
+ one = gen_reg_rtx (mode);
+ expand_simple_binop (mode, PLUS, half, half, one, 0, OPTAB_DIRECT);
+ mhalf = gen_reg_rtx (mode);
+ expand_simple_binop (mode, MINUS, half, one, mhalf, 0, OPTAB_DIRECT);
+
+ /* Compensate. */
+ tmp = gen_reg_rtx (mode);
+ /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
+ tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
+ emit_insn (gen_rtx_SET (VOIDmode, tmp,
+ gen_rtx_AND (mode, one, tmp)));
+ expand_simple_binop (mode, MINUS, xa2, tmp, xa2, 0, OPTAB_DIRECT);
+ /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
+ tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
+ emit_insn (gen_rtx_SET (VOIDmode, tmp,
+ gen_rtx_AND (mode, one, tmp)));
+ expand_simple_binop (mode, PLUS, xa2, tmp, xa2, 0, OPTAB_DIRECT);
+
+ /* res = copysign (xa2, operand1) */
+ ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
+
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+
+ emit_move_insn (operand0, res);
+}
+
+/* Expand SSE sequence for computing round from OPERAND1 storing
+ into OPERAND0. */
+void
+ix86_expand_round (rtx operand0, rtx operand1)
+{
+ /* C code for the stuff we're doing below:
+ double xa = fabs (x);
+ if (!isless (xa, TWO52))
+ return x;
+ xa = (double)(long)(xa + nextafter (0.5, 0.0));
+ return copysign (xa, x);
+ */
+ enum machine_mode mode = GET_MODE (operand0);
+ rtx res, TWO52, xa, label, xi, half, mask;
+ const struct real_format *fmt;
+ REAL_VALUE_TYPE pred_half, half_minus_pred_half;
+
+ /* Temporary for holding the result, initialized to the input
+ operand to ease control flow. */
+ res = gen_reg_rtx (mode);
+ emit_move_insn (res, operand1);
+
+ TWO52 = ix86_gen_TWO52 (mode);
+ xa = ix86_expand_sse_fabs (res, &mask);
+ label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+
+ /* load nextafter (0.5, 0.0) */
+ fmt = REAL_MODE_FORMAT (mode);
+ real_2expN (&half_minus_pred_half, -(fmt->p) - 1);
+ REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
+
+ /* xa = xa + 0.5 */
+ half = force_reg (mode, const_double_from_real_value (pred_half, mode));
+ expand_simple_binop (mode, PLUS, xa, half, xa, 0, OPTAB_DIRECT);
+
+ /* xa = (double)(int64_t)xa */
+ xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
+ expand_fix (xi, xa, 0);
+ expand_float (xa, xi, 0);
+
+ /* res = copysign (xa, operand1) */
+ ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
+
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+
+ emit_move_insn (operand0, res);
+}
+
#include "gt-i386.h"
Index: gcc/config/i386/i386.md
===================================================================
--- gcc.orig/config/i386/i386.md
+++ gcc/config/i386/i386.md
@@ -17222,6 +17222,29 @@
DONE;
})
+(define_expand "roundsf2"
+ [(match_operand:SF 0 "register_operand" "")
+ (match_operand:SF 1 "nonimmediate_operand" "")]
+ "SSE_FLOAT_MODE_P (SFmode) && TARGET_SSE_MATH
+ && !flag_trapping_math && !flag_rounding_math"
+{
+ ix86_expand_round (operand0, operand1);
+ DONE;
+})
+
+(define_expand "rounddf2"
+ [(match_operand:DF 0 "register_operand" "")
+ (match_operand:DF 1 "nonimmediate_operand" "")]
+ "SSE_FLOAT_MODE_P (DFmode) && TARGET_SSE_MATH
+ && !flag_trapping_math && !flag_rounding_math"
+{
+ if (TARGET_64BIT)
+ ix86_expand_round (operand0, operand1);
+ else
+ ix86_expand_rounddf_32 (operand0, operand1);
+ DONE;
+})
+
(define_insn_and_split "*fistdi2_1"
[(set (match_operand:DI 0 "nonimmediate_operand" "=m,?r")
(unspec:DI [(match_operand:XF 1 "register_operand" "f,f")]
Index: gcc/testsuite/gcc.target/i386/math-torture/round.c
===================================================================
--- /dev/null
+++ gcc/testsuite/gcc.target/i386/math-torture/round.c
@@ -0,0 +1,15 @@
+/* { dg-do assemble } */
+
+float testlf (float x)
+{
+ return __builtin_roundf (x);
+}
+double testl (double x)
+{
+ return __builtin_round (x);
+}
+long double testll (long double x)
+{
+ return __builtin_roundl (x);
+}
+