This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
[PATCH] Expand round inline with SSE on x86_64 and i?86

From: Richard Guenther <rguenther at suse dot de>
To: gcc-patches at gcc dot gnu dot org
Date: Sun, 29 Oct 2006 15:34:31 +0100 (CET)
Subject: [PATCH] Expand round inline with SSE on x86_64 and i?86
This patch adds inline expansion of C99 round rounding function
using SSE math.  Like the previous one, it is a backend local change.
We expand round via the following pseudocode:

        double xa = fabs (x);
        if (!isless (xa, TWO52))
          return x;
        xa = (double)(long)(xa + nextafter (0.5, 0.0));
        return copysign (xa, x);

which is basically (double)llround(x).  This is not possible on i686
for DFmode, because we cannot truncate and convert to integer with
enough precision there, so the alternative expansion is used there:

        double xa = fabs (x), xa2, x2;
        if (!isless (xa, TWO52))
          return x;
     Using the absolute value and copying back sign makes
     -0.0 -> -0.0 correct.
        xa2 = xa + TWO52 - TWO52;
     Compensate.
        dxa = xa2 - xa;
        if (dxa <= -0.5)
          xa2 += 1;
        else if (dxa > 0.5)
          xa2 -= 1;
        x2 = copysign (xa2, x);
        return x2;

assembly produced for round is (x86_64)

round_libcall:
.LFB2:
        movapd  %xmm0, %xmm2
        movsd   .LC7(%rip), %xmm0
        movapd  %xmm2, %xmm1
        andpd   %xmm0, %xmm1
        ucomisd .LC2(%rip), %xmm1
        jae     .L52
        jp      .L52
        addsd   .LC5(%rip), %xmm1
        andnpd  %xmm2, %xmm0
        cvttsd2siq      %xmm1, %rax
        cvtsi2sdq       %rax, %xmm1
        movapd  %xmm1, %xmm2
        orpd    %xmm0, %xmm2
.L52:
        movapd  %xmm2, %xmm0
        ret

Bootstrapped and tested on {x86_64,i686}-linux.

Ok for mainline?

Thanks,
Richard.

2006-08-23  Richard Guenther  <rguenther@suse.de>

	* config/i386/i386-protos.h (ix86_expand_round): Declare.
	(ix86_expand_rounddf_32): Likewise.
	* config/i386/i386.c (ix86_expand_round): New function expanding
	round inline for SSE math and -fno-trapping-math.
	(ix86_expand_rounddf_32): Same for DFmode on 32bit archs.
	* config/i386/i386.md (rounddf2, roundsf2): New pattern expanding
	round via ix86_expand_round.

	* gcc.target/i386/math-torture/round.c: New testcase.


Index: gcc/config/i386/i386-protos.h
===================================================================
--- gcc.orig/config/i386/i386-protos.h
+++ gcc/config/i386/i386-protos.h
@@ -162,6 +162,8 @@ extern void ix86_expand_lfloorceil (rtx,
 extern void ix86_expand_rint (rtx, rtx);
 extern void ix86_expand_floorceil (rtx, rtx, bool);
 extern void ix86_expand_floorceildf_32 (rtx, rtx, bool);
+extern void ix86_expand_round (rtx, rtx);
+extern void ix86_expand_rounddf_32 (rtx, rtx);
 
 #ifdef TREE_CODE
 extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree);
Index: gcc/config/i386/i386.c
===================================================================
--- gcc.orig/config/i386/i386.c
+++ gcc/config/i386/i386.c
@@ -19505,4 +19505,129 @@ ix86_expand_floorceil (rtx operand0, rtx
   emit_move_insn (operand0, res);
 }
 
+/* Expand SSE sequence for computing round from OPERAND1 storing
+   into OPERAND0.  Sequence that works without relying on DImode truncation
+   via cvttsd2siq that is only available on 64bit targets.  */
+void
+ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
+{
+  /* C code for the stuff we expand below.
+        double xa = fabs (x), xa2, x2;
+        if (!isless (xa, TWO52))
+          return x;
+     Using the absolute value and copying back sign makes
+     -0.0 -> -0.0 correct.
+        xa2 = xa + TWO52 - TWO52;
+     Compensate.
+	dxa = xa2 - xa;
+        if (dxa <= -0.5)
+          xa2 += 1;
+        else if (dxa > 0.5)
+          xa2 -= 1;
+        x2 = copysign (xa2, x);
+        return x2;
+   */
+  enum machine_mode mode = GET_MODE (operand0);
+  rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
+
+  TWO52 = ix86_gen_TWO52 (mode);
+
+  /* Temporary for holding the result, initialized to the input
+     operand to ease control flow.  */
+  res = gen_reg_rtx (mode);
+  emit_move_insn (res, operand1);
+
+  /* xa = abs (operand1) */
+  xa = ix86_expand_sse_fabs (res, &mask);
+
+  /* if (!isless (xa, TWO52)) goto label; */
+  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+
+  /* xa2 = xa + TWO52 - TWO52; */
+  xa2 = gen_reg_rtx (mode);
+  expand_simple_binop (mode, PLUS, xa, TWO52, xa2, 0, OPTAB_DIRECT);
+  expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
+
+  /* dxa = xa2 - xa; */
+  dxa = gen_reg_rtx (mode);
+  expand_simple_binop (mode, MINUS, xa2, xa, dxa, 0, OPTAB_DIRECT);
+
+  /* generate 0.5, 1.0 and -0.5 */
+  half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
+  one = gen_reg_rtx (mode);
+  expand_simple_binop (mode, PLUS, half, half, one, 0, OPTAB_DIRECT);
+  mhalf = gen_reg_rtx (mode);
+  expand_simple_binop (mode, MINUS, half, one, mhalf, 0, OPTAB_DIRECT);
+
+  /* Compensate.  */
+  tmp = gen_reg_rtx (mode);
+  /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
+  tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
+  emit_insn (gen_rtx_SET (VOIDmode, tmp,
+                          gen_rtx_AND (mode, one, tmp)));
+  expand_simple_binop (mode, MINUS, xa2, tmp, xa2, 0, OPTAB_DIRECT);
+  /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
+  tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
+  emit_insn (gen_rtx_SET (VOIDmode, tmp,
+                          gen_rtx_AND (mode, one, tmp)));
+  expand_simple_binop (mode, PLUS, xa2, tmp, xa2, 0, OPTAB_DIRECT);
+
+  /* res = copysign (xa2, operand1) */
+  ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
+
+  emit_label (label);
+  LABEL_NUSES (label) = 1;
+
+  emit_move_insn (operand0, res);
+}
+
+/* Expand SSE sequence for computing round from OPERAND1 storing
+   into OPERAND0.  */
+void
+ix86_expand_round (rtx operand0, rtx operand1)
+{
+  /* C code for the stuff we're doing below:
+        double xa = fabs (x);
+        if (!isless (xa, TWO52))
+          return x;
+        xa = (double)(long)(xa + nextafter (0.5, 0.0));
+        return copysign (xa, x);
+   */
+  enum machine_mode mode = GET_MODE (operand0);
+  rtx res, TWO52, xa, label, xi, half, mask;
+  const struct real_format *fmt;
+  REAL_VALUE_TYPE pred_half, half_minus_pred_half;
+
+  /* Temporary for holding the result, initialized to the input
+     operand to ease control flow.  */
+  res = gen_reg_rtx (mode);
+  emit_move_insn (res, operand1);
+
+  TWO52 = ix86_gen_TWO52 (mode);
+  xa = ix86_expand_sse_fabs (res, &mask);
+  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+
+  /* load nextafter (0.5, 0.0) */
+  fmt = REAL_MODE_FORMAT (mode);
+  real_2expN (&half_minus_pred_half, -(fmt->p) - 1);
+  REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
+
+  /* xa = xa + 0.5 */
+  half = force_reg (mode, const_double_from_real_value (pred_half, mode));
+  expand_simple_binop (mode, PLUS, xa, half, xa, 0, OPTAB_DIRECT);
+
+  /* xa = (double)(int64_t)xa */
+  xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
+  expand_fix (xi, xa, 0);
+  expand_float (xa, xi, 0);
+
+  /* res = copysign (xa, operand1) */
+  ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
+
+  emit_label (label);
+  LABEL_NUSES (label) = 1;
+
+  emit_move_insn (operand0, res);
+}
+
 #include "gt-i386.h"
Index: gcc/config/i386/i386.md
===================================================================
--- gcc.orig/config/i386/i386.md
+++ gcc/config/i386/i386.md
@@ -17222,6 +17222,29 @@
   DONE;
 })
 
+(define_expand "roundsf2"
+  [(match_operand:SF 0 "register_operand" "")
+   (match_operand:SF 1 "nonimmediate_operand" "")]
+  "SSE_FLOAT_MODE_P (SFmode) && TARGET_SSE_MATH
+   && !flag_trapping_math && !flag_rounding_math"
+{
+  ix86_expand_round (operand0, operand1);
+  DONE;
+})
+
+(define_expand "rounddf2"
+  [(match_operand:DF 0 "register_operand" "")
+   (match_operand:DF 1 "nonimmediate_operand" "")]
+  "SSE_FLOAT_MODE_P (DFmode) && TARGET_SSE_MATH
+   && !flag_trapping_math && !flag_rounding_math"
+{
+  if (TARGET_64BIT)
+    ix86_expand_round (operand0, operand1);
+  else
+    ix86_expand_rounddf_32 (operand0, operand1);
+  DONE;
+})
+
 (define_insn_and_split "*fistdi2_1"
   [(set (match_operand:DI 0 "nonimmediate_operand" "=m,?r")
 	(unspec:DI [(match_operand:XF 1 "register_operand" "f,f")]
Index: gcc/testsuite/gcc.target/i386/math-torture/round.c
===================================================================
--- /dev/null
+++ gcc/testsuite/gcc.target/i386/math-torture/round.c
@@ -0,0 +1,15 @@
+/* { dg-do assemble } */
+
+float testlf (float x)
+{
+  return __builtin_roundf (x);
+}
+double testl (double x)
+{
+  return __builtin_round (x);
+}
+long double testll (long double x)
+{
+  return __builtin_roundl (x);
+}
+
Follow-Ups:
- Re: [PATCH] Expand round inline with SSE on x86_64 and i?86
  - From: Roger Sayle
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]