This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
[PATCH] Expand floor and ceil inline with SSE on x86_64 and i?86

From: Richard Guenther <rguenther at suse dot de>
To: gcc-patches at gcc dot gnu dot org
Date: Sun, 29 Oct 2006 15:22:17 +0100 (CET)
Subject: [PATCH] Expand floor and ceil inline with SSE on x86_64 and i?86
This patch adds the ability to expand the C99 floor{f,} and ceil{f,}
rounding functions inline using SSE math.  Optabs and expanders for
387 math already exist, so this merely adds i386 backend helper functions
and adjusts the machine description accordingly.

The basic idea is, that if we can truncate and convert to integer
with appropriate precision (which we can for both DFmode and SFmode on
x86_64 and SFmode on i686), we can use C pseudocode like

       double xa = fabs (x), x2;
       if (!isless (xa, TWO52))
         return x;
       x2 = (double)(long)x;
       /* Compensate.  Floor: */
       if (x2 > x)
         x2 -= 1;
       /* Compensate.  Ceil: */
       if (x2 < x)
         x2 += 1;
       return x2;

(where TWO52 is the precision of the FP mode, 2**52 for DFmode and
2**23 for SFmode).  This results in assembly like

        movapd  %xmm0, %xmm1
        movsd   .LC7(%rip), %xmm0
        movapd  %xmm1, %xmm2
        andpd   %xmm0, %xmm2
        ucomisd .LC2(%rip), %xmm2
        jae     .L40
        jp      .L40
        cvttsd2siq      %xmm1, %rax
        movsd   .LC9(%rip), %xmm0
        cvtsi2sdq       %rax, %xmm2
        cmpnlesd        %xmm2, %xmm1
        andpd   %xmm0, %xmm1
        addsd   %xmm2, %xmm1
.L40:
        movapd  %xmm1, %xmm0
        ret

for ceil for example (.LC7 is a mask for fabs, .LC2 is 2**52,
.LC9 is 1.0).

The trick using truncation makes this expansion valid even for
rounding modes other than round-to-nearest, so the only constraint
is !flag_trapping_math.

The patch adds a 2nd variant for i686 DFmode where cvttsd2siq is
not available.  There we expand to

        double xa = fabs (x), x2;
        if (!isless (xa, TWO52))
          return x;
        xa = xa + TWO52 - TWO52;
        x2 = copysign (xa, x);
     Compensate.  Floor:
        if (x2 > x)
          x2 -= 1;
     Compensate.  Ceil:
        if (x2 < x)
          x2 += 1;
        return x2;

which is similar, but does the rounding and compensation differently.
The resulting asm is slightly more envolved:

ceil_libcall:
        pushl   %ebp
        movl    %esp, %ebp
        subl    $8, %esp
        movsd   .LC6, %xmm0
        movsd   8(%ebp), %xmm1
        movsd   .LC7, %xmm3
        movapd  %xmm1, %xmm2
        andpd   %xmm3, %xmm2
        ucomisd %xmm2, %xmm0
        jbe     .L82
        addsd   %xmm0, %xmm2
        movsd   .LC8, %xmm3
        subsd   %xmm0, %xmm2
        movapd  %xmm1, %xmm0
        andpd   %xmm3, %xmm0
        orpd    %xmm0, %xmm2
        movapd  %xmm2, %xmm0
        subsd   %xmm1, %xmm0
        xorpd   %xmm1, %xmm1
        cmpnlesd        %xmm0, %xmm1
        movsd   .LC10, %xmm0
        andpd   %xmm0, %xmm1
        addsd   %xmm2, %xmm1
.L82:
        movsd   %xmm1, -8(%ebp)
        fldl    -8(%ebp)
        leave
        ret

but it should still be a benefit for SSE math as it does not require
a costly SSE to x87 register move and back (it is also benchmarked
faster than a library call with x87 math).  This expansion we might
disable for -Os though?

Bootstrapped and tested on {x86_64,i686,ppc}-linux-gnu.

Ok for mainline?

Thanks,
Richard.

2006-08-23  Richard Guenther  <rguenther@suse.de>

	* config/i386/i386-protos.h (ix86_expand_floorceil): Declare.
	(ix86_expand_floorceildf_32): Likewise.
	* config/i386/i386.c (ix86_expand_sse_compare_mask): New
	static helper function.
	(ix86_expand_floorceil): Expander for floor and ceil to SSE
	math.
	(ix86_expand_floorceildf_32): Same for DFmode on 32bit archs.
	* config/i386/i386.md (floordf2): Adjust to enable floor
	expansion via ix86_expand_floorceil if TARGET_SSE_MATH and
	-fno-trapping-math is enabled.
	(floorsf2, ceildf2, ceilsf2): Likewise.
	* config/i386/sse.md (sse_maskcmpsf3): New insn.
	(sse2_maskcmpdf3): Likewise.

	* gcc.target/i386/math-torture/ceil.c: New testcase.
	* gcc.target/i386/math-torture/floor.c: Likewise.


Index: gcc/config/i386/i386-protos.h
===================================================================
--- gcc.orig/config/i386/i386-protos.h
+++ gcc/config/i386/i386-protos.h
@@ -160,6 +160,8 @@ extern enum rtx_code ix86_reverse_condit
 extern void ix86_expand_lround (rtx, rtx);
 extern void ix86_expand_lfloorceil (rtx, rtx, bool);
 extern void ix86_expand_rint (rtx, rtx);
+extern void ix86_expand_floorceil (rtx, rtx, bool);
+extern void ix86_expand_floorceildf_32 (rtx, rtx, bool);
 
 #ifdef TREE_CODE
 extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree);
Index: gcc/config/i386/i386.c
===================================================================
--- gcc.orig/config/i386/i386.c
+++ gcc/config/i386/i386.c
@@ -19252,6 +19252,33 @@ ix86_expand_sse_compare_and_jump (enum r
   return label;
 }
 
+/* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
+   using comparison code CODE.  Operands are swapped for the comparison if
+   SWAP_OPERANDS is true.  Returns a rtx for the generated mask.  */
+static rtx
+ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
+			      bool swap_operands)
+{
+  enum machine_mode mode = GET_MODE (op0);
+  rtx mask = gen_reg_rtx (mode);
+
+  if (swap_operands)
+    {
+      rtx tmp = op0;
+      op0 = op1;
+      op1 = tmp;
+    }
+
+  if (mode == DFmode)
+    emit_insn (gen_sse2_maskcmpdf3 (mask, op0, op1,
+				    gen_rtx_fmt_ee (code, mode, op0, op1)));
+  else
+    emit_insn (gen_sse_maskcmpsf3 (mask, op0, op1,
+				   gen_rtx_fmt_ee (code, mode, op0, op1)));
+
+  return mask;
+}
+
 /* Generate and return a rtx of mode MODE for 2**n where n is the number
    of bits of the mantissa of MODE, which must be one of DFmode or SFmode.  */
 static rtx
@@ -19365,4 +19392,117 @@ ix86_expand_rint (rtx operand0, rtx oper
   emit_move_insn (operand0, res);
 }
 
+/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
+   into OPERAND0.  */
+void
+ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
+{
+  /* C code for the stuff we expand below.
+        double xa = fabs (x), x2;
+        if (!isless (xa, TWO52))
+          return x;
+        xa = xa + TWO52 - TWO52;
+        x2 = copysign (xa, x);
+     Compensate.  Floor:
+        if (x2 > x)
+          x2 -= 1;
+     Compensate.  Ceil:
+        if (x2 < x)
+          x2 += 1;
+        return x2;
+   */
+  enum machine_mode mode = GET_MODE (operand0);
+  rtx xa, TWO52, tmp, label, one, res, mask;
+
+  TWO52 = ix86_gen_TWO52 (mode);
+
+  /* Temporary for holding the result, initialized to the input
+     operand to ease control flow.  */
+  res = gen_reg_rtx (mode);
+  emit_move_insn (res, operand1);
+
+  /* xa = abs (operand1) */
+  xa = ix86_expand_sse_fabs (res, &mask);
+
+  /* if (!isless (xa, TWO52)) goto label; */
+  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+
+  /* xa = xa + TWO52 - TWO52; */
+  expand_simple_binop (mode, PLUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
+  expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
+
+  /* xa = copysign (xa, operand1) */
+  ix86_sse_copysign_to_positive (xa, xa, res, mask);
+
+  /* generate 1.0 */
+  one = force_reg (mode, const_double_from_real_value (dconst1, mode));
+
+  /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
+  tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
+  emit_insn (gen_rtx_SET (VOIDmode, tmp,
+                          gen_rtx_AND (mode, one, tmp)));
+  expand_simple_binop (mode, do_floor ? MINUS : PLUS,
+                       xa, tmp, res, 0, OPTAB_DIRECT);
+
+  emit_label (label);
+  LABEL_NUSES (label) = 1;
+
+  emit_move_insn (operand0, res);
+}
+
+/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
+   into OPERAND0.  */
+void
+ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
+{
+  /* C code for the stuff we expand below.
+	double xa = fabs (x), x2;
+        if (!isless (xa, TWO52))
+          return x;
+	x2 = (double)(long)x;
+     Compensate.  Floor:
+	if (x2 > x)
+	  x2 -= 1;
+     Compensate.  Ceil:
+	if (x2 < x)
+	  x2 += 1;
+	return x2;
+   */
+  enum machine_mode mode = GET_MODE (operand0);
+  rtx xa, xi, TWO52, tmp, label, one, res;
+
+  TWO52 = ix86_gen_TWO52 (mode);
+
+  /* Temporary for holding the result, initialized to the input
+     operand to ease control flow.  */
+  res = gen_reg_rtx (mode);
+  emit_move_insn (res, operand1);
+
+  /* xa = abs (operand1) */
+  xa = ix86_expand_sse_fabs (res, NULL);
+
+  /* if (!isless (xa, TWO52)) goto label; */
+  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+
+  /* xa = (double)(long)x */
+  xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
+  expand_fix (xi, res, 0);
+  expand_float (xa, xi, 0);
+
+  /* generate 1.0 */
+  one = force_reg (mode, const_double_from_real_value (dconst1, mode));
+
+  /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
+  tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
+  emit_insn (gen_rtx_SET (VOIDmode, tmp,
+                          gen_rtx_AND (mode, one, tmp)));
+  expand_simple_binop (mode, do_floor ? MINUS : PLUS,
+                       xa, tmp, res, 0, OPTAB_DIRECT);
+
+  emit_label (label);
+  LABEL_NUSES (label) = 1;
+
+  emit_move_insn (operand0, res);
+}
+
 #include "gt-i386.h"
Index: gcc/config/i386/i386.md
===================================================================
--- gcc.orig/config/i386/i386.md
+++ gcc/config/i386/i386.md
@@ -17440,10 +17440,22 @@
 (define_expand "floordf2"
   [(use (match_operand:DF 0 "register_operand" ""))
    (use (match_operand:DF 1 "register_operand" ""))]
-  "TARGET_USE_FANCY_MATH_387
-   && (!(TARGET_SSE2 && TARGET_SSE_MATH) || TARGET_MIX_SSE_I387)
-   && flag_unsafe_math_optimizations"
+  "(TARGET_USE_FANCY_MATH_387
+    && (!(TARGET_SSE2 && TARGET_SSE_MATH) || TARGET_MIX_SSE_I387)
+    && flag_unsafe_math_optimizations)
+   || (SSE_FLOAT_MODE_P (DFmode) && TARGET_SSE_MATH
+       && !flag_trapping_math)"
 {
+  if (SSE_FLOAT_MODE_P (DFmode) && TARGET_SSE_MATH
+      && !flag_trapping_math)
+    {
+      if (TARGET_64BIT)
+	ix86_expand_floorceil (operand0, operand1, true);
+      else
+	ix86_expand_floorceildf_32 (operand0, operand1, true);
+    }
+  else
+    {
   rtx op0 = gen_reg_rtx (XFmode);
   rtx op1 = gen_reg_rtx (XFmode);
 
@@ -17451,16 +17463,24 @@
   emit_insn (gen_frndintxf2_floor (op0, op1));
 
   emit_insn (gen_truncxfdf2_i387_noop (operands[0], op0));
+    }
   DONE;
 })
 
 (define_expand "floorsf2"
   [(use (match_operand:SF 0 "register_operand" ""))
    (use (match_operand:SF 1 "register_operand" ""))]
-  "TARGET_USE_FANCY_MATH_387
-   && (!TARGET_SSE_MATH || TARGET_MIX_SSE_I387)
-   && flag_unsafe_math_optimizations"
-{
+  "(TARGET_USE_FANCY_MATH_387
+    && (!TARGET_SSE_MATH || TARGET_MIX_SSE_I387)
+    && flag_unsafe_math_optimizations)
+   || (SSE_FLOAT_MODE_P (SFmode) && TARGET_SSE_MATH
+       && !flag_trapping_math)"
+{
+  if (SSE_FLOAT_MODE_P (SFmode) && TARGET_SSE_MATH
+      && !flag_trapping_math)
+    ix86_expand_floorceil (operand0, operand1, true);
+  else
+    {
   rtx op0 = gen_reg_rtx (XFmode);
   rtx op1 = gen_reg_rtx (XFmode);
 
@@ -17468,6 +17488,7 @@
   emit_insn (gen_frndintxf2_floor (op0, op1));
 
   emit_insn (gen_truncxfsf2_i387_noop (operands[0], op0));
+    }
   DONE;
 })
 
@@ -17701,10 +17722,22 @@
 (define_expand "ceildf2"
   [(use (match_operand:DF 0 "register_operand" ""))
    (use (match_operand:DF 1 "register_operand" ""))]
-  "TARGET_USE_FANCY_MATH_387
-   && (!(TARGET_SSE2 && TARGET_SSE_MATH) || TARGET_MIX_SSE_I387)
-   && flag_unsafe_math_optimizations"
+  "(TARGET_USE_FANCY_MATH_387
+    && (!(TARGET_SSE2 && TARGET_SSE_MATH) || TARGET_MIX_SSE_I387)
+    && flag_unsafe_math_optimizations)
+   || (SSE_FLOAT_MODE_P (DFmode) && TARGET_SSE_MATH
+       && !flag_trapping_math)"
 {
+  if (SSE_FLOAT_MODE_P (DFmode) && TARGET_SSE_MATH
+      && !flag_trapping_math)
+    {
+      if (TARGET_64BIT)
+	ix86_expand_floorceil (operand0, operand1, false);
+      else
+	ix86_expand_floorceildf_32 (operand0, operand1, false);
+    }
+  else
+    {
   rtx op0 = gen_reg_rtx (XFmode);
   rtx op1 = gen_reg_rtx (XFmode);
 
@@ -17712,16 +17745,24 @@
   emit_insn (gen_frndintxf2_ceil (op0, op1));
 
   emit_insn (gen_truncxfdf2_i387_noop (operands[0], op0));
+    }
   DONE;
 })
 
 (define_expand "ceilsf2"
   [(use (match_operand:SF 0 "register_operand" ""))
    (use (match_operand:SF 1 "register_operand" ""))]
-  "TARGET_USE_FANCY_MATH_387
-   && (!TARGET_SSE_MATH || TARGET_MIX_SSE_I387)
-   && flag_unsafe_math_optimizations"
-{
+  "(TARGET_USE_FANCY_MATH_387
+    && (!TARGET_SSE_MATH || TARGET_MIX_SSE_I387)
+    && flag_unsafe_math_optimizations)
+   || (SSE_FLOAT_MODE_P (SFmode) && TARGET_SSE_MATH
+       && !flag_trapping_math)"
+{
+  if (SSE_FLOAT_MODE_P (SFmode) && TARGET_SSE_MATH
+      && !flag_trapping_math)
+    ix86_expand_floorceil (operand0, operand1, false);
+  else
+    {
   rtx op0 = gen_reg_rtx (XFmode);
   rtx op1 = gen_reg_rtx (XFmode);
 
@@ -17729,6 +17770,7 @@
   emit_insn (gen_frndintxf2_ceil (op0, op1));
 
   emit_insn (gen_truncxfsf2_i387_noop (operands[0], op0));
+    }
   DONE;
 })
 
Index: gcc/config/i386/sse.md
===================================================================
--- gcc.orig/config/i386/sse.md
+++ gcc/config/i386/sse.md
@@ -733,6 +733,16 @@
   [(set_attr "type" "ssecmp")
    (set_attr "mode" "V4SF")])
 
+(define_insn "sse_maskcmpsf3"
+  [(set (match_operand:SF 0 "register_operand" "=x")
+	(match_operator:SF 3 "sse_comparison_operator"
+		[(match_operand:SF 1 "register_operand" "0")
+		 (match_operand:SF 2 "nonimmediate_operand" "xm")]))]
+  "TARGET_SSE"
+  "cmp%D3ss\t{%2, %0|%0, %2}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "mode" "SF")])
+
 (define_insn "sse_vmmaskcmpv4sf3"
   [(set (match_operand:V4SF 0 "register_operand" "=x")
 	(vec_merge:V4SF
@@ -1718,6 +1728,16 @@
   [(set_attr "type" "ssecmp")
    (set_attr "mode" "V2DF")])
 
+(define_insn "sse2_maskcmpdf3"
+  [(set (match_operand:DF 0 "register_operand" "=x")
+	(match_operator:DF 3 "sse_comparison_operator"
+		[(match_operand:DF 1 "register_operand" "0")
+		 (match_operand:DF 2 "nonimmediate_operand" "xm")]))]
+  "TARGET_SSE2"
+  "cmp%D3sd\t{%2, %0|%0, %2}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "mode" "DF")])
+
 (define_insn "sse2_vmmaskcmpv2df3"
   [(set (match_operand:V2DF 0 "register_operand" "=x")
 	(vec_merge:V2DF
Index: gcc/testsuite/gcc.target/i386/math-torture/ceil.c
===================================================================
--- /dev/null
+++ gcc/testsuite/gcc.target/i386/math-torture/ceil.c
@@ -0,0 +1,15 @@
+/* { dg-do assemble } */
+
+float testlf (float x)
+{
+  return __builtin_ceilf (x);
+}
+double testl (double x)
+{
+  return __builtin_ceil (x);
+}
+long double testll (long double x)
+{
+  return __builtin_ceill (x);
+}
+
Index: gcc/testsuite/gcc.target/i386/math-torture/floor.c
===================================================================
--- /dev/null
+++ gcc/testsuite/gcc.target/i386/math-torture/floor.c
@@ -0,0 +1,15 @@
+/* { dg-do assemble } */
+
+float testlf (float x)
+{
+  return __builtin_floorf (x);
+}
+double testl (double x)
+{
+  return __builtin_floor (x);
+}
+long double testll (long double x)
+{
+  return __builtin_floorl (x);
+}
+
Follow-Ups:
- Re: [PATCH] Expand floor and ceil inline with SSE on x86_64 and i?86
  - From: Roger Sayle
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]