This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: Inline round for IA64
- From: Canqun Yang <canqun at nudt dot edu dot cn>
- To: Richard Henderson <rth at redhat dot com>, gcc-patches at gcc dot gnu dot org,Jim Wilson <wilson at specifixinc dot com>,Toon Moene <toon at moene dot indiv dot nluug dot nl>
- Date: Mon, 2 Feb 2004 10:43:33 +0800 (HKT)
- Subject: Re: Inline round for IA64
- Reply-to: Canqun Yang <canqun at nudt dot edu dot cn>
Richard Henderson <rth@redhat.com>:
> As mentioned by Jim, you need to protect against
overflow.
>
I worked out another version to deal with overflow. I
appended the patch to this mail with a testcase. The
round inlined like this:
float
round (float a)
{
float y = a;
/* Use IA64 'fabs' instruction. */
if (abs(a) < (1UL << 63))
{
if (a < 0)
y = y - 0.5;
else
y = y + 0.5;
/* Truncate to integer value.
Use IA64 'fcvt.trunc' instruction. */
y = fix (y);
/* Convert to floating point value.
Use IA64 'fcvt.xf' instruction. */
y = float (y);
}
return y;
}
> In addition, one needs to protect against the
rounding mode being set
> to something other than round-to-nearest. This,
actually, is probably
> a non-starter when it comes to inlining this function.
>
> Are you sure you want to implement round, and not
rint or nearbyint?
>
To implement round.
> Finally, there's very little of this that's specific
to ia64. Indeed,
> none of it. If you were to generate this:
>
> float rint(float orig)
> {
> float two_exp_p = 1 << FLT_MANT_DIG;
> float x = fabs(orig);
> if (x < two_exp_p)
> {
> x += two_exp_p;
> x -= two_exp_p;
> x = copysignf (x, orig);
> }
> else
> x = orig;
> return x;
> }
>
> in rtl from a function in builtins.c, every target
would benefit.
>
>
The motivation of inlining round is to speed up
spec2000 program 189.lucas on IA64 platform.
As gfortran convert Fortran 95 intrinsic dnint to
round, if the round is inlined, 189.lucas will speed up
nearly 15%.
There are may be three choices to achieve this:
1. Modify the implementation of intrinsic dnint in
gfortran. But, other languages will not benefit.
2. Expand round in builtins.c. We still need to expand
function call copysignf as in case of rint.
3. Expand round in *.md for a specific target.
So, which one is the best?
> r~
>
Canqun Yang
Common subdirectories: /home/ycq/ia64/CVS and ia64/CVS
diff -c /home/ycq/ia64/ia64.c ia64/ia64.c
*** /home/ycq/ia64/ia64.c Tue Jan 27 14:48:11 2004
--- ia64/ia64.c Sat Jan 31 17:09:33 2004
***************
*** 5308,5314 ****
case UNSPEC_FR_RESTORE:
case UNSPEC_GETF_EXP:
case UNSPEC_SETF_EXP:
! case UNSPEC_ADDP4:
case UNSPEC_FR_SQRT_RECIP_APPROX:
need_barrier = rtx_needs_barrier (XVECEXP (x, 0, 0), flags, pred);
break;
--- 5308,5315 ----
case UNSPEC_FR_RESTORE:
case UNSPEC_GETF_EXP:
case UNSPEC_SETF_EXP:
! case UNSPEC_SETF_SIG:
! case UNSPEC_ADDP4:
case UNSPEC_FR_SQRT_RECIP_APPROX:
need_barrier = rtx_needs_barrier (XVECEXP (x, 0, 0), flags, pred);
break;
diff -c /home/ycq/ia64/ia64.h ia64/ia64.h
*** /home/ycq/ia64/ia64.h Mon Jan 26 07:32:42 2004
--- ia64/ia64.h Sat Jan 31 16:59:10 2004
***************
*** 92,97 ****
--- 92,99 ----
#define MASK_INLINE_SQRT_THR 0x00004000 /* inline sqrt, max throughput. */
+ #define MASK_INLINE_ROUND 0x00008000 /* inline round. */
+
#define MASK_DWARF2_ASM 0x40000000 /* test dwarf2 line info via gas. */
#define MASK_EARLY_STOP_BITS 0x00002000 /* tune stop bits for the model. */
***************
*** 139,144 ****
--- 141,149 ----
#define TARGET_INLINE_SQRT \
(target_flags & (MASK_INLINE_SQRT_LAT | MASK_INLINE_SQRT_THR))
+ #define TARGET_INLINE_ROUND \
+ (target_flags & MASK_INLINE_ROUND)
+
#define TARGET_DWARF2_ASM (target_flags & MASK_DWARF2_ASM)
/* If the assembler supports thread-local storage, assume that the
***************
*** 211,216 ****
--- 216,223 ----
N_("Generate inline square root, optimize for latency") }, \
{ "inline-sqrt-max-throughput", MASK_INLINE_SQRT_THR, \
N_("Generate inline square root, optimize for throughput") }, \
+ { "inline-round", MASK_INLINE_ROUND, \
+ N_("Generate inline round") }, \
{ "dwarf2-asm", MASK_DWARF2_ASM, \
N_("Enable Dwarf 2 line debug info via GNU as")}, \
{ "no-dwarf2-asm", -MASK_DWARF2_ASM, \
diff -c /home/ycq/ia64/ia64.md ia64/ia64.md
*** /home/ycq/ia64/ia64.md Tue Jan 27 09:42:59 2004
--- ia64/ia64.md Mon Feb 2 08:47:25 2004
***************
*** 76,81 ****
--- 76,85 ----
(UNSPEC_RET_ADDR 26)
(UNSPEC_SETF_EXP 27)
(UNSPEC_FR_SQRT_RECIP_APPROX 28)
+ (UNSPEC_ROUNDF 29)
+ (UNSPEC_ROUND 30)
+ (UNSPEC_ROUNDL 31)
+ (UNSPEC_SETF_SIG 32)
])
(define_constants
***************
*** 958,963 ****
--- 962,974 ----
"fcvt.fx.trunc %0 = %1"
[(set_attr "itanium_class" "fcvtfx")])
+ (define_insn "fix_truncxf2"
+ [(set (match_operand:XF 0 "fr_register_operand" "=f")
+ (fix: XF (match_operand:XF 1 "fr_register_operand" "f")))]
+ ""
+ "fcvt.fx.trunc %0 = %1"
+ [(set_attr "itanium_class" "fcvtfx")])
+
(define_insn "fix_truncxfdi2_alts"
[(set (match_operand:DI 0 "fr_register_operand" "=f")
(fix:DI (match_operand:XF 1 "fr_register_operand" "f")))
***************
*** 966,971 ****
--- 977,989 ----
"fcvt.fx.trunc.s%2 %0 = %1"
[(set_attr "itanium_class" "fcvtfx")])
+ (define_insn "floatxfxf2"
+ [(set (match_operand:XF 0 "fr_register_operand" "=f")
+ (float:XF (match_operand:XF 1 "fr_register_operand" "f")))]
+ ""
+ "fcvt.xf %0 = %1"
+ [(set_attr "itanium_class" "fcvtfx")])
+
;; Convert between unsigned integer types and floating point.
(define_insn "floatunsdisf2"
***************
*** 2887,2892 ****
--- 2905,3015 ----
operands[9] = CONST0_RTX (XFmode);
}
[(set_attr "predicable" "no")])
+
+ (define_insn "*setf_sig_xf"
+ [(set (match_operand:XF 0 "fr_register_operand" "=f")
+ (unspec:XF [(match_operand:DI 1 "register_operand" "r")]
+ UNSPEC_SETF_SIG))]
+ ""
+ "setf.sig %0 = %1"
+ [(set_attr "itanium_class" "frfr")])
+
+ ;; Inline round y = round (a)
+ ;;
+ ;; float
+ ;; round (float a)
+ ;; {
+ ;; float y = a;
+ ;; if (abs(a) < (1UL << 63))
+ ;; {
+ ;; if (a < 0)
+ ;; y = y - 0.5;
+ ;; else
+ ;; y = y + 0.5;
+ ;;
+ ;; /* Truncate to integer value. */
+ ;; y = fix (y)
+ ;; /* Convert to floating point value. */
+ ;; y = float (y)
+ ;; }
+ ;; return y;
+ ;; }
+
+ (define_expand "roundsf2"
+ [(set (match_operand:SF 0 "fr_register_operand" "=&f")
+ (unspec:SF [(match_operand:SF 1 "fr_register_operand" "f")]
+ UNSPEC_ROUNDF))]
+ "TARGET_INLINE_ROUND"
+ {
+ rtx insn;
+ insn = gen_roundsf2_internal (operands[0], operands[1]);
+ emit_insn (insn);
+ DONE;
+ })
+
+ (define_insn_and_split "roundsf2_internal"
+ [(set (match_operand:SF 0 "fr_register_operand" "=&f")
+ (unspec: SF [(match_operand:SF 1 "fr_register_operand" "f")]
+ UNSPEC_ROUNDF))
+ (clobber (match_scratch:DI 2 "=r"))
+ (clobber (match_scratch:DI 3 "=r"))
+ (clobber (match_scratch:DI 4 "=r"))
+ (clobber (match_scratch:XF 5 "=f"))
+ (clobber (match_scratch:XF 6 "=f"))
+ (clobber (match_scratch:XF 7 "=f"))
+ (clobber (match_scratch:BI 8 "=c"))
+ (clobber (match_scratch:BI 9 "=c"))]
+ "TARGET_INLINE_ROUND"
+ "#"
+ "reload_completed"
+ [;; f5 = -0.5
+ (set (match_dup 2) (const_int 196606))
+ (set (match_dup 5) (unspec:XF [(match_dup 2)] UNSPEC_SETF_EXP))
+
+ ;; f6 = 0.5
+ (set (match_dup 3) (const_int 65534))
+ (set (match_dup 6) (unspec:XF [(match_dup 3)] UNSPEC_SETF_EXP))
+
+ ;; f7 = LONG_MAX
+ (set (match_dup 4) (match_dup 13))
+ (set (match_dup 7) (unspec:XF [(match_dup 4)] UNSPEC_SETF_SIG))
+
+ ;; if (abs(a) < LONG_MAX) set p8
+ (set (match_dup 10) (abs:XF (match_dup 11)))
+ (set (match_dup 8)
+ (lt:BI (match_dup 10) (match_dup 7)))
+
+ ;; y = a
+ (set (match_dup 10) (match_dup 11))
+
+ ;; if (abs(a) < LONG_MAX && a >= 0) f5 = 0.5
+ (cond_exec (ne (match_dup 8) (const_int 0))
+ (set (match_dup 9)
+ (ge:BI (match_dup 11) (match_dup 12))))
+ (cond_exec (ne (match_dup 9) (const_int 0))
+ (set (match_dup 5) (match_dup 6)))
+
+ ;; if (abs(a) < LONG_MAX) y = a +/- 0.5
+ (cond_exec (ne (match_dup 8) (const_int 0))
+ (set (match_dup 10) (plus:XF (match_dup 10) (match_dup 5))))
+
+ ;; if (abs(a) < LONG_MAX) y = fix(y) (Truncate to integer)
+ (cond_exec (ne (match_dup 8) (const_int 0))
+ (set (match_dup 10) (fix:XF (match_dup 10))))
+
+ ;; if (abs(a) < LONG_MAX) y = float(y) (Convert to floating point value)
+ (cond_exec (ne (match_dup 8) (const_int 0))
+ (set (match_dup 10) (float:XF (match_dup 10))))
+
+ ;; y = (double)
+ (set (match_dup 0) (float_truncate:SF (match_dup 10)))]
+ {
+ operands[10] = gen_rtx_REG (XFmode, REGNO (operands[0]));
+ operands[11] = gen_rtx_REG (XFmode, REGNO (operands[1]));
+ operands[12] = CONST0_RTX (XFmode);
+ operands[13] = GEN_INT (1UL << 63);
+ }
+ [(set_attr "predicable" "no")])
;; ::::::::::::::::::::
;; ::
***************
*** 3387,3392 ****
--- 3510,3594 ----
operands[9] = CONST0_RTX (XFmode);
}
[(set_attr "predicable" "no")])
+
+ ;; Inline round y = round (a)
+ ;; Refer to "roundsf2"
+
+ (define_expand "rounddf2"
+ [(set (match_operand:DF 0 "fr_register_operand" "=&f")
+ (unspec:DF [(match_operand:DF 1 "fr_register_operand" "f")]
+ UNSPEC_ROUND))]
+ "TARGET_INLINE_ROUND"
+ {
+ rtx insn;
+ insn = gen_rounddf2_internal (operands[0], operands[1]);
+ emit_insn (insn);
+ DONE;
+ })
+
+ (define_insn_and_split "rounddf2_internal"
+ [(set (match_operand:DF 0 "fr_register_operand" "=&f")
+ (unspec: DF [(match_operand:DF 1 "fr_register_operand" "f")]
+ UNSPEC_ROUND))
+ (clobber (match_scratch:DI 2 "=r"))
+ (clobber (match_scratch:DI 3 "=r"))
+ (clobber (match_scratch:DI 4 "=r"))
+ (clobber (match_scratch:XF 5 "=f"))
+ (clobber (match_scratch:XF 6 "=f"))
+ (clobber (match_scratch:XF 7 "=f"))
+ (clobber (match_scratch:BI 8 "=c"))
+ (clobber (match_scratch:BI 9 "=c"))]
+ "TARGET_INLINE_ROUND"
+ "#"
+ "reload_completed"
+ [;; f5 = -0.5
+ (set (match_dup 2) (const_int 196606))
+ (set (match_dup 5) (unspec:XF [(match_dup 2)] UNSPEC_SETF_EXP))
+
+ ;; f6 = 0.5
+ (set (match_dup 3) (const_int 65534))
+ (set (match_dup 6) (unspec:XF [(match_dup 3)] UNSPEC_SETF_EXP))
+
+ ;; f7 = LONG_MAX
+ (set (match_dup 4) (match_dup 13))
+ (set (match_dup 7) (unspec:XF [(match_dup 4)] UNSPEC_SETF_SIG))
+
+ ;; if (abs(a) < LONG_MAX) set p8
+ (set (match_dup 10) (abs:XF (match_dup 11)))
+ (set (match_dup 8)
+ (lt:BI (match_dup 10) (match_dup 7)))
+
+ ;; y = a
+ (set (match_dup 10) (match_dup 11))
+
+ ;; if (abs(a) < LONG_MAX && a >= 0) f5 = 0.5
+ (cond_exec (ne (match_dup 8) (const_int 0))
+ (set (match_dup 9)
+ (ge:BI (match_dup 11) (match_dup 12))))
+ (cond_exec (ne (match_dup 9) (const_int 0))
+ (set (match_dup 5) (match_dup 6)))
+
+ ;; if (abs(a) < LONG_MAX) y = a +/- 0.5
+ (cond_exec (ne (match_dup 8) (const_int 0))
+ (set (match_dup 10) (plus:XF (match_dup 10) (match_dup 5))))
+
+ ;; if (abs(a) < LONG_MAX) y = fix(y) (Truncate to integer)
+ (cond_exec (ne (match_dup 8) (const_int 0))
+ (set (match_dup 10) (fix:XF (match_dup 10))))
+
+ ;; if (abs(a) < LONG_MAX) y = float(y) (Convert to floating point value)
+ (cond_exec (ne (match_dup 8) (const_int 0))
+ (set (match_dup 10) (float:XF (match_dup 10))))
+
+ ;; y = (double)
+ (set (match_dup 0) (float_truncate:DF (match_dup 10)))]
+ {
+ operands[10] = gen_rtx_REG (XFmode, REGNO (operands[0]));
+ operands[11] = gen_rtx_REG (XFmode, REGNO (operands[1]));
+ operands[12] = CONST0_RTX (XFmode);
+ operands[13] = GEN_INT (1UL << 63);
+ }
+ [(set_attr "predicable" "no")])
;; ::::::::::::::::::::
;; ::
***************
*** 4074,4079 ****
--- 4276,4357 ----
"frcpa.s%4 %0, %1 = %2, %3"
[(set_attr "itanium_class" "fmisc")
(set_attr "predicable" "no")])
+
+ ;; Inline round y = round (a)
+ ;; Refer to "roundsf2"
+
+ (define_expand "roundxf2"
+ [(set (match_operand:XF 0 "fr_register_operand" "=&f")
+ (unspec:XF [(match_operand:XF 1 "fr_register_operand" "f")]
+ UNSPEC_ROUNDL))]
+ "TARGET_INLINE_ROUND"
+ {
+ rtx insn;
+ insn = gen_roundxf2_internal (operands[0], operands[1]);
+ emit_insn (insn);
+ DONE;
+ })
+
+ (define_insn_and_split "roundxf2_internal"
+ [(set (match_operand:XF 0 "fr_register_operand" "=&f")
+ (unspec: XF [(match_operand:XF 1 "fr_register_operand" "f")]
+ UNSPEC_ROUNDL))
+ (clobber (match_scratch:DI 2 "=r"))
+ (clobber (match_scratch:DI 3 "=r"))
+ (clobber (match_scratch:DI 4 "=r"))
+ (clobber (match_scratch:XF 5 "=f"))
+ (clobber (match_scratch:XF 6 "=f"))
+ (clobber (match_scratch:XF 7 "=f"))
+ (clobber (match_scratch:BI 8 "=c"))
+ (clobber (match_scratch:BI 9 "=c"))]
+ "TARGET_INLINE_ROUND"
+ "#"
+ "reload_completed"
+ [;; f5 = -0.5
+ (set (match_dup 2) (const_int 196606))
+ (set (match_dup 5) (unspec:XF [(match_dup 2)] UNSPEC_SETF_EXP))
+
+ ;; f6 = 0.5
+ (set (match_dup 3) (const_int 65534))
+ (set (match_dup 6) (unspec:XF [(match_dup 3)] UNSPEC_SETF_EXP))
+
+ ;; f7 = LONG_MAX
+ (set (match_dup 4) (match_dup 13))
+ (set (match_dup 7) (unspec:XF [(match_dup 4)] UNSPEC_SETF_SIG))
+
+ ;; if (abs(a) < LONG_MAX) set p8
+ (set (match_dup 10) (abs:XF (match_dup 11)))
+ (set (match_dup 8)
+ (lt:BI (match_dup 10) (match_dup 7)))
+
+ ;; y = a
+ (set (match_dup 0) (match_dup 11))
+
+ ;; if (abs(a) < LONG_MAX && a >= 0) f5 = 0.5
+ (cond_exec (ne (match_dup 8) (const_int 0))
+ (set (match_dup 9)
+ (ge:BI (match_dup 11) (match_dup 12))))
+ (cond_exec (ne (match_dup 9) (const_int 0))
+ (set (match_dup 5) (match_dup 6)))
+
+ ;; if (abs(a) < LONG_MAX) y = a +/- 0.5
+ (cond_exec (ne (match_dup 8) (const_int 0))
+ (set (match_dup 10) (plus:XF (match_dup 10) (match_dup 5))))
+
+ ;; if (abs(a) < LONG_MAX) y = fix(y) (Truncate to integer)
+ (cond_exec (ne (match_dup 8) (const_int 0))
+ (set (match_dup 10) (fix:XF (match_dup 10))))
+
+ ;; if (abs(a) < LONG_MAX) y = float(y) (Convert to floating point value)
+ (cond_exec (ne (match_dup 8) (const_int 0))
+ (set (match_dup 0) (float:XF (match_dup 10))))]
+ {
+ operands[10] = gen_rtx_REG (XFmode, REGNO (operands[0]));
+ operands[11] = gen_rtx_REG (XFmode, REGNO (operands[1]));
+ operands[12] = CONST0_RTX (XFmode);
+ operands[13] = GEN_INT (1UL << 63);
+ }
+ [(set_attr "predicable" "no")])
;; ::::::::::::::::::::
;; ::
/* Test inline round on IA64.
Use different command line options to compile round.c, run, and then
compare the results.
gcc round.c -lm
gcc round.c -O3 -minline-round */
float
f_round (float orig)
{
return roundf (orig);
}
double
d_round (double orig)
{
return round (orig);
}
long double
l_round (long double orig)
{
return roundl (orig);
}
int
main ()
{
printf ("%f\n", f_round (-7.507));
printf ("%f\n", f_round (-7.499));
printf ("%e\n", f_round (-7.012345E37));
printf ("%f\n", f_round (7.507));
printf ("%f\n", f_round (7.499));
printf ("%e\n", f_round (7.012345E37));
printf ("%f\n", d_round (-7.507));
printf ("%f\n", d_round (-7.499));
printf ("%e\n", d_round (-7.012345E307));
printf ("%f\n", d_round (7.507));
printf ("%f\n", d_round (7.499));
printf ("%e\n", d_round (7.012345E307));
printf ("%f\n", (double)l_round (-7.507));
printf ("%f\n", (double)l_round (-7.499));
printf ("%e\n", (double)l_round (-7.012345E307));
printf ("%f\n", (double)l_round (7.507));
printf ("%f\n", (double)l_round (7.499));
printf ("%e\n", (double)l_round (7.012345E307));
}