This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Patch to change IA64 division code
- From: Steve Ellcey <sje at cup dot hp dot com>
- To: gcc-patches at gcc dot gnu dot org, wilson at specifixinc dot com
- Date: Wed, 21 Feb 2007 09:16:55 -0800 (PST)
- Subject: Patch to change IA64 division code
- Reply-to: sje at cup dot hp dot com
The current implementation of floating point division on IA64 is
sub-optimal in that it is done with a post-reload split that doesn't
allow for very good instruction scheduling. This patch allows for the
expansion of the division sequence earlier and thus allows for better
instruction scheduling. I did some testing with SPEC2000 fp and got
some improvements with this change, ignoring art which had a lot of
variation (both good and bad) when I ran it, I got a less than 1%
slowdown in 3 tests and a less than 1% speed up in 7 tests. 173.applu
sped up by a little over 1%, 200.sixtrack sped up by a little less than
4% and 301.apsi sped up by a little more than 4%. I got similar results
on HP-UX in LP64 mode and I got slightly better results on HP-UX in
ILP32 mode.
I did one SPEC2006 fp run on HP-UX in LP64 mode and had 3 tests with
less than 1% slowdown and 6 with less than 1% speed up. bwaves sped up
6%, lbm by 4%, and zuesmp, gromacs, wrf, and sphinx3 sped up by 1 to
2%.
This patch only converts the maximum throughput versions of single and
double precision floating point division, if it is approved I will
commit to converting the minimum latency versions to this same setup as
well but I would like to get this patch approved as an intermediate step
and to make sure the overall approach is acceptable.
Tested on IA64 HP-UX and Linux with no regressions.
OK to checkin?
Steve Ellcey
sje@cup.hp.com
2007-02-21 Steve Ellcey <sje@cup.hp.com>
* config/ia64/ia64.h (HARD_REGNO_NREGS): Handle RFmode.
(HARD_REGNO_MODE_OK): Ditto.
(MODES_TIEABLE_P): Ditto.
(HARD_REGNO_CALLER_SAVE_MODE): Ditto.
(CLASS_MAX_NREGS): Ditto.
* config/ia64/ia64.c (ia64_print_operand_address): Add R format.
(rtx_needs_barrier): Add UNSPEC_NOP_CONVERT case.
* config/ia64/ia64.md (UNSPEC_NOP_CONVERT): New.
(divsf3_internal_thr): Removed.
(divdf3_internal_thr): Removed.
* config/ia64/div.md: New file.
Index: config/ia64/ia64.h
===================================================================
--- config/ia64/ia64.h (revision 122189)
+++ config/ia64/ia64.h (working copy)
@@ -1,5 +1,5 @@
/* Definitions of target machine GNU compiler. IA-64 version.
- Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006
+ Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007
Free Software Foundation, Inc.
Contributed by James E. Wilson <wilson@cygnus.com> and
David Mosberger <davidm@hpl.hp.com>.
@@ -642,6 +642,7 @@ while (0)
: PR_REGNO_P (REGNO) && (MODE) == BImode ? 2 \
: PR_REGNO_P (REGNO) && (MODE) == CCImode ? 1 \
: FR_REGNO_P (REGNO) && (MODE) == XFmode ? 1 \
+ : FR_REGNO_P (REGNO) && (MODE) == RFmode ? 1 \
: FR_REGNO_P (REGNO) && (MODE) == XCmode ? 2 \
: (GET_MODE_SIZE (MODE) + UNITS_PER_WORD - 1) / UNITS_PER_WORD)
@@ -657,7 +658,7 @@ while (0)
: PR_REGNO_P (REGNO) ? \
(MODE) == BImode || GET_MODE_CLASS (MODE) == MODE_CC \
: GR_REGNO_P (REGNO) ? \
- (MODE) != CCImode && (MODE) != XFmode && (MODE) != XCmode \
+ (MODE) != CCImode && (MODE) != XFmode && (MODE) != XCmode && (MODE) != RFmode \
: AR_REGNO_P (REGNO) ? (MODE) == DImode \
: BR_REGNO_P (REGNO) ? (MODE) == DImode \
: 0)
@@ -674,15 +675,15 @@ while (0)
we can't tie it with any other modes. */
#define MODES_TIEABLE_P(MODE1, MODE2) \
(GET_MODE_CLASS (MODE1) == GET_MODE_CLASS (MODE2) \
- && ((((MODE1) == XFmode) || ((MODE1) == XCmode)) \
- == (((MODE2) == XFmode) || ((MODE2) == XCmode))) \
+ && ((((MODE1) == XFmode) || ((MODE1) == XCmode) || ((MODE1) == RFmode)) \
+ == (((MODE2) == XFmode) || ((MODE2) == XCmode) || ((MODE1) == RFmode))) \
&& (((MODE1) == BImode) == ((MODE2) == BImode)))
/* Specify the modes required to caller save a given hard regno.
We need to ensure floating pt regs are not saved as DImode. */
#define HARD_REGNO_CALLER_SAVE_MODE(REGNO, NREGS, MODE) \
- ((FR_REGNO_P (REGNO) && (NREGS) == 1) ? XFmode \
+ ((FR_REGNO_P (REGNO) && (NREGS) == 1) ? RFmode \
: choose_hard_reg_mode ((REGNO), (NREGS), false))
/* Handling Leaf Functions */
@@ -896,6 +897,7 @@ enum reg_class
#define CLASS_MAX_NREGS(CLASS, MODE) \
((MODE) == BImode && (CLASS) == PR_REGS ? 2 \
: (((CLASS) == FR_REGS || (CLASS) == FP_REGS) && (MODE) == XFmode) ? 1 \
+ : (((CLASS) == FR_REGS || (CLASS) == FP_REGS) && (MODE) == RFmode) ? 1 \
: (((CLASS) == FR_REGS || (CLASS) == FP_REGS) && (MODE) == XCmode) ? 2 \
: (GET_MODE_SIZE (MODE) + UNITS_PER_WORD - 1) / UNITS_PER_WORD)
Index: config/ia64/ia64.c
===================================================================
--- config/ia64/ia64.c (revision 122189)
+++ config/ia64/ia64.c (working copy)
@@ -1,5 +1,5 @@
/* Definitions of target machine for GNU compiler.
- Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006
+ Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007
Free Software Foundation, Inc.
Contributed by James E. Wilson <wilson@cygnus.com> and
David Mosberger <davidm@hpl.hp.com>.
@@ -4508,6 +4508,7 @@ ia64_print_operand_address (FILE * strea
O Append .acq for volatile load.
P Postincrement of a MEM.
Q Append .rel for volatile store.
+ R Print .s .d or nothing for a single, double or no truncation.
S Shift amount for shladd instruction.
T Print an 8-bit sign extended number (K) as a 32-bit unsigned number
for Intel assembler.
@@ -4648,6 +4649,17 @@ ia64_print_operand (FILE * file, rtx x,
fputs(".rel", file);
return;
+ case 'R':
+ if (x == CONST0_RTX (GET_MODE (x)))
+ fputs(".s", file);
+ else if (x == CONST1_RTX (GET_MODE (x)))
+ fputs(".d", file);
+ else if (x == CONST2_RTX (GET_MODE (x)))
+ ;
+ else
+ output_operand_lossage ("invalid %%R value");
+ return;
+
case 'S':
fprintf (file, "%d", exact_log2 (INTVAL (x)));
return;
@@ -5793,6 +5805,7 @@ rtx_needs_barrier (rtx x, struct reg_fla
case UNSPEC_LDSA:
case UNSPEC_CHKACLR:
case UNSPEC_CHKS:
+ case UNSPEC_NOP_CONVERT:
need_barrier = rtx_needs_barrier (XVECEXP (x, 0, 0), flags, pred);
break;
Index: config/ia64/ia64.md
===================================================================
--- config/ia64/ia64.md (revision 122189)
+++ config/ia64/ia64.md (working copy)
@@ -81,6 +81,7 @@ (define_constants
(UNSPEC_SHRP 29)
(UNSPEC_COPYSIGN 30)
(UNSPEC_VECT_EXTR 31)
+ (UNSPEC_NOP_CONVERT 32)
(UNSPEC_LDA 40)
(UNSPEC_LDS 41)
(UNSPEC_LDSA 42)
@@ -3108,60 +3109,6 @@ (define_insn_and_split "divsf3_internal_
}
[(set_attr "predicable" "no")])
-(define_insn_and_split "divsf3_internal_thr"
- [(set (match_operand:SF 0 "fr_register_operand" "=&f")
- (div:SF (match_operand:SF 1 "fr_register_operand" "f")
- (match_operand:SF 2 "fr_register_operand" "f")))
- (clobber (match_scratch:XF 3 "=&f"))
- (clobber (match_scratch:XF 4 "=f"))
- (clobber (match_scratch:BI 5 "=c"))]
- "TARGET_INLINE_FLOAT_DIV == INL_MAX_THR"
- "#"
- "&& reload_completed"
- [(parallel [(set (match_dup 6) (div:XF (const_int 1) (match_dup 8)))
- (set (match_dup 5) (unspec:BI [(match_dup 7) (match_dup 8)]
- UNSPEC_FR_RECIP_APPROX))
- (use (const_int 0))])
- (cond_exec (ne (match_dup 5) (const_int 0))
- (parallel [(set (match_dup 3)
- (minus:XF (match_dup 10)
- (mult:XF (match_dup 8) (match_dup 6))))
- (use (const_int 1))]))
- (cond_exec (ne (match_dup 5) (const_int 0))
- (parallel [(set (match_dup 3)
- (plus:XF (mult:XF (match_dup 3) (match_dup 3))
- (match_dup 3)))
- (use (const_int 1))]))
- (cond_exec (ne (match_dup 5) (const_int 0))
- (parallel [(set (match_dup 6)
- (plus:XF (mult:XF (match_dup 3) (match_dup 6))
- (match_dup 6)))
- (use (const_int 1))]))
- (cond_exec (ne (match_dup 5) (const_int 0))
- (parallel [(set (match_dup 9)
- (float_truncate:SF
- (mult:XF (match_dup 7) (match_dup 6))))
- (use (const_int 1))]))
- (cond_exec (ne (match_dup 5) (const_int 0))
- (parallel [(set (match_dup 4)
- (minus:XF (match_dup 7)
- (mult:XF (match_dup 8) (match_dup 3))))
- (use (const_int 1))]))
- (cond_exec (ne (match_dup 5) (const_int 0))
- (set (match_dup 0)
- (float_truncate:SF
- (plus:XF (mult:XF (match_dup 4) (match_dup 6))
- (match_dup 3)))))
- ]
-{
- operands[6] = gen_rtx_REG (XFmode, REGNO (operands[0]));
- operands[7] = gen_rtx_REG (XFmode, REGNO (operands[1]));
- operands[8] = gen_rtx_REG (XFmode, REGNO (operands[2]));
- operands[9] = gen_rtx_REG (SFmode, REGNO (operands[3]));
- operands[10] = CONST1_RTX (XFmode);
-}
- [(set_attr "predicable" "no")])
-
;; Inline square root.
(define_insn "*sqrt_approx"
@@ -3614,72 +3561,6 @@ (define_insn_and_split "divdf3_internal_
}
[(set_attr "predicable" "no")])
-(define_insn_and_split "divdf3_internal_thr"
- [(set (match_operand:DF 0 "fr_register_operand" "=&f")
- (div:DF (match_operand:DF 1 "fr_register_operand" "f")
- (match_operand:DF 2 "fr_register_operand" "f")))
- (clobber (match_scratch:XF 3 "=&f"))
- (clobber (match_scratch:DF 4 "=f"))
- (clobber (match_scratch:BI 5 "=c"))]
- "TARGET_INLINE_FLOAT_DIV == INL_MAX_THR"
- "#"
- "&& reload_completed"
- [(parallel [(set (match_dup 6) (div:XF (const_int 1) (match_dup 8)))
- (set (match_dup 5) (unspec:BI [(match_dup 7) (match_dup 8)]
- UNSPEC_FR_RECIP_APPROX))
- (use (const_int 0))])
- (cond_exec (ne (match_dup 5) (const_int 0))
- (parallel [(set (match_dup 3)
- (minus:XF (match_dup 10)
- (mult:XF (match_dup 8) (match_dup 6))))
- (use (const_int 1))]))
- (cond_exec (ne (match_dup 5) (const_int 0))
- (parallel [(set (match_dup 6)
- (plus:XF (mult:XF (match_dup 3) (match_dup 6))
- (match_dup 6)))
- (use (const_int 1))]))
- (cond_exec (ne (match_dup 5) (const_int 0))
- (parallel [(set (match_dup 3)
- (mult:XF (match_dup 3) (match_dup 3)))
- (use (const_int 1))]))
- (cond_exec (ne (match_dup 5) (const_int 0))
- (parallel [(set (match_dup 6)
- (plus:XF (mult:XF (match_dup 3) (match_dup 6))
- (match_dup 6)))
- (use (const_int 1))]))
- (cond_exec (ne (match_dup 5) (const_int 0))
- (parallel [(set (match_dup 3)
- (mult:XF (match_dup 3) (match_dup 3)))
- (use (const_int 1))]))
- (cond_exec (ne (match_dup 5) (const_int 0))
- (parallel [(set (match_dup 6)
- (plus:XF (mult:XF (match_dup 3) (match_dup 6))
- (match_dup 6)))
- (use (const_int 1))]))
- (cond_exec (ne (match_dup 5) (const_int 0))
- (parallel [(set (match_dup 9)
- (float_truncate:DF
- (mult:XF (match_dup 7) (match_dup 6))))
- (use (const_int 1))]))
- (cond_exec (ne (match_dup 5) (const_int 0))
- (parallel [(set (match_dup 4)
- (minus:DF (match_dup 1)
- (mult:DF (match_dup 2) (match_dup 9))))
- (use (const_int 1))]))
- (cond_exec (ne (match_dup 5) (const_int 0))
- (set (match_dup 0)
- (plus:DF (mult:DF (match_dup 4) (match_dup 0))
- (match_dup 9))))
- ]
-{
- operands[6] = gen_rtx_REG (XFmode, REGNO (operands[0]));
- operands[7] = gen_rtx_REG (XFmode, REGNO (operands[1]));
- operands[8] = gen_rtx_REG (XFmode, REGNO (operands[2]));
- operands[9] = gen_rtx_REG (DFmode, REGNO (operands[3]));
- operands[10] = CONST1_RTX (XFmode);
-}
- [(set_attr "predicable" "no")])
-
;; Inline square root.
(define_expand "sqrtdf2"
@@ -6540,3 +6421,5 @@ (define_insn "ip_value"
(include "vect.md")
;; Atomic operations
(include "sync.md")
+;; New division operations
+(include "div.md")
Index: config/ia64/div.md
===================================================================
--- config/ia64/div.md (revision 0)
+++ config/ia64/div.md (revision 0)
@@ -0,0 +1,575 @@
+;; IA-64 machine description for inline division operations.
+;; Copyright (C) 2007
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 2, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING. If not, write to
+;; the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+;; Boston, MA 02110-1301, USA.
+
+
+;; For the internal conditional math routines:
+
+;; _a versions are when we want output to be op 0 if predicate is false.
+;; _b versions are when we don't care about output if predicate is false.
+
+;; operand 0 is always the result
+;; operand 1 is always the predicate
+;; operand 2, 3, and sometimes 4 are the input values.
+;; operand 4 or 5 is the floating point status register to use.
+;; operand 5 or 6 is the rounding to do. (0 = single, 1 = double, 2 = none)
+;;
+;; addrf3_cond_[ab] - F0 = F2 + F3
+;; subrf3_cond_[ab] - F0 = F2 - F3
+;; mulrf3_cond_[ab] - F0 = F2 * F3
+;; nmulrf3_cond_[ab] - F0 = - (F2 * F3)
+;; m1addrf4_cond_[ab] - F0 = (F2 * F3) + F4
+;; m1subrf4_cond_[ab] - F0 = (F2 * F3) - F4
+;; m2addrf4_cond_[ab] - F0 = F2 + (F3 * F4)
+;; m2subrf4_cond_[ab] - F0 = F2 - (F3 * F4)
+
+;; Basic plus/minus/mult operations
+
+(define_insn "*addrf3_cond_a"
+ [(set (match_operand:RF 0 "fr_register_operand" "=f")
+ (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand" "c")
+ (const_int 0))
+ (plus:RF
+ (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+ (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG"))
+ (match_operand:RF 4 "fr_register_operand" "0")))
+ (use (match_operand:SI 5 "const_int_operand" ""))
+ (use (match_operand:SI 6 "const_int_operand" ""))]
+ ""
+ "(%1) fadd%R6.s%5 %0 = %F2, %F3"
+ [(set_attr "itanium_class" "fmac")
+ (set_attr "predicable" "no")])
+
+(define_insn "*addrf3_cond_b"
+ [(set (match_operand:RF 0 "fr_register_operand" "=f")
+ (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand" "c")
+ (const_int 0))
+ (plus:RF
+ (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+ (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG"))
+ (const_int 0)))
+ (use (match_operand:SI 4 "const_int_operand" ""))
+ (use (match_operand:SI 5 "const_int_operand" ""))]
+ ""
+ "(%1) fadd%R5.s%4 %0 = %F2, %F3"
+ [(set_attr "itanium_class" "fmac")
+ (set_attr "predicable" "no")])
+
+(define_insn "*subrf3_cond_a"
+ [(set (match_operand:RF 0 "fr_register_operand" "=f")
+ (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand" "c")
+ (const_int 0))
+ (minus:RF
+ (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+ (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG"))
+ (match_operand:RF 4 "fr_register_operand" "0")))
+ (use (match_operand:SI 5 "const_int_operand" ""))
+ (use (match_operand:SI 6 "const_int_operand" ""))]
+ ""
+ "(%1) fsub%R6.s%5 %0 = %F2, %F3"
+ [(set_attr "itanium_class" "fmac")
+ (set_attr "predicable" "no")])
+
+(define_insn "*subrf3_cond_b"
+ [(set (match_operand:RF 0 "fr_register_operand" "=f")
+ (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand" "c")
+ (const_int 0))
+ (minus:RF
+ (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+ (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG"))
+ (const_int 0)))
+ (use (match_operand:SI 4 "const_int_operand" ""))
+ (use (match_operand:SI 5 "const_int_operand" ""))]
+ ""
+ "(%1) fsub%R5.s%4 %0 = %F2, %F3"
+ [(set_attr "itanium_class" "fmac")
+ (set_attr "predicable" "no")])
+
+(define_insn "*mulrf3_cond_a"
+ [(set (match_operand:RF 0 "fr_register_operand" "=f")
+ (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand" "c")
+ (const_int 0))
+ (mult:RF
+ (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+ (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG"))
+ (match_operand:RF 4 "fr_register_operand" "0")))
+ (use (match_operand:SI 5 "const_int_operand" ""))
+ (use (match_operand:SI 6 "const_int_operand" ""))]
+ ""
+ "(%1) fmpy%R6.s%5 %0 = %F2, %F3"
+ [(set_attr "itanium_class" "fmac")
+ (set_attr "predicable" "no")])
+
+(define_insn "*mulrf3_cond_b"
+ [(set (match_operand:RF 0 "fr_register_operand" "=f")
+ (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand" "c")
+ (const_int 0))
+ (mult:RF
+ (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+ (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG"))
+ (const_int 0)))
+ (use (match_operand:SI 4 "const_int_operand" ""))
+ (use (match_operand:SI 5 "const_int_operand" ""))]
+ ""
+ "(%1) fmpy%R5.s%4 %0 = %F2, %F3"
+ [(set_attr "itanium_class" "fmac")
+ (set_attr "predicable" "no")])
+
+;; neg-mult operations
+
+(define_insn "*nmulrf3_cond_a"
+ [(set (match_operand:RF 0 "fr_register_operand" "=f")
+ (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand" "c")
+ (const_int 0))
+ (neg:RF (mult:RF
+ (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+ (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG")))
+ (match_operand:RF 4 "fr_register_operand" "0")))
+ (use (match_operand:SI 5 "const_int_operand" ""))
+ (use (match_operand:SI 6 "const_int_operand" ""))]
+ ""
+ "(%1) fnmpy%R6.s%5 %0 = %F2, %F3"
+ [(set_attr "itanium_class" "fmac")
+ (set_attr "predicable" "no")])
+
+(define_insn "*nmulrf3_cond_b"
+ [(set (match_operand:RF 0 "fr_register_operand" "=f")
+ (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand" "c")
+ (const_int 0))
+ (neg:RF (mult:RF
+ (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+ (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG")))
+ (const_int 0)))
+ (use (match_operand:SI 4 "const_int_operand" ""))
+ (use (match_operand:SI 5 "const_int_operand" ""))]
+ ""
+ "(%1) fnmpy%R5.s%4 %0 = %F2, %F3"
+ [(set_attr "itanium_class" "fmac")
+ (set_attr "predicable" "no")])
+
+;; add-mult/sub-mult operations (mult as op1)
+
+(define_insn "*m1addrf4_cond_a"
+ [(set (match_operand:RF 0 "fr_register_operand" "=f")
+ (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand" "c")
+ (const_int 0))
+ (plus:RF
+ (mult:RF
+ (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+ (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG"))
+ (match_operand:RF 4 "fr_reg_or_fp01_operand" "fG"))
+ (match_operand:RF 5 "fr_register_operand" "0")))
+ (use (match_operand:SI 6 "const_int_operand" ""))
+ (use (match_operand:SI 7 "const_int_operand" ""))]
+ ""
+ "(%1) fma%R7.s%6 %0 = %F2, %F3, %F4"
+ [(set_attr "itanium_class" "fmac")
+ (set_attr "predicable" "no")])
+
+(define_insn "*m1addrf4_cond_b"
+ [(set (match_operand:RF 0 "fr_register_operand" "=f")
+ (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand" "c")
+ (const_int 0))
+ (plus:RF
+ (mult:RF
+ (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+ (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG"))
+ (match_operand:RF 4 "fr_reg_or_fp01_operand" "fG"))
+ (const_int 0)))
+ (use (match_operand:SI 5 "const_int_operand" ""))
+ (use (match_operand:SI 6 "const_int_operand" ""))]
+ ""
+ "(%1) fma%R6.s%5 %0 = %F2, %F3, %F4"
+ [(set_attr "itanium_class" "fmac")
+ (set_attr "predicable" "no")])
+
+(define_insn "*m1subrf4_cond_a"
+ [(set (match_operand:RF 0 "fr_register_operand" "=f")
+ (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand" "c")
+ (const_int 0))
+ (minus:RF
+ (mult:RF
+ (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+ (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG"))
+ (match_operand:RF 4 "fr_reg_or_fp01_operand" "fG"))
+ (match_operand:RF 5 "fr_register_operand" "0")))
+ (use (match_operand:SI 6 "const_int_operand" ""))
+ (use (match_operand:SI 7 "const_int_operand" ""))]
+ ""
+ "(%1) fms%R7.s%6 %0 = %F2, %F3, %F4"
+ [(set_attr "itanium_class" "fmac")
+ (set_attr "predicable" "no")])
+
+(define_insn "*m1subrf4_cond_b"
+ [(set (match_operand:RF 0 "fr_register_operand" "=f")
+ (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand" "c")
+ (const_int 0))
+ (minus:RF
+ (mult:RF
+ (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+ (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG"))
+ (match_operand:RF 4 "fr_reg_or_fp01_operand" "fG"))
+ (const_int 0)))
+ (use (match_operand:SI 5 "const_int_operand" ""))
+ (use (match_operand:SI 6 "const_int_operand" ""))]
+ ""
+ "(%1) fms%R6.s%5 %0 = %F2, %F3, %F4"
+ [(set_attr "itanium_class" "fmac")
+ (set_attr "predicable" "no")])
+
+;; add-mult/sub-mult operations (mult as op2)
+
+(define_insn "*m2addrf4_cond_a"
+ [(set (match_operand:RF 0 "fr_register_operand" "=f")
+ (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand" "c")
+ (const_int 0))
+ (plus:RF
+ (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+ (mult:RF
+ (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG")
+ (match_operand:RF 4 "fr_reg_or_fp01_operand" "fG")))
+ (match_operand:RF 5 "fr_register_operand" "0")))
+ (use (match_operand:SI 6 "const_int_operand" ""))
+ (use (match_operand:SI 7 "const_int_operand" ""))]
+ ""
+ "(%1) fma%R7.s%6 %0 = %F3, %F4, %F2"
+ [(set_attr "itanium_class" "fmac")
+ (set_attr "predicable" "no")])
+
+(define_insn "*m2addrf4_cond_b"
+ [(set (match_operand:RF 0 "fr_register_operand" "=f")
+ (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand" "c")
+ (const_int 0))
+ (plus:RF
+ (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+ (mult:RF
+ (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG")
+ (match_operand:RF 4 "fr_reg_or_fp01_operand" "fG")))
+ (const_int 0)))
+ (use (match_operand:SI 5 "const_int_operand" ""))
+ (use (match_operand:SI 6 "const_int_operand" ""))]
+ ""
+ "(%1) fma%R6.s%5 %0 = %F3, %F4, %F2"
+ [(set_attr "itanium_class" "fmac")
+ (set_attr "predicable" "no")])
+
+(define_insn "*m2subrf4_cond_a"
+ [(set (match_operand:RF 0 "fr_register_operand" "=f")
+ (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand" "c")
+ (const_int 0))
+ (minus:RF
+ (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+ (mult:RF
+ (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG")
+ (match_operand:RF 4 "fr_reg_or_fp01_operand" "fG")))
+ (match_operand:RF 5 "fr_register_operand" "0")))
+ (use (match_operand:SI 6 "const_int_operand" ""))
+ (use (match_operand:SI 7 "const_int_operand" ""))]
+ ""
+ "(%1) fnma%R7.s%6 %0 = %F3, %F4, %F2"
+ [(set_attr "itanium_class" "fmac")
+ (set_attr "predicable" "no")])
+
+(define_insn "*m2subrf4_cond_b"
+ [(set (match_operand:RF 0 "fr_register_operand" "=f")
+ (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand" "c")
+ (const_int 0))
+ (minus:RF
+ (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+ (mult:RF
+ (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG")
+ (match_operand:RF 4 "fr_reg_or_fp01_operand" "fG")))
+ (const_int 0)))
+ (use (match_operand:SI 5 "const_int_operand" ""))
+ (use (match_operand:SI 6 "const_int_operand" ""))]
+ ""
+ "(%1) fnma%R6.s%5 %0 = %F3, %F4, %F2"
+ [(set_attr "itanium_class" "fmac")
+ (set_attr "predicable" "no")])
+
+;; Conversions to/from RF and SF/DF/XF
+
+(define_mode_macro SDX_F [SF DF XF])
+
+(define_insn "*mov_trunc<mode>rf"
+ [(set (match_operand:SDX_F 0 "fr_register_operand" "=f")
+ (unspec:SDX_F [(match_operand:RF 1 "fr_register_operand" "f")]
+ UNSPEC_NOP_CONVERT))]
+ ""
+ "#"
+ [(set_attr "itanium_class" "fmisc")
+ (set_attr "predicable" "yes")])
+
+
+(define_insn "*mov_extendrf<mode>"
+ [(set (match_operand:RF 0 "fr_register_operand" "=f")
+ (unspec:RF [(match_operand:SDX_F 1 "fr_register_operand" "f")]
+ UNSPEC_NOP_CONVERT))]
+ ""
+ "#"
+ [(set_attr "itanium_class" "fmisc")
+ (set_attr "predicable" "yes")])
+
+(define_split
+ [(set (match_operand:SDX_F 0 "fr_register_operand" "")
+ (unspec:SDX_F [(match_operand:RF 1 "fr_register_operand" "")]
+ UNSPEC_NOP_CONVERT))]
+ "reload_completed"
+ [(set (match_dup 0) (match_dup 2))]
+{
+ operands[2] = gen_rtx_REG (<MODE>mode, REGNO (operands[1]));
+})
+
+(define_split
+ [(set (match_operand:RF 0 "fr_register_operand" "")
+ (unspec:RF [(match_operand:SDX_F 1 "fr_register_operand" "")]
+ UNSPEC_NOP_CONVERT))]
+ "reload_completed"
+ [(set (match_dup 0) (match_dup 2))]
+{
+ operands[2] = gen_rtx_REG (RFmode, REGNO (operands[1]));
+})
+
+;; Reciprical approximation
+
+(define_insn "*recip_approx_rf"
+ [(set (match_operand:RF 0 "fr_register_operand" "=f")
+ (div:RF (match_operand:RF 2 "fr_register_operand" "f")
+ (match_operand:RF 3 "fr_register_operand" "f")))
+ (set (match_operand:BI 1 "register_operand" "=c")
+ (unspec:BI [(match_dup 2) (match_dup 3)] UNSPEC_FR_RECIP_APPROX))
+ (use (match_operand:SI 4 "const_int_operand" ""))]
+ ""
+ "frcpa.s%4 %0, %1 = %2, %3"
+ [(set_attr "itanium_class" "fmisc")
+ (set_attr "predicable" "no")])
+
+;; Single precision floating point division (high throughput)
+;; The algorithm:
+;; y = 1 / b OP3 = 1 / OP5
+;; e = 1 - (b * y) OP6 = OP15 - (OP5 * OP3)
+;; y1 = y + (y * e) OP7 = OP3 + (OP3 * OP6)
+;; y2 = y + (y1 * e) OP8 = OP3 + (OP7 * OP6)
+;; q = single(a * y2) OP9 = single(OP4 * OP8)
+;; r = a - (q * b) OP10 = OP4 - (OP9 * OP5)
+;; Q = single (q + (r * y2)) OP3 = single (OP9 + (OP10 * OP8))
+
+(define_expand "divsf3_internal_thr"
+ [
+
+;; Empty conversions to put inputs into RFmode
+
+ (set (match_dup 4)
+ (unspec:RF [(match_operand:SF 1 "fr_register_operand" "f")]
+ UNSPEC_NOP_CONVERT))
+ (set (match_dup 5)
+ (unspec:RF [(match_operand:SF 2 "fr_register_operand" "f")]
+ UNSPEC_NOP_CONVERT))
+
+;; y = 1 / b OP3 = 1 / OP5
+ (parallel [(set (match_dup 3) (div:RF (match_dup 4) (match_dup 5)))
+ (set (match_dup 11) (unspec:BI [(match_dup 4) (match_dup 5)] UNSPEC_FR_RECIP_APPROX))
+ (use (match_dup 12))])
+
+;; e = 1 - (b * y) OP6 = OP15 - (OP5 * OP3)
+ (parallel [(set (match_dup 6)
+ (if_then_else:RF (ne:RF (match_dup 11) (const_int 0))
+ (minus:RF (match_dup 15)
+ (mult:RF (match_dup 5) (match_dup 3)))
+ (const_int 0)))
+ (use (match_dup 13)) (use (match_dup 14))])
+;; y1 = y + (y * e) OP7 = OP3 + (OP3 * OP6)
+ (parallel [(set (match_dup 7)
+ (if_then_else:RF (ne:RF (match_dup 11) (const_int 0))
+ (plus:RF (match_dup 3)
+ (mult:RF (match_dup 3) (match_dup 6)))
+ (const_int 0)))
+ (use (match_dup 13)) (use (match_dup 14))])
+;; y2 = y + (y1 * e) OP8 = OP3 + (OP7 * OP6)
+ (parallel [(set (match_dup 8)
+ (if_then_else:RF (ne:RF (match_dup 11) (const_int 0))
+ (plus:RF (match_dup 3)
+ (mult:RF (match_dup 7) (match_dup 6)))
+ (const_int 0)))
+ (use (match_dup 13)) (use (match_dup 14))])
+;; q = single(a * y2) OP9 = single(OP4 * OP8)
+ (parallel [(set (match_dup 9)
+ (if_then_else:RF (ne:RF (match_dup 11) (const_int 0))
+ (mult:RF (match_dup 4) (match_dup 8))
+ (const_int 0)))
+ (use (match_dup 13)) (use (match_dup 12))])
+;; r = a - (q * b) OP10 = OP4 - (OP9 * OP5)
+ (parallel [(set (match_dup 10)
+ (if_then_else:RF (ne:RF (match_dup 11) (const_int 0))
+ (minus:RF (match_dup 4)
+ (mult:RF (match_dup 9) (match_dup 5)))
+ (const_int 0)))
+ (use (match_dup 13)) (use (match_dup 14))])
+;; Q = single (q + (r * y2)) OP3 = single (OP9 + (OP10 * OP8))
+ (parallel [(set (match_dup 3)
+ (if_then_else:RF (ne:RF (match_dup 11) (const_int 0))
+ (plus:RF (match_dup 9)
+ (mult:RF (match_dup 10) (match_dup 8)))
+ (match_dup 3)))
+ (use (match_dup 12)) (use (match_dup 12))])
+
+ (set (match_operand:SF 0 "fr_register_operand" "=f")
+ (unspec:SF [(match_dup 3)] UNSPEC_NOP_CONVERT))
+ ]
+ ""
+{
+ operands[3] = gen_reg_rtx (RFmode);
+ operands[4] = gen_reg_rtx (RFmode);
+ operands[5] = gen_reg_rtx (RFmode);
+ operands[6] = gen_reg_rtx (RFmode);
+ operands[7] = gen_reg_rtx (RFmode);
+ operands[8] = gen_reg_rtx (RFmode);
+ operands[9] = gen_reg_rtx (RFmode);
+ operands[10] = gen_reg_rtx (RFmode);
+ operands[11] = gen_reg_rtx (BImode);
+ operands[12] = CONST0_RTX (SImode);
+ operands[13] = CONST1_RTX (SImode);
+ operands[14] = CONST2_RTX (SImode);
+ operands[15] = CONST1_RTX (RFmode);
+})
+
+
+;; Double precision floating point division (high throughput)
+;; The algorithm:
+;; y = 1 / b OP3 = 1 / OP5
+;; e = 1 - (b * y) OP6 = OP18 - (OP5 * OP3)
+;; y1 = y + (y * e) OP7 = OP3 + (OP3 * OP6)
+;; e1 = e * e OP8 = OP6 * OP6
+;; y2 = y1 + (y1 * e1) OP9 = OP7 + (OP7 * OP8)
+;; e2 = e1 * e1 OP10 = OP8 * OP8
+;; y3 = y2 + (y2 * e2) OP11 = OP9 + (OP9 * OP10)
+;; q = double (a * y3) OP12 = double (OP4 * OP11)
+;; r = a - (b * q) OP13 = OP4 - (OP5 * OP12)
+;; Q = double (q + (r * y3) OP3 = double (OP12 + (OP13 * OP11))
+
+(define_expand "divdf3_internal_thr"
+ [
+
+;; Empty conversions to put inputs into RFmode
+
+ (set (match_dup 4)
+ (unspec:RF [(match_operand:DF 1 "fr_register_operand" "f")]
+ UNSPEC_NOP_CONVERT))
+ (set (match_dup 5)
+ (unspec:RF [(match_operand:DF 2 "fr_register_operand" "f")]
+ UNSPEC_NOP_CONVERT))
+
+;; y = 1 / b OP3 = 1 / OP5
+ (parallel [(set (match_dup 3) (div:RF (match_dup 4) (match_dup 5)))
+ (set (match_dup 14) (unspec:BI [(match_dup 4) (match_dup 5)] UNSPEC_FR_RECIP_APPROX))
+ (use (match_dup 15))])
+
+;; e = 1 - (b * y) OP6 = OP18 - (OP5 * OP3)
+ (parallel [(set (match_dup 6)
+ (if_then_else:RF (ne:RF (match_dup 14) (const_int 0))
+ (minus:RF (match_dup 18)
+ (mult:RF (match_dup 5) (match_dup 3)))
+ (const_int 0)))
+ (use (match_dup 16)) (use (match_dup 17))])
+
+;; y1 = y + (y * e) OP7 = OP3 + (OP3 * OP6)
+ (parallel [(set (match_dup 7)
+ (if_then_else:RF (ne:RF (match_dup 14) (const_int 0))
+ (plus:RF (match_dup 3)
+ (mult:RF (match_dup 3) (match_dup 6)))
+ (const_int 0)))
+ (use (match_dup 16)) (use (match_dup 17))])
+
+;; e1 = e * e OP8 = OP6 * OP6
+ (parallel [(set (match_dup 8)
+ (if_then_else:RF (ne:RF (match_dup 14) (const_int 0))
+ (mult:RF (match_dup 6) (match_dup 6))
+ (const_int 0)))
+ (use (match_dup 16)) (use (match_dup 17))])
+
+;; y2 = y1 + (y1 * e1) OP9 = OP7 + (OP7 * OP8)
+ (parallel [(set (match_dup 9)
+ (if_then_else:RF (ne:RF (match_dup 14) (const_int 0))
+ (plus:RF (match_dup 7)
+ (mult:RF (match_dup 7) (match_dup 8)))
+ (const_int 0)))
+ (use (match_dup 16)) (use (match_dup 17))])
+
+;; e2 = e1 * e1 OP10 = OP8 * OP8
+ (parallel [(set (match_dup 10)
+ (if_then_else:RF (ne:RF (match_dup 14) (const_int 0))
+ (mult:RF (match_dup 8) (match_dup 8))
+ (const_int 0)))
+ (use (match_dup 16)) (use (match_dup 17))])
+
+;; y3 = y2 + (y2 * e2) OP11 = OP9 + (OP9 * OP10)
+ (parallel [(set (match_dup 11)
+ (if_then_else:RF (ne:RF (match_dup 14) (const_int 0))
+ (plus:RF (match_dup 9)
+ (mult:RF (match_dup 9) (match_dup 10)))
+ (const_int 0)))
+ (use (match_dup 16)) (use (match_dup 17))])
+
+;; q = double (a * y3) OP12 = double (OP4 * OP11)
+ (parallel [(set (match_dup 12)
+ (if_then_else:RF (ne:RF (match_dup 14) (const_int 0))
+ (mult:RF (match_dup 4) (match_dup 11))
+ (const_int 0)))
+ (use (match_dup 16)) (use (match_dup 16))])
+
+;; r = a - (b * q) OP13 = OP4 - (OP5 * OP12)
+ (parallel [(set (match_dup 13)
+ (if_then_else:RF (ne:RF (match_dup 14) (const_int 0))
+ (minus:RF (match_dup 4)
+ (mult:RF (match_dup 5) (match_dup 12)))
+ (const_int 0)))
+ (use (match_dup 16)) (use (match_dup 17))])
+
+;; Q = double (q + (r * y3)) OP3 = double (OP12 + (OP13 * OP11))
+ (parallel [(set (match_dup 3)
+ (if_then_else:RF (ne:RF (match_dup 14) (const_int 0))
+ (plus:RF (match_dup 12)
+ (mult:RF (match_dup 13) (match_dup 11)))
+ (match_dup 3)))
+ (use (match_dup 15)) (use (match_dup 16))])
+
+;; Do an 'empty' conversion back to SFmode
+
+ (set (match_operand:DF 0 "fr_register_operand" "=f")
+ (unspec:DF [(match_dup 3)] UNSPEC_NOP_CONVERT))
+ ]
+""
+{
+ operands[3] = gen_reg_rtx (RFmode);
+ operands[4] = gen_reg_rtx (RFmode);
+ operands[5] = gen_reg_rtx (RFmode);
+ operands[6] = gen_reg_rtx (RFmode);
+ operands[7] = gen_reg_rtx (RFmode);
+ operands[8] = gen_reg_rtx (RFmode);
+ operands[9] = gen_reg_rtx (RFmode);
+ operands[10] = gen_reg_rtx (RFmode);
+ operands[11] = gen_reg_rtx (RFmode);
+ operands[12] = gen_reg_rtx (RFmode);
+ operands[13] = gen_reg_rtx (RFmode);
+ operands[14] = gen_reg_rtx (BImode);
+ operands[15] = CONST0_RTX (SImode);
+ operands[16] = CONST1_RTX (SImode);
+ operands[17] = CONST2_RTX (SImode);
+ operands[18] = CONST1_RTX (RFmode);
+})