This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: Patch to change IA64 division code
- From: Steve Ellcey <sje at cup dot hp dot com>
- To: wilson at specifix dot com
- Cc: gcc-patches at gcc dot gnu dot org
- Date: Tue, 27 Mar 2007 10:21:16 -0700 (PDT)
- Subject: Re: Patch to change IA64 division code
Jim,
Here is a new version of my division change. I got rid of the _a and _b
variants and changed the division sequences to use gen_* calls. This
makes div.md smaller and easier to read. The code sequence generated is
unchanged.
I am still a bit concerned about HARD_REGNO_CALLER_SAVE_MODE. I changed
it back to XFmode and I got no regressions but it still seems like
RFmode is the 'right' mode to use as it will save and restore a register
without losing any information. XFmode will lose two bits of precision.
This should only happen if we save and restore an FP register in the
middle of a division code sequence. A long time ago I thought I saw
that happen but I cannot reproduce it with the current compiler.
Retested with no regressions. OK to check in?
2007-02-27 Steve Ellcey <sje@cup.hp.com>
* config/ia64/ia64.h (HARD_REGNO_NREGS): Handle RFmode.
(HARD_REGNO_MODE_OK): Ditto.
(MODES_TIEABLE_P): Ditto.
(HARD_REGNO_CALLER_SAVE_MODE): Ditto.
(CLASS_MAX_NREGS): Ditto.
* config/ia64/ia64.c (ia64_print_operand_address): Add R format.
(rtx_needs_barrier): Add UNSPEC_NOP_CONVERT case.
* config/ia64/ia64.md (UNSPEC_NOP_CONVERT): New.
(divsf3_internal_thr): Removed.
(divdf3_internal_thr): Removed.
* config/ia64/div.md: New file.
Index: config/ia64/ia64.h
===================================================================
--- config/ia64/ia64.h (revision 123090)
+++ config/ia64/ia64.h (working copy)
@@ -642,6 +642,7 @@ while (0)
: PR_REGNO_P (REGNO) && (MODE) == BImode ? 2 \
: PR_REGNO_P (REGNO) && (MODE) == CCImode ? 1 \
: FR_REGNO_P (REGNO) && (MODE) == XFmode ? 1 \
+ : FR_REGNO_P (REGNO) && (MODE) == RFmode ? 1 \
: FR_REGNO_P (REGNO) && (MODE) == XCmode ? 2 \
: (GET_MODE_SIZE (MODE) + UNITS_PER_WORD - 1) / UNITS_PER_WORD)
@@ -657,7 +658,7 @@ while (0)
: PR_REGNO_P (REGNO) ? \
(MODE) == BImode || GET_MODE_CLASS (MODE) == MODE_CC \
: GR_REGNO_P (REGNO) ? \
- (MODE) != CCImode && (MODE) != XFmode && (MODE) != XCmode \
+ (MODE) != CCImode && (MODE) != XFmode && (MODE) != XCmode && (MODE) != RFmode \
: AR_REGNO_P (REGNO) ? (MODE) == DImode \
: BR_REGNO_P (REGNO) ? (MODE) == DImode \
: 0)
@@ -674,8 +675,8 @@ while (0)
we can't tie it with any other modes. */
#define MODES_TIEABLE_P(MODE1, MODE2) \
(GET_MODE_CLASS (MODE1) == GET_MODE_CLASS (MODE2) \
- && ((((MODE1) == XFmode) || ((MODE1) == XCmode)) \
- == (((MODE2) == XFmode) || ((MODE2) == XCmode))) \
+ && ((((MODE1) == XFmode) || ((MODE1) == XCmode) || ((MODE1) == RFmode)) \
+ == (((MODE2) == XFmode) || ((MODE2) == XCmode) || ((MODE1) == RFmode))) \
&& (((MODE1) == BImode) == ((MODE2) == BImode)))
/* Specify the modes required to caller save a given hard regno.
@@ -896,6 +897,7 @@ enum reg_class
#define CLASS_MAX_NREGS(CLASS, MODE) \
((MODE) == BImode && (CLASS) == PR_REGS ? 2 \
: (((CLASS) == FR_REGS || (CLASS) == FP_REGS) && (MODE) == XFmode) ? 1 \
+ : (((CLASS) == FR_REGS || (CLASS) == FP_REGS) && (MODE) == RFmode) ? 1 \
: (((CLASS) == FR_REGS || (CLASS) == FP_REGS) && (MODE) == XCmode) ? 2 \
: (GET_MODE_SIZE (MODE) + UNITS_PER_WORD - 1) / UNITS_PER_WORD)
Index: config/ia64/ia64.c
===================================================================
--- config/ia64/ia64.c (revision 123090)
+++ config/ia64/ia64.c (working copy)
@@ -4503,6 +4503,7 @@ ia64_print_operand_address (FILE * strea
O Append .acq for volatile load.
P Postincrement of a MEM.
Q Append .rel for volatile store.
+ R Print .s .d or nothing for a single, double or no truncation.
S Shift amount for shladd instruction.
T Print an 8-bit sign extended number (K) as a 32-bit unsigned number
for Intel assembler.
@@ -4643,6 +4644,17 @@ ia64_print_operand (FILE * file, rtx x,
fputs(".rel", file);
return;
+ case 'R':
+ if (x == CONST0_RTX (GET_MODE (x)))
+ fputs(".s", file);
+ else if (x == CONST1_RTX (GET_MODE (x)))
+ fputs(".d", file);
+ else if (x == CONST2_RTX (GET_MODE (x)))
+ ;
+ else
+ output_operand_lossage ("invalid %%R value");
+ return;
+
case 'S':
fprintf (file, "%d", exact_log2 (INTVAL (x)));
return;
@@ -5762,6 +5774,7 @@ rtx_needs_barrier (rtx x, struct reg_fla
case UNSPEC_BSP_VALUE:
case UNSPEC_FLUSHRS:
case UNSPEC_BUNDLE_SELECTOR:
+ case UNSPEC_NOP_CONVERT:
break;
case UNSPEC_GR_SPILL:
Index: config/ia64/ia64.md
===================================================================
--- config/ia64/ia64.md (revision 123090)
+++ config/ia64/ia64.md (working copy)
@@ -81,6 +81,7 @@ (define_constants
(UNSPEC_SHRP 29)
(UNSPEC_COPYSIGN 30)
(UNSPEC_VECT_EXTR 31)
+ (UNSPEC_NOP_CONVERT 32)
(UNSPEC_LDA 40)
(UNSPEC_LDS 41)
(UNSPEC_LDSA 42)
@@ -3108,60 +3109,6 @@ (define_insn_and_split "divsf3_internal_
}
[(set_attr "predicable" "no")])
-(define_insn_and_split "divsf3_internal_thr"
- [(set (match_operand:SF 0 "fr_register_operand" "=&f")
- (div:SF (match_operand:SF 1 "fr_register_operand" "f")
- (match_operand:SF 2 "fr_register_operand" "f")))
- (clobber (match_scratch:XF 3 "=&f"))
- (clobber (match_scratch:XF 4 "=f"))
- (clobber (match_scratch:BI 5 "=c"))]
- "TARGET_INLINE_FLOAT_DIV == INL_MAX_THR"
- "#"
- "&& reload_completed"
- [(parallel [(set (match_dup 6) (div:XF (const_int 1) (match_dup 8)))
- (set (match_dup 5) (unspec:BI [(match_dup 7) (match_dup 8)]
- UNSPEC_FR_RECIP_APPROX))
- (use (const_int 0))])
- (cond_exec (ne (match_dup 5) (const_int 0))
- (parallel [(set (match_dup 3)
- (minus:XF (match_dup 10)
- (mult:XF (match_dup 8) (match_dup 6))))
- (use (const_int 1))]))
- (cond_exec (ne (match_dup 5) (const_int 0))
- (parallel [(set (match_dup 3)
- (plus:XF (mult:XF (match_dup 3) (match_dup 3))
- (match_dup 3)))
- (use (const_int 1))]))
- (cond_exec (ne (match_dup 5) (const_int 0))
- (parallel [(set (match_dup 6)
- (plus:XF (mult:XF (match_dup 3) (match_dup 6))
- (match_dup 6)))
- (use (const_int 1))]))
- (cond_exec (ne (match_dup 5) (const_int 0))
- (parallel [(set (match_dup 9)
- (float_truncate:SF
- (mult:XF (match_dup 7) (match_dup 6))))
- (use (const_int 1))]))
- (cond_exec (ne (match_dup 5) (const_int 0))
- (parallel [(set (match_dup 4)
- (minus:XF (match_dup 7)
- (mult:XF (match_dup 8) (match_dup 3))))
- (use (const_int 1))]))
- (cond_exec (ne (match_dup 5) (const_int 0))
- (set (match_dup 0)
- (float_truncate:SF
- (plus:XF (mult:XF (match_dup 4) (match_dup 6))
- (match_dup 3)))))
- ]
-{
- operands[6] = gen_rtx_REG (XFmode, REGNO (operands[0]));
- operands[7] = gen_rtx_REG (XFmode, REGNO (operands[1]));
- operands[8] = gen_rtx_REG (XFmode, REGNO (operands[2]));
- operands[9] = gen_rtx_REG (SFmode, REGNO (operands[3]));
- operands[10] = CONST1_RTX (XFmode);
-}
- [(set_attr "predicable" "no")])
-
;; Inline square root.
(define_insn "*sqrt_approx"
@@ -3614,72 +3561,6 @@ (define_insn_and_split "divdf3_internal_
}
[(set_attr "predicable" "no")])
-(define_insn_and_split "divdf3_internal_thr"
- [(set (match_operand:DF 0 "fr_register_operand" "=&f")
- (div:DF (match_operand:DF 1 "fr_register_operand" "f")
- (match_operand:DF 2 "fr_register_operand" "f")))
- (clobber (match_scratch:XF 3 "=&f"))
- (clobber (match_scratch:DF 4 "=f"))
- (clobber (match_scratch:BI 5 "=c"))]
- "TARGET_INLINE_FLOAT_DIV == INL_MAX_THR"
- "#"
- "&& reload_completed"
- [(parallel [(set (match_dup 6) (div:XF (const_int 1) (match_dup 8)))
- (set (match_dup 5) (unspec:BI [(match_dup 7) (match_dup 8)]
- UNSPEC_FR_RECIP_APPROX))
- (use (const_int 0))])
- (cond_exec (ne (match_dup 5) (const_int 0))
- (parallel [(set (match_dup 3)
- (minus:XF (match_dup 10)
- (mult:XF (match_dup 8) (match_dup 6))))
- (use (const_int 1))]))
- (cond_exec (ne (match_dup 5) (const_int 0))
- (parallel [(set (match_dup 6)
- (plus:XF (mult:XF (match_dup 3) (match_dup 6))
- (match_dup 6)))
- (use (const_int 1))]))
- (cond_exec (ne (match_dup 5) (const_int 0))
- (parallel [(set (match_dup 3)
- (mult:XF (match_dup 3) (match_dup 3)))
- (use (const_int 1))]))
- (cond_exec (ne (match_dup 5) (const_int 0))
- (parallel [(set (match_dup 6)
- (plus:XF (mult:XF (match_dup 3) (match_dup 6))
- (match_dup 6)))
- (use (const_int 1))]))
- (cond_exec (ne (match_dup 5) (const_int 0))
- (parallel [(set (match_dup 3)
- (mult:XF (match_dup 3) (match_dup 3)))
- (use (const_int 1))]))
- (cond_exec (ne (match_dup 5) (const_int 0))
- (parallel [(set (match_dup 6)
- (plus:XF (mult:XF (match_dup 3) (match_dup 6))
- (match_dup 6)))
- (use (const_int 1))]))
- (cond_exec (ne (match_dup 5) (const_int 0))
- (parallel [(set (match_dup 9)
- (float_truncate:DF
- (mult:XF (match_dup 7) (match_dup 6))))
- (use (const_int 1))]))
- (cond_exec (ne (match_dup 5) (const_int 0))
- (parallel [(set (match_dup 4)
- (minus:DF (match_dup 1)
- (mult:DF (match_dup 2) (match_dup 9))))
- (use (const_int 1))]))
- (cond_exec (ne (match_dup 5) (const_int 0))
- (set (match_dup 0)
- (plus:DF (mult:DF (match_dup 4) (match_dup 0))
- (match_dup 9))))
- ]
-{
- operands[6] = gen_rtx_REG (XFmode, REGNO (operands[0]));
- operands[7] = gen_rtx_REG (XFmode, REGNO (operands[1]));
- operands[8] = gen_rtx_REG (XFmode, REGNO (operands[2]));
- operands[9] = gen_rtx_REG (DFmode, REGNO (operands[3]));
- operands[10] = CONST1_RTX (XFmode);
-}
- [(set_attr "predicable" "no")])
-
;; Inline square root.
(define_expand "sqrtdf2"
@@ -6540,3 +6421,5 @@ (define_insn "ip_value"
(include "vect.md")
;; Atomic operations
(include "sync.md")
+;; New division operations
+(include "div.md")
Index: config/ia64/div.md
===================================================================
--- config/ia64/div.md (revision 0)
+++ config/ia64/div.md (revision 0)
@@ -0,0 +1,317 @@
+
+;; For the internal conditional math routines:
+
+;; operand 0 is always the result
+;; operand 1 is always the predicate
+;; operand 2, 3, and sometimes 4 are the input values.
+;; operand 4 or 5 is the floating point status register to use.
+;; operand 5 or 6 is the rounding to do. (0 = single, 1 = double, 2 = none)
+;;
+;; addrf3_cond - F0 = F2 + F3
+;; subrf3_cond - F0 = F2 - F3
+;; mulrf3_cond - F0 = F2 * F3
+;; nmulrf3_cond - F0 = - (F2 * F3)
+;; m1addrf4_cond - F0 = (F2 * F3) + F4
+;; m1subrf4_cond - F0 = (F2 * F3) - F4
+;; m2addrf4_cond - F0 = F2 + (F3 * F4)
+;; m2subrf4_cond - F0 = F2 - (F3 * F4)
+
+;; Basic plus/minus/mult operations
+
+(define_insn "addrf3_cond"
+ [(set (match_operand:RF 0 "fr_register_operand" "=f,f")
+ (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand" "c,c")
+ (const_int 0))
+ (plus:RF
+ (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG,fG")
+ (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG,fG"))
+ (match_operand:RF 4 "fr_reg_or_0_operand" "0,U")))
+ (use (match_operand:SI 5 "const_int_operand" ""))
+ (use (match_operand:SI 6 "const_int_operand" ""))]
+ ""
+ "(%1) fadd%R6.s%5 %0 = %F2, %F3"
+ [(set_attr "itanium_class" "fmac")
+ (set_attr "predicable" "no")])
+
+(define_insn "subrf3_cond"
+ [(set (match_operand:RF 0 "fr_register_operand" "=f,f")
+ (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand" "c,c")
+ (const_int 0))
+ (minus:RF
+ (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG,fG")
+ (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG,fG"))
+ (match_operand:RF 4 "fr_reg_or_0_operand" "0,U")))
+ (use (match_operand:SI 5 "const_int_operand" ""))
+ (use (match_operand:SI 6 "const_int_operand" ""))]
+ ""
+ "(%1) fsub%R6.s%5 %0 = %F2, %F3"
+ [(set_attr "itanium_class" "fmac")
+ (set_attr "predicable" "no")])
+
+(define_insn "mulrf3_cond"
+ [(set (match_operand:RF 0 "fr_register_operand" "=f,f")
+ (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand" "c,c")
+ (const_int 0))
+ (mult:RF
+ (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG,fG")
+ (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG,fG"))
+ (match_operand:RF 4 "fr_reg_or_0_operand" "0,U")))
+ (use (match_operand:SI 5 "const_int_operand" ""))
+ (use (match_operand:SI 6 "const_int_operand" ""))]
+ ""
+ "(%1) fmpy%R6.s%5 %0 = %F2, %F3"
+ [(set_attr "itanium_class" "fmac")
+ (set_attr "predicable" "no")])
+
+;; neg-mult operation
+
+(define_insn "nmulrf3_cond"
+ [(set (match_operand:RF 0 "fr_register_operand" "=f,f")
+ (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand" "c,c")
+ (const_int 0))
+ (neg:RF (mult:RF
+ (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG,fG")
+ (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG,fG")))
+ (match_operand:RF 4 "fr_reg_or_0_operand" "0,U")))
+ (use (match_operand:SI 5 "const_int_operand" ""))
+ (use (match_operand:SI 6 "const_int_operand" ""))]
+ ""
+ "(%1) fnmpy%R6.s%5 %0 = %F2, %F3"
+ [(set_attr "itanium_class" "fmac")
+ (set_attr "predicable" "no")])
+
+;; add-mult/sub-mult operations (mult as op1)
+
+(define_insn "m1addrf4_cond"
+ [(set (match_operand:RF 0 "fr_register_operand" "=f,f")
+ (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand" "c,c")
+ (const_int 0))
+ (plus:RF
+ (mult:RF
+ (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG,fG")
+ (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG,fG"))
+ (match_operand:RF 4 "fr_reg_or_fp01_operand" "fG,fG"))
+ (match_operand:RF 5 "fr_reg_or_0_operand" "0,U")))
+ (use (match_operand:SI 6 "const_int_operand" ""))
+ (use (match_operand:SI 7 "const_int_operand" ""))]
+ ""
+ "(%1) fma%R7.s%6 %0 = %F2, %F3, %F4"
+ [(set_attr "itanium_class" "fmac")
+ (set_attr "predicable" "no")])
+
+(define_insn "m1subrf4_cond"
+ [(set (match_operand:RF 0 "fr_register_operand" "=f,f")
+ (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand" "c,c")
+ (const_int 0))
+ (minus:RF
+ (mult:RF
+ (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG,fG")
+ (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG,fG"))
+ (match_operand:RF 4 "fr_reg_or_fp01_operand" "fG,fG"))
+ (match_operand:RF 5 "fr_reg_or_0_operand" "0,U")))
+ (use (match_operand:SI 6 "const_int_operand" ""))
+ (use (match_operand:SI 7 "const_int_operand" ""))]
+ ""
+ "(%1) fms%R7.s%6 %0 = %F2, %F3, %F4"
+ [(set_attr "itanium_class" "fmac")
+ (set_attr "predicable" "no")])
+
+;; add-mult/sub-mult operations (mult as op2)
+
+(define_insn "m2addrf4_cond"
+ [(set (match_operand:RF 0 "fr_register_operand" "=f,f")
+ (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand" "c,c")
+ (const_int 0))
+ (plus:RF
+ (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG,fG")
+ (mult:RF
+ (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG,fG")
+ (match_operand:RF 4 "fr_reg_or_fp01_operand" "fG,fG")))
+ (match_operand:RF 5 "fr_reg_or_0_operand" "0,U")))
+ (use (match_operand:SI 6 "const_int_operand" ""))
+ (use (match_operand:SI 7 "const_int_operand" ""))]
+ ""
+ "(%1) fma%R7.s%6 %0 = %F3, %F4, %F2"
+ [(set_attr "itanium_class" "fmac")
+ (set_attr "predicable" "no")])
+
+(define_insn "m2subrf4_cond"
+ [(set (match_operand:RF 0 "fr_register_operand" "=f,f")
+ (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand" "c,c")
+ (const_int 0))
+ (minus:RF
+ (match_operand:RF 2 "fr_reg_or_fp01_operand" "fg,fG")
+ (mult:RF
+ (match_operand:RF 3 "fr_reg_or_fp01_operand" "fg,fG")
+ (match_operand:RF 4 "fr_reg_or_fp01_operand" "fg,fG")))
+ (match_operand:RF 5 "fr_reg_or_0_operand" "0,U")))
+ (use (match_operand:SI 6 "const_int_operand" ""))
+ (use (match_operand:SI 7 "const_int_operand" ""))]
+ ""
+ "(%1) fnma%R7.s%6 %0 = %F3, %F4, %F2"
+ [(set_attr "itanium_class" "fmac")
+ (set_attr "predicable" "no")])
+
+;; Conversions to/from RF and SF/DF/XF
+;; These conversions should not generate any code but make it possible
+;; for all the instructions used to implement floating point division
+;; to be written for RFmode only and to not have to handle multiple
+;; modes or to have to handle a register in more than one mode.
+
+(define_mode_macro SDX_F [SF DF XF])
+
+(define_insn "mov_extendrf<mode>"
+ [(set (match_operand:RF 0 "fr_register_operand" "=f")
+ (unspec:RF [(match_operand:SDX_F 1 "fr_register_operand" "f")]
+ UNSPEC_NOP_CONVERT))]
+ ""
+ "#"
+ [(set_attr "itanium_class" "fmisc")
+ (set_attr "predicable" "yes")])
+
+(define_split
+ [(set (match_operand:RF 0 "fr_register_operand" "")
+ (unspec:RF [(match_operand:SDX_F 1 "fr_register_operand" "")]
+ UNSPEC_NOP_CONVERT))]
+ "reload_completed"
+ [(set (match_dup 0) (match_dup 2))]
+{
+ operands[2] = gen_rtx_REG (RFmode, REGNO (operands[1]));
+})
+
+
+(define_insn "mov_trunc<mode>rf"
+ [(set (match_operand:SDX_F 0 "fr_register_operand" "=f")
+ (unspec:SDX_F [(match_operand:RF 1 "fr_register_operand" "f")]
+ UNSPEC_NOP_CONVERT))]
+ ""
+ "#"
+ [(set_attr "itanium_class" "fmisc")
+ (set_attr "predicable" "yes")])
+
+(define_split
+ [(set (match_operand:SDX_F 0 "fr_register_operand" "")
+ (unspec:SDX_F [(match_operand:RF 1 "fr_register_operand" "")]
+ UNSPEC_NOP_CONVERT))]
+ "reload_completed"
+ [(set (match_dup 0) (match_dup 2))]
+{
+ operands[2] = gen_rtx_REG (<MODE>mode, REGNO (operands[1]));
+})
+
+;; Reciprical approximation
+
+(define_insn "recip_approx_rf"
+ [(set (match_operand:RF 0 "fr_register_operand" "=f")
+ (div:RF (match_operand:RF 1 "fr_register_operand" "f")
+ (match_operand:RF 2 "fr_register_operand" "f")))
+ (set (match_operand:BI 3 "register_operand" "=c")
+ (unspec:BI [(match_dup 1) (match_dup 2)] UNSPEC_FR_RECIP_APPROX))
+ (use (match_operand:SI 4 "const_int_operand" ""))]
+ ""
+ "frcpa.s%4 %0, %3 = %1, %2"
+ [(set_attr "itanium_class" "fmisc")
+ (set_attr "predicable" "no")])
+
+;; Single precision floating point division (maximum throughput algorithm).
+
+(define_expand "divsf3_internal_thr"
+ [(set (match_operand:SF 0 "fr_register_operand" "")
+ (div:SF (match_operand:SF 1 "fr_register_operand" "")
+ (match_operand:SF 2 "fr_register_operand" "")))]
+ "TARGET_INLINE_FLOAT_DIV"
+{
+ rtx y = gen_reg_rtx (RFmode);
+ rtx a = gen_reg_rtx (RFmode);
+ rtx b = gen_reg_rtx (RFmode);
+ rtx e = gen_reg_rtx (RFmode);
+ rtx y1 = gen_reg_rtx (RFmode);
+ rtx y2 = gen_reg_rtx (RFmode);
+ rtx q = gen_reg_rtx (RFmode);
+ rtx r = gen_reg_rtx (RFmode);
+ rtx q_res = gen_reg_rtx (RFmode);
+ rtx cond = gen_reg_rtx (BImode);
+ rtx zero = CONST0_RTX (RFmode);
+ rtx one = CONST1_RTX (RFmode);
+ rtx status0 = CONST0_RTX (SImode);
+ rtx status1 = CONST1_RTX (SImode);
+ rtx trunc_sgl = CONST0_RTX (SImode);
+ rtx trunc_off = CONST2_RTX (SImode);
+
+ /* Empty conversions to put inputs into RFmode. */
+ emit_insn (gen_mov_extendrfsf (a, operands[1]));
+ emit_insn (gen_mov_extendrfsf (b, operands[2]));
+ /* y = 1 / b */
+ emit_insn (gen_recip_approx_rf (y, a, b, cond, status0));
+ /* e = 1 - (b * y) */
+ emit_insn (gen_m2subrf4_cond (e, cond, one, b, y, zero, status1, trunc_off));
+ /* y1 = y + (y * e) */
+ emit_insn (gen_m2addrf4_cond (y1, cond, y, y, e, zero, status1, trunc_off));
+ /* y2 = y + (y1 * e) */
+ emit_insn (gen_m2addrf4_cond (y2, cond, y, y1, e, zero, status1, trunc_off));
+ /* q = single(a * y2) */
+ emit_insn (gen_mulrf3_cond (q, cond, a, y2, zero, status1, trunc_sgl));
+ /* r = a - (q * b) */
+ emit_insn (gen_m2subrf4_cond (r, cond, a, q, b, zero, status1, trunc_off));
+ /* Q = single (q + (r * y2)) */
+ emit_insn (gen_m2addrf4_cond (q_res, cond, q, r, y2, y, status0, trunc_sgl));
+ /* Conversion back into SFmode. */
+ emit_insn (gen_mov_truncsfrf (operands[0], q_res));
+ DONE;
+})
+
+
+;; Double precision floating point division (maximum throughput algorithm).
+
+(define_expand "divdf3_internal_thr"
+ [(set (match_operand:DF 0 "fr_register_operand" "")
+ (div:DF (match_operand:DF 1 "fr_register_operand" "")
+ (match_operand:DF 2 "fr_register_operand" "")))]
+ "TARGET_INLINE_FLOAT_DIV"
+{
+ rtx q_res = gen_reg_rtx (RFmode);
+ rtx a = gen_reg_rtx (RFmode);
+ rtx b = gen_reg_rtx (RFmode);
+ rtx y = gen_reg_rtx (RFmode);
+ rtx e = gen_reg_rtx (RFmode);
+ rtx y1 = gen_reg_rtx (RFmode);
+ rtx e1 = gen_reg_rtx (RFmode);
+ rtx y2 = gen_reg_rtx (RFmode);
+ rtx e2 = gen_reg_rtx (RFmode);
+ rtx y3 = gen_reg_rtx (RFmode);
+ rtx q = gen_reg_rtx (RFmode);
+ rtx r = gen_reg_rtx (RFmode);
+ rtx cond = gen_reg_rtx (BImode);
+ rtx zero = CONST0_RTX (RFmode);
+ rtx one = CONST1_RTX (RFmode);
+ rtx status0 = CONST0_RTX (SImode);
+ rtx status1 = CONST1_RTX (SImode);
+ rtx trunc_dbl = CONST1_RTX (SImode);
+ rtx trunc_off = CONST2_RTX (SImode);
+ /* Empty conversions to put inputs into RFmode */
+ emit_insn (gen_mov_extendrfdf (a, operands[1]));
+ emit_insn (gen_mov_extendrfdf (b, operands[2]));
+ /* y = 1 / b */
+ emit_insn (gen_recip_approx_rf (y, a, b, cond, status0));
+ /* e = 1 - (b * y) */
+ emit_insn (gen_m2subrf4_cond (e, cond, one, b, y, zero, status1, trunc_off));
+ /* y1 = y + (y * e) */
+ emit_insn (gen_m2addrf4_cond (y1, cond, y, y, e, zero, status1, trunc_off));
+ /* e1 = e * e */
+ emit_insn (gen_mulrf3_cond (e1, cond, e, e, zero, status1, trunc_off));
+ /* y2 = y1 + (y1 * e1) */
+ emit_insn (gen_m2addrf4_cond (y2, cond, y1, y1, e1, zero, status1, trunc_off));
+ /* e2 = e1 * e1 */
+ emit_insn (gen_mulrf3_cond (e2, cond, e1, e1, zero, status1, trunc_off));
+ /* y3 = y2 + (y2 * e2) */
+ emit_insn (gen_m2addrf4_cond (y3, cond, y2, y2, e2, zero, status1, trunc_off));
+ /* q = double (a * y3) */
+ emit_insn (gen_mulrf3_cond (q, cond, a, y3, zero, status1, trunc_dbl));
+ /* r = a - (b * q) */
+ emit_insn (gen_m2subrf4_cond (r, cond, a, b, q, zero, status1, trunc_off));
+ /* Q = double (q + (r * y3)) */
+ emit_insn (gen_m2addrf4_cond (q_res, cond, q, r, y3, y, status0, trunc_dbl));
+ /* Conversion back into DFmode */
+ emit_insn (gen_mov_truncdfrf (operands[0], q_res));
+ DONE;
+})