This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Inline sqrt for ia64
- From: "Zack Weinberg" <zack at codesourcery dot com>
- To: gcc-patches at gcc dot gnu dot org
- Date: Sun, 26 Oct 2003 01:57:03 -0700
- Subject: Inline sqrt for ia64
This patch is a joint effort by Jeffrey Oldham, Mark Mitchell, and
myself, to implement inline sqrt for the ia64 back end, following
the algorithm recommended in the ia64 architecture manual. (Jeffrey
and Mark did the lion's share of the work; I just tweaked things a bit.)
Currently it only implements the throughput-optimized version;
however, placeholders have been left for the latency-optimized
version. Like division, the choice of whether or not to do this
transformation, and the algorithm to use, is left to the user.
I cannot bootstrap on ia64-anything right now due to a problem with
libstdc++ and another problem with dwarf2out.c, therefore I am not
checking this in yet.
zw
2003-10-25 Jeffrey D. Oldham <oldham@codesourcery.com>
Mark Mitchell <mark@codesourcery.com>
Zack Weinberg <zack@codesourcery.com>
* ia64.md (UNSPEC_SETF_EXP, UNSPEC_FR_SQRT_RECIP_APPROX): New constants.
(*sqrt_approx): New instruction pattern for approximate square roots.
(*setf_exp_xf): New instruction pattern for exponentiation.
(*maddxf4_alts_truncsf): New instruction pattern for truncation.
(sqrtsf2_internal_thr): New define_and_split implementing
throughput-optimized inline calculation of SFmode square root.
(sqrtdf2_internal_thr): Likewise for DFmode.
(sqrtxf2_internal_thr): Likewise for XFmode.
(sqrtsf2, sqrtdf2, sqrtxf2): New expanders to choose between
latency- and throughput-optimized square root algorithms.
* ia64.h (MASK_INLINE_SQRT_LAT, MASK_INLINE_SQRT_THR,
TARGET_INLINE_SQRT_LAT, TARGET_INLINE_SQRT_THR, TARGET_INLINE_SQRT):
New macros.
(TARGET_SWITCHES): Add -minline-sqrt-min-latency and
-minline-sqrt-max-throughput.
* ia64.c (ia64_override_options): If both -minline-sqrt-min-latency
and -minline-sqrt-max-throughput are given, notify the user
that both options cannot be used simultaneously.
If -minline-sqrt-min-latency is given, notify the user that
this mode is not yet implemented.
(rtx_needs_barrier): Reformat initial comment to obey
72-character width limit. Support UNSPEC_SETF_EXP and
UNSPEC_FR_SQRT_RECIP_APPROX.
===================================================================
Index: config/ia64/ia64.c
--- config/ia64/ia64.c 25 Oct 2003 02:03:39 -0000 1.256
+++ config/ia64/ia64.c 26 Oct 2003 08:46:37 -0000
@@ -4487,6 +4487,18 @@ ia64_override_options (void)
target_flags &= ~MASK_INLINE_INT_DIV_THR;
}
+ if (TARGET_INLINE_SQRT_LAT && TARGET_INLINE_SQRT_THR)
+ {
+ warning ("cannot optimize square root for both latency and throughput");
+ target_flags &= ~MASK_INLINE_SQRT_THR;
+ }
+
+ if (TARGET_INLINE_SQRT_LAT)
+ {
+ warning ("not yet implemented: latency-optimized inline square root");
+ target_flags &= ~MASK_INLINE_SQRT_LAT;
+ }
+
if (ia64_fixed_range_string)
fix_range (ia64_fixed_range_string);
@@ -4896,9 +4908,9 @@ set_src_needs_barrier (rtx x, struct reg
return need_barrier;
}
-/* Handle an access to rtx X of type FLAGS using predicate register PRED.
- Return 1 is this access creates a dependency with an earlier instruction
- in the same group. */
+/* Handle an access to rtx X of type FLAGS using predicate register
+ PRED. Return 1 if this access creates a dependency with an earlier
+ instruction in the same group. */
static int
rtx_needs_barrier (rtx x, struct reg_flags flags, int pred)
@@ -5124,7 +5136,9 @@ rtx_needs_barrier (rtx x, struct reg_fla
case UNSPEC_FR_SPILL:
case UNSPEC_FR_RESTORE:
case UNSPEC_GETF_EXP:
+ case UNSPEC_SETF_EXP:
case UNSPEC_ADDP4:
+ case UNSPEC_FR_SQRT_RECIP_APPROX:
need_barrier = rtx_needs_barrier (XVECEXP (x, 0, 0), flags, pred);
break;
===================================================================
Index: config/ia64/ia64.h
--- config/ia64/ia64.h 25 Oct 2003 02:03:39 -0000 1.159
+++ config/ia64/ia64.h 26 Oct 2003 08:46:37 -0000
@@ -87,6 +87,10 @@ extern int target_flags;
#define MASK_INLINE_INT_DIV_THR 0x00001000 /* inline div, max throughput. */
+#define MASK_INLINE_SQRT_LAT 0x00002000 /* inline sqrt, min latency. */
+
+#define MASK_INLINE_SQRT_THR 0x00004000 /* inline sqrt, max throughput. */
+
#define MASK_DWARF2_ASM 0x40000000 /* test dwarf2 line info via gas. */
#define MASK_EARLY_STOP_BITS 0x00002000 /* tune stop bits for the model. */
@@ -127,6 +131,13 @@ extern int target_flags;
#define TARGET_INLINE_INT_DIV \
(target_flags & (MASK_INLINE_INT_DIV_LAT | MASK_INLINE_INT_DIV_THR))
+#define TARGET_INLINE_SQRT_LAT (target_flags & MASK_INLINE_SQRT_LAT)
+
+#define TARGET_INLINE_SQRT_THR (target_flags & MASK_INLINE_SQRT_THR)
+
+#define TARGET_INLINE_SQRT \
+ (target_flags & (MASK_INLINE_SQRT_LAT | MASK_INLINE_SQRT_THR))
+
#define TARGET_DWARF2_ASM (target_flags & MASK_DWARF2_ASM)
extern int ia64_tls_size;
@@ -186,6 +197,10 @@ extern int ia64_tls_size;
N_("Generate inline integer division, optimize for latency") }, \
{ "inline-int-divide-max-throughput", MASK_INLINE_INT_DIV_THR, \
N_("Generate inline integer division, optimize for throughput") },\
+ { "inline-sqrt-min-latency", MASK_INLINE_SQRT_LAT, \
+ N_("Generate inline square root, optimize for latency") }, \
+ { "inline-sqrt-max-throughput", MASK_INLINE_SQRT_THR, \
+ N_("Generate inline square root, optimize for throughput") }, \
{ "dwarf2-asm", MASK_DWARF2_ASM, \
N_("Enable Dwarf 2 line debug info via GNU as")}, \
{ "no-dwarf2-asm", -MASK_DWARF2_ASM, \
===================================================================
Index: config/ia64/ia64.md
--- config/ia64/ia64.md 25 Oct 2003 02:03:40 -0000 1.115
+++ config/ia64/ia64.md 26 Oct 2003 08:46:37 -0000
@@ -74,6 +74,8 @@
(UNSPEC_ADDP4 24)
(UNSPEC_PROLOGUE_USE 25)
(UNSPEC_RET_ADDR 26)
+ (UNSPEC_SETF_EXP 27)
+ (UNSPEC_FR_SQRT_RECIP_APPROX 28)
])
(define_constants
@@ -2757,6 +2759,155 @@
operands[10] = CONST1_RTX (XFmode);
}
[(set_attr "predicable" "no")])
+
+;; Inline square root.
+
+(define_insn "*sqrt_approx"
+ [(set (match_operand:XF 0 "fr_register_operand" "=f")
+ (div:XF (const_int 1)
+ (sqrt:XF (match_operand:XF 2 "fr_register_operand" "f"))))
+ (set (match_operand:BI 1 "register_operand" "=c")
+ (unspec:BI [(match_dup 2)] UNSPEC_FR_SQRT_RECIP_APPROX))
+ (use (match_operand:SI 3 "const_int_operand" "")) ]
+ ""
+ "frsqrta.s%3 %0, %1 = %2"
+ [(set_attr "itanium_class" "fmisc")
+ (set_attr "predicable" "no")])
+
+(define_insn "*setf_exp_xf"
+ [(set (match_operand:XF 0 "fr_register_operand" "=f")
+ (unspec:XF [(match_operand:DI 1 "register_operand" "r")]
+ UNSPEC_SETF_EXP))]
+ ""
+ "setf.exp %0 = %1"
+ [(set_attr "itanium_class" "frfr")])
+
+(define_expand "sqrtsf2"
+ [(set (match_operand:SF 0 "fr_register_operand" "=&f")
+ (sqrt:SF (match_operand:SF 1 "fr_register_operand" "f")))]
+ "TARGET_INLINE_SQRT"
+{
+ rtx insn;
+ if (TARGET_INLINE_SQRT_LAT)
+#if 0
+ insn = gen_sqrtsf2_internal_lat (operands[0], operands[1]);
+#else
+ abort ();
+#endif
+ else
+ insn = gen_sqrtsf2_internal_thr (operands[0], operands[1]);
+ emit_insn (insn);
+ DONE;
+})
+
+;; Latency-optimized square root.
+;; FIXME: Implement.
+
+;; Throughput-optimized square root.
+
+(define_insn_and_split "sqrtsf2_internal_thr"
+ [(set (match_operand:SF 0 "fr_register_operand" "=&f")
+ (sqrt:SF (match_operand:SF 1 "fr_register_operand" "f")))
+ ;; Register r2 in optimization guide.
+ (clobber (match_scratch:DI 2 "=r"))
+ ;; Register f8 in optimization guide
+ (clobber (match_scratch:XF 3 "=&f"))
+ ;; Register f9 in optimization guide
+ (clobber (match_scratch:XF 4 "=&f"))
+ ;; Register f10 in optimization guide
+ (clobber (match_scratch:XF 5 "=&f"))
+ ;; Register p6 in optimization guide.
+ (clobber (match_scratch:BI 6 "=c"))]
+ "TARGET_INLINE_SQRT_THR"
+ "#"
+ "&& reload_completed"
+ [ ;; exponent of +1/2 in r2
+ (set (match_dup 2) (const_int 65534))
+ ;; +1/2 in f8
+ (set (match_dup 3)
+ (unspec:XF [(match_dup 2)] UNSPEC_SETF_EXP))
+ ;; Step 1
+ ;; y0 = 1/sqrt(a) in f7
+ (parallel [(set (match_dup 7)
+ (div:XF (const_int 1)
+ (sqrt:XF (match_dup 8))))
+ (set (match_dup 6)
+ (unspec:BI [(match_dup 8)]
+ UNSPEC_FR_SQRT_RECIP_APPROX))
+ (use (const_int 0))])
+ ;; Step 2
+ ;; H0 = 1/2 * y0 in f9
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 4)
+ (plus:XF (mult:XF (match_dup 3) (match_dup 7))
+ (match_dup 9)))
+ (use (const_int 1))]))
+ ;; Step 3
+ ;; S0 = a * y0 in f7
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 7)
+ (plus:XF (mult:XF (match_dup 8) (match_dup 7))
+ (match_dup 9)))
+ (use (const_int 1))]))
+ ;; Step 4
+ ;; d = 1/2 - S0 * H0 in f10
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 5)
+ (plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 4)))
+ (match_dup 3)))
+ (use (const_int 1))]))
+ ;; Step 5
+ ;; d' = d + 1/2 * d in f8
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 3)
+ (plus:XF (mult:XF (match_dup 3) (match_dup 5))
+ (match_dup 5)))
+ (use (const_int 1))]))
+ ;; Step 6
+ ;; e = d + d * d' in f8
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 3)
+ (plus:XF (mult:XF (match_dup 5) (match_dup 3))
+ (match_dup 5)))
+ (use (const_int 1))]))
+ ;; Step 7
+ ;; S1 = S0 + e * S0 in f7
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 0)
+ (float_truncate:SF
+ (plus:XF (mult:XF (match_dup 3) (match_dup 7))
+ (match_dup 7))))
+ (use (const_int 1))]))
+ ;; Step 8
+ ;; H1 = H0 + e * H0 in f8
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 3)
+ (plus:XF (mult:XF (match_dup 3) (match_dup 4))
+ (match_dup 4)))
+ (use (const_int 1))]))
+ ;; Step 9
+ ;; d1 = a - S1 * S1 in f9
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 4)
+ (plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 7)))
+ (match_dup 8)))
+ (use (const_int 1))]))
+ ;; Step 10
+ ;; S = S1 + d1 * H1 in f7
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 0)
+ (float_truncate:SF
+ (plus:XF (mult:XF (match_dup 4) (match_dup 3))
+ (match_dup 7))))
+ (use (const_int 0))]))]
+{
+ /* Generate 82-bit versions of the input and output operands. */
+ operands[7] = gen_rtx_REG (XFmode, REGNO (operands[0]));
+ operands[8] = gen_rtx_REG (XFmode, REGNO (operands[1]));
+ /* Generate required floating-point constants. */
+ operands[9] = CONST0_RTX (XFmode);
+}
+ [(set_attr "predicable" "no")])
;; ::::::::::::::::::::
;; ::
@@ -3102,6 +3253,155 @@
operands[10] = CONST1_RTX (XFmode);
}
[(set_attr "predicable" "no")])
+
+;; Inline square root.
+
+(define_expand "sqrtdf2"
+ [(set (match_operand:DF 0 "fr_register_operand" "=&f")
+ (sqrt:DF (match_operand:DF 1 "fr_register_operand" "f")))]
+ "TARGET_INLINE_SQRT"
+{
+ rtx insn;
+ if (TARGET_INLINE_SQRT_LAT)
+#if 0
+ insn = gen_sqrtdf2_internal_lat (operands[0], operands[1]);
+#else
+ abort ();
+#endif
+ else
+ insn = gen_sqrtdf2_internal_thr (operands[0], operands[1]);
+ emit_insn (insn);
+ DONE;
+})
+
+;; Latency-optimized square root.
+;; FIXME: Implement.
+
+;; Throughput-optimized square root.
+
+(define_insn_and_split "sqrtdf2_internal_thr"
+ [(set (match_operand:DF 0 "fr_register_operand" "=&f")
+ (sqrt:DF (match_operand:DF 1 "fr_register_operand" "f")))
+ ;; Register r2 in optimization guide.
+ (clobber (match_scratch:DI 2 "=r"))
+ ;; Register f8 in optimization guide
+ (clobber (match_scratch:XF 3 "=&f"))
+ ;; Register f9 in optimization guide
+ (clobber (match_scratch:XF 4 "=&f"))
+ ;; Register f10 in optimization guide
+ (clobber (match_scratch:XF 5 "=&f"))
+ ;; Register p6 in optimization guide.
+ (clobber (match_scratch:BI 6 "=c"))]
+ "TARGET_INLINE_SQRT_THR"
+ "#"
+ "&& reload_completed"
+ [ ;; exponent of +1/2 in r2
+ (set (match_dup 2) (const_int 65534))
+ ;; +1/2 in f10
+ (set (match_dup 5)
+ (unspec:XF [(match_dup 2)] UNSPEC_SETF_EXP))
+ ;; Step 1
+ ;; y0 = 1/sqrt(a) in f7
+ (parallel [(set (match_dup 7)
+ (div:XF (const_int 1)
+ (sqrt:XF (match_dup 8))))
+ (set (match_dup 6)
+ (unspec:BI [(match_dup 8)]
+ UNSPEC_FR_SQRT_RECIP_APPROX))
+ (use (const_int 0))])
+ ;; Step 2
+ ;; H0 = 1/2 * y0 in f8
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 3)
+ (plus:XF (mult:XF (match_dup 5) (match_dup 7))
+ (match_dup 9)))
+ (use (const_int 1))]))
+ ;; Step 3
+ ;; G0 = a * y0 in f7
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 7)
+ (plus:XF (mult:XF (match_dup 8) (match_dup 7))
+ (match_dup 9)))
+ (use (const_int 1))]))
+ ;; Step 4
+ ;; r0 = 1/2 - G0 * H0 in f9
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 4)
+ (plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 3)))
+ (match_dup 5)))
+ (use (const_int 1))]))
+ ;; Step 5
+ ;; H1 = H0 + r0 * H0 in f8
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 3)
+ (plus:XF (mult:XF (match_dup 4) (match_dup 3))
+ (match_dup 3)))
+ (use (const_int 1))]))
+ ;; Step 6
+ ;; G1 = G0 + r0 * G0 in f7
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 7)
+ (plus:XF (mult:XF (match_dup 4) (match_dup 7))
+ (match_dup 7)))
+ (use (const_int 1))]))
+ ;; Step 7
+ ;; r1 = 1/2 - G1 * H1 in f9
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 4)
+ (plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 3)))
+ (match_dup 5)))
+ (use (const_int 1))]))
+ ;; Step 8
+ ;; H2 = H1 + r1 * H1 in f8
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 3)
+ (plus:XF (mult:XF (match_dup 4) (match_dup 3))
+ (match_dup 3)))
+ (use (const_int 1))]))
+ ;; Step 9
+ ;; G2 = G1 + r1 * G1 in f7
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 7)
+ (plus:XF (mult:XF (match_dup 4) (match_dup 7))
+ (match_dup 7)))
+ (use (const_int 1))]))
+ ;; Step 10
+ ;; d2 = a - G2 * G2 in f9
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 4)
+ (plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 7)))
+ (match_dup 8)))
+ (use (const_int 1))]))
+ ;; Step 11
+ ;; G3 = G2 + d2 * H2 in f7
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 7)
+ (plus:XF (mult:XF (match_dup 4) (match_dup 3))
+ (match_dup 7)))
+ (use (const_int 1))]))
+ ;; Step 12
+ ;; d3 = a - G3 * G3 in f9
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 4)
+ (plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 7)))
+ (match_dup 8)))
+ (use (const_int 1))]))
+ ;; Step 13
+ ;; S = G3 + d3 * H2 in f7
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 0)
+ (float_truncate:DF
+ (plus:XF (mult:XF (match_dup 4) (match_dup 3))
+ (match_dup 7))))
+ (use (const_int 0))]))]
+{
+ /* Generate 82-bit versions of the input and output operands. */
+ operands[7] = gen_rtx_REG (XFmode, REGNO (operands[0]));
+ operands[8] = gen_rtx_REG (XFmode, REGNO (operands[1]));
+ /* Generate required floating-point constants. */
+ operands[9] = CONST0_RTX (XFmode);
+}
+ [(set_attr "predicable" "no")])
;; ::::::::::::::::::::
;; ::
@@ -3292,6 +3592,17 @@
"fma.s%4 %0 = %F1, %F2, %F3"
[(set_attr "itanium_class" "fmac")])
+(define_insn "*maddxf4_alts_truncsf"
+ [(set (match_operand:SF 0 "fr_register_operand" "=f")
+ (float_truncate:SF
+ (plus:XF (mult:XF (match_operand:XF 1 "xfreg_or_fp01_operand" "fG")
+ (match_operand:XF 2 "xfreg_or_fp01_operand" "fG"))
+ (match_operand:XF 3 "xfreg_or_fp01_operand" "fG"))))
+ (use (match_operand:SI 4 "const_int_operand" ""))]
+ ""
+ "fma.s.s%4 %0 = %F1, %F2, %F3"
+ [(set_attr "itanium_class" "fmac")])
+
(define_insn "*maddxf4_alts_truncdf"
[(set (match_operand:DF 0 "fr_register_operand" "=f")
(float_truncate:DF
@@ -3589,6 +3900,170 @@
(match_dup 3))))
]
"operands[6] = CONST1_RTX (XFmode);"
+ [(set_attr "predicable" "no")])
+
+;; Inline square root.
+
+(define_expand "sqrtxf2"
+ [(set (match_operand:XF 0 "fr_register_operand" "=&f")
+ (sqrt:XF (match_operand:XF 1 "fr_register_operand" "f")))]
+ "TARGET_INLINE_SQRT"
+{
+ rtx insn;
+ if (TARGET_INLINE_SQRT_LAT)
+#if 0
+ insn = gen_sqrtxf2_internal_lat (operands[0], operands[1]);
+#else
+ abort ();
+#endif
+ else
+ insn = gen_sqrtxf2_internal_thr (operands[0], operands[1]);
+ emit_insn (insn);
+ DONE;
+})
+
+;; Latency-optimized square root.
+;; FIXME: Implement.
+
+;; Throughput-optimized square root.
+
+(define_insn_and_split "sqrtxf2_internal_thr"
+ [(set (match_operand:XF 0 "fr_register_operand" "=&f")
+ (sqrt:XF (match_operand:XF 1 "fr_register_operand" "f")))
+ ;; Register r2 in optimization guide.
+ (clobber (match_scratch:DI 2 "=r"))
+ ;; Register f8 in optimization guide
+ (clobber (match_scratch:XF 3 "=&f"))
+ ;; Register f9 in optimization guide
+ (clobber (match_scratch:XF 4 "=&f"))
+ ;; Register f10 in optimization guide
+ (clobber (match_scratch:XF 5 "=&f"))
+ ;; Register f11 in optimization guide
+ (clobber (match_scratch:XF 6 "=&f"))
+ ;; Register p6 in optimization guide.
+ (clobber (match_scratch:BI 7 "=c"))]
+ "TARGET_INLINE_SQRT_THR"
+ "#"
+ "&& reload_completed"
+ [ ;; exponent of +1/2 in r2
+ (set (match_dup 2) (const_int 65534))
+ ;; +1/2 in f8. The Intel manual mistakenly specifies f10.
+ (set (match_dup 3)
+ (unspec:XF [(match_dup 2)] UNSPEC_SETF_EXP))
+ ;; Step 1
+ ;; y0 = 1/sqrt(a) in f7
+ (parallel [(set (match_dup 8)
+ (div:XF (const_int 1)
+ (sqrt:XF (match_dup 9))))
+ (set (match_dup 7)
+ (unspec:BI [(match_dup 9)]
+ UNSPEC_FR_SQRT_RECIP_APPROX))
+ (use (const_int 0))])
+ ;; Step 2
+ ;; H0 = 1/2 * y0 in f9
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 4)
+ (plus:XF (mult:XF (match_dup 3) (match_dup 8))
+ (match_dup 10)))
+ (use (const_int 1))]))
+ ;; Step 3
+ ;; S0 = a * y0 in f7
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 8)
+ (plus:XF (mult:XF (match_dup 9) (match_dup 8))
+ (match_dup 10)))
+ (use (const_int 1))]))
+ ;; Step 4
+ ;; d0 = 1/2 - S0 * H0 in f10
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 5)
+ (plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 4)))
+ (match_dup 3)))
+ (use (const_int 1))]))
+ ;; Step 5
+ ;; H1 = H0 + d0 * H0 in f9
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 4)
+ (plus:XF (mult:XF (match_dup 5) (match_dup 4))
+ (match_dup 4)))
+ (use (const_int 1))]))
+ ;; Step 6
+ ;; S1 = S0 + d0 * S0 in f7
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 8)
+ (plus:XF (mult:XF (match_dup 5) (match_dup 8))
+ (match_dup 8)))
+ (use (const_int 1))]))
+ ;; Step 7
+ ;; d1 = 1/2 - S1 * H1 in f10
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 5)
+ (plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 4)))
+ (match_dup 3)))
+ (use (const_int 1))]))
+ ;; Step 8
+ ;; H2 = H1 + d1 * H1 in f9
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 4)
+ (plus:XF (mult:XF (match_dup 5) (match_dup 4))
+ (match_dup 4)))
+ (use (const_int 1))]))
+ ;; Step 9
+ ;; S2 = S1 + d1 * S1 in f7
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 8)
+ (plus:XF (mult:XF (match_dup 5) (match_dup 8))
+ (match_dup 8)))
+ (use (const_int 1))]))
+ ;; Step 10
+ ;; d2 = 1/2 - S2 * H2 in f10
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 5)
+ (plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 4)))
+ (match_dup 3)))
+ (use (const_int 1))]))
+ ;; Step 11
+ ;; e2 = a - S2 * S2 in f8
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 3)
+ (plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 8)))
+ (match_dup 9)))
+ (use (const_int 1))]))
+ ;; Step 12
+ ;; S3 = S2 + e2 * H2 in f7
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 8)
+ (plus:XF (mult:XF (match_dup 3) (match_dup 4))
+ (match_dup 8)))
+ (use (const_int 1))]))
+ ;; Step 13
+ ;; H3 = H2 + d2 * H2 in f9
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 4)
+ (plus:XF (mult:XF (match_dup 5) (match_dup 4))
+ (match_dup 4)))
+ (use (const_int 1))]))
+ ;; Step 14
+ ;; e3 = a - S3 * S3 in f8
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 3)
+ (plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 8)))
+ (match_dup 9)))
+ (use (const_int 1))]))
+ ;; Step 15
+ ;; S = S3 + e3 * H3 in f7
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 0)
+ (plus:XF (mult:XF (match_dup 3) (match_dup 4))
+ (match_dup 8)))
+ (use (const_int 0))]))]
+{
+ /* Generate 82-bit versions of the input and output operands. */
+ operands[8] = gen_rtx_REG (XFmode, REGNO (operands[0]));
+ operands[9] = gen_rtx_REG (XFmode, REGNO (operands[1]));
+ /* Generate required floating-point constants. */
+ operands[10] = CONST0_RTX (XFmode);
+}
[(set_attr "predicable" "no")])
;; ??? frcpa works like cmp.foo.unc.