This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

sh4 soft fp


The normalization in muldf3 posted here:
http://gcc.gnu.org/ml/gcc-patches/2004-08/msg00459.html
doesn't work right when the top 32 bits are zero but the
lower 32 bits have to be distrubuted over two 32-bit words.
When the upper 32 bits are zero, the following strategy should work better:
copy low word to high word, shift low word left by 16, use signed shift
count on high word, shift +16 on low word.

I've also found a number of smaller bugs / missing comments in the other code,
so here is an update to the patches to the old files:

Index: lib1funcs.asm
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/sh/lib1funcs.asm,v
retrieving revision 1.36
diff -p -u -r1.36 lib1funcs.asm
--- lib1funcs.asm	12 Aug 2003 01:25:07 -0000	1.36
+++ lib1funcs.asm	30 Sep 2004 18:59:32 -0000
@@ -37,6 +37,8 @@ Boston, MA 02111-1307, USA.  */
    ELF local label prefixes by J"orn Rennecke
    amylaar@cygnus.com  */
 
+#include "insn-constants.h"
+
 #ifdef __ELF__
 #define LOCAL(X)	.L_##X
 #define FUNC(X)		.type X,@function
@@ -56,6 +58,34 @@ Boston, MA 02111-1307, USA.  */
 #define FMOVD_WORKS
 #endif
 
+#ifdef __sh1__
+#define SL(branch, dest, in_slot, in_slot_arg2) \
+        in_slot, in_slot_arg2; branch dest
+#define SL_(branch, dest, in_slot) \
+        in_slot; branch dest
+#define SLC(branch, dest, in_slot, in_slot_arg2) \
+        branch dest; in_slot, in_slot_arg2
+#define SLI(in_slot, in_slot_arg2) in_slot, in_slot_arg2
+#define SLCMP(branch, cmp1, cmp1arg2, cmp2, cmp2arg2) \
+	branch .+6; bra .+6; cmp2, cmp2arg2; cmp1, cmp1arg2
+#else
+#define SL(branch, dest, in_slot, in_slot_arg2) \
+        branch##/s dest; in_slot, in_slot_arg2
+#define SL_(branch, dest, in_slot) \
+        branch##/s dest; in_slot
+#define SLC(branch, dest, in_slot, in_slot_arg2) \
+        branch##/s dest; in_slot, in_slot_arg2
+#define SLI(in_slot, in_slot_arg)
+#define SLCMP(branch, cmp1, cmp1arg2, cmp2, cmp2arg2) \
+	branch##/s .+6; cmp1, cmp1arg2; cmp2, cmp2arg2
+#endif
+
+#if defined (__sh1__) || defined (__sh2__) || defined (__SH2E__)
+/* don't #define DYN_SHIFT */
+#else
+#define DYN_SHIFT 1
+#endif
+
 #if ! __SH5__
 #ifdef L_ashiftrt
 	.global	GLOBAL(ashiftrt_r4_0)
@@ -2873,3 +2903,1647 @@ GLOBAL(GCC_pop_shmedia_regs_nofpu):
 	ENDFUNC(GLOBAL(GCC_pop_shmedia_regs_nofpu))
 #endif /* __SH5__ == 32 */
 #endif /* L_push_pop_shmedia_regs */
+
+/* Floating-point emulation.  We handle NANs, +-infinity, and +-zero.
+   However, we assume that for NANs, the topmost bit of the fraction is set.  */
+#ifdef L_nesf2
+/* -ffinite-math-only inline version, T := r4:SF == r5:SF
+	cmp/eq	r4,r5
+	mov	r4,r0
+	bt	0f
+	or	r5,r0
+	add	r0,r0
+	tst	r0,r0
+	0:			*/
+	.global GLOBAL(nesf2_)
+	FUNC(GLOBAL(nesf2_))
+GLOBAL(nesf2_):
+        /* If the raw values are unequal, the result is unequal, unless
+	   both values are +-zero.
+	   If the raw values are equal, the result is equal, unless
+	   the values are nan or infinity.  */
+	cmp/eq	r4,r5
+	mov.l   LOCAL(c_SF_NAN_MASK),r1
+	not	r4,r0
+	bt	LOCAL(check_nan)
+	mov	r4,r0
+	or	r5,r0
+	rts
+	add	r0,r0
+LOCAL(check_nan):
+	tst	r1,r0
+	rts
+	movt	r0
+	.balign 4
+LOCAL(c_SF_NAN_MASK):
+	.long SF_NAN_MASK
+	ENDFUNC(GLOBAL(nesf2_))
+#endif /* L_nesf2 */
+
+#ifdef __LITTLE_ENDIAN__
+#define DBL0L r4
+#define DBL0H r5
+#define DBL1L r6
+#define DBL1H r7
+#define DBLRL r0
+#define DBLRH r1
+#else
+#define DBL0L r5
+#define DBL0H r4
+#define DBL1L r7
+#define DBL1H r6
+#define DBLRL r1
+#define DBLRH r0
+#endif
+
+#ifdef L_nedf2
+/* -ffinite-math-only -mb inline version, T := r4:DF == r6:DF
+	cmp/eq	r5,r7
+	mov	r4,r0
+	bf	0f
+	cmp/eq	r4,r6
+	bt	0f
+	or	r6,r0
+	add	r0,r0
+	or	r5,r0
+	tst	r0,r0
+	0:			*/
+	.global GLOBAL(nedf2_)
+	FUNC(GLOBAL(nedf2_))
+GLOBAL(nedf2_):
+	cmp/eq	DBL0L,DBL1L
+	mov.l   LOCAL(c_DF_NAN_MASK),r1
+	bf LOCAL(ne)
+	cmp/eq	DBL0H,DBL1H
+	not	DBL0H,r0
+	bt	LOCAL(check_nan)
+	mov	DBL0H,r0
+	or	DBL1H,r0
+	add	r0,r0
+	rts
+	or	DBL0L,r0
+LOCAL(check_nan):
+	tst	r1,r0
+	rts
+	movt	r0
+LOCAL(ne):
+	rts
+	mov #1,r0
+	.balign 4
+LOCAL(c_DF_NAN_MASK):
+	.long DF_NAN_MASK
+	ENDFUNC(GLOBAL(nedf2_))
+#endif /* L_nedf2 */
+
+#ifdef L_unordsf2
+	.global GLOBAL(unordsf2_)
+	FUNC(GLOBAL(unordsf2_))
+GLOBAL(unordsf2_):
+	mov.l	LOCAL(c_SF_NAN_MASK),r1
+	not	r4,r0
+	tst	r1,r0
+	not	r5,r0
+	bt	LOCAL(unord)
+	tst	r1,r0
+LOCAL(unord):
+	rts
+	movt	r0
+	.balign	4
+LOCAL(c_SF_NAN_MASK):
+	.long SF_NAN_MASK
+	ENDFUNC(GLOBAL(unordsf2_))
+#endif /* L_unordsf2 */
+
+#ifdef L_unorddf2
+	.global GLOBAL(unorddf2_)
+	FUNC(GLOBAL(unorddf2_))
+GLOBAL(unorddf2_):
+	mov.l	LOCAL(c_DF_NAN_MASK),r1
+	not	r4,r0
+	tst	r1,r0
+	not	r6,r0
+	bt	LOCAL(unord)
+	tst	r1,r0
+LOCAL(unord):
+	rts
+	movt	r0
+	.balign	4
+LOCAL(c_DF_NAN_MASK):
+	.long DF_NAN_MASK
+	ENDFUNC(GLOBAL(unorddf2_))
+#endif /* L_unorddf2 */
+
+#if defined(L_gtsf2t) || defined(L_gtsf2t_trap)
+/* -ffinite-math-only inline version, T := r4:SF > r5:SF ? 0 : 1
+	cmp/pz	r4
+	mov	r4,r0
+	bf/s	0f
+	 cmp/hs	r5,r4
+	cmp/ge	r4,r5
+	or	r5,r0
+	bt	0f
+	add	r0,r0
+	tst	r0,r0
+	0:			*/
+#ifdef L_gtsf2t
+#define fun_label GLOBAL(gtsf2t)
+#else
+#define fun_label GLOBAL(gtsf2t_trap)
+#endif
+	.global fun_label
+	FUNC(fun_label)
+fun_label:
+	/* If the raw values compare greater, the result true, unless
+	   any of them is a nan (but infinity is fine), or both values are
+	   +- zero.  Otherwise, the result false.  */
+	mov.l	LOCAL(c_SF_NAN_MASK),r1
+	cmp/pz	r4
+	not	r5,r0
+	SLC(bf,	LOCAL(neg),
+	 tst	r1,r0)
+	mov	r4,r0
+	bt	LOCAL(nan)
+	cmp/gt	r5,r4
+	SLC(bf,	LOCAL(check_nan),
+	 cmp/gt	r4,r1)
+	bf	LOCAL(nan)
+	or	r5,r0
+	rts
+	add	r0,r0
+LOCAL(neg):
+	SLI(tst	r1,r0)
+	bt	LOCAL(nan)
+	not	r4,r0
+	tst	r1,r0
+	bt	LOCAL(nan)
+	cmp/hi	r4,r5
+#if defined(L_gtsf2t) && defined(DELAYED_BRANCHES)
+LOCAL(check_nan):
+#endif /* DELAYED_BRANCHES */
+	rts
+	movt	r0
+#ifdef L_gtsf2t
+LOCAL(check_nan):
+LOCAL(nan):
+	rts
+	mov	#0,r0
+#else /* ! L_gtsf2t */
+LOCAL(check_nan):
+	SLI(cmp/gt	r4,r1)
+	bf	LOCAL(nan)
+	rts
+	movt	r0
+LOCAL(nan):
+	mov	#0,r0
+	trapa	#0
+#endif /* ! L_gtsf2t */
+	.balign	4
+LOCAL(c_SF_NAN_MASK):
+	.long SF_NAN_MASK
+	ENDFUNC(fun_label)
+#endif /* L_gtsf2t */
+
+#if defined(L_gtdf2t) || defined(L_gtdf2t_trap)
+#ifdef L_gtdf2t
+#define fun_label GLOBAL(gtdf2t)
+#else
+#define fun_label GLOBAL(gtdf2t_trap)
+#endif
+	.global fun_label
+	FUNC(fun_label)
+fun_label:
+	/* If the raw values compare greater, the result true, unless
+	   any of them is a nan (but infinity is fine), or both values are
+	   +- zero.  Otherwise, the result false.  */
+	mov.l	LOCAL(c_DF_NAN_MASK),r1
+	cmp/pz	DBL0H
+	not	DBL1H,r0
+	SLC(bf,	LOCAL(neg),
+	 tst	r1,r0)
+	mov	DBL0H,r0
+	bt	LOCAL(nan) /* return zero if DBL1 is NAN.  */
+	cmp/eq	DBL1H,DBL0H
+	bt	LOCAL(cmp_low)
+	cmp/gt	DBL1H,DBL0H
+	or	DBL1H,r0
+	SLC(bf,	LOCAL(check_nan),
+	 cmp/gt	DBL0H,r1)
+	add	r0,r0
+	bf	LOCAL(nan) /* return zero if DBL0 is NAN.  */
+	or	DBL0L,r0
+	rts
+	or	DBL1L,r0 /* non-zero unless both DBL0 and DBL1 are +-zero.  */
+LOCAL(cmp_low):
+	cmp/hi	DBL1L,DBL0L
+	rts
+	movt	r0
+LOCAL(neg):
+	SLI(tst	r1,r0)
+	bt	LOCAL(nan) /* return zero if DBL1 is NAN.  */
+	cmp/eq	DBL1H,DBL0H
+	SLC(bt,	LOCAL(neg_cmp_low),
+	 cmp/hi	DBL0L,DBL1L)
+	not	r4,r0
+	tst	r1,r0
+	bt	LOCAL(nan) /* return zero if DBL0 is NAN.  */
+	cmp/hi	DBL0H,DBL1H
+	SLI(rts	!,)
+	SLI(movt r0 !,)
+LOCAL(neg_cmp_low):
+	SLI(cmp/hi	DBL0L,DBL1L)
+	rts
+	movt	r0
+LOCAL(check_nan):
+#ifdef L_gtdf2t
+LOCAL(nan):
+	rts
+	mov	#0,r0
+#else
+	SLI(cmp/gt DBL0H,r1)
+	bf	LOCAL(nan) /* return zero if DBL0 is NAN.  */
+	rts
+	mov	#0,r0
+LOCAL(nan):
+	mov	#0,r0
+	trapa	#0
+#endif
+	.balign	4
+LOCAL(c_DF_NAN_MASK):
+	.long DF_NAN_MASK
+	ENDFUNC(fun_label)
+#endif /* defined(L_gtdf2t) || defined(L_gtdf2t_trap) */
+
+#if defined(L_gesf2f) || defined(L_gesf2f_trap)
+/* -ffinite-math-only inline version, T := r4:SF >= r5:SF */
+	cmp/pz	r5
+	mov	r4,r0
+	bf/s	0f
+	 cmp/hs	r4,r5
+	cmp/ge	r5,r4
+	or	r5,r0
+	bt	0f
+	add	r0,r0
+	tst	r0,r0
+	0:
+#ifdef L_gesf2f
+#define fun_label GLOBAL(gesf2f)
+#else
+#define fun_label GLOBAL(gesf2f_trap)
+#endif
+	.global fun_label
+	FUNC(fun_label)
+fun_label:
+	/* If the raw values compare greater or equal, the result is
+	   true, unless any of them is a nan.  If both are -+zero, the
+	   result is true; otherwise, it is false.
+	   We use 0 as true and nonzero as false for this function.  */
+	mov.l	LOCAL(c_SF_NAN_MASK),r1
+	cmp/pz	r5
+	not	r4,r0
+	SLC(bf,	LOCAL(neg),
+	 tst	r1,r0)
+	mov	r4,r0
+	bt	LOCAL(nan)
+	cmp/gt	r4,r5
+	SLC(bf,	LOCAL(check_nan),
+	 cmp/ge	r1,r5)
+	bt	LOCAL(nan)
+	or	r5,r0
+	rts
+	add	r0,r0
+LOCAL(neg):
+	SLI(tst	r1,r0)
+	bt	LOCAL(nan)
+	not	r5,r0
+	tst	r1,r0
+	bt	LOCAL(nan)
+	cmp/hi	r5,r4
+#if defined(L_gesf2f) && defined(DELAYED_BRANCHES)
+LOCAL(nan): LOCAL(check_nan):
+#endif
+	rts
+	movt	r0
+#if defined(L_gesf2f) && ! defined(DELAYED_BRANCHES)
+LOCAL(check_nan):
+	cmp/ge	r1,r5
+LOCAL(nan):
+	rts
+	movt	r0
+#endif /* ! DELAYED_BRANCHES */
+#ifdef L_gesf2f_trap
+LOCAL(check_nan):
+	SLI(cmp/ge	r1,r5)
+	bt	LOCAL(nan)
+	rts
+LOCAL(nan):
+	movt	r0
+	trapa	#0
+#endif /* L_gesf2f_trap */
+	.balign	4
+LOCAL(c_SF_NAN_MASK):
+	.long SF_NAN_MASK
+	ENDFUNC(GLOBAL(gesf2f))
+#endif /* L_gesf2f */
+
+#ifdef L_gedf2f
+	.global GLOBAL(gedf2f)
+	FUNC(GLOBAL(gedf2f))
+GLOBAL(gedf2f):
+	/* If the raw values compare greater or equal, the result is
+	   true, unless any of them is a nan, or both are the
+	   same infinity.  If both are -+zero, the result is true;
+	   otherwise, it is false.
+	   We use 0 as true and nonzero as false for this function.  */
+	mov.l	LOCAL(c_DF_NAN_MASK),r1
+	cmp/pz	DBL1H
+	not	DBL0H,r0
+	SLC(bf,	LOCAL(neg),
+	 tst	r1,r0)
+	mov	DBL0H,r0
+	bt	LOCAL(nan)
+	cmp/eq	DBL0H,DBL1H
+	bt	LOCAL(cmp_low)
+	cmp/gt	DBL0H,DBL1H
+	or	DBL1H,r0
+	SLC(bf,	LOCAL(check_nan),
+	 cmp/ge	r1,DBL1H)
+	add	r0,r0
+	bt	LOCAL(nan)
+	or	DBL0L,r0
+	rts
+	or	DBL1L,r0
+LOCAL(cmp_low):
+	cmp/hi	DBL0L,DBL1L
+#if defined(L_gedf2f) && defined(DELAYED_BRANCHES)
+LOCAL(nan): LOCAL(check_nan):
+#endif
+	rts
+	movt	r0
+#if defined(L_gedf2f) && ! defined(DELAYED_BRANCHES)
+LOCAL(check_nan):
+	SLI(cmp/ge	r1,DBL1H)
+LOCAL(nan):
+	rts
+	movt	r0
+#elif defined(L_gedf2f_trap)
+LOCAL(check_nan):
+	SLI(cmp/ge	r1,DBL1H)
+	bt	LOCAL(nan)
+	rts
+LOCAL(nan):
+	movt	r0
+	trapa	#0
+#endif /* L_gedf2f_trap */
+LOCAL(neg):
+	SLI(tst	r1,r0)
+	bt	LOCAL(nan)
+	cmp/eq	DBL0H,DBL1H
+	not	DBL1H,r0
+	SLC(bt,	LOCAL(neg_cmp_low),
+	 cmp/hi	DBL1L,DBL0L)
+	tst	r1,r0
+	bt	LOCAL(nan)
+	cmp/hi	DBL1H,DBL0H
+	SLI(rts !,)
+	SLI(movt	r0 !,)
+LOCAL(neg_cmp_low):
+	SLI(cmp/hi	DBL1L,DBL0L)
+	rts
+	movt	r0
+	.balign	4
+LOCAL(c_DF_NAN_MASK):
+	.long DF_NAN_MASK
+	ENDFUNC(GLOBAL(gedf2f))
+#endif /* L_gedf2f */
+
+#ifndef DYN_SHIFT /* Basic conversions for SH1 / SH2  */
+#ifdef L_extendsfdf2
+	.global GLOBAL(extendsfdf2_)
+	FUNC(GLOBAL(extendsfdf2_))
+GLOBAL(extendsfdf2_):
+	mov.l	LOCAL(x7f800000),r3
+	mov	r4,DBLRL
+	tst	r3,r4
+	bt	LOCAL(zero_denorm)
+	mov.l	LOCAL(xe0000000),r2
+	rotr	DBLRL
+	rotr	DBLRL
+	rotr	DBLRL
+	and	r2,DBLRL
+	mov	r4,DBLRH
+	not	r4,r2
+	shll	DBLRH
+	shlr2	DBLRH
+	shlr2	DBLRH
+	add	DBLRH,DBLRH
+	rotcr	DBLRH
+	tst	r3,r2
+	bt	LOCAL(inf_nan)
+	mov.l	LOCAL(x38000000),r2
+	rts
+	add	r2,DBLRH
+LOCAL(inf_nan):
+	mov.l	LOCAL(x70000000),r2
+	rts
+	add	r2,DBLRH
+LOCAL(zero_denorm):
+	mov.l	r4,@-r15
+	add	r4,r4
+	tst	r4,r4
+	bt	LOCAL(zero)
+	add	r3,r3	/* 0xff000000 */
+	mov.l	LOCAL(xb8000009),r2
+LOCAL(shift_byte):
+	tst	r3,r4
+	shll8	r4
+	SL(bt,	LOCAL(shift_byte),
+	 add	#-8,r2)
+LOCAL(shift_bit):
+	shll	r4
+	SL(bf,	LOCAL(shift_bit),
+	 add	#-1,r2)
+	mov	r4,DBLRH
+	mov.l	@r15+,r4
+	shlr8	DBLRH
+	shlr2	DBLRH
+	shlr	DBLRH
+	rotcr	DBLRL
+	cmp/pz	r4
+	rotcr	DBLRH
+	rotcr	DBLRL
+	rts
+	add	r2,DBLRH
+LOCAL(zero):
+	mov.l	@r15+,DBLRH
+	rts
+	mov	#0,DBLRL
+	.balign	4
+LOCAL(x7f800000):
+	.long	0x7f800000
+LOCAL(x38000000):
+	.long	0x38000000
+LOCAL(xe0000000):
+	.long	0xe0000000
+LOCAL(x70000000):
+	.long	0x70000000
+LOCAL(xb8000009):
+	/* Flip sign back, do exponent adjustment, and compensate for -8 / -1
+	   adjustments in first shift loop iterations.  */
+	.long 0x80000000 + 0x38000000 + 9
+	ENDFUNC(GLOBAL(extendsfdf2_))
+#endif /* L_extendsfdf2 */
+
+#ifdef L_truncdfsf2
+	.global GLOBAL(truncdfsf2_)
+	FUNC(GLOBAL(truncdfsf2_))
+GLOBAL(truncdfsf2_):
+	mov.l	LOCAL(x38000000),r3	! exponent adjustment DF -> SF
+	mov	DBL0H,r1
+	mov.l	LOCAL(x70000000),r2	! mask for out-of-range exponent bits
+	mov	DBL0H,r0
+	mov.l	DBL0L,@-r15
+	sub	r3,r1
+	tst	r2,r1
+	shll8	r0			!
+	shll2	r0			! Isolate highpart fraction.
+	shll2	r0			!
+	bf	LOCAL(ill_exp)
+	shll2	r1
+	mov.l	LOCAL(x2fffffff),r2 /* Fraction lsb | lower guard bits.  */
+	shll2	r1
+	mov.l	LOCAL(xff000000),r3
+	shlr8	r0
+	tst	r2,DBL0L /* Check if msb guard bit wants rounding up.  */
+	shlr16	DBL0L
+	shlr8	DBL0L
+	shlr2	DBL0L
+	SL_(bt,	LOCAL(add_frac),
+	 shlr2	DBL0L)
+	add	#1,DBL0L
+LOCAL(add_frac):
+	add	DBL0L,r0
+	mov.l	LOCAL(x01000000),r2
+	and	r3,r1
+	mov.l	@r15+,DBL0L
+	add	r1,r0
+	tst	r3,r0
+	bt	LOCAL(inf_denorm0)
+	cmp/hs	r3,r0
+LOCAL(denorm_noup_sh1):
+	bt	LOCAL(inf)
+	div0s	DBL0H,r2	/* copy orig. sign into T.  */
+	rts
+	rotcr	r0
+LOCAL(inf_denorm0):	!  We might need to undo previous rounding.
+	mov.l	LOCAL(x2fffffff),r3 /* Old fraction lsb | lower guard bits.  */
+	tst	r1,r1
+	bf	LOCAL(inf)
+	add	#-1,r0
+	tst	r3,DBL0L /* Check if msb guard bit was rounded up.  */
+	mov.l	LOCAL(x5fffffff),r3 /* Fraction lsb | lower guard bits.  */
+	addc	r2,r0
+	shlr	r0
+	tst	r3,DBL0L /* Check if msb guard bit wants rounding up.  */
+#ifdef DELAYED_BRANCHES
+	bt/s	LOCAL(denorm_noup)
+#else
+	bt	LOCAL(denorm_noup_sh1)
+#endif
+	div0s	DBL0H,r2	/* copy orig. sign into T.  */
+	add	#1,r0
+LOCAL(denorm_noup):
+	rts
+	rotcr	r0
+LOCAL(ill_exp):
+	div0s	DBL0H,r1
+	mov.l	LOCAL(x7ff80000),r2
+	add	r1,r1
+	bf	LOCAL(inf_nan)
+	mov.w	LOCAL(m32),r3 /* Handle denormal or zero.  */
+	shlr16	r1
+	exts.w	r1,r1
+	shll2	r1
+	add	r1,r1
+	shlr8	r1
+	exts.w	r1,r1
+	add	#-8,r1	/* Go from 9 to 1 guard bit in MSW.  */
+	cmp/gt	r3,r1
+	mov.l	@r15+,r3 /* DBL0L */
+	bf	LOCAL(zero)
+	mov.l	DBL0L, @-r15
+	shll8	DBL0L
+	rotcr	r0	/* Insert leading 1.  */
+	shlr16	r3
+	shll2	r3
+	add	r3,r3
+	shlr8	r3
+	cmp/pl	DBL0L	/* Check lower 23 guard bits if guard bit 23 is 0.  */
+	addc	r3,r0	/* Assemble fraction with compressed guard bits.  */
+	mov.l	@r15+,DBL0L
+	mov	#0,r2
+	neg	r1,r1
+LOCAL(denorm_loop):
+	shlr	r0
+	rotcl	r2
+	dt	r1
+	bf	LOCAL(denorm_loop)
+	tst	#2,r0
+	rotcl	r0
+	tst	r2,r2
+	rotcl	r0
+	xor	#3,r0
+	add	#3,r0	/* Even overflow gives the correct result.  */
+	shlr2	r0
+	div0s	r0,DBL0H
+	rts
+	rotcr	r0
+LOCAL(zero):
+	mov	#0,r0
+	div0s	r0,DBL0H
+	rts
+	rotcr	r0
+LOCAL(inf_nan):
+	not	DBL0H,r0
+	tst	r2,r0
+	mov.l	@r15+,DBL0L
+	bf	LOCAL(inf)
+	rts
+	mov	#-1,r0	/* NAN */
+LOCAL(inf):	/* r2 must be positive here.  */
+	mov.l	LOCAL(xffe00000),r0
+	div0s	r2,DBL0H
+	rts
+	rotcr	r0
+LOCAL(m32):
+	.word	-32
+	.balign	4
+LOCAL(x38000000):
+	.long	0x38000000
+LOCAL(x70000000):
+	.long	0x70000000
+LOCAL(x2fffffff):
+	.long	0x2fffffff
+LOCAL(x01000000):
+	.long	0x01000000
+LOCAL(xff000000):
+	.long	0xff000000
+LOCAL(x5fffffff):
+	.long	0x5fffffff
+LOCAL(x7ff80000):
+	.long	0x7ff80000
+LOCAL(xffe00000):
+	.long	0xffe00000
+	ENDFUNC(GLOBAL(truncdfsf2_))
+#endif /*  L_truncdfsf2 */
+#endif /* ! DYN_SHIFT */
+
+/* The actual arithmetic uses dynamic shift.  Supporting SH1 / SH2 here would
+   make this code too hard to maintain, so if you want to add SH1 / SH2
+   support, do it in a separate copy.  */
+#ifdef DYN_SHIFT
+#ifdef L_extendsfdf2
+	.global GLOBAL(extendsfdf2_)
+	FUNC(GLOBAL(extendsfdf2_))
+GLOBAL(extendsfdf2_):
+	mov.l	LOCAL(x7f800000),r2
+	mov	#29,r3
+	mov	r4,DBLRL
+	not	r4,DBLRH
+	tst	r2,r4
+	shld	r3,DBLRL
+	bt	LOCAL(zero_denorm)
+	mov	#-3,r3
+	tst	r2,DBLRH
+	mov	r4,DBLRH
+	bt/s	LOCAL(inf_nan)
+	 shll	DBLRH
+	shld	r3,DBLRH
+	mov.l	LOCAL(x38000000),r2
+	rotcr	DBLRH
+	rts
+	add	r2,DBLRH
+	.balign	4
+LOCAL(inf_nan):
+	shld	r2,DBLRH
+	mov.l	LOCAL(x70000000),r2
+	rotcr	DBLRH
+	rts
+	add	r2,DBLRH
+LOCAL(zero_denorm):
+	mov.l	r4,@-r15
+	add	r4,r4
+	tst	r4,r4
+	extu.w	r4,r2
+	bt	LOCAL(zero)
+	cmp/eq	r4,r2
+	extu.b	r4,r1
+	bf/s	LOCAL(three_bytes)
+	 mov.l	LOCAL(c__clz_tab),r0
+	cmp/eq	r4,r1
+	mov	#22,DBLRH
+	bt	LOCAL(one_byte)
+	shlr8	r2
+	mov	#14,DBLRH
+LOCAL(one_byte):
+#ifdef __pic__
+	add	r0,r2
+	mova  LOCAL(c__clz_tab),r0
+#endif
+	mov.b	@(r0,r2),r2
+	mov	#21,r3
+	mov.w	LOCAL(x0),DBLRL
+	sub	r2,DBLRH
+LOCAL(norm_shift):
+	shld	DBLRH,r4
+	mov.l	@r15+,r2
+	shld	r3,DBLRH
+	mov.l	LOCAL(xb7ffffff),r3
+	add	r4,DBLRH
+	cmp/pz	r2
+	mov	r2,r4
+	rotcr	DBLRH
+	rts
+	add	r3,DBLRH
+LOCAL(three_bytes):
+	mov	r4,r2
+	shlr16	r2
+#ifdef __pic__
+	add	r0,r2
+	mova  LOCAL(c__clz_tab),r0
+#endif
+	mov.b	@(r0,r2),r2
+	mov	#21,r3
+	mov	#6-32,DBLRH
+	sub	r2,DBLRH
+	mov	r4,DBLRL
+	shld	DBLRH,DBLRL
+	bra	LOCAL(norm_shift)
+	add	#32,DBLRH
+LOCAL(zero):
+	rts	/* DBLRL has already been zeroed above.  */
+	mov.l @r15+,DBLRH
+LOCAL(x0):
+	.word 0
+	.balign	4
+LOCAL(x7f800000):
+	.long	0x7f800000
+LOCAL(x38000000):
+	.long	0x38000000
+LOCAL(x70000000):
+	.long	0x70000000
+LOCAL(xb7ffffff):
+	/* Flip sign back, do exponent adjustment, and remove leading one.  */
+	.long 0x80000000 + 0x38000000 - 1
+LOCAL(c__clz_tab):
+#ifdef __pic__
+	.long	GLOBAL(clz_tab) - .
+#else
+	.long	GLOBAL(clz_tab)
+#endif
+	ENDFUNC(GLOBAL(extendsfdf2_))
+#endif /* L_extendsfdf2 */
+
+#ifdef L_truncdfsf2
+	.global GLOBAL(truncdfsf2_)
+	FUNC(GLOBAL(truncdfsf2_))
+GLOBAL(truncdfsf2_):
+	mov.l	LOCAL(x38000000),r3
+	mov	DBL0H,r1
+	mov.l	LOCAL(x70000000),r2
+	mov	DBL0H,r0
+	sub	r3,r1
+	mov.l	DBL0L,@-r15
+	tst	r2,r1
+	mov	#12,r3
+	shld	r3,r0			! Isolate highpart fraction.
+	bf	LOCAL(ill_exp)
+	shll2	r1
+	mov.l	LOCAL(x2fffffff),r2 /* Fraction lsb | lower guard bits.  */
+	shll2	r1
+	mov.l	LOCAL(xff000000),r3
+	shlr8	r0
+	tst	r2,DBL0L /* Check if msb guard bit wants rounding up.  */
+	mov	#-28,r2
+	bt/s	LOCAL(add_frac)
+	 shld	r2,DBL0L
+	add	#1,DBL0L
+LOCAL(add_frac):
+	add	DBL0L,r0
+	mov.l	LOCAL(x01000000),r2
+	and	r3,r1
+	mov.l	@r15+,DBL0L
+	add	r1,r0
+	tst	r3,r0
+	bt	LOCAL(inf_denorm0)
+#if 0	// No point checking overflow -> infinity if we dont't raise a signal.
+	cmp/hs	r3,r0
+	bt	LOCAL(inf)
+#endif
+	div0s	DBL0H,r2	/* copy orig. sign into T.  */
+	rts
+	rotcr	r0
+LOCAL(inf_denorm0):	! We might need to undo previous rounding.
+	mov.l	LOCAL(x2fffffff),r3 /* Old fraction lsb | lower guard bits.  */
+	tst	r1,r1
+	bf	LOCAL(inf)
+	add	#-1,r0
+	tst	r3,DBL0L /* Check if msb guard bit was rounded up.  */
+	mov.l	LOCAL(x5fffffff),r3 /* Fraction lsb | lower guard bits.  */
+	addc	r2,r0
+	shlr	r0
+	tst	r3,DBL0L /* Check if msb guard bit wants rounding up.  */
+	bt/s	LOCAL(denorm_noup)
+	 div0s	DBL0H,r2	/* copy orig. sign into T.  */
+	add	#1,r0
+LOCAL(denorm_noup):
+	rts
+	rotcr	r0
+LOCAL(ill_exp):
+	div0s	DBL0H,r1
+	mov.l	LOCAL(x7ff80000),r2
+	add	r1,r1
+	bf	LOCAL(inf_nan)
+	mov.w	LOCAL(m32),r3 /* Handle denormal or zero.  */
+	mov	#-21,r2
+	shad	r2,r1
+	add	#-8,r1	/* Go from 9 to 1 guard bit in MSW.  */
+	cmp/gt	r3,r1
+	mov.l	@r15+,r3 /* DBL0L */
+	bf	LOCAL(zero)
+	mov.l	DBL0L, @-r15
+	shll8	DBL0L
+	rotcr	r0	/* Insert leading 1.  */
+	shld	r2,r3
+	cmp/pl	DBL0L	/* Check lower 23 guard bits if guard bit 23 is 0.  */
+	addc	r3,r0	/* Assemble fraction with compressed guard bits.  */
+	mov	r0,r2
+	shld	r1,r0
+	mov.l	@r15+,DBL0L
+	add	#32,r1
+	shld	r1,r2
+	tst	#2,r0
+	rotcl	r0
+	tst	r2,r2
+	rotcl	r0
+	xor	#3,r0
+	add	#3,r0	/* Even overflow gives the correct result.  */
+	shlr2	r0
+	div0s	r0,DBL0H
+	rts
+	rotcr	r0
+LOCAL(zero):
+	mov	#0,r0
+	div0s	r0,DBL0H
+	rts
+	rotcr	r0
+LOCAL(inf_nan):
+	not	DBL0H,r0
+	tst	r2,r0
+	mov.l	@r15+,DBL0L
+	bf	LOCAL(inf)
+	rts
+	mov	#-1,r0	/* NAN */
+LOCAL(inf):	/* r2 must be positive here.  */
+	mov.l	LOCAL(xffe00000),r0
+	div0s	r2,DBL0H
+	rts
+	rotcr	r0
+LOCAL(m32):
+	.word	-32
+	.balign	4
+LOCAL(x38000000):
+	.long	0x38000000
+LOCAL(x70000000):
+	.long	0x70000000
+LOCAL(x2fffffff):
+	.long	0x2fffffff
+LOCAL(x01000000):
+	.long	0x01000000
+LOCAL(xff000000):
+	.long	0xff000000
+LOCAL(x5fffffff):
+	.long	0x5fffffff
+LOCAL(x7ff80000):
+	.long	0x7ff80000
+LOCAL(xffe00000):
+	.long	0xffe00000
+	ENDFUNC(GLOBAL(truncdfsf2_))
+#endif /* L_truncdfsf2 */
+
+#ifdef L_add_sub_sf3
+	.global GLOBAL(subsf3_)
+	FUNC(GLOBAL(subsf3_))
+	.global GLOBAL(addsf3_)
+	FUNC(GLOBAL(addsf3_))
+GLOBAL(subsf3_):
+	cmp/pz	r5
+	add	r5,r5
+	rotcr	r5
+GLOBAL(addsf3_):
+	mov.l	LOCAL(x7f800000),r3
+	mov	r4,r6
+	add	r6,r6
+	mov	r5,r7
+	add	r7,r7
+	mov	r4,r0
+	or	r3,r0
+	cmp/hi	r6,r7
+	mov	r5,r1
+	bf/s	LOCAL(r4_hs)
+	 or	r3,r1
+	cmp/eq	r5,r1
+	bt	LOCAL(ret_r5) /* sole Inf or NaN, return unchanged.  */
+	shll8	r0
+	tst	r6,r6
+	shll8	r1
+	mov	#-24,r2
+	bt	LOCAL(denorm_r4)
+LOCAL(denorm_r4_done):
+	mov	r6,r3
+	shld	r2,r3
+	mov	r7,r6
+	shld	r2,r6
+	sub	r6,r3
+	mov	r0,r7
+	shld	r3,r0	/* Get 31 upper bits.  */
+	mov.l	LOCAL(xff000000),r2
+	add	#31,r3
+	mov.l	r5,@-r15 ! push result sign.
+	cmp/pl	r3
+	shld	r3,r7
+	bf	LOCAL(ret_stack)
+	div0s	r4,r5
+	bf/s	LOCAL(add)
+	 cmp/pl	r7	/* Is LSB in r0 clear, but any lower guards bit set?  */
+	subc	r0,r1
+	mov.l	LOCAL(c__clz_tab),r7
+	tst	r2,r1
+	mov	#-24,r3
+	bf/s LOCAL(norm_r0)
+	 mov	r1,r0
+	extu.w	r1,r1
+	bra	LOCAL(norm_check2)
+	 cmp/eq	r0,r1
+LOCAL(ret_r5):
+	rts
+	mov	r5,r0
+LOCAL(ret_stack):
+	rts
+	mov.l	@r15+,r0
+
+/* We leave the numbers denormalized, but we change the bit position to be
+   consistent with normalized numbers.  This also removes the spurious
+   leading one that was inserted before.  */
+LOCAL(denorm_r4):
+	tst	r7,r7
+	add	r0,r0
+	bf	LOCAL(denorm_r4_done)
+	bra	LOCAL(denorm_r4_done)
+	add	r1,r1
+LOCAL(denorm_r5):
+	tst	r6,r6
+	add	r1,r1
+	bf	LOCAL(denorm_r5_done)
+	clrt
+	bra	LOCAL(denorm_r5_done)
+	add	r0,r0
+
+/* If the exponent differs by two or more, normalization is minimal, and
+   few guard bits are needed for an exact final result, so sticky guard
+   bit compresion before subtraction (or add) works fine.
+   If the exponent differs by one, only one extra guard bit is generated,
+   and effectively no guard bit compression takes place.  */
+
+LOCAL(r4_hs):
+	cmp/eq	r4,r0
+	shll8	r0
+	bt	LOCAL(inf_nan_arg0)
+	shll8	r1
+	mov	#-24,r2
+	tst	r7,r7
+	shld	r2,r7
+	bt	LOCAL(denorm_r5)
+LOCAL(denorm_r5_done):
+	mov	r1,r3
+	shld	r2,r6
+	subc	r6,r7
+	mov.l	LOCAL(xff000000),r2
+	bf	LOCAL(same_exp)
+	shld	r7,r1	/* Get 31 upper bits.  */
+	add	#31,r7
+	mov.l	r4,@-r15 ! push result sign.
+	cmp/pl	r7
+	shld	r7,r3
+	bf	LOCAL(ret_stack)
+	div0s	r4,r5
+	bf/s	LOCAL(add)
+	 cmp/pl	r3	/* Is LSB in r1 clear, but any lower guard bit set?  */
+	subc	r1,r0
+	mov.l	LOCAL(c__clz_tab),r7
+LOCAL(norm_check):
+	tst	r2,r0
+	mov	#-24,r3
+	bf LOCAL(norm_r0)
+	extu.w	r0,r1
+	cmp/eq	r0,r1
+LOCAL(norm_check2):
+	mov	#-8,r3
+	bt LOCAL(norm_r0)
+	mov	#-16,r3
+LOCAL(norm_r0):
+	mov	r0,r1
+	shld	r3,r0
+#ifdef __pic__
+	add	r0,r7
+	mova  LOCAL(c__clz_tab),r0
+#endif
+	mov.b	@(r0,r7),r7
+	add	#25,r3
+	add	#-9+1,r6
+	mov	r1,r0
+	sub	r7,r3
+	mov.l	LOCAL(xbfffffff),r7
+	sub	r3,r6	/* generate exp-1  */
+	mov.w	LOCAL(d24),r2
+	cmp/pz	r6	/* check exp > 0  */
+	shld	r3,r0	/* Leading 1 becomes +1 exp adjustment.  */
+	bf	LOCAL(zero_denorm)
+LOCAL(denorm_done):
+	add	#30,r3
+	shld	r3,r1
+	mov.w   LOCAL(m1),r3
+	tst	r7,r1	! clear T if rounding up
+	shld	r2,r6
+	subc	r3,r0	! round - overflow will boost exp adjustment to 2.
+	mov.l	@r15+,r2
+	add	r6,r0	! overflow will generate inf
+	cmp/ge	r2,r3	! get sign into T
+	rts
+	rotcr	r0
+LOCAL(ret_r4):
+	rts
+	mov	r4,r0
+
+/* At worst, we are shifting the number back in place where an incoming
+   denormal was.  Thus, the shifts won't get out of range.  They still
+   might generate a zero fraction, but that's OK, that makes it 0.  */
+LOCAL(zero_denorm):
+	add	r6,r3
+	mov	r1,r0
+	mov	#0,r6	/* leading one will become free (except for rounding) */
+	bra	LOCAL(denorm_done)
+	shld	r3,r0
+
+/* Handle abs(r4) >= abs(r5), same exponents specially so we don't need
+   check for a zero fraction in the main path.  */
+LOCAL(same_exp):
+	div0s	r4,r5
+	mov.l	r4,@-r15
+	bf	LOCAL(add)
+	cmp/eq	r1,r0
+	mov.l	LOCAL(c__clz_tab),r7
+	bf/s	LOCAL(norm_check)
+	 sub	r1,r0
+	rts	! zero difference -> return +zero
+	mov.l	@r15+,r1
+
+/* r2: 0xff000000 */
+LOCAL(add):
+	addc	r1,r0
+	mov.w	LOCAL(x2ff),r7
+	shll8	r6
+	bf/s	LOCAL(no_carry)
+	shll16	r6
+	tst	r7,r0
+	shlr8	r0
+	mov.l	@r15+,r3	! discard saved sign
+	subc	r2,r0
+	sett
+	addc	r6,r0
+	cmp/hs	r2,r0
+	bt/s	LOCAL(inf)
+	 div0s	r7,r4 /* Copy sign.  */
+	rts
+	rotcr	r0
+LOCAL(inf):
+	mov	r6,r0
+	rts
+	rotcr	r0
+LOCAL(no_carry):
+	mov.w	LOCAL(m1),r3
+	shll	r0
+	bf	LOCAL(denorm_add)
+	tst	r7,r0
+	shlr8	r0
+	mov.l	@r15+,r1	! discard saved sign
+	subc	r3,r0	! round ; overflow -> exp++
+	cmp/ge	r4,r3	/* Copy sign.  */
+	add	r6,r0	! overflow -> inf
+	rts
+	rotcr	r0
+
+LOCAL(denorm_add):
+	shlr	r0
+	cmp/ge	r4,r3	/* Copy sign.  */
+	shlr8	r0
+	mov.l	@r15+,r1	! discard saved sign
+	rts
+	rotcr	r0
+
+LOCAL(inf_nan_arg0):
+	cmp/eq	r5,r1
+	bf	LOCAL(ret_r4)
+	div0s	r4,r5		/* Both are inf or NaN, check signs.  */
+	bt	LOCAL(ret_nan)	/* inf - inf, or NaN.  */
+	mov	r4,r0		! same sign; return NaN if either is NaN.
+	rts
+	or	r5,r0
+LOCAL(ret_nan):
+	rts
+	mov	#-1,r0
+
+LOCAL(d24):
+	.word	24
+LOCAL(x2ff):
+	.word	0x2ff
+LOCAL(m1):
+	.word	-1
+	.balign	4
+LOCAL(x7f800000):
+	.long	0x7f800000
+LOCAL(xbfffffff):
+	.long	0xbfffffff
+LOCAL(xff000000):
+	.long	0xff000000
+LOCAL(xfe000000):
+	.long	0xfe000000
+LOCAL(c__clz_tab):
+#ifdef __pic__
+	.long	GLOBAL(clz_tab) - .
+#else
+	.long	GLOBAL(clz_tab)
+#endif
+
+	ENDFUNC(GLOBAL(addsf3_))
+	ENDFUNC(GLOBAL(subsf3_))
+#endif /* L_add_sub_sf3 */
+
+#ifdef L_mulsf3
+	.global GLOBAL(mulsf3_)
+	FUNC(GLOBAL(mulsf3_))
+GLOBAL(mulsf3_):
+	mov.l	LOCAL(x7f800000),r1
+	not	r4,r2
+	mov	r4,r3
+	not	r5,r0
+	tst	r1,r2
+	or	r1,r3
+	bt/s	LOCAL(inf_nan_arg0)
+	 tst	r1,r0
+	bt	LOCAL(inf_nan_arg1)
+	tst	r1,r5
+	mov	r1,r2
+	shll8	r3
+	or	r5,r1
+	bt/s	LOCAL(zero_denorm_arg1)
+	 shll8	r1
+	tst	r2,r4
+	bt	LOCAL(zero_denorm_arg0)
+	dmulu.l	r3,r1
+	mov	r4,r0
+	and	r2,r0
+LOCAL(arg_norm):
+	and	r5,r2
+	mov.l	LOCAL(x3f800000),r3
+	sts	mach,r1
+	sub	r3,r0
+	sts	macl,r3
+	add	r2,r0
+	cmp/pz	r1
+	mov.w	LOCAL(x100),r2
+	bf/s	LOCAL(norm_frac)
+	 tst	r3,r3
+	shll2	r1	/* Shift one up, replace leading 1 with 0.  */
+	shlr	r1
+	tst	r3,r3
+LOCAL(norm_frac):
+	mov.w	LOCAL(mx80),r3
+	bf	LOCAL(round_frac)
+	tst	r2,r1
+LOCAL(round_frac):
+	mov.l	LOCAL(xff000000),r2
+	subc	r3,r1	/* Even overflow gives right result: exp++, frac=0.  */
+	shlr8	r1
+	add	r1,r0
+	shll	r0
+	bt	LOCAL(ill_exp)
+	tst	r2,r0
+	bt	LOCAL(denorm0)
+	cmp/hs	r2,r0
+	bt	LOCAL(inf)
+LOCAL(insert_sign):
+	div0s	r4,r5
+	rts
+	rotcr	r0
+LOCAL(denorm0):
+	sub	r2,r0
+	bra	LOCAL(insert_sign)
+	 shlr	r0
+LOCAL(zero_denorm_arg1):
+	mov.l	LOCAL(x60000000),r2	/* Check exp0 >= -64	*/
+	add	r1,r1
+	tst	r1,r1	/* arg1 == 0 ? */
+	mov	#0,r0
+	bt	LOCAL(insert_sign) /* argument 1 is zero ==> return 0  */
+	tst	r4,r2
+	bt	LOCAL(insert_sign) /* exp0 < -64  ==> return 0 */
+	mov.l	LOCAL(c__clz_tab),r0
+	mov	r3,r2
+	mov	r1,r3
+	bra	LOCAL(arg_normalize)
+	mov	r2,r1
+LOCAL(zero_denorm_arg0):
+	mov.l	LOCAL(x60000000),r2	/* Check exp1 >= -64	*/
+	add	r3,r3
+	tst	r3,r3	/* arg0 == 0 ? */
+	mov	#0,r0
+	bt	LOCAL(insert_sign) /* argument 0 is zero ==> return 0  */
+	tst	r5,r2
+	bt	LOCAL(insert_sign) /* exp1 < -64  ==> return 0 */
+	mov.l	LOCAL(c__clz_tab),r0
+LOCAL(arg_normalize):
+	mov.l	r7,@-r15
+	extu.w	r3,r7
+	cmp/eq	r3,r7
+	mov.l	LOCAL(xff000000),r7
+	mov	#-8,r2
+	bt	0f
+	tst	r7,r3
+	mov	#-16,r2
+	bt	0f
+	mov	#-24,r2
+0:
+	mov	r3,r7
+	shld	r2,r7
+#ifdef __pic__
+	add	r0,r7
+	mova  LOCAL(c__clz_tab),r0
+#endif
+	mov.b	@(r0,r7),r0
+	add	#32,r2
+	mov	r2,r7
+	mov	#23,r2
+	sub	r0,r7
+	mov.l	LOCAL(x7f800000),r0
+	shld	r7,r3
+	shld	r2,r7
+	mov	r0,r2
+	and	r4,r0
+	sub	r7,r0
+	mov.l	@r15+,r7
+	bra	LOCAL(arg_norm)
+	 dmulu.l	r3,r1
+#if 0 /* This is slightly slower, but could be used if table lookup causes
+         cache thrashing.  */
+	bt	LOCAL(insert_sign) /* exp1 < -64  ==> return 0 */
+	mov.l	LOCAL(xff000000),r2
+	mov	r4,r0
+LOCAL(arg_normalize):
+	tst	r2,r3
+	bf	LOCAL(arg_bit_norm)
+LOCAL(arg_byte_loop):
+	tst	r2,r3
+	add	r2,r0
+	shll8	r3
+	bt	LOCAL(arg_byte_loop)
+	add	r4,r0
+LOCAL(arg_bit_norm):
+	mov.l	LOCAL(x7f800000),r2
+	rotl	r3
+LOCAL(arg_bit_loop):
+	add	r2,r0
+	bf/s	LOCAL(arg_bit_loop)
+	 rotl	r3
+	rotr	r3
+	rotr	r3
+	sub	r2,r0
+	bra	LOCAL(arg_norm)
+	 dmulu.l	r3,r1
+#endif /* 0 */
+LOCAL(inf):
+	bra	LOCAL(insert_sign)
+	 mov	r2,r0
+LOCAL(inf_nan_arg0):
+	bt	LOCAL(inf_nan_both)
+	add	r0,r0
+	cmp/eq	#-1,r0	/* arg1 zero? -> NAN */
+	bt	LOCAL(insert_sign)
+	mov	r4,r0
+LOCAL(inf_insert_sign):
+	bra	LOCAL(insert_sign)
+	 add	r0,r0
+LOCAL(inf_nan_both):
+	mov	r4,r0
+	bra	LOCAL(inf_insert_sign)
+	 or	r5,r0
+LOCAL(inf_nan_arg1):
+	mov	r2,r0
+	add	r0,r0
+	cmp/eq	#-1,r0	/* arg0 zero? */
+	bt	LOCAL(insert_sign)
+	bra	LOCAL(inf_insert_sign)
+	 mov	r5,r0
+LOCAL(ill_exp):
+	cmp/pz	r0
+	mov	#-24,r3
+	bt	LOCAL(inf)
+	add	r1,r1
+	mov	r0,r2
+	sub	r1,r2	! remove fraction to get back pre-rounding exponent.
+	sts	mach,r0
+	sts	macl,r1
+	shad	r3,r2
+	mov	r0,r3
+	shld	r2,r0
+	add	#32,r2
+	cmp/pz	r2
+	shld	r2,r3
+	bf	LOCAL(zero)
+	or	r1,r3
+	mov	#-1,r1
+	tst	r3,r3
+	mov.w	LOCAL(x100),r3
+	bf/s	LOCAL(denorm_round_up)
+	mov	#-0x80,r1
+	tst	r3,r0
+LOCAL(denorm_round_up):
+	mov	#-7,r3
+	subc	r1,r0
+	bra	LOCAL(insert_sign)
+	 shld	r3,r0
+LOCAL(zero):
+	bra	LOCAL(insert_sign)
+	 mov #0,r0
+LOCAL(x100):
+	.word	0x100
+LOCAL(mx80):
+	.word	-0x80
+	.balign	4
+LOCAL(x7f800000):
+	.long 0x7f800000
+LOCAL(x3f800000):
+	.long 0x3f800000
+LOCAL(xff000000):
+	.long	0xff000000
+LOCAL(x60000000):
+	.long	0x60000000
+LOCAL(c__clz_tab):
+#ifdef __pic__
+	.long	GLOBAL(clz_tab) - .
+#else
+	.long	GLOBAL(clz_tab)
+#endif
+	ENDFUNC(GLOBAL(mulsf3_))
+#endif /* L_mulsf3 */
+
+#ifdef L_hypotf
+	.global GLOBAL(hypotf)
+	FUNC(GLOBAL(hypotf))
+GLOBAL(hypotf):
+/* This integer implementation takes 71 to 72 cycles in the main path.
+   This is a bit slower than the SH4 can do this computation using double
+   precision hardware floating point - 57 cycles, or 69 with mode switches.  */
+ /* First, calculate x (r4) as the sum of the square of the fractions -
+    the exponent is calculated separately in r3.
+    Then, alculate sqrt(x) for the fraction by reciproot iteration.
+    We get an 7.5 bit inital value using linear approximation with two slopes
+    that are powers of two.
+    x (- [1. .. 2.)  y0 := 1.25 - x/4 - tab(x)   y (- (0.8 .. 1.0)
+    x (- [2. .. 4.)  y0 := 1.   - x/8 - tab(x)   y (- (0.5 .. 0.8)
+ x is represented with two bits before the point,
+ y with 0 bits before the binary point.
+ Thus, to calculate y0 := 1. - x/8 - tab(x), all you have to do is to shift x
+ right by 1, negate it, and subtract tab(x).  */
+
+ /* y1 := 1.5*y0 - 0.5 * (x * y0) * (y0 * y0)
+    z0 := x * y1
+    z1 := z0 + 0.5 * (y1 - (y1*y1) * z0) */
+
+	mov.l	LOCAL(xff000000),r1
+	add	r4,r4
+	mov	r4,r0
+	add	r5,r5
+	cmp/hs	r5,r4
+	sub	r5,r0
+	mov	#-24,r2
+	bf/s	LOCAL(r5_large)
+	shad	r2,r0
+	mov	r4,r3
+	shll8	r4
+	rotcr	r4
+	tst	#0xe0,r0
+	neg	r0,r0
+	bt	LOCAL(ret_abs_r3)
+	tst	r1,r5
+	shll8	r5
+	bt/s	LOCAL(denorm_r5)
+	cmp/hi	r3,r1
+	dmulu.l	r4,r4
+	bf	LOCAL(inf_nan)
+	rotcr	r5
+	shld	r0,r5
+LOCAL(denorm_r5_done):
+	sts	mach,r4
+	dmulu.l	r5,r5
+	mov.l	r6,@-r15
+	mov	#20,r6
+
+	sts	mach,r5
+LOCAL(add_frac):
+	mova	LOCAL(tab)-32,r0
+	mov.l	r7,@-r15
+	mov.w	LOCAL(x1380),r7
+	and	r1,r3
+	addc	r5,r4
+	mov.w	LOCAL(m25),r2	! -25
+	bf	LOCAL(frac_ok)
+	sub	r1,r3
+	rotcr	r4
+	cmp/eq	r1,r3	! did we generate infinity ?
+	bt	LOCAL(inf_nan)
+	shlr	r4
+	mov	r4,r1
+	shld	r2,r1
+	mov.b	@(r0,r1),r0
+	mov	r4,r1
+	shld	r6,r1
+	bra	LOCAL(frac_low2)
+	sub	r1,r7
+
+LOCAL(frac_ok):
+	mov	r4,r1
+	shld	r2,r1
+	mov.b	@(r0,r1),r1
+	cmp/pz	r4
+	mov	r4,r0
+	bt/s	LOCAL(frac_low)
+	shld	r6,r0
+	mov.w	LOCAL(xf80),r7
+	shlr	r0
+LOCAL(frac_low):
+	sub	r0,r7
+LOCAL(frac_low2):
+	mov.l	LOCAL(x40000080),r0 ! avoid denorm results near 1. << r3
+	sub	r1,r7	! {0.12}
+	mov.l	LOCAL(xfffe0000),r5 ! avoid rounding overflow near 4. << r3
+	swap.w	r7,r1	! {0.28}
+	dmulu.l	r1,r4 /* two issue cycles */
+	mulu.w	r7,r7  /* two issue cycles */
+	sts	mach,r2	! {0.26}
+	mov	r1,r7
+	shlr	r1
+	sts	macl,r6	! {0.24}
+	cmp/hi	r0,r4
+	shlr2	r2
+	bf	LOCAL(near_one)
+	shlr	r2	! {0.23} systemic error of linear approximation keeps y1 < 1
+	dmulu.l	r2,r6
+	cmp/hs	r5,r4
+	add	r7,r1	! {1.28}
+	bt	LOCAL(near_four)
+	shlr2	r1	! {1.26}
+	sts	mach,r0	! {0.15} x*y0^3 == {0.16} 0.5*x*y0^3
+	shlr2	r1	! {1.24}
+	shlr8	r1	! {1.16}
+	sett		! compensate for truncation of subtrahend, keep y1 < 1
+	subc	r0,r1   ! {0.16} y1;  max error about 3.5 ulp
+	swap.w	r1,r0
+	dmulu.l	r0,r4	! { 1.30 }
+	mulu.w	r1,r1
+	sts	mach,r2
+	shlr2	r0
+	sts	macl,r1
+	add	r2,r0
+	mov.l	LOCAL(xff000000),r6
+	add	r2,r0
+	dmulu.l	r1,r2
+	add	#127,r0
+	add	r6,r3	! precompensation for adding leading 1
+	sts	mach,r1
+	shlr	r3
+	mov.l	@r15+,r7
+	sub	r1,r0	! {0.31} max error about 50 ulp (+127)
+	mov.l	@r15+,r6
+	shlr8	r0	! {0.23} max error about 0.7 ulp
+	rts
+	add	r3,r0
+	
+LOCAL(r5_large):
+	mov	r5,r3
+	mov	#-31,r2
+	cmp/ge	r2,r0
+	shll8	r5
+	bf	LOCAL(ret_abs_r3)
+	rotcr	r5
+	tst	r1,r4
+	shll8	r4
+	bt/s	LOCAL(denorm_r4)
+	cmp/hi	r3,r1
+	dmulu.l	r5,r5
+	bf	LOCAL(inf_nan)
+	rotcr	r4
+LOCAL(denorm_r4_done):
+	shld	r0,r4
+	sts	mach,r5
+	dmulu.l	r4,r4
+	mov.l	r6,@-r15
+	mov	#20,r6
+	bra	LOCAL(add_frac)
+	sts	mach,r4
+
+LOCAL(near_one):
+	bra	LOCAL(assemble_sqrt)
+	mov	#0,r0
+LOCAL(near_four):
+	! exact round-to-nearest would add 255.  We add 256 for speed & compactness.
+	mov	r4,r0
+	shlr8	r0
+	add	#1,r0
+	tst	r0,r0
+	addc	r0,r3	! might generate infinity.
+LOCAL(assemble_sqrt):
+	mov.l	@r15+,r7
+	shlr	r3
+	mov.l	@r15+,r6
+	rts
+	add	r3,r0
+LOCAL(inf_nan):
+LOCAL(ret_abs_r3):
+	mov	r3,r0
+	rts
+	shlr	r0
+LOCAL(denorm_r5):
+	bf	LOCAL(inf_nan)
+	tst	r1,r4
+	bt	LOCAL(denorm_both)
+	dmulu.l	r4,r4
+	bra	LOCAL(denorm_r5_done)
+	shld	r0,r5
+LOCAL(denorm_r4):
+	bf	LOCAL(inf_nan)
+	tst	r1,r5
+	dmulu.l	r5,r5
+	bf	LOCAL(denorm_r4_done)
+LOCAL(denorm_both):	! normalize according to r3.
+	extu.w	r3,r2
+	mov.l	LOCAL(c__clz_tab),r0
+	cmp/eq	r3,r2
+	mov	#-8,r2
+	bt	0f
+	tst	r1,r3
+	mov	#-16,r2
+	bt	0f
+	mov	#-24,r2
+0:
+	shld	r2,r3
+	mov.l	r7,@-r15
+#ifdef __pic__
+	add	r0,r3
+	mova	 LOCAL(c__clz_tab),r0
+#endif
+	mov.b	@(r0,r3),r0
+	add	#32,r2
+	sub	r0,r2
+	shld	r2,r4
+	mov	r2,r7
+	dmulu.l	r4,r4
+	sts.l	pr,@-r15
+	mov	#1,r3
+	bsr	LOCAL(denorm_r5_done)
+	shld	r2,r5
+	mov.l	LOCAL(x01000000),r1
+	neg	r7,r2
+	lds.l	@r15+,pr
+	tst	r1,r0
+	mov.l	@r15+,r7
+	bt	0f
+	add	#1,r2
+	sub	r1,r0
+0:
+	rts
+	shld	r2,r0
+
+LOCAL(m25):
+	.word	-25
+LOCAL(x1380):
+	.word	0x1380
+LOCAL(xf80):
+	.word	0xf80
+	.balign	4
+LOCAL(xff000000):
+	.long	0xff000000
+LOCAL(x40000080):
+	.long	0x40000080
+LOCAL(xfffe0000):
+	.long	0xfffe0000
+LOCAL(x01000000):
+	.long	0x01000000
+LOCAL(c__clz_tab):
+#ifdef __pic__
+	.long	GLOBAL(clz_tab) - .
+#else
+	.long	GLOBAL(clz_tab)
+#endif
+
+/*
+double err(double x)
+{
+  return (x < 2. ? 1.25 - x/4. : 1. - x/8.) - 1./sqrt(x);
+}
+
+int
+main ()
+{
+  int i = 0;
+  double x, s, v;
+  double lx, hx;
+
+  s = 1./32.;
+  for (x = 1.; x < 4; x += s, i++)
+    {
+      lx = x;
+      hx = x + s - 1. / (1 << 30);
+      v = 0.5 * (err (lx) + err (hx));
+      printf ("%s% 4d%c",
+              (i & 7) == 0 ? "\t.byte\t" : "",
+              (int)(v * 4096 + 0.5) - 128,
+              (i & 7) == 7 ? '\n' : ',');
+    }
+  return 0;
+} */
+
+	.balign	4
+LOCAL(tab):
+	.byte	-113, -84, -57, -33, -11,   8,  26,  41
+	.byte	  55,  67,  78,  87,  94, 101, 106, 110
+	.byte	 113, 115, 115, 115, 114, 112, 109, 106
+	.byte	 101,  96,  91,  84,  77,  69,  61,  52
+	.byte	  51,  57,  63,  68,  72,  77,  80,  84
+	.byte	  87,  89,  91,  93,  95,  96,  97,  97
+	.byte	  97,  97,  97,  96,  95,  94,  93,  91
+	.byte	  89,  87,  84,  82,  79,  76,  72,  69
+	.byte	  65,  61,  57,  53,  49,  44,  39,  34
+	.byte	  29,  24,  19,  13,   8,   2,  -4, -10
+	.byte	 -17, -23, -29, -36, -43, -50, -57, -64
+	.byte	 -71, -78, -85, -93,-101,-108,-116,-124
+	ENDFUNC(GLOBAL(hypotf))
+#endif /* L_hypotf */
+
+#ifdef L_add_sub_df3
+#include "IEEE-754/adddf3.S"
+#endif /* _add_sub_df3 _muldf3 */
+
+#ifdef L_muldf3
+#include "IEEE-754/muldf3.S"
+#endif /* L_muldf3 */
+#endif /* DYN_SHIFT */
Index: sh-modes.def
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/sh/sh-modes.def,v
retrieving revision 1.1
diff -p -u -r1.1 sh-modes.def
--- sh-modes.def	13 Oct 2003 21:16:32 -0000	1.1
+++ sh-modes.def	30 Sep 2004 18:59:32 -0000
@@ -1,5 +1,5 @@
-/* Alpha extra machine modes. 
-   Copyright (C) 2003 Free Software Foundation, Inc.
+/* SH extra machine modes. 
+   Copyright (C) 2004 Free Software Foundation, Inc.
 
 This file is part of GCC.
 
@@ -21,3 +21,7 @@ Boston, MA 02111-1307, USA.  */
 /* The SH uses a partial integer mode to represent the FPSCR register.  */
 PARTIAL_INT_MODE (SI);
 
+/* For software floating point comparisons.  */
+CC_MODE (CC_FP_NE);
+CC_MODE (CC_FP_GT);
+CC_MODE (CC_FP_UNLT);
Index: sh-protos.h
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/sh/sh-protos.h,v
retrieving revision 1.55
diff -p -u -r1.55 sh-protos.h
--- sh-protos.h	10 May 2004 23:25:13 -0000	1.55
+++ sh-protos.h	30 Sep 2004 18:59:32 -0000
@@ -93,6 +93,10 @@ extern void expand_sf_binop (rtx (*)(rtx
 extern void expand_df_unop (rtx (*)(rtx, rtx, rtx), rtx *);
 extern void expand_df_binop (rtx (*)(rtx, rtx, rtx, rtx), rtx *);
 extern void expand_fp_branch (rtx (*)(void), rtx (*)(void));
+extern void expand_sfunc_unop (enum machine_mode, rtx (*) (rtx, rtx),
+			       const char *, enum rtx_code code, rtx *);
+extern void expand_sfunc_binop (enum machine_mode, rtx (*) (rtx, rtx),
+				const char *, enum rtx_code code, rtx *);
 extern int sh_insn_length_adjustment (rtx);
 extern int sh_can_redirect_branch (rtx, rtx);
 extern void sh_expand_unop_v2sf (enum rtx_code, rtx, rtx);
Index: sh.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/sh/sh.c,v
retrieving revision 1.270.2.3
diff -p -u -r1.270.2.3 sh.c
--- sh.c	29 Jun 2004 17:33:57 -0000	1.270.2.3
+++ sh.c	30 Sep 2004 18:59:37 -0000
@@ -1061,6 +1061,68 @@ prepare_scc_operands (enum rtx_code code
   return t_reg;
 }
 
+static rtx
+sh_soft_fp_cmp (int code, enum machine_mode op_mode)
+{
+  const char *name;
+  rtx (*fun) (rtx, rtx), addr, tmp, first, last, equiv;
+  int df = op_mode == DFmode;
+  enum machine_mode mode;
+
+  if (flag_finite_math_only && ! df)
+    switch (code)
+      {
+      case EQ:
+	return gen_cmpeqsf_i1_finite (sh_compare_op0, sh_compare_op1);
+      case LE:
+      case UNLE:
+	return gen_cmplesf_i1_finite (sh_compare_op0, sh_compare_op1);
+      case GE:
+      case UNGE:
+	return gen_cmplesf_i1_finite (sh_compare_op1, sh_compare_op0);
+      default:
+	break;
+      }
+  if (flag_finite_math_only && df && code == EQ)
+    return gen_cmpeqdf_i1_finite (sh_compare_op0, sh_compare_op1);
+
+  switch (code)
+    {
+    case EQ:
+      name = df ? "__nedf2_" : "__nesf2_";
+      fun = df ? gen_cmpnedf_i1 : gen_cmpnesf_i1;
+      mode = CC_FP_NEmode;
+      break;
+    case UNLE:
+      name = df ? "__gtdf2t" : "__gtsf2t";
+      fun = df ? gen_cmpgtdf_i1 : gen_cmpgtsf_i1;
+      mode = CC_FP_GTmode;
+      break;
+    case GE:
+      name = df ? "__gedf2f" : "__gesf2f";
+      fun = df ? gen_cmpunltdf_i1 : gen_cmpunltsf_i1;
+      mode = CC_FP_UNLTmode;
+      break;
+    default: abort ();
+    }
+  tmp = gen_reg_rtx (mode);
+  addr = force_reg (Pmode, function_symbol (name));
+  first = emit_move_insn (gen_rtx_REG (op_mode, R4_REG), sh_compare_op0);
+  emit_move_insn (gen_rtx_REG (op_mode, R5_REG + df), sh_compare_op1);
+  last = emit_insn (fun (tmp, addr));
+  equiv = gen_rtx_fmt_ee (COMPARE, mode, sh_compare_op0, sh_compare_op1);
+  REG_NOTES (last) = gen_rtx_EXPR_LIST (REG_EQUAL, equiv, REG_NOTES (last));
+  /* Wrap the sequence in REG_LIBCALL / REG_RETVAL notes so that loop
+     invariant code motion can move it.  */
+  REG_NOTES (first) = gen_rtx_INSN_LIST (REG_LIBCALL, last, REG_NOTES (first));
+  REG_NOTES (last) = gen_rtx_INSN_LIST (REG_RETVAL, first, REG_NOTES (last));
+  /* Use fpcmp_i1 rather than cmpeqsi_t, so that the optimizers can grok
+     the computation.  */
+  return gen_rtx_SET (VOIDmode,
+		      gen_rtx_REG (SImode, T_REG),
+		      gen_rtx_fmt_ee (code, SImode, tmp, CONST0_RTX (mode)));
+}
+
 /* Called from the md file, set up the operands of a compare instruction.  */
 
 void
@@ -1081,11 +1143,16 @@ from_compare (rtx *operands, int code)
 	  || (TARGET_SH2E && GET_MODE_CLASS (mode) == MODE_FLOAT))
 	sh_compare_op1 = force_reg (mode, sh_compare_op1);
     }
-  if (TARGET_SH2E && GET_MODE_CLASS (mode) == MODE_FLOAT && code == GE)
+  if (GET_MODE_CLASS (mode) == MODE_FLOAT && TARGET_SH2E && code == GE
+      && (mode == SFmode || TARGET_SH4))
     {
       from_compare (operands, GT);
       insn = gen_ieee_ccmpeqsf_t (sh_compare_op0, sh_compare_op1);
     }
+  else if (GET_MODE_CLASS (mode) == MODE_FLOAT
+	   && ! TARGET_SH4 && TARGET_SH1
+	   && (mode == DFmode || ! TARGET_SH2E))
+    insn = sh_soft_fp_cmp (code, mode);
   else
     insn = gen_rtx_SET (VOIDmode,
 			gen_rtx_REG (SImode, T_REG),
@@ -7582,7 +7649,7 @@ equality_comparison_operator (rtx op, en
 int
 greater_comparison_operator (rtx op, enum machine_mode mode)
 {
-  if (mode != VOIDmode && GET_MODE (op) == mode)
+  if (mode != VOIDmode && GET_MODE (op) != mode)
     return 0;
   switch (GET_CODE (op))
     {
@@ -7599,7 +7666,7 @@ greater_comparison_operator (rtx op, enu
 int
 less_comparison_operator (rtx op, enum machine_mode mode)
 {
-  if (mode != VOIDmode && GET_MODE (op) == mode)
+  if (mode != VOIDmode && GET_MODE (op) != mode)
     return 0;
   switch (GET_CODE (op))
     {
@@ -7613,6 +7680,37 @@ less_comparison_operator (rtx op, enum m
     }
 }
 
+int
+soft_fp_comparison_operator (rtx op, enum machine_mode mode)
+{
+  if (mode != VOIDmode && GET_MODE (op) != mode)
+    return 0;
+  switch (GET_CODE (op))
+    {
+    default:
+      return 0;
+    case EQ:	mode = CC_FP_NEmode;	break;
+    case UNLE:	mode = CC_FP_GTmode;	break;
+    case GE:	mode = CC_FP_UNLTmode;	break;
+    }
+  return register_operand (XEXP (op, 0), mode);
+}
+
+int
+soft_fp_comparison_operand (rtx op, enum machine_mode mode)
+{
+  switch (GET_MODE (op))
+    {
+      default:
+	return 0;
+      case CC_FP_NEmode: case CC_FP_GTmode: case CC_FP_UNLTmode:
+	break;
+    }
+  if (mode == SFmode && TARGET_SH2E)
+    return 0;
+  return register_operand (op, mode);
+}
+
 /* Accept pseudos and branch target registers.  */
 int
 target_reg_operand (rtx op, enum machine_mode mode)
@@ -7946,6 +8044,54 @@ expand_df_binop (rtx (*fun) (rtx, rtx, r
   emit_df_insn ((*fun) (operands[0], operands[1], operands[2],
 			get_fpscr_rtx ()));
 }
+
+/* Expand an sfunc operation taking NARGS MODE arguments, using generator
+   function FUN, which needs symbol NAME loaded int a register first.
+   Add a REG_EQUAL note using EQUIV.  */
+static void
+expand_sfunc_op (int nargs, enum machine_mode mode, rtx (*fun) (rtx, rtx),
+		    const char *name, rtx equiv, rtx *operands)
+{
+  int next_reg = FIRST_PARM_REG, i;
+  rtx addr, first = NULL_RTX, last, insn;
+
+  addr = force_reg (Pmode, function_symbol (name));
+  for ( i = 1; i <= nargs; i++)
+    {
+      insn = emit_move_insn (gen_rtx_REG (mode, next_reg), operands[i]);
+      if (!first)
+	first = insn;
+      next_reg += GET_MODE_SIZE (mode) / UNITS_PER_WORD;
+    }
+  last = emit_insn ((*fun) (operands[0], addr));
+  REG_NOTES (last) = gen_rtx_EXPR_LIST (REG_EQUAL, equiv, REG_NOTES (last));
+  /* Wrap the sequence in REG_LIBCALL / REG_RETVAL notes so that loop
+     invariant code motion can move it.  */
+  REG_NOTES (first) = gen_rtx_INSN_LIST (REG_LIBCALL, last, REG_NOTES (first));
+  REG_NOTES (last) = gen_rtx_INSN_LIST (REG_RETVAL, first, REG_NOTES (last));
+}
+
+/* Expand an sfunc unary operation taking an MODE argument, using generator
+   function FUN, which needs symbol NAME loaded int a register first.
+   Add a REG_EQUAL note using CODE.  */
+void
+expand_sfunc_unop (enum machine_mode mode, rtx (*fun) (rtx, rtx),
+		   const char *name, enum rtx_code code, rtx *operands)
+{
+  rtx equiv = gen_rtx_fmt_e (code, GET_MODE (operands[0]), operands[1]);
+  expand_sfunc_op (1, mode, fun, name, equiv, operands);
+}
+
+/* Expand an sfunc binary operation in MODE, using generator function FUN,
+   which needs symbol NAME loaded int a register first.
+   Add a REG_EQUAL note using CODE.  */
+void
+expand_sfunc_binop (enum machine_mode mode, rtx (*fun) (rtx, rtx),
+		    const char *name, enum rtx_code code, rtx *operands)
+{
+  rtx equiv = gen_rtx_fmt_ee (code, mode, operands[1], operands[2]);
+  expand_sfunc_op (2, mode, fun, name, equiv, operands);
+}
 
 /* ??? gcc does flow analysis strictly after common subexpression
    elimination.  As a result, common subexpression elimination fails
Index: sh.h
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/sh/sh.h,v
retrieving revision 1.240.2.4
diff -p -u -r1.240.2.4 sh.h
--- sh.h	29 Jun 2004 17:33:57 -0000	1.240.2.4
+++ sh.h	30 Sep 2004 18:59:37 -0000
@@ -3295,6 +3295,8 @@ extern int rtx_equal_function_value_matt
   {"noncommutative_float_operator", {MINUS, DIV}},			\
   {"shmedia_6bit_operand", {SUBREG, REG, CONST_INT}},			\
   {"sh_register_operand", {REG, SUBREG, CONST_INT}},			\
+  {"soft_fp_comparison_operand", {SUBREG, REG}},			\
+  {"soft_fp_comparison_operator", {EQ, UNLE, GE}},			\
   {"target_reg_operand", {SUBREG, REG}},				\
   {"target_operand", {SUBREG, REG, LABEL_REF, SYMBOL_REF, CONST, UNSPEC}},\
   {"trunc_hi_operand", {SUBREG, REG, TRUNCATE}},			\
@@ -3308,6 +3310,7 @@ extern int rtx_equal_function_value_matt
 #define SPECIAL_MODE_PREDICATES \
   "any_register_operand", \
   "int_gpr_dest", \
+  "soft_fp_comparison_operand" \
   "trunc_hi_operand", \
   /* This line intentionally left blank.  */
 
Index: sh.md
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/sh/sh.md,v
retrieving revision 1.172.2.1
diff -p -u -r1.172.2.1 sh.md
--- sh.md	8 Jun 2004 16:55:33 -0000	1.172.2.1
+++ sh.md	30 Sep 2004 18:59:42 -0000
@@ -152,6 +152,16 @@
   (UNSPECV_CONST8	6)
   (UNSPECV_WINDOW_END	10)
   (UNSPECV_CONST_END	11)
+
+  ;; NaN handling for software floating point:
+  ;; We require one bit specific for a precision to be set in all NaNs,
+  ;; so that we can test them with a not / tst sequence.
+  ;; ??? Ironically, this is the quiet bit for now, because that is the
+  ;; only bit set by __builtin_nan ("").
+  ;; ??? Should really use one bit lower and force it set by using
+  ;; a custom encoding function.
+  (SF_NAN_MASK		0x7fc00000)
+  (DF_NAN_MASK		0x7ff80000)
 ])
 
 ;; -------------------------------------------------------------------------
@@ -660,6 +670,14 @@
 	cmp/eq	%1,%0"
    [(set_attr "type" "mt_group")])
 
+(define_insn "fpcmp_i1"
+  [(set (reg:SI T_REG)
+	(match_operator:SI 1 "soft_fp_comparison_operator"
+	  [(match_operand 0 "soft_fp_comparison_operand" "r") (const_int 0)]))]
+  "TARGET_SH1 && !TARGET_SH4"
+  "tst	%0,%0"
+   [(set_attr "type" "mt_group")])
+
 (define_insn "cmpgtsi_t"
   [(set (reg:SI T_REG)
 	(gt:SI (match_operand:SI 0 "arith_reg_operand" "r,r")
@@ -5272,6 +5290,14 @@
       DONE;
     }
 
+  if (GET_MODE_CLASS (GET_MODE (sh_compare_op0)) == MODE_FLOAT
+      && TARGET_SH1 && !TARGET_SH4
+      && (!TARGET_SH2E || GET_MODE (sh_compare_op0) == DFmode))
+    {
+      from_compare (operands, UNLE);
+      emit_jump_insn (gen_branch_false (operands[0]));
+      DONE;
+    }
   from_compare (operands, GT);
 }")
 
@@ -5308,10 +5334,15 @@
       rtx tmp = sh_compare_op0;
       sh_compare_op0 = sh_compare_op1;
       sh_compare_op1 = tmp;
-      emit_insn (gen_bgt (operands[0]));
-      DONE;
+      if (TARGET_SH4 || (TARGET_SH2E && GET_MODE (sh_compare_op0) == SFmode))
+	{
+	  emit_insn (gen_bgt (operands[0]));
+	  DONE;
+	}
+      from_compare (operands, UNLE);
     }
-  from_compare (operands, GE);
+  else
+    from_compare (operands, GE);
 }")
 
 (define_expand "ble"
@@ -5342,9 +5373,9 @@
       DONE;
     }
 
-  if (TARGET_SH2E
-      && TARGET_IEEE
-      && GET_MODE_CLASS (GET_MODE (sh_compare_op0)) == MODE_FLOAT)
+  if (GET_MODE_CLASS (GET_MODE (sh_compare_op0)) == MODE_FLOAT
+      && (!TARGET_SH2E || TARGET_IEEE
+	  || (!TARGET_SH4 && GET_MODE (sh_compare_op0) == DFmode)))
     {
       rtx tmp = sh_compare_op0;
       sh_compare_op0 = sh_compare_op1;
@@ -5383,9 +5414,9 @@
       DONE;
     }
 
-  if (TARGET_SH2E
-      && ! TARGET_IEEE
-      && GET_MODE_CLASS (GET_MODE (sh_compare_op0)) == MODE_FLOAT)
+  if (GET_MODE_CLASS (GET_MODE (sh_compare_op0)) == MODE_FLOAT
+      && TARGET_SH2E && !TARGET_IEEE
+      && (TARGET_SH4 || GET_MODE (sh_compare_op0) == SFmode))
     {
       rtx tmp = sh_compare_op0;
       sh_compare_op0 = sh_compare_op1;
@@ -5484,19 +5515,82 @@
   from_compare (operands, GTU);
 }")
 
+;; ??? Can't use DFmode bcc patterns for SH2E whwn there is no SFmode
+;; equivalent: the insn predicate has to be evaluable at compiler startup,
+;; and FAIL in bcc patterns causes crashes.
 (define_expand "bunordered"
   [(set (match_dup 1) (unordered:DI (match_dup 2) (match_dup 3)))
    (set (pc)
 	(if_then_else (ne (match_dup 1) (const_int 0))
 		      (label_ref:DI (match_operand 0 "" ""))
 		      (pc)))]
-  "TARGET_SHMEDIA"
+  "(TARGET_SH1 && !TARGET_SH2E) || TARGET_SHMEDIA"
   "
 {
-  operands[1] = gen_reg_rtx (DImode);
   operands[2] = force_reg (GET_MODE (sh_compare_op0), sh_compare_op0);
   operands[3] = force_reg (GET_MODE (sh_compare_op1), sh_compare_op1);
+  if (TARGET_SH1)
+    {
+      HOST_WIDE_INT mask;
+      switch (GET_MODE (operands[2]))
+	{
+	case SFmode:
+	  mask = SF_NAN_MASK;
+	  break;
+	case DFmode:
+	  mask = DF_NAN_MASK;
+	  break;
+	default:
+	  FAIL;
+	}
+      emit_insn (gen_cmpunsf_i1 (operands[2], operands[3],
+				 force_reg (SImode, GEN_INT (mask))));
+      emit_jump_insn (gen_branch_true (operands[0]));
+      DONE;
+    }
+  operands[1] = gen_reg_rtx (DImode);
 }")
+
+(define_expand "bunle"
+  [(set (pc)
+	(if_then_else (ne (reg:SI T_REG) (const_int 0))
+		      (label_ref (match_operand 0 "" ""))
+		      (pc)))]
+  "(TARGET_SH1 && !TARGET_SH2E) || TARGET_SHMEDIA_FPU"
+  "
+{
+  if (TARGET_SHMEDIA_FPU)
+    {
+      rtx tmp = gen_reg_rtx (DImode);
+
+      emit_insn (gen_sgt (tmp));
+      emit_jump_insn (gen_beq_media (operands[0], tmp, const0_rtx));
+      DONE;
+    }
+
+  from_compare (operands, UNLE);
+}")
+
+(define_expand "bunlt"
+  [(set (pc)
+	(if_then_else (eq (reg:SI T_REG) (const_int 0))
+		      (label_ref (match_operand 0 "" ""))
+		      (pc)))]
+  "(TARGET_SH1 && !TARGET_SH2E) || TARGET_SHMEDIA_FPU"
+  "
+{
+  if (TARGET_SHMEDIA_FPU)
+    {
+      rtx tmp = gen_reg_rtx (DImode);
+
+      emit_insn (gen_sge (tmp));
+      emit_jump_insn (gen_beq_media (operands[0], tmp, const0_rtx));
+      DONE;
+    }
+
+  from_compare (operands, GE);
+}")
+
 
 ;; ------------------------------------------------------------------------
 ;; Jump and linkage insns
@@ -7495,6 +7589,13 @@ mov.l\\t1f,r0\\n\\
     DONE;
   if (! rtx_equal_function_value_matters)
     FAIL;
+  if (GET_MODE_CLASS (GET_MODE (sh_compare_op0)) == MODE_FLOAT
+      && !TARGET_SH4 && (!TARGET_SH2E || GET_MODE (sh_compare_op0) == DFmode))
+   {
+      from_compare (operands, EQ);
+      emit_insn (gen_movt (operands[0]));
+      DONE;
+    }
   operands[1] = prepare_scc_operands (EQ);
 }")
 
@@ -7543,6 +7644,9 @@ mov.l\\t1f,r0\\n\\
     }
   if (! rtx_equal_function_value_matters)
     FAIL;
+  if (GET_MODE_CLASS (GET_MODE (sh_compare_op0)) == MODE_FLOAT && !TARGET_SH4
+      && (!TARGET_SH2E || GET_MODE (sh_compare_op0) == DFmode))
+    FAIL;
   operands[1] = prepare_scc_operands (LT);
 }")
 
@@ -7647,6 +7751,9 @@ mov.l\\t1f,r0\\n\\
     }
   if (! rtx_equal_function_value_matters)
     FAIL;
+  if (GET_MODE_CLASS (GET_MODE (sh_compare_op0)) == MODE_FLOAT && !TARGET_SH4
+      && (!TARGET_SH2E || GET_MODE (sh_compare_op0) == DFmode))
+    FAIL;
   operands[1] = prepare_scc_operands (GT);
 }")
 
@@ -7703,7 +7810,13 @@ mov.l\\t1f,r0\\n\\
     FAIL;
   if (GET_MODE_CLASS (GET_MODE (sh_compare_op0)) == MODE_FLOAT)
     {
-      if (TARGET_IEEE)
+      if (!TARGET_SH4
+	  && (!TARGET_SH2E || GET_MODE (sh_compare_op0) == DFmode))
+	{
+	  from_compare (operands, GE);
+	  emit_insn (gen_movt (operands[0]));
+	}
+      else if (TARGET_IEEE)
 	{
 	  rtx lab = gen_label_rtx ();
 	  prepare_scc_operands (EQ);
@@ -7834,6 +7947,21 @@ mov.l\\t1f,r0\\n\\
   operands[1] = prepare_scc_operands (GEU);
 }")
 
+(define_expand "sunle"
+  [(set (match_operand:SI 0 "arith_reg_operand" "")
+	(match_dup 1))]
+  "TARGET_SH1 && !TARGET_SH4"
+  "
+{
+  if (TARGET_SH2E && GET_MODE (sh_compare_op0) == SFmode)
+    FAIL;
+  if (! rtx_equal_function_value_matters)
+    FAIL;
+  from_compare (operands, UNLE);
+  emit_insn (gen_movt (operands[0]));
+  DONE;
+}")
+
 ;; sne moves the complement of the T reg to DEST like this:
 ;;      cmp/eq ...
 ;;      mov    #-1,temp
@@ -7882,7 +8010,15 @@ mov.l\\t1f,r0\\n\\
     DONE;
   if (! rtx_equal_function_value_matters)
     FAIL;
-  operands[1] = prepare_scc_operands (EQ);
+  if (GET_MODE_CLASS (GET_MODE (sh_compare_op0)) == MODE_FLOAT
+      && !TARGET_SH4
+      && (!TARGET_SH2E || GET_MODE (sh_compare_op0) == DFmode))
+    {
+      from_compare (operands, EQ);
+      operands[1] = gen_rtx_REG (SImode, T_REG);
+    }
+  else
+    operands[1] = prepare_scc_operands (EQ);
   operands[2] = gen_reg_rtx (SImode);
 }")
 
@@ -8257,7 +8393,7 @@ mov.l\\t1f,r0\\n\\
   [(set (match_operand:SF 0 "arith_reg_operand" "")
 	(plus:SF (match_operand:SF 1 "arith_reg_operand" "")
 		 (match_operand:SF 2 "arith_reg_operand" "")))]
-  "TARGET_SH2E || TARGET_SHMEDIA_FPU"
+  "TARGET_SH2E || TARGET_SH3 || TARGET_SHMEDIA_FPU"
   "
 {
   if (TARGET_SH2E)
@@ -8265,6 +8401,12 @@ mov.l\\t1f,r0\\n\\
       expand_sf_binop (&gen_addsf3_i, operands);
       DONE;
     }
+  else if (TARGET_SH3)
+    {
+      expand_sfunc_binop (SFmode, &gen_addsf3_i3, \"__addsf3_\", PLUS,
+			  operands);
+      DONE;
+    }
 }")
 
 (define_insn "*addsf3_media"
@@ -8341,6 +8483,22 @@ mov.l\\t1f,r0\\n\\
 }"
   [(set_attr "type" "fparith_media")])
 
+(define_insn "addsf3_i3"
+  [(set (match_operand:SF 0 "arith_reg_operand" "=z")
+	(plus:SF (reg:SF R4_REG) (reg:SF R5_REG)))
+   (clobber (reg:SI T_REG))
+   (clobber (reg:SI PR_REG))
+   (clobber (reg:SI R1_REG))
+   (clobber (reg:SI R2_REG))
+   (clobber (reg:SI R3_REG))
+   (clobber (reg:SI R6_REG))
+   (clobber (reg:SI R7_REG))
+   (use (match_operand:SI 1 "arith_reg_operand" "r"))]
+  "TARGET_SH3 && ! TARGET_SH2E"
+  "jsr	@%1%#"
+  [(set_attr "type" "sfunc")
+   (set_attr "needs_delay_slot" "yes")])
+
 (define_insn "addsf3_i"
   [(set (match_operand:SF 0 "arith_reg_operand" "=f")
 	(plus:SF (match_operand:SF 1 "arith_reg_operand" "%0")
@@ -8355,7 +8513,7 @@ mov.l\\t1f,r0\\n\\
   [(set (match_operand:SF 0 "fp_arith_reg_operand" "")
 	(minus:SF (match_operand:SF 1 "fp_arith_reg_operand" "")
 		  (match_operand:SF 2 "fp_arith_reg_operand" "")))]
-  "TARGET_SH2E || TARGET_SHMEDIA_FPU"
+  "TARGET_SH2E || TARGET_SH3 || TARGET_SHMEDIA_FPU"
   "
 {
   if (TARGET_SH2E)
@@ -8363,6 +8521,12 @@ mov.l\\t1f,r0\\n\\
       expand_sf_binop (&gen_subsf3_i, operands);
       DONE;
     }
+  else if (TARGET_SH3)
+    {
+      expand_sfunc_binop (SFmode, &gen_subsf3_i3, \"__subsf3_\", MINUS,
+			  operands);
+      DONE;
+    }
 }")
 
 (define_insn "*subsf3_media"
@@ -8373,6 +8537,23 @@ mov.l\\t1f,r0\\n\\
   "fsub.s	%1, %2, %0"
   [(set_attr "type" "fparith_media")])
 
+(define_insn "subsf3_i3"
+  [(set (match_operand:SF 0 "arith_reg_operand" "=z")
+	(minus:SF (reg:SF R4_REG) (reg:SF R5_REG)))
+   (clobber (reg:SI T_REG))
+   (clobber (reg:SI PR_REG))
+   (clobber (reg:SI R1_REG))
+   (clobber (reg:SI R2_REG))
+   (clobber (reg:SI R3_REG))
+   (clobber (reg:SI R5_REG))
+   (clobber (reg:SI R6_REG))
+   (clobber (reg:SI R7_REG))
+   (use (match_operand:SI 1 "arith_reg_operand" "r"))]
+  "TARGET_SH3 && ! TARGET_SH2E"
+  "jsr	@%1%#"
+  [(set_attr "type" "sfunc")
+   (set_attr "needs_delay_slot" "yes")])
+
 (define_insn "subsf3_i"
   [(set (match_operand:SF 0 "fp_arith_reg_operand" "=f")
 	(minus:SF (match_operand:SF 1 "fp_arith_reg_operand" "0")
@@ -8392,13 +8573,19 @@ mov.l\\t1f,r0\\n\\
   [(set (match_operand:SF 0 "fp_arith_reg_operand" "")
 	(mult:SF (match_operand:SF 1 "fp_arith_reg_operand" "")
 		 (match_operand:SF 2 "fp_arith_reg_operand" "")))]
-  "TARGET_SH2E || TARGET_SHMEDIA_FPU"
+  "TARGET_SH2E || TARGET_SH3 || TARGET_SHMEDIA_FPU"
   "
 {
   if (TARGET_SH4)
     expand_sf_binop (&gen_mulsf3_i4, operands);
   else if (TARGET_SH2E)
     emit_insn (gen_mulsf3_ie (operands[0], operands[1], operands[2]));
+  else if (TARGET_SH3)
+    {
+      expand_sfunc_binop (SFmode, &gen_mulsf3_i3, \"__mulsf3_\", MULT,
+			  operands);
+      DONE;
+    }
   if (! TARGET_SHMEDIA)
     DONE;
 }")
@@ -8429,6 +8616,22 @@ mov.l\\t1f,r0\\n\\
   "fmul	%2,%0"
   [(set_attr "type" "fp")])
 
+(define_insn "mulsf3_i3"
+  [(set (match_operand:SF 0 "arith_reg_operand" "=z")
+	(mult:SF (reg:SF R4_REG) (reg:SF R5_REG)))
+   (clobber (reg:SI MACH_REG))
+   (clobber (reg:SI MACL_REG))
+   (clobber (reg:SI T_REG))
+   (clobber (reg:SI PR_REG))
+   (clobber (reg:SI R1_REG))
+   (clobber (reg:SI R2_REG))
+   (clobber (reg:SI R3_REG))
+   (use (match_operand:SI 1 "arith_reg_operand" "r"))]
+  "TARGET_SH3 && ! TARGET_SH2E"
+  "jsr	@%1%#"
+  [(set_attr "type" "sfunc")
+   (set_attr "needs_delay_slot" "yes")])
+
 (define_insn "*mac_media"
   [(set (match_operand:SF 0 "fp_arith_reg_operand" "=f")
 	(plus:SF (mult:SF (match_operand:SF 1 "fp_arith_reg_operand" "%f")
@@ -8589,6 +8792,95 @@ mov.l\\t1f,r0\\n\\
   "ftrc	%1,%0"
   [(set_attr "type" "fp")])
 
+(define_insn "cmpnesf_i1"
+  [(set (match_operand:CC_FP_NE 0 "register_operand" "=z")
+	(compare:CC_FP_NE (reg:SF R4_REG) (reg:SF R5_REG)))
+   (clobber (reg:SI T_REG))
+   (clobber (reg:SI PR_REG))
+   (clobber (reg:SI R1_REG))
+   (use (match_operand:SI 1 "arith_reg_operand" "r"))]
+  "TARGET_SH1 && ! TARGET_SH2E"
+  "jsr	@%1%#"
+  [(set_attr "type" "sfunc")
+   (set_attr "needs_delay_slot" "yes")])
+
+(define_insn "cmpgtsf_i1"
+  [(set (match_operand:CC_FP_GT 0 "register_operand" "=z")
+	(compare:CC_FP_GT (reg:SF R4_REG) (reg:SF R5_REG)))
+   (clobber (reg:SI T_REG))
+   (clobber (reg:SI PR_REG))
+   (clobber (reg:SI R1_REG))
+   (use (match_operand:SI 1 "arith_reg_operand" "r"))]
+  "TARGET_SH1 && ! TARGET_SH2E"
+  "jsr	@%1%#"
+  [(set_attr "type" "sfunc")
+   (set_attr "needs_delay_slot" "yes")])
+
+(define_insn "cmpunltsf_i1"
+  [(set (match_operand:CC_FP_UNLT 0 "register_operand" "=z")
+	(compare:CC_FP_UNLT (reg:SF R4_REG) (reg:SF R5_REG)))
+   (clobber (reg:SI T_REG))
+   (clobber (reg:SI PR_REG))
+   (clobber (reg:SI R1_REG))
+   (use (match_operand:SI 1 "arith_reg_operand" "r"))]
+  "TARGET_SH1 && ! TARGET_SH2E"
+  "jsr	@%1%#"
+  [(set_attr "type" "sfunc")
+   (set_attr "needs_delay_slot" "yes")])
+
+(define_insn "cmpeqsf_i1_finite"
+  [(set (reg:SI T_REG)
+	(eq:SI (match_operand:SF 0 "arith_reg_operand" "r,r,r")
+	       (match_operand:SF 1 "arith_reg_operand" "r,r,r")))
+   (clobber (match_scratch:SI 2 "=0,1,?r"))]
+  "TARGET_SH1 && ! TARGET_SH2E && flag_finite_math_only"
+  "*
+{
+  if (which_alternative == 0)
+     output_asm_insn (\"cmp/eq\t%0,%1\;or\t%1,%2\;bt\t0f\", operands);
+  else if (which_alternative == 1)
+     output_asm_insn (\"cmp/eq\t%0,%1\;or\t%0,%2\;bt\t0f\", operands);
+  else
+    output_asm_insn (\"cmp/eq\t%0,%1\;mov\t%0,%2\;bt\t0f\;or\t%1,%2\",
+		     operands);
+  return \"add\t%2,%2\;tst\t%2,%2\\n0:\";
+}"
+  [(set_attr "length" "10,10,12")])
+
+(define_insn "cmplesf_i1_finite"
+  [(set (reg:SI T_REG)
+	(le:SI (match_operand:SF 0 "arith_reg_operand" "r,r,r")
+	       (match_operand:SF 1 "arith_reg_operand" "r,r,r")))
+   (clobber (match_scratch:SI 2 "=0,1,r"))]
+  "TARGET_SH1 && ! TARGET_SH2E && flag_finite_math_only"
+  "*
+{
+  output_asm_insn (\"cmp/pz\t%0\", operands);
+  if (which_alternative == 2)
+    output_asm_insn (\"mov\t%0,%2\", operands);
+  if (TARGET_SH2)
+    output_asm_insn (\"bf/s\t0f\;cmp/hs\t%1,%0\;cmp/ge\t%0,%1\", operands);
+  else
+    output_asm_insn (\"bt\t1f\;bra\t0f\;cmp/hs\t%1,%0\\n1:\tcmp/ge\t%0,%1\",
+		     operands);
+  if (which_alternative == 1)
+    output_asm_insn (\"or\t%0,%2\", operands);
+  else
+    output_asm_insn (\"or\t%1,%2\", operands);
+  return \"bt\t0f\;add\t%2,%2\;tst\t%2,%2\\n0:\";
+}"
+  [(set_attr "length" "18,18,20")])
+
+(define_insn "cmpunsf_i1"
+  [(set (reg:SI T_REG)
+	(unordered:SI (match_operand:SF 0 "arith_reg_operand" "r,r")
+		      (match_operand:SF 1 "arith_reg_operand" "r,r")))
+   (use (match_operand:SI 2 "arith_reg_operand" "r,r"))
+   (clobber (match_scratch:SI 3 "=0,&r"))]
+  "TARGET_SH1 && ! TARGET_SH2E"
+  "not\t%0,%3\;tst\t%2,%3\;not\t%1,%3\;bt\t0f\;tst\t%2,%3\;0:"
+  [(set_attr "length" "10")])
+
 (define_insn "cmpgtsf_t"
   [(set (reg:SI T_REG)
 	(gt:SI (match_operand:SF 0 "fp_arith_reg_operand" "f")
@@ -8684,7 +8976,7 @@ mov.l\\t1f,r0\\n\\
   [(set (reg:SI T_REG)
 	(compare (match_operand:SF 0 "arith_operand" "")
 		 (match_operand:SF 1 "arith_operand" "")))]
-  "TARGET_SH2E || TARGET_SHMEDIA_FPU"
+  "TARGET_SH1 || TARGET_SHMEDIA_FPU"
   "
 {
   sh_compare_op0 = operands[0];
@@ -8779,11 +9071,44 @@ mov.l\\t1f,r0\\n\\
   [(set_attr "type" "fmove")
    (set_attr "fp_mode" "single")])
 
+(define_expand "abssc2"
+  [(set (match_operand:SF 0 "fp_arith_reg_operand" "")
+	(abs:SF (match_operand:SC 1 "fp_arith_reg_operand" "")))]
+  "TARGET_SH3 && ! TARGET_SH2E"
+  "
+{
+  if (TARGET_SH3)
+    {
+      expand_sfunc_unop (SCmode, &gen_abssc2_i3, \"__hypotf\", ABS,
+			  operands);
+      DONE;
+    }
+  FAIL;
+}")
+
+(define_insn "abssc2_i3"
+  [(set (match_operand:SF 0 "arith_reg_operand" "=z")
+	(abs:SF (reg:SC R4_REG)))
+   (clobber (reg:SI MACH_REG))
+   (clobber (reg:SI MACL_REG))
+   (clobber (reg:SI T_REG))
+   (clobber (reg:SI PR_REG))
+   (clobber (reg:SI R1_REG))
+   (clobber (reg:SI R2_REG))
+   (clobber (reg:SI R3_REG))
+   (clobber (reg:SI R4_REG))
+   (clobber (reg:SI R5_REG))
+   (use (match_operand:SI 1 "arith_reg_operand" "r"))]
+  "TARGET_SH3 && ! TARGET_SH2E"
+  "jsr	@%1%#"
+  [(set_attr "type" "sfunc")
+   (set_attr "needs_delay_slot" "yes")])
+
 (define_expand "adddf3"
   [(set (match_operand:DF 0 "fp_arith_reg_operand" "")
 	(plus:DF (match_operand:DF 1 "fp_arith_reg_operand" "")
 		 (match_operand:DF 2 "fp_arith_reg_operand" "")))]
-  "TARGET_SH4 || TARGET_SHMEDIA_FPU"
+  "TARGET_SH4 || TARGET_SHMEDIA_FPU || TARGET_SH3"
   "
 {
   if (TARGET_SH4)
@@ -8791,6 +9116,12 @@ mov.l\\t1f,r0\\n\\
       expand_df_binop (&gen_adddf3_i, operands);
       DONE;
     }
+  else if (TARGET_SH3)
+    {
+      expand_sfunc_binop (DFmode, &gen_adddf3_i3_wrap, \"__adddf3_\", PLUS,
+			  operands);
+      DONE;
+    }
 }")
 
 (define_insn "*adddf3_media"
@@ -8811,6 +9142,30 @@ mov.l\\t1f,r0\\n\\
   [(set_attr "type" "dfp_arith")
    (set_attr "fp_mode" "double")])
 
+(define_expand "adddf3_i3_wrap"
+  [(match_operand:DF 0 "" "") (match_operand:SI 1 "" "")]
+  "TARGET_SH3"
+  "
+{
+  emit_insn (gen_adddf3_i3 (operands[1]));
+  emit_move_insn (operands[0], gen_rtx_REG (DFmode, R0_REG));
+  DONE;
+}")
+
+(define_insn "adddf3_i3"
+  [(set (reg:DF R0_REG)
+	(plus:DF (reg:DF R4_REG) (reg:DF R6_REG)))
+   (clobber (reg:SI T_REG))
+   (clobber (reg:SI PR_REG))
+   (clobber (reg:DI R2_REG))
+   (clobber (reg:DF R4_REG))
+   (clobber (reg:DF R6_REG))
+   (use (match_operand:SI 0 "arith_reg_operand" "r"))]
+  "TARGET_SH3"
+  "jsr	@%0%#"
+  [(set_attr "type" "sfunc")
+   (set_attr "needs_delay_slot" "yes")])
+
 (define_expand "subdf3"
   [(set (match_operand:DF 0 "fp_arith_reg_operand" "")
 	(minus:DF (match_operand:DF 1 "fp_arith_reg_operand" "")
@@ -8847,7 +9202,7 @@ mov.l\\t1f,r0\\n\\
   [(set (match_operand:DF 0 "fp_arith_reg_operand" "")
 	(mult:DF (match_operand:DF 1 "fp_arith_reg_operand" "")
 		 (match_operand:DF 2 "fp_arith_reg_operand" "")))]
-  "TARGET_SH4 || TARGET_SHMEDIA_FPU"
+  "TARGET_SH4 || TARGET_SHMEDIA_FPU || TARGET_SH3"
   "
 {
   if (TARGET_SH4)
@@ -8855,6 +9210,12 @@ mov.l\\t1f,r0\\n\\
       expand_df_binop (&gen_muldf3_i, operands);
       DONE;
     }
+  else if (TARGET_SH3)
+    {
+      expand_sfunc_binop (DFmode, &gen_muldf3_i3_wrap, \"__muldf3_\", MULT,
+			  operands);
+      DONE;
+    }
 }")
 
 (define_insn "*muldf3_media"
@@ -8875,6 +9236,32 @@ mov.l\\t1f,r0\\n\\
   [(set_attr "type" "dfp_arith")
    (set_attr "fp_mode" "double")])
 
+(define_expand "muldf3_i3_wrap"
+  [(match_operand:DF 0 "" "") (match_operand:SI 1 "" "")]
+  "TARGET_SH3"
+  "
+{
+  emit_insn (gen_muldf3_i3 (operands[1]));
+  emit_move_insn (operands[0], gen_rtx_REG (DFmode, R0_REG));
+  DONE;
+}")
+
+(define_insn "muldf3_i3"
+  [(set (reg:DF R0_REG)
+	(mult:DF (reg:DF R4_REG) (reg:DF R6_REG)))
+   (clobber (reg:SI MACH_REG))
+   (clobber (reg:SI MACL_REG))
+   (clobber (reg:SI T_REG))
+   (clobber (reg:SI PR_REG))
+   (clobber (reg:DI R2_REG))
+   (clobber (reg:DF R4_REG))
+   (clobber (reg:DF R6_REG))
+   (use (match_operand:SI 0 "arith_reg_operand" "r"))]
+  "TARGET_SH3"
+  "jsr	@%0%#"
+  [(set_attr "type" "sfunc")
+   (set_attr "needs_delay_slot" "yes")])
+
 (define_expand "divdf3"
   [(set (match_operand:DF 0 "fp_arith_reg_operand" "")
 	(div:DF (match_operand:DF 1 "fp_arith_reg_operand" "")
@@ -9004,6 +9391,61 @@ mov.l\\t1f,r0\\n\\
 ;; 	      (use (match_dup 2))])
 ;;    (set (match_dup 0) (reg:SI FPUL_REG))])
 
+(define_insn "cmpnedf_i1"
+  [(set (match_operand:CC_FP_NE 0 "register_operand" "=z")
+	(compare:CC_FP_NE (reg:DF R4_REG) (reg:DF R6_REG)))
+   (clobber (reg:SI T_REG))
+   (clobber (reg:SI PR_REG))
+   (clobber (reg:SI R1_REG))
+   (use (match_operand:SI 1 "arith_reg_operand" "r"))]
+  "TARGET_SH1 && ! TARGET_SH2E"
+  "jsr	@%1%#"
+  [(set_attr "type" "sfunc")
+   (set_attr "needs_delay_slot" "yes")])
+
+(define_insn "cmpgtdf_i1"
+  [(set (match_operand:CC_FP_GT 0 "register_operand" "=z")
+	(compare:CC_FP_GT (reg:DF R4_REG) (reg:DF R6_REG)))
+   (clobber (reg:SI T_REG))
+   (clobber (reg:SI PR_REG))
+   (clobber (reg:SI R1_REG))
+   (use (match_operand:SI 1 "arith_reg_operand" "r"))]
+  "TARGET_SH1 && ! TARGET_SH4"
+  "jsr	@%1%#"
+  [(set_attr "type" "sfunc")
+   (set_attr "needs_delay_slot" "yes")])
+
+(define_insn "cmpunltdf_i1"
+  [(set (match_operand:CC_FP_UNLT 0 "register_operand" "=z")
+	(compare:CC_FP_UNLT (reg:DF R4_REG) (reg:DF R6_REG)))
+   (clobber (reg:SI T_REG))
+   (clobber (reg:SI PR_REG))
+   (clobber (reg:SI R1_REG))
+   (use (match_operand:SI 1 "arith_reg_operand" "r"))]
+  "TARGET_SH1 && ! TARGET_SH4"
+  "jsr	@%1%#"
+  [(set_attr "type" "sfunc")
+   (set_attr "needs_delay_slot" "yes")])
+
+(define_insn "cmpeqdf_i1_finite"
+  [(set (reg:SI T_REG)
+	(eq:SI (match_operand:DF 0 "arith_reg_operand" "r")
+	       (match_operand:DF 1 "arith_reg_operand" "r")))
+   (clobber (match_scratch:SI 2 "=&r"))]
+  "TARGET_SH1 && ! TARGET_SH4 && flag_finite_math_only"
+  "cmp/eq\t%R0,%R1\;mov\t%S0,%2\;bf\t0f\;cmp/eq\t%S0,%S1\;bt\t0f\;or\t%S1,%2\;add\t%2,%2\;or\t%R0,%2\;tst\t%2,%2\\n0:"
+  [(set_attr "length" "18")])
+
+(define_insn "cmpundf_i1"
+  [(set (reg:SI T_REG)
+	(unordered:SI (match_operand:DF 0 "arith_reg_operand" "r,r")
+		      (match_operand:DF 1 "arith_reg_operand" "r,r")))
+   (use (match_operand:SI 2 "arith_reg_operand" "r,r"))
+   (clobber (match_scratch:SI 3 "=0,&r"))]
+  "TARGET_SH1 && ! TARGET_SH2E"
+  "not\t%S0,%3\;tst\t%2,%3\;not\t%S1,%3\;bt\t0f\;tst\t%2,%3\;0:"
+  [(set_attr "length" "10")])
+
 (define_insn "cmpgtdf_t"
   [(set (reg:SI T_REG)
 	(gt:SI (match_operand:DF 0 "arith_reg_operand" "f")
@@ -9071,7 +9513,7 @@ mov.l\\t1f,r0\\n\\
   [(set (reg:SI T_REG)
 	(compare (match_operand:DF 0 "arith_operand" "")
 		 (match_operand:DF 1 "arith_operand" "")))]
-  "TARGET_SH4 || TARGET_SHMEDIA_FPU"
+  "TARGET_SH1 || TARGET_SHMEDIA_FPU"
   "
 {
   sh_compare_op0 = operands[0];
@@ -9169,7 +9611,7 @@ mov.l\\t1f,r0\\n\\
 (define_expand "extendsfdf2"
   [(set (match_operand:DF 0 "fp_arith_reg_operand" "")
 	(float_extend:DF (match_operand:SF 1 "fpul_operand" "")))]
-  "TARGET_SH4 || TARGET_SHMEDIA_FPU"
+  "TARGET_SH1 || TARGET_SHMEDIA_FPU"
   "
 {
   if (TARGET_SH4)
@@ -9178,6 +9620,12 @@ mov.l\\t1f,r0\\n\\
 					get_fpscr_rtx ()));
       DONE;
     }
+  else if (TARGET_SH1)
+    {
+      expand_sfunc_unop (SFmode, &gen_extendsfdf2_i1, \"__extendsfdf2_\",
+			 FLOAT_EXTEND, operands);
+      DONE;
+    }
 }")
 
 (define_insn "*extendsfdf2_media"
@@ -9196,10 +9644,43 @@ mov.l\\t1f,r0\\n\\
   [(set_attr "type" "fp")
    (set_attr "fp_mode" "double")])
 
+;; ??? In order to use this efficiently, we'd have to have an extra
+;; register class for r0 and r1 - and that would cause repercussions in
+;; register allocation elsewhere.  So just say we clobber r0 / r1, and
+;; that we can use an arbitrary target.  */
+(define_insn_and_split "extendsfdf2_i1"
+  [(set (match_operand:DF 0 "arith_reg_operand" "=r")
+	(float_extend:DF (reg:SF R4_REG)))
+   (clobber (reg:SI T_REG))
+   (clobber (reg:SI PR_REG))
+   (clobber (reg:SI R0_REG))
+   (clobber (reg:SI R1_REG))
+   (clobber (reg:SI R2_REG))
+   (clobber (reg:SI R3_REG))
+   (use (match_operand:SI 1 "arith_reg_operand" "r"))]
+  "TARGET_SH1 && !TARGET_SH4"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (reg:DF R0_REG))]
+  "emit_insn (gen_extendsfdf2_i1_r0 (operands[1]));"
+  [(set_attr "type" "sfunc")])
+
+(define_insn "extendsfdf2_i1_r0"
+  [(set (reg:DF R0_REG) (float_extend:DF (reg:SF R4_REG)))
+   (clobber (reg:SI T_REG))
+   (clobber (reg:SI PR_REG))
+   (clobber (reg:SI R2_REG))
+   (clobber (reg:SI R3_REG))
+   (use (match_operand:SI 0 "arith_reg_operand" "r"))]
+  "TARGET_SH1 && !TARGET_SH4"
+  "jsr	@%0%#"
+  [(set_attr "type" "sfunc")
+   (set_attr "needs_delay_slot" "yes")])
+
 (define_expand "truncdfsf2"
   [(set (match_operand:SF 0 "fpul_operand" "")
 	(float_truncate:SF (match_operand:DF 1 "fp_arith_reg_operand" "")))]
-  "TARGET_SH4 || TARGET_SHMEDIA_FPU"
+  "TARGET_SH1 || TARGET_SHMEDIA_FPU"
   "
 {
   if (TARGET_SH4)
@@ -9208,6 +9689,12 @@ mov.l\\t1f,r0\\n\\
 				       get_fpscr_rtx ()));
       DONE;
     }
+  else if (TARGET_SH1)
+    {
+      expand_sfunc_unop (DFmode, &gen_truncdfsf2_i1, \"__truncdfsf2_\",
+			 FLOAT_TRUNCATE, operands);
+      DONE;
+    }
 }")
 
 (define_insn "*truncdfsf2_media"
@@ -9225,6 +9712,21 @@ mov.l\\t1f,r0\\n\\
   "fcnvds  %1,%0"
   [(set_attr "type" "fp")
    (set_attr "fp_mode" "double")])
+
+(define_insn "truncdfsf2_i1"
+  [(set (match_operand:SF 0 "arith_reg_operand" "=z")
+	(float_truncate:SF (reg:DF R4_REG)))
+   (clobber (reg:SI T_REG))
+   (clobber (reg:SI PR_REG))
+   (clobber (reg:SI R1_REG))
+   (clobber (reg:SI R2_REG))
+   (clobber (reg:SI R3_REG))
+   (use (match_operand:SI 1 "arith_reg_operand" "r"))]
+  "TARGET_SH1 && ! TARGET_SH4"
+  "jsr	@%1%#"
+  [(set_attr "type" "sfunc")
+   (set_attr "needs_delay_slot" "yes")])
+
 
 ;; Bit field extract patterns.  These give better code for packed bitfields,
 ;; because they allow auto-increment addresses to be generated.
Index: t-sh
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/sh/t-sh,v
retrieving revision 1.16.30.1
diff -p -u -r1.16.30.1 t-sh
--- t-sh	16 Jun 2004 19:58:35 -0000	1.16.30.1
+++ t-sh	30 Sep 2004 18:59:42 -0000
@@ -1,6 +1,8 @@
 LIB1ASMSRC = sh/lib1funcs.asm
 LIB1ASMFUNCS = _ashiftrt _ashiftrt_n _ashiftlt _lshiftrt _movstr \
   _movstr_i4 _mulsi3 _sdivsi3 _sdivsi3_i4 _udivsi3 _udivsi3_i4 _set_fpscr \
+  _nesf2 _nedf2 _gtsf2t _gtdf2t _gesf2f _gedf2f _extendsfdf2  _truncdfsf2 \
+  _add_sub_sf3 _mulsf3 _hypotf _muldf3 _add_sub_df3 \
   $(LIB1ASMFUNCS_CACHE)
 
 # We want fine grained libraries, so use the new code to build the


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]