This is the mail archive of the
gcc@gcc.gnu.org
mailing list for the GCC project.
Re: RFC: Handling of libgcc symbols in SH shared libraries
- From: Joern Rennecke <joern dot rennecke at superh dot com>
- To: kumar107 at rediffmail dot com
- Cc: kkojima at rr dot iij4u dot or dot jp, joern dot rennecke at superh dot com, gcc at gcc dot gnu dot org, aoliva at redhat dot com, amylaar at spamcop dot net
- Date: Wed, 4 Aug 2004 13:11:23 +0100 (BST)
- Subject: Re: RFC: Handling of libgcc symbols in SH shared libraries
> ess registers=0A> > (and are faster, obviously).=0A=0AWe, here at HCL, have=
> implemented a hand-coded floating point library=0Afor single precision and=
> double precision floating point arithmetic=0Afunctions for SH architecture=
> . The implementation confirms to IEEE-754=0Astandards and aims towards repl=
> acing the functions in [fd]p-bit.c.=0AIt could be seen as following of Pete=
Are you saying you have implemented different rounding modes and
all the flags (inexact, overflow, invalid, underflow...) ?
I have kept with mostly the current fp-bit functionality in terms of
features, however I have have made sure there is no double rounding for
denormals - IIRC fp-bit.c had some trouble with this.
Keeping withintg this feature set, and allowing for PIC code generation,
my priorities were to make the SH4 code fast, use fewer registers, and make
it compact, in that order.
> r's initiative for ARM. It will work for SH[1-4]. SH5 is not in scope.=0A=
> =0AWe are already through with the implemenation and tested with release=0A=
> version on paranoia. It is working fine. The only thing remaining is=0Abenc=
> hmarking. It would be on the list in a few days.=0A=0AWe don't want effort =
> to be duplicated. Please make sure we are not=0Aworking on same lines.=0A=
> =0AThanks and Best Regards,=0ARakesh Kumar=0A=0ARakesh
I have appended what I have so far. I did some regression testing on most
of the code, but the hypotf implementation is completely untested yet.
Its near_one code path is also unnecessary because of the rounding,
so another five instructions and four bytes of data can be saved.
Its 'tab' table could also be shared with single and double precision sqrt
by using one of three techniques:
_ put the table in a separate module, and load the address pc-relative
(static libraries only).
- amalgamate the related functions into a single module (that makes most
sense in a dynamic library).
- Use separate sections and a linker script to group the related functions
together, and the table at the end.
Note, we not only need functions that implement the arithmetic, but
also expanders in sh.md / sh.c to take advantage of lower register usage
and to enable some extra optimizations that work only on explicit
rtl operations, but not on ordinary function calls.
The way fp-bit does comparisons is also encumbered with backwards
compatibility baggage from the time that there was only one comparison
function, and no proper NaN handling.
Let's see your code too so that we can see how we can integrate our work,
and what pieces are still useful to add or enhance.
Index: lib1funcs.asm
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/sh/lib1funcs.asm,v
retrieving revision 1.36
diff -p -r1.36 lib1funcs.asm
*** lib1funcs.asm 12 Aug 2003 01:25:07 -0000 1.36
--- lib1funcs.asm 2 Aug 2004 03:57:36 -0000
*************** Boston, MA 02111-1307, USA. */
*** 37,42 ****
--- 37,44 ----
ELF local label prefixes by J"orn Rennecke
amylaar@cygnus.com */
+ #include "insn-constants.h"
+
#ifdef __ELF__
#define LOCAL(X) .L_##X
#define FUNC(X) .type X,@function
*************** Boston, MA 02111-1307, USA. */
*** 56,61 ****
--- 58,91 ----
#define FMOVD_WORKS
#endif
+ #ifdef __sh1__
+ #define SL(branch, dest, in_slot, in_slot_arg2) \
+ in_slot, in_slot_arg2; branch dest
+ #define SL_(branch, dest, in_slot) \
+ in_slot; branch dest
+ #define SLC(branch, dest, in_slot, in_slot_arg2) \
+ branch dest; in_slot, in_slot_arg2
+ #define SLI(in_slot, in_slot_arg2) in_slot, in_slot_arg2
+ #define SLCMP(branch, cmp1, cmp1arg2, cmp2, cmp2arg2) \
+ branch .+6; bra .+6; cmp2, cmp2arg2; cmp1, cmp1arg2
+ #else
+ #define SL(branch, dest, in_slot, in_slot_arg2) \
+ branch##/s dest; in_slot, in_slot_arg2
+ #define SL_(branch, dest, in_slot) \
+ branch##/s dest; in_slot
+ #define SLC(branch, dest, in_slot, in_slot_arg2) \
+ branch##/s dest; in_slot, in_slot_arg2
+ #define SLI(in_slot, in_slot_arg)
+ #define SLCMP(branch, cmp1, cmp1arg2, cmp2, cmp2arg2) \
+ branch##/s .+6; cmp1, cmp1arg2; cmp2, cmp2arg2
+ #endif
+
+ #if defined (__sh1__) || defined (__sh2__) || defined (__SH2E__)
+ /* don't #define DYN_SHIFT */
+ #else
+ #define DYN_SHIFT 1
+ #endif
+
#if ! __SH5__
#ifdef L_ashiftrt
.global GLOBAL(ashiftrt_r4_0)
*************** GLOBAL(GCC_pop_shmedia_regs_nofpu):
*** 2873,2875 ****
--- 2903,4543 ----
ENDFUNC(GLOBAL(GCC_pop_shmedia_regs_nofpu))
#endif /* __SH5__ == 32 */
#endif /* L_push_pop_shmedia_regs */
+
+ /* Floating-point emulation. We handle NANs, +-infinity, and +-zero.
+ However, we assume that for NANs, the topmost bit of the fraction is set. */
+ #ifdef L_nesf2
+ /* -ffinite-math-only inline version, T := r4:SF == r5:SF
+ cmp/eq r4,r5
+ mov r4,r0
+ bt 0f
+ or r5,r0
+ add r0,r0
+ tst r0,r0
+ 0: */
+ .global GLOBAL(nesf2_)
+ FUNC(GLOBAL(nesf2_))
+ GLOBAL(nesf2_):
+ /* If the raw values are unequal, the result is unequal, unless
+ both values are +-zero.
+ If the raw values are equal, the result is equal, unless
+ the values are nan or infinity. */
+ cmp/eq r4,r5
+ mov.l LOCAL(c_SF_NAN_MASK),r1
+ not r4,r0
+ bt LOCAL(check_nan)
+ mov r4,r0
+ or r5,r0
+ rts
+ add r0,r0
+ LOCAL(check_nan):
+ tst r1,r0
+ rts
+ movt r0
+ .balign 4
+ LOCAL(c_SF_NAN_MASK):
+ .long SF_NAN_MASK
+ ENDFUNC(GLOBAL(nesf2_))
+ #endif /* L_nesf2 */
+
+ #ifdef __LITTLE_ENDIAN__
+ #define DBL0L r4
+ #define DBL0H r5
+ #define DBL1L r6
+ #define DBL1H r7
+ #define DBLRL r0
+ #define DBLRH r1
+ #else
+ #define DBL0L r5
+ #define DBL0H r4
+ #define DBL1L r7
+ #define DBL1H r6
+ #define DBLRL r1
+ #define DBLRH r0
+ #endif
+
+ #ifdef L_nedf2
+ /* -ffinite-math-only -mb inline version, T := r4:DF == r6:DF
+ cmp/eq r5,r7
+ mov r4,r0
+ bf 0f
+ cmp/eq r4,r6
+ bt 0f
+ or r6,r0
+ add r0,r0
+ or r5,r0
+ tst r0,r0
+ 0: */
+ .global GLOBAL(nedf2_)
+ FUNC(GLOBAL(nedf2_))
+ GLOBAL(nedf2_):
+ cmp/eq DBL0L,DBL1L
+ mov.l LOCAL(c_DF_NAN_MASK),r1
+ bf LOCAL(ne)
+ cmp/eq DBL0H,DBL1H
+ not DBL0H,r0
+ bt LOCAL(check_nan)
+ mov DBL0H,r0
+ or DBL1H,r0
+ add r0,r0
+ rts
+ or DBL0L,r0
+ LOCAL(check_nan):
+ tst r1,r0
+ rts
+ movt r0
+ LOCAL(ne):
+ rts
+ mov #1,r0
+ .balign 4
+ LOCAL(c_DF_NAN_MASK):
+ .long DF_NAN_MASK
+ ENDFUNC(GLOBAL(nedf2_))
+ #endif /* L_nedf2 */
+
+ #ifdef L_unordsf2
+ .global GLOBAL(unordsf2_)
+ FUNC(GLOBAL(unordsf2_))
+ GLOBAL(unordsf2_):
+ mov.l LOCAL(c_SF_NAN_MASK),r1
+ not r4,r0
+ tst r1,r0
+ not r5,r0
+ bt LOCAL(unord)
+ tst r1,r0
+ LOCAL(unord):
+ rts
+ movt r0
+ .balign 4
+ LOCAL(c_SF_NAN_MASK):
+ .long SF_NAN_MASK
+ ENDFUNC(GLOBAL(unordsf2_))
+ #endif /* L_unordsf2 */
+
+ #ifdef L_unorddf2
+ .global GLOBAL(unorddf2_)
+ FUNC(GLOBAL(unorddf2_))
+ GLOBAL(unorddf2_):
+ mov.l LOCAL(c_DF_NAN_MASK),r1
+ not r4,r0
+ tst r1,r0
+ not r6,r0
+ bt LOCAL(unord)
+ tst r1,r0
+ LOCAL(unord):
+ rts
+ movt r0
+ .balign 4
+ LOCAL(c_DF_NAN_MASK):
+ .long DF_NAN_MASK
+ ENDFUNC(GLOBAL(unorddf2_))
+ #endif /* L_unorddf2 */
+
+ #if defined(L_gtsf2t) || defined(L_gtsf2t_trap)
+ /* -ffinite-math-only inline version, T := r4:SF > r5:SF ? 0 : 1
+ cmp/pz r4
+ mov r4,r0
+ bf/s 0f
+ cmp/hs r5,r4
+ cmp/ge r4,r5
+ or r5,r0
+ bt 0f
+ add r0,r0
+ tst r0,r0
+ 0: */
+ #ifdef L_gtsf2t
+ #define fun_label GLOBAL(gtsf2t)
+ #else
+ #define fun_label GLOBAL(gtsf2t_trap)
+ #endif
+ .global fun_label
+ FUNC(fun_label)
+ fun_label:
+ /* If the raw values compare greater, the result true, unless
+ any of them is a nan (but infinity is fine), or both values are
+ +- zero. Otherwise, the result false. */
+ mov.l LOCAL(c_SF_NAN_MASK),r1
+ cmp/pz r4
+ not r5,r0
+ SLC(bf, LOCAL(neg),
+ tst r1,r0)
+ mov r4,r0
+ bt LOCAL(nan)
+ cmp/gt r5,r4
+ SLC(bf, LOCAL(check_nan),
+ cmp/gt r4,r1)
+ bf LOCAL(nan)
+ or r5,r0
+ rts
+ add r0,r0
+ LOCAL(neg):
+ SLI(tst r1,r0)
+ bt LOCAL(nan)
+ not r4,r0
+ tst r1,r0
+ bt LOCAL(nan)
+ cmp/hi r4,r5
+ #if defined(L_gtsf2t) && defined(DELAYED_BRANCHES)
+ LOCAL(check_nan):
+ #endif /* DELAYED_BRANCHES */
+ rts
+ movt r0
+ #ifdef L_gtsf2t
+ LOCAL(check_nan):
+ LOCAL(nan):
+ rts
+ mov #0,r0
+ #else /* ! L_gtsf2t */
+ LOCAL(check_nan):
+ SLI(cmp/gt r4,r1)
+ bf LOCAL(nan)
+ rts
+ movt r0
+ LOCAL(nan):
+ mov #0,r0
+ trapa #0
+ #endif /* ! L_gtsf2t */
+ .balign 4
+ LOCAL(c_SF_NAN_MASK):
+ .long SF_NAN_MASK
+ ENDFUNC(fun_label)
+ #endif /* L_gtsf2t */
+
+ #if defined(L_gtdf2t) || defined(L_gtdf2t_trap)
+ #ifdef L_gtdf2t
+ #define fun_label GLOBAL(gtdf2t)
+ #else
+ #define fun_label GLOBAL(gtdf2t_trap)
+ #endif
+ .global fun_label
+ FUNC(fun_label)
+ fun_label:
+ /* If the raw values compare greater, the result true, unless
+ any of them is a nan (but infinity is fine), or both values are
+ +- zero. Otherwise, the result false. */
+ mov.l LOCAL(c_DF_NAN_MASK),r1
+ cmp/pz DBL0H
+ not DBL1H,r0
+ SLC(bf, LOCAL(neg),
+ tst r1,r0)
+ mov DBL0H,r0
+ bt LOCAL(nan) /* return zero if DBL1 is NAN. */
+ cmp/eq DBL1H,DBL0H
+ bt LOCAL(cmp_low)
+ cmp/gt DBL1H,DBL0H
+ or DBL1H,r0
+ SLC(bf, LOCAL(check_nan),
+ cmp/gt DBL0H,r1)
+ add r0,r0
+ bf LOCAL(nan) /* return zero if DBL0 is NAN. */
+ or DBL0L,r0
+ rts
+ or DBL1L,r0 /* non-zero unless both DBL0 and DBL1 are +-zero. */
+ LOCAL(cmp_low):
+ cmp/hi DBL1L,DBL0L
+ rts
+ movt r0
+ LOCAL(neg):
+ SLI(tst r1,r0)
+ bt LOCAL(nan) /* return zero if DBL1 is NAN. */
+ cmp/eq DBL1H,DBL0H
+ SLC(bt, LOCAL(neg_cmp_low),
+ cmp/hi DBL0L,DBL1L)
+ not r4,r0
+ tst r1,r0
+ bt LOCAL(nan) /* return zero if DBL0 is NAN. */
+ cmp/hi DBL0H,DBL1H
+ SLI(rts !,)
+ SLI(movt r0 !,)
+ LOCAL(neg_cmp_low):
+ SLI(cmp/hi DBL0L,DBL1L)
+ rts
+ movt r0
+ LOCAL(check_nan):
+ #ifdef L_gtdf2t
+ LOCAL(nan):
+ rts
+ mov #0,r0
+ #else
+ SLI(cmp/gt DBL0H,r1)
+ bf LOCAL(nan) /* return zero if DBL0 is NAN. */
+ rts
+ mov #0,r0
+ LOCAL(nan):
+ mov #0,r0
+ trapa #0
+ #endif
+ .balign 4
+ LOCAL(c_DF_NAN_MASK):
+ .long DF_NAN_MASK
+ ENDFUNC(fun_label)
+ #endif /* defined(L_gtdf2t) || defined(L_gtdf2t_trap) */
+
+ #if defined(L_gesf2f) || defined(L_gesf2f_trap)
+ /* -ffinite-math-only inline version, T := r4:SF >= r5:SF */
+ cmp/pz r5
+ mov r4,r0
+ bf/s 0f
+ cmp/hs r4,r5
+ cmp/ge r5,r4
+ or r5,r0
+ bt 0f
+ add r0,r0
+ tst r0,r0
+ 0:
+ #ifdef L_gesf2f
+ #define fun_label GLOBAL(gesf2f)
+ #else
+ #define fun_label GLOBAL(gesf2f_trap)
+ #endif
+ .global fun_label
+ FUNC(fun_label)
+ fun_label:
+ /* If the raw values compare greater or equal, the result is
+ true, unless any of them is a nan. If both are -+zero, the
+ result is true; otherwise, it is false.
+ We use 0 as true and nonzero as false for this function. */
+ mov.l LOCAL(c_SF_NAN_MASK),r1
+ cmp/pz r5
+ not r4,r0
+ SLC(bf, LOCAL(neg),
+ tst r1,r0)
+ mov r4,r0
+ bt LOCAL(nan)
+ cmp/gt r4,r5
+ SLC(bf, LOCAL(check_nan),
+ cmp/ge r1,r5)
+ bt LOCAL(nan)
+ or r5,r0
+ rts
+ add r0,r0
+ LOCAL(neg):
+ SLI(tst r1,r0)
+ bt LOCAL(nan)
+ not r5,r0
+ tst r1,r0
+ bt LOCAL(nan)
+ cmp/hi r5,r4
+ #if defined(L_gesf2f) && defined(DELAYED_BRANCHES)
+ LOCAL(nan): LOCAL(check_nan):
+ #endif
+ rts
+ movt r0
+ #if defined(L_gesf2f) && ! defined(DELAYED_BRANCHES)
+ LOCAL(check_nan):
+ cmp/ge r1,r5
+ LOCAL(nan):
+ rts
+ movt r0
+ #endif /* ! DELAYED_BRANCHES */
+ #ifdef L_gesf2f_trap
+ LOCAL(check_nan):
+ SLI(cmp/ge r1,r5)
+ bt LOCAL(nan)
+ rts
+ LOCAL(nan):
+ movt r0
+ trapa #0
+ #endif /* L_gesf2f_trap */
+ .balign 4
+ LOCAL(c_SF_NAN_MASK):
+ .long SF_NAN_MASK
+ ENDFUNC(GLOBAL(gesf2f))
+ #endif /* L_gesf2f */
+
+ #ifdef L_gedf2f
+ .global GLOBAL(gedf2f)
+ FUNC(GLOBAL(gedf2f))
+ GLOBAL(gedf2f):
+ /* If the raw values compare greater or equal, the result is
+ true, unless any of them is a nan, or both are the
+ same infinity. If both are -+zero, the result is true;
+ otherwise, it is false.
+ We use 0 as true and nonzero as false for this function. */
+ mov.l LOCAL(c_DF_NAN_MASK),r1
+ cmp/pz DBL1H
+ not DBL0H,r0
+ SLC(bf, LOCAL(neg),
+ tst r1,r0)
+ mov DBL0H,r0
+ bt LOCAL(nan)
+ cmp/eq DBL0H,DBL1H
+ bt LOCAL(cmp_low)
+ cmp/gt DBL0H,DBL1H
+ or DBL1H,r0
+ SLC(bf, LOCAL(check_nan),
+ cmp/ge r1,DBL1H)
+ add r0,r0
+ bt LOCAL(nan)
+ or DBL0L,r0
+ rts
+ or DBL1L,r0
+ LOCAL(cmp_low):
+ cmp/hi DBL0L,DBL1L
+ #if defined(L_gedf2f) && defined(DELAYED_BRANCHES)
+ LOCAL(nan): LOCAL(check_nan):
+ #endif
+ rts
+ movt r0
+ #if defined(L_gedf2f) && ! defined(DELAYED_BRANCHES)
+ LOCAL(check_nan):
+ SLI(cmp/ge r1,DBL1H)
+ LOCAL(nan):
+ rts
+ movt r0
+ #elif defined(L_gedf2f_trap)
+ LOCAL(check_nan):
+ SLI(cmp/ge r1,DBL1H)
+ bt LOCAL(nan)
+ rts
+ LOCAL(nan):
+ movt r0
+ trapa #0
+ #endif /* L_gedf2f_trap */
+ LOCAL(neg):
+ SLI(tst r1,r0)
+ bt LOCAL(nan)
+ cmp/eq DBL0H,DBL1H
+ not DBL1H,r0
+ SLC(bt, LOCAL(neg_cmp_low),
+ cmp/hi DBL1L,DBL0L)
+ tst r1,r0
+ bt LOCAL(nan)
+ cmp/hi DBL1H,DBL0H
+ SLI(rts !,)
+ SLI(movt r0 !,)
+ LOCAL(neg_cmp_low):
+ SLI(cmp/hi DBL1L,DBL0L)
+ rts
+ movt r0
+ .balign 4
+ LOCAL(c_DF_NAN_MASK):
+ .long DF_NAN_MASK
+ ENDFUNC(GLOBAL(gedf2f))
+ #endif /* L_gedf2f */
+
+ #ifndef DYN_SHIFT /* Basic conversions for SH1 / SH2 */
+ #ifdef L_extendsfdf2
+ .global GLOBAL(extendsfdf2_)
+ FUNC(GLOBAL(extendsfdf2_))
+ GLOBAL(extendsfdf2_):
+ mov.l LOCAL(x7f800000),r3
+ mov r4,DBLRL
+ tst r3,r4
+ bt LOCAL(zero_denorm)
+ mov.l LOCAL(xe0000000),r2
+ rotr DBLRL
+ rotr DBLRL
+ rotr DBLRL
+ and r2,DBLRL
+ mov r4,DBLRH
+ not r4,r2
+ shll DBLRH
+ shlr2 DBLRH
+ shlr2 DBLRH
+ add DBLRH,DBLRH
+ rotcr DBLRH
+ tst r3,r2
+ bt LOCAL(inf_nan)
+ mov.l LOCAL(x38000000),r2
+ rts
+ add r2,DBLRH
+ LOCAL(inf_nan):
+ mov.l LOCAL(x70000000),r2
+ rts
+ add r2,DBLRH
+ LOCAL(zero_denorm):
+ mov.l r4,@-r15
+ add r4,r4
+ tst r4,r4
+ bt LOCAL(zero)
+ add r3,r3 /* 0xff000000 */
+ mov.l LOCAL(xb8000009),r2
+ LOCAL(shift_byte):
+ tst r3,r4
+ shll8 r4
+ SL(bt, LOCAL(shift_byte),
+ add #-8,r2)
+ LOCAL(shift_bit):
+ shll r4
+ SL(bf, LOCAL(shift_bit),
+ add #-1,r2)
+ mov r4,DBLRH
+ mov.l @r15+,r4
+ shlr8 DBLRH
+ shlr2 DBLRH
+ shlr DBLRH
+ rotcr DBLRL
+ cmp/pz r4
+ rotcr DBLRH
+ rotcr DBLRL
+ rts
+ add r2,DBLRH
+ LOCAL(zero):
+ mov.l @r15+,DBLRH
+ rts
+ mov #0,DBLRL
+ .balign 4
+ LOCAL(x7f800000):
+ .long 0x7f800000
+ LOCAL(x38000000):
+ .long 0x38000000
+ LOCAL(xe0000000):
+ .long 0xe0000000
+ LOCAL(x70000000):
+ .long 0x70000000
+ LOCAL(xb8000009):
+ /* Flip sign back, do exponent adjustment, and compensate for -8 / -1
+ adjustments in first shift loop iterations. */
+ .long 0x80000000 + 0x38000000 + 9
+ ENDFUNC(GLOBAL(extendsfdf2_))
+ #endif /* L_extendsfdf2 */
+
+ #ifdef L_truncdfsf2
+ .global GLOBAL(truncdfsf2_)
+ FUNC(GLOBAL(truncdfsf2_))
+ GLOBAL(truncdfsf2_):
+ mov.l LOCAL(x38000000),r3 ! exponent adjustment DF -> SF
+ mov DBL0H,r1
+ mov.l LOCAL(x70000000),r2 ! mask for out-of-range exponent bits
+ mov DBL0H,r0
+ mov.l DBL0L,@-r15
+ sub r3,r1
+ tst r2,r1
+ shll8 r0 !
+ shll2 r0 ! Isolate highpart fraction.
+ shll2 r0 !
+ bf LOCAL(ill_exp)
+ shll2 r1
+ mov.l LOCAL(x2fffffff),r2 /* Fraction lsb | lower guard bits. */
+ shll2 r1
+ mov.l LOCAL(xff000000),r3
+ shlr8 r0
+ tst r2,DBL0L /* Check if msb guard bit wants rounding up. */
+ shlr16 DBL0L
+ shlr8 DBL0L
+ shlr2 DBL0L
+ SL_(bt, LOCAL(add_frac),
+ shlr2 DBL0L)
+ add #1,DBL0L
+ LOCAL(add_frac):
+ add DBL0L,r0
+ mov.l LOCAL(x01000000),r2
+ and r3,r1
+ mov.l @r15+,DBL0L
+ add r1,r0
+ tst r3,r0
+ bt LOCAL(inf_denorm0)
+ cmp/hs r3,r0
+ LOCAL(denorm_noup_sh1):
+ bt LOCAL(inf)
+ div0s DBL0H,r2 /* copy orig. sign into T. */
+ rts
+ rotcr r0
+ LOCAL(inf_denorm0): ! We might need to undo previous rounding.
+ mov.l LOCAL(x2fffffff),r3 /* Old fraction lsb | lower guard bits. */
+ tst r1,r1
+ bf LOCAL(inf)
+ add #-1,r0
+ tst r3,DBL0L /* Check if msb guard bit was rounded up. */
+ mov.l LOCAL(x5fffffff),r3 /* Fraction lsb | lower guard bits. */
+ addc r2,r0
+ shlr r0
+ tst r3,DBL0L /* Check if msb guard bit wants rounding up. */
+ #ifdef DELAYED_BRANCHES
+ bt/s LOCAL(denorm_noup)
+ #else
+ bt LOCAL(denorm_noup_sh1)
+ #endif
+ div0s DBL0H,r2 /* copy orig. sign into T. */
+ add #1,r0
+ LOCAL(denorm_noup):
+ rts
+ rotcr r0
+ LOCAL(ill_exp):
+ div0s DBL0H,r1
+ mov.l LOCAL(x7ff80000),r2
+ add r1,r1
+ bf LOCAL(inf_nan)
+ mov.w LOCAL(m32),r3 /* Handle denormal or zero. */
+ shlr16 r1
+ exts.w r1,r1
+ shll2 r1
+ add r1,r1
+ shlr8 r1
+ exts.w r1,r1
+ add #-8,r1 /* Go from 9 to 1 guard bit in MSW. */
+ cmp/gt r3,r1
+ mov.l @r15+,r3 /* DBL0L */
+ bf LOCAL(zero)
+ mov.l DBL0L, @-r15
+ shll8 DBL0L
+ rotcr r0 /* Insert leading 1. */
+ shlr16 r3
+ shll2 r3
+ add r3,r3
+ shlr8 r3
+ cmp/pl DBL0L /* Check lower 23 guard bits if guard bit 23 is 0. */
+ addc r3,r0 /* Assemble fraction with compressed guard bits. */
+ mov.l @r15+,DBL0L
+ mov #0,r2
+ neg r1,r1
+ LOCAL(denorm_loop):
+ shlr r0
+ rotcl r2
+ dt r1
+ bf LOCAL(denorm_loop)
+ tst #2,r0
+ rotcl r0
+ tst r2,r2
+ rotcl r0
+ xor #3,r0
+ add #3,r0 /* Even overflow gives the correct result. */
+ shlr2 r0
+ div0s r0,DBL0H
+ rts
+ rotcr r0
+ LOCAL(zero):
+ mov #0,r0
+ div0s r0,DBL0H
+ rts
+ rotcr r0
+ LOCAL(inf_nan):
+ not DBL0H,r0
+ tst r2,r0
+ mov.l @r15+,DBL0L
+ bf LOCAL(inf)
+ rts
+ mov #-1,r0 /* NAN */
+ LOCAL(inf): /* r2 must be positive here. */
+ mov.l LOCAL(xffe00000),r0
+ div0s r2,DBL0H
+ rts
+ rotcr r0
+ LOCAL(m32):
+ .word -32
+ .balign 4
+ LOCAL(x38000000):
+ .long 0x38000000
+ LOCAL(x70000000):
+ .long 0x70000000
+ LOCAL(x2fffffff):
+ .long 0x2fffffff
+ LOCAL(x01000000):
+ .long 0x01000000
+ LOCAL(xff000000):
+ .long 0xff000000
+ LOCAL(x5fffffff):
+ .long 0x5fffffff
+ LOCAL(x7ff80000):
+ .long 0x7ff80000
+ LOCAL(xffe00000):
+ .long 0xffe00000
+ ENDFUNC(GLOBAL(truncdfsf2_))
+ #endif /* L_truncdfsf2 */
+ #endif /* ! DYN_SHIFT */
+
+ /* The actual arithmetic uses dynamic shift. Supporting SH1 / SH2 here would
+ make this code too hard to maintain, so if you want to add SH1 / SH2
+ support, do it in a separate copy. */
+ #ifdef DYN_SHIFT
+ #ifdef L_extendsfdf2
+ .global GLOBAL(extendsfdf2_)
+ FUNC(GLOBAL(extendsfdf2_))
+ GLOBAL(extendsfdf2_):
+ mov.l LOCAL(x7f800000),r2
+ mov #29,r3
+ mov r4,DBLRL
+ not r4,DBLRH
+ tst r2,r4
+ shld r3,DBLRL
+ bt LOCAL(zero_denorm)
+ mov #-3,r3
+ tst r2,DBLRH
+ mov r4,DBLRH
+ bt/s LOCAL(inf_nan)
+ shll DBLRH
+ shld r3,DBLRH
+ mov.l LOCAL(x38000000),r2
+ rotcr DBLRH
+ rts
+ add r2,DBLRH
+ .balign 4
+ LOCAL(inf_nan):
+ shld r2,DBLRH
+ mov.l LOCAL(x70000000),r2
+ rotcr DBLRH
+ rts
+ add r2,DBLRH
+ LOCAL(zero_denorm):
+ mov.l r4,@-r15
+ add r4,r4
+ tst r4,r4
+ extu.w r4,r2
+ bt LOCAL(zero)
+ cmp/eq r4,r2
+ extu.b r4,r1
+ bf/s LOCAL(three_bytes)
+ mov.l LOCAL(c__clz_tab),r0
+ cmp/eq r4,r1
+ mov #22,DBLRH
+ bt LOCAL(one_byte)
+ shlr8 r2
+ mov #14,DBLRH
+ LOCAL(one_byte):
+ #ifdef __pic__
+ add r0,r2
+ mova LOCAL(c__clz_tab),r0
+ #endif
+ mov.b @(r0,r2),r2
+ mov #21,r3
+ mov.w LOCAL(x0),DBLRL
+ sub r2,DBLRH
+ LOCAL(norm_shift):
+ shld DBLRH,r4
+ mov.l @r15+,r2
+ shld r3,DBLRH
+ mov.l LOCAL(xb7ffffff),r3
+ add r4,DBLRH
+ cmp/pz r2
+ mov r2,r4
+ rotcr DBLRH
+ rts
+ add r3,DBLRH
+ LOCAL(three_bytes):
+ mov r4,r2
+ shlr16 r2
+ #ifdef __pic__
+ add r0,r2
+ mova LOCAL(c__clz_tab),r0
+ #endif
+ mov.b @(r0,r2),r2
+ mov #21,r3
+ mov #6-32,DBLRH
+ sub r2,DBLRH
+ mov r4,DBLRL
+ shld DBLRH,DBLRL
+ bra LOCAL(norm_shift)
+ add #32,DBLRH
+ LOCAL(zero):
+ rts /* DBLRL has already been zeroed above. */
+ mov.l @r15+,DBLRH
+ LOCAL(x0):
+ .word 0
+ .balign 4
+ LOCAL(x7f800000):
+ .long 0x7f800000
+ LOCAL(x38000000):
+ .long 0x38000000
+ LOCAL(x70000000):
+ .long 0x70000000
+ LOCAL(xb7ffffff):
+ /* Flip sign back, do exponent adjustment, and remove leading one. */
+ .long 0x80000000 + 0x38000000 - 1
+ LOCAL(c__clz_tab):
+ #ifdef __pic__
+ .long GLOBAL(clz_tab) - .
+ #else
+ .long GLOBAL(clz_tab)
+ #endif
+ ENDFUNC(GLOBAL(extendsfdf2_))
+ #endif /* L_extendsfdf2 */
+
+ #ifdef L_truncdfsf2
+ .global GLOBAL(truncdfsf2_)
+ FUNC(GLOBAL(truncdfsf2_))
+ GLOBAL(truncdfsf2_):
+ mov.l LOCAL(x38000000),r3
+ mov DBL0H,r1
+ mov.l LOCAL(x70000000),r2
+ mov DBL0H,r0
+ sub r3,r1
+ mov.l DBL0L,@-r15
+ tst r2,r1
+ mov #12,r3
+ shld r3,r0 ! Isolate highpart fraction.
+ bf LOCAL(ill_exp)
+ shll2 r1
+ mov.l LOCAL(x2fffffff),r2 /* Fraction lsb | lower guard bits. */
+ shll2 r1
+ mov.l LOCAL(xff000000),r3
+ shlr8 r0
+ tst r2,DBL0L /* Check if msb guard bit wants rounding up. */
+ mov #-28,r2
+ bt/s LOCAL(add_frac)
+ shld r2,DBL0L
+ add #1,DBL0L
+ LOCAL(add_frac):
+ add DBL0L,r0
+ mov.l LOCAL(x01000000),r2
+ and r3,r1
+ mov.l @r15+,DBL0L
+ add r1,r0
+ tst r3,r0
+ bt LOCAL(inf_denorm0)
+ #if 0 // No point checking overflow -> infinity if we dont't raise a signal.
+ cmp/hs r3,r0
+ bt LOCAL(inf)
+ #endif
+ div0s DBL0H,r2 /* copy orig. sign into T. */
+ rts
+ rotcr r0
+ LOCAL(inf_denorm0): ! We might need to undo previous rounding.
+ mov.l LOCAL(x2fffffff),r3 /* Old fraction lsb | lower guard bits. */
+ tst r1,r1
+ bf LOCAL(inf)
+ add #-1,r0
+ tst r3,DBL0L /* Check if msb guard bit was rounded up. */
+ mov.l LOCAL(x5fffffff),r3 /* Fraction lsb | lower guard bits. */
+ addc r2,r0
+ shlr r0
+ tst r3,DBL0L /* Check if msb guard bit wants rounding up. */
+ bt/s LOCAL(denorm_noup)
+ div0s DBL0H,r2 /* copy orig. sign into T. */
+ add #1,r0
+ LOCAL(denorm_noup):
+ rts
+ rotcr r0
+ LOCAL(ill_exp):
+ div0s DBL0H,r1
+ mov.l LOCAL(x7ff80000),r2
+ add r1,r1
+ bf LOCAL(inf_nan)
+ mov.w LOCAL(m32),r3 /* Handle denormal or zero. */
+ mov #-21,r2
+ shad r2,r1
+ add #-8,r1 /* Go from 9 to 1 guard bit in MSW. */
+ cmp/gt r3,r1
+ mov.l @r15+,r3 /* DBL0L */
+ bf LOCAL(zero)
+ mov.l DBL0L, @-r15
+ shll8 DBL0L
+ rotcr r0 /* Insert leading 1. */
+ shld r2,r3
+ cmp/pl DBL0L /* Check lower 23 guard bits if guard bit 23 is 0. */
+ addc r3,r0 /* Assemble fraction with compressed guard bits. */
+ mov r0,r2
+ shld r1,r0
+ mov.l @r15+,DBL0L
+ add #32,r1
+ shld r1,r2
+ tst #2,r0
+ rotcl r0
+ tst r2,r2
+ rotcl r0
+ xor #3,r0
+ add #3,r0 /* Even overflow gives the correct result. */
+ shlr2 r0
+ div0s r0,DBL0H
+ rts
+ rotcr r0
+ LOCAL(zero):
+ mov #0,r0
+ div0s r0,DBL0H
+ rts
+ rotcr r0
+ LOCAL(inf_nan):
+ not DBL0H,r0
+ tst r2,r0
+ mov.l @r15+,DBL0L
+ bf LOCAL(inf)
+ rts
+ mov #-1,r0 /* NAN */
+ LOCAL(inf): /* r2 must be positive here. */
+ mov.l LOCAL(xffe00000),r0
+ div0s r2,DBL0H
+ rts
+ rotcr r0
+ LOCAL(m32):
+ .word -32
+ .balign 4
+ LOCAL(x38000000):
+ .long 0x38000000
+ LOCAL(x70000000):
+ .long 0x70000000
+ LOCAL(x2fffffff):
+ .long 0x2fffffff
+ LOCAL(x01000000):
+ .long 0x01000000
+ LOCAL(xff000000):
+ .long 0xff000000
+ LOCAL(x5fffffff):
+ .long 0x5fffffff
+ LOCAL(x7ff80000):
+ .long 0x7ff80000
+ LOCAL(xffe00000):
+ .long 0xffe00000
+ ENDFUNC(GLOBAL(truncdfsf2_))
+ #endif /* L_truncdfsf2 */
+
+ #ifdef L_add_sub_sf3
+ .global GLOBAL(subsf3_)
+ FUNC(GLOBAL(subsf3_))
+ .global GLOBAL(addsf3_)
+ FUNC(GLOBAL(addsf3_))
+ GLOBAL(subsf3_):
+ cmp/pz r5
+ add r5,r5
+ rotcr r5
+ GLOBAL(addsf3_):
+ mov.l LOCAL(x7f800000),r3
+ mov r4,r6
+ add r6,r6
+ mov r5,r7
+ add r7,r7
+ mov r4,r0
+ or r3,r0
+ cmp/hi r6,r7
+ mov r5,r1
+ bf/s LOCAL(r4_hs)
+ or r3,r1
+ cmp/eq r5,r1
+ bt LOCAL(ret_r5) /* sole Inf or NaN, return unchanged. */
+ shll8 r0
+ tst r6,r6
+ shll8 r1
+ mov #-24,r2
+ bt LOCAL(denorm_r4)
+ LOCAL(denorm_r4_done):
+ mov r6,r3
+ shld r2,r3
+ mov r7,r6
+ shld r2,r6
+ sub r6,r3
+ mov r0,r7
+ shld r3,r0 /* Get 31 upper bits. */
+ mov.l LOCAL(xff000000),r2
+ add #31,r3
+ mov.l r5,@-r15 ! push result sign.
+ cmp/pl r3
+ shld r3,r7
+ bf LOCAL(ret_stack)
+ div0s r4,r5
+ bf/s LOCAL(add)
+ cmp/pl r7 /* Is LSB in r0 clear, but any lower guards bit set? */
+ subc r0,r1
+ mov.l LOCAL(c__clz_tab),r7
+ tst r2,r1
+ mov #-24,r3
+ bf/s LOCAL(norm_r0)
+ mov r1,r0
+ extu.w r1,r1
+ bra LOCAL(norm_check2)
+ cmp/eq r0,r1
+ LOCAL(ret_r5):
+ rts
+ mov r5,r0
+ LOCAL(ret_stack):
+ rts
+ mov.l @r15+,r0
+
+ /* We leave the numbers denormalized, but we change the bit position to be
+ consistent with normalized numbers. This also removes the supurious
+ leading one that was inserted before. */
+ LOCAL(denorm_r4):
+ tst r7,r7
+ add r0,r0
+ bf LOCAL(denorm_r4_done)
+ bra LOCAL(denorm_r4_done)
+ add r1,r1
+ LOCAL(denorm_r5):
+ tst r6,r6
+ add r1,r1
+ bf LOCAL(denorm_r5_done)
+ clrt
+ bra LOCAL(denorm_r5_done)
+ add r0,r0
+
+ /* If the exponent differs by two or more, normalization is minimal, and
+ few guard bits are needed for an exact final result, so sticky guard
+ bit compresion before subtraction (or add) works fine.
+ If the exponent differs by one, only one extra guard bit is generated,
+ and effectively no guard bit compression takes place. */
+
+ LOCAL(r4_hs):
+ cmp/eq r4,r0
+ shll8 r0
+ bt LOCAL(inf_nan_arg0)
+ shll8 r1
+ mov #-24,r2
+ tst r7,r7
+ shld r2,r7
+ bt LOCAL(denorm_r5)
+ LOCAL(denorm_r5_done):
+ mov r1,r3
+ shld r2,r6
+ subc r6,r7
+ mov.l LOCAL(xff000000),r2
+ bf LOCAL(same_exp)
+ shld r7,r1 /* Get 31 upper bits. */
+ add #31,r7
+ mov.l r4,@-r15 ! push result sign.
+ cmp/pl r7
+ shld r7,r3
+ bf LOCAL(ret_stack)
+ div0s r4,r5
+ bf/s LOCAL(add)
+ cmp/pl r3 /* Is LSB in r1 clear, but any lower guard bit set? */
+ subc r1,r0
+ mov.l LOCAL(c__clz_tab),r7
+ LOCAL(norm_check):
+ tst r2,r0
+ mov #-24,r3
+ bf LOCAL(norm_r0)
+ extu.w r0,r1
+ cmp/eq r0,r1
+ LOCAL(norm_check2):
+ mov #-8,r3
+ bt LOCAL(norm_r0)
+ mov #-16,r3
+ LOCAL(norm_r0):
+ mov r0,r1
+ shld r3,r0
+ #ifdef __pic__
+ add r0,r7
+ mova LOCAL(c__clz_tab),r0
+ #endif
+ mov.b @(r0,r7),r7
+ add #25,r3
+ add #-9+1,r6
+ mov r1,r0
+ sub r7,r3
+ mov.l LOCAL(xbfffffff),r7
+ sub r3,r6 /* generate exp-1 */
+ mov.w LOCAL(d24),r2
+ cmp/pz r6 /* check exp > 0 */
+ shld r3,r0 /* Leading 1 becomes +1 exp adjustment. */
+ bf LOCAL(zero_denorm)
+ LOCAL(denorm_done):
+ add #30,r3
+ shld r3,r1
+ mov.w LOCAL(m1),r3
+ tst r7,r1 ! clear T if rounding up
+ shld r2,r6
+ subc r3,r0 ! round - overflow will boost exp adjustment to 2.
+ mov.l @r15+,r2
+ add r6,r0 ! overflow will generate inf
+ cmp/ge r2,r3 ! get sign into T
+ rts
+ rotcr r0
+ LOCAL(ret_r4):
+ rts
+ mov r4,r0
+
+ /* At worst, we are shifting the number back in place where an incoming
+ denormal was. Thus, the shifts won't get out of range. They still
+ might generate a zero fraction, but that's OK, that makes it 0. */
+ LOCAL(zero_denorm):
+ add r6,r3
+ mov r1,r0
+ mov #0,r6 /* leading one will become free (except for rounding) */
+ bra LOCAL(denorm_done)
+ shld r3,r0
+
+ /* Handle abs(r4) >= abs(r5), same exponents specially so we don't need
+ check for a zero fraction in the main path. */
+ LOCAL(same_exp):
+ div0s r4,r5
+ mov.l r4,@-r15
+ bf LOCAL(add)
+ cmp/eq r1,r0
+ mov.l LOCAL(c__clz_tab),r7
+ bf/s LOCAL(norm_check)
+ sub r1,r0
+ mov.l @r15+,r1
+ cmp/gt r4,r0 ! copy sign
+ rts
+ rotcr r0
+
+ /* r2: 0xff000000 */
+ LOCAL(add):
+ addc r1,r0
+ mov.w LOCAL(x2ff),r7
+ shll8 r6
+ bf/s LOCAL(no_carry)
+ shll16 r6
+ tst r7,r0
+ shlr8 r0
+ mov.l @r15+,r3 ! discard saved sign
+ subc r2,r0
+ sett
+ addc r6,r0
+ cmp/hs r2,r0
+ bt/s LOCAL(inf)
+ div0s r7,r4 /* Copy sign. */
+ rts
+ rotcr r0
+ LOCAL(inf):
+ mov r6,r0
+ rts
+ rotcr r0
+ LOCAL(no_carry):
+ mov.w LOCAL(m1),r3
+ shll r0
+ bf LOCAL(denorm_add)
+ tst r7,r0
+ shlr8 r0
+ mov.l @r15+,r1 ! discard saved sign
+ subc r3,r0 ! round ; overflow -> exp++
+ cmp/ge r4,r3 /* Copy sign. */
+ add r6,r0 ! overflow -> inf
+ rts
+ rotcr r0
+
+ LOCAL(denorm_add):
+ shlr r0
+ cmp/ge r4,r3 /* Copy sign. */
+ shlr8 r0
+ mov.l @r15+,r1 ! discard saved sign
+ rts
+ rotcr r0
+
+ LOCAL(inf_nan_arg0):
+ cmp/eq r5,r1
+ bf LOCAL(ret_r4)
+ div0s r4,r5 /* Both are inf or NaN, check signs. */
+ bt LOCAL(ret_nan) /* inf - inf, or NaN. */
+ mov r4,r0 ! same sign; return NaN if either is NaN.
+ rts
+ or r5,r0
+ LOCAL(ret_nan):
+ rts
+ mov #-1,r0
+
+ LOCAL(d24):
+ .word 24
+ LOCAL(x2ff):
+ .word 0x2ff
+ LOCAL(m1):
+ .word -1
+ .balign 4
+ LOCAL(x7f800000):
+ .long 0x7f800000
+ LOCAL(xbfffffff):
+ .long 0xbfffffff
+ LOCAL(xff000000):
+ .long 0xff000000
+ LOCAL(xfe000000):
+ .long 0xfe000000
+ LOCAL(c__clz_tab):
+ #ifdef __pic__
+ .long GLOBAL(clz_tab) - .
+ #else
+ .long GLOBAL(clz_tab)
+ #endif
+
+ ENDFUNC(GLOBAL(addsf3_))
+ ENDFUNC(GLOBAL(subsf3_))
+ #endif /* L_add_sub_sf3 */
+
+ #ifdef L_mulsf3
+ .global GLOBAL(mulsf3_)
+ FUNC(GLOBAL(mulsf3_))
+ GLOBAL(mulsf3_):
+ mov.l LOCAL(x7f800000),r1
+ not r4,r2
+ mov r4,r3
+ not r5,r0
+ tst r1,r2
+ or r1,r3
+ bt/s LOCAL(inf_nan_arg0)
+ tst r1,r0
+ bt LOCAL(inf_nan_arg1)
+ tst r1,r5
+ mov r1,r2
+ shll8 r3
+ or r5,r1
+ bt/s LOCAL(zero_denorm_arg1)
+ shll8 r1
+ tst r2,r4
+ bt LOCAL(zero_denorm_arg0)
+ dmulu.l r3,r1
+ mov r4,r0
+ and r2,r0
+ LOCAL(arg_norm):
+ and r5,r2
+ mov.l LOCAL(x3f800000),r3
+ sts mach,r1
+ sub r3,r0
+ sts macl,r3
+ add r2,r0
+ cmp/pz r1
+ mov.w LOCAL(x100),r2
+ bf/s LOCAL(norm_frac)
+ tst r3,r3
+ shll2 r1 /* Shift one up, replace leading 1 with 0. */
+ shlr r1
+ tst r3,r3
+ LOCAL(norm_frac):
+ mov.w LOCAL(mx80),r3
+ bf LOCAL(round_frac)
+ tst r2,r1
+ LOCAL(round_frac):
+ mov.l LOCAL(xff000000),r2
+ subc r3,r1 /* Even overflow gives right result: exp++, frac=0. */
+ shlr8 r1
+ add r1,r0
+ shll r0
+ bt LOCAL(ill_exp)
+ tst r2,r0
+ bt LOCAL(denorm0)
+ cmp/hs r2,r0
+ bt LOCAL(inf)
+ LOCAL(insert_sign):
+ div0s r4,r5
+ rts
+ rotcr r0
+ LOCAL(denorm0):
+ sub r2,r0
+ bra LOCAL(insert_sign)
+ shlr r0
+ LOCAL(zero_denorm_arg1):
+ mov.l LOCAL(x60000000),r2 /* Check exp0 >= -64 */
+ add r1,r1
+ tst r1,r1 /* arg1 == 0 ? */
+ mov #0,r0
+ bt LOCAL(insert_sign) /* argument 1 is zero ==> return 0 */
+ tst r4,r2
+ bt LOCAL(insert_sign) /* exp0 < -64 ==> return 0 */
+ mov.l LOCAL(c__clz_tab),r0
+ mov r3,r2
+ mov r1,r3
+ bra LOCAL(arg_normalize)
+ mov r2,r1
+ LOCAL(zero_denorm_arg0):
+ mov.l LOCAL(x60000000),r2 /* Check exp1 >= -64 */
+ add r3,r3
+ tst r3,r3 /* arg0 == 0 ? */
+ mov #0,r0
+ bt LOCAL(insert_sign) /* argument 0 is zero ==> return 0 */
+ tst r5,r2
+ bt LOCAL(insert_sign) /* exp1 < -64 ==> return 0 */
+ mov.l LOCAL(c__clz_tab),r0
+ LOCAL(arg_normalize):
+ mov.l r7,@-r15
+ extu.w r3,r7
+ cmp/eq r3,r7
+ mov.l LOCAL(xff000000),r7
+ mov #-8,r2
+ bt 0f
+ tst r7,r3
+ mov #-16,r2
+ bt 0f
+ mov #-24,r2
+ 0:
+ mov r3,r7
+ shld r2,r7
+ #ifdef __pic__
+ add r0,r7
+ mova LOCAL(c__clz_tab),r0
+ #endif
+ mov.b @(r0,r7),r0
+ add #32,r2
+ mov r2,r7
+ mov #23,r2
+ sub r0,r7
+ mov.l LOCAL(x7f800000),r0
+ shld r7,r3
+ shld r2,r7
+ mov r0,r2
+ and r4,r0
+ sub r7,r0
+ mov.l @r15+,r7
+ bra LOCAL(arg_norm)
+ dmulu.l r3,r1
+ #if 0 /* This is slightly slower, but could be used if table lookup causes
+ cache thrashing. */
+ bt LOCAL(insert_sign) /* exp1 < -64 ==> return 0 */
+ mov.l LOCAL(xff000000),r2
+ mov r4,r0
+ LOCAL(arg_normalize):
+ tst r2,r3
+ bf LOCAL(arg_bit_norm)
+ LOCAL(arg_byte_loop):
+ tst r2,r3
+ add r2,r0
+ shll8 r3
+ bt LOCAL(arg_byte_loop)
+ add r4,r0
+ LOCAL(arg_bit_norm):
+ mov.l LOCAL(x7f800000),r2
+ rotl r3
+ LOCAL(arg_bit_loop):
+ add r2,r0
+ bf/s LOCAL(arg_bit_loop)
+ rotl r3
+ rotr r3
+ rotr r3
+ sub r2,r0
+ bra LOCAL(arg_norm)
+ dmulu.l r3,r1
+ #endif /* 0 */
+ LOCAL(inf):
+ bra LOCAL(insert_sign)
+ mov r2,r0
+ LOCAL(inf_nan_arg0):
+ bt LOCAL(inf_nan_both)
+ add r0,r0
+ cmp/eq #-1,r0 /* arg1 zero? -> NAN */
+ bt LOCAL(insert_sign)
+ mov r4,r0
+ LOCAL(inf_insert_sign):
+ bra LOCAL(insert_sign)
+ add r0,r0
+ LOCAL(inf_nan_both):
+ mov r4,r0
+ bra LOCAL(inf_insert_sign)
+ or r5,r0
+ LOCAL(inf_nan_arg1):
+ mov r2,r0
+ add r0,r0
+ cmp/eq #-1,r0 /* arg0 zero? */
+ bt LOCAL(insert_sign)
+ bra LOCAL(inf_insert_sign)
+ mov r5,r0
+ LOCAL(ill_exp):
+ cmp/pz r0
+ mov #-24,r3
+ bt LOCAL(inf)
+ add r1,r1
+ mov r0,r2
+ sub r1,r2 ! remove fraction to get back pre-rounding exponent.
+ sts mach,r0
+ sts macl,r1
+ shad r3,r2
+ mov r0,r3
+ shld r2,r0
+ add #32,r2
+ cmp/pz r2
+ shld r2,r3
+ bf LOCAL(zero)
+ or r1,r3
+ mov #-1,r1
+ tst r3,r3
+ mov.w LOCAL(x100),r3
+ bf/s LOCAL(denorm_round_up)
+ mov #-0x80,r1
+ tst r3,r0
+ LOCAL(denorm_round_up):
+ mov #-7,r3
+ subc r1,r0
+ bra LOCAL(insert_sign)
+ shld r3,r0
+ LOCAL(zero):
+ bra LOCAL(insert_sign)
+ mov #0,r0
+ LOCAL(x100):
+ .word 0x100
+ LOCAL(mx80):
+ .word -0x80
+ .balign 4
+ LOCAL(x7f800000):
+ .long 0x7f800000
+ LOCAL(x3f800000):
+ .long 0x3f800000
+ LOCAL(xff000000):
+ .long 0xff000000
+ LOCAL(x60000000):
+ .long 0x60000000
+ LOCAL(c__clz_tab):
+ #ifdef __pic__
+ .long GLOBAL(clz_tab) - .
+ #else
+ .long GLOBAL(clz_tab)
+ #endif
+ ENDFUNC(GLOBAL(mulsf3_))
+ #endif /* L_mulsf3 */
+
+ #ifdef L_hypotf
+ .global GLOBAL(hypotf)
+ FUNC(GLOBAL(hypotf))
+ GLOBAL(hypotf):
+ /* This integer implementation takes 71 to 72 cycles in the main path.
+ This is a bit slower than the SH4 can do this computation using double
+ precision hardware floating point - 57 cycles, or 69 with mode switches. */
+ /* First, calculate x (r4) as the sum of the square of the fractions -
+ the exponent is calculated separately in r3.
+ Then, alculate sqrt(x) for the fraction by reciproot iteration.
+ We get an 7.5 bit inital value using linear approximation with two slopes
+ that are powers of two.
+ x (- [1. .. 2.) y0 := 1.25 - x/4 - tab(x) y (- (0.8 .. 1.0)
+ x (- [2. .. 4.) y0 := 1. - x/8 - tab(x) y (- (0.5 .. 0.8)
+ x is represented with two bits before the point,
+ y with 0 bits before the binary point.
+ Thus, to calculate y0 := 1. - x/8 - tab(x), all you have to do is to shift x
+ right by 1, negate it, and subtract tab(x). */
+
+ /* y1 := 1.5*y0 - 0.5 * (x * y0) * (y0 * y0)
+ z0 := x * y1
+ z1 := z0 + 0.5 * (y1 - (y1*y1) * z0) */
+
+ mov.l LOCAL(xff000000),r1
+ add r4,r4
+ mov r4,r0
+ add r5,r5
+ cmp/hs r5,r4
+ sub r5,r0
+ mov #-24,r2
+ bf/s LOCAL(r5_large)
+ shad r2,r0
+ mov r4,r3
+ shll8 r4
+ rotcr r4
+ tst #0xe0,r0
+ neg r0,r0
+ bt LOCAL(ret_abs_r3)
+ tst r1,r5
+ shll8 r5
+ bt/s LOCAL(denorm_r5)
+ cmp/hi r3,r1
+ dmulu.l r4,r4
+ bf LOCAL(inf_nan)
+ rotcr r5
+ shld r0,r5
+ LOCAL(denorm_r5_done):
+ sts mach,r4
+ dmulu.l r5,r5
+ mov.l r6,@-r15
+ mov #20,r6
+
+ sts mach,r5
+ LOCAL(add_frac):
+ mova LOCAL(tab)-32,r0
+ mov.l r7,@-r15
+ mov.w LOCAL(x1380),r7
+ and r1,r3
+ addc r5,r4
+ mov.w LOCAL(m25),r2 ! -25
+ bf LOCAL(frac_ok)
+ sub r1,r3
+ rotcr r4
+ cmp/eq r1,r3 ! did we generate infinity ?
+ bt LOCAL(inf_nan)
+ shlr r4
+ mov r4,r1
+ shld r2,r1
+ mov.b @(r0,r1),r0
+ mov r4,r1
+ shld r6,r1
+ bra LOCAL(frac_low2)
+ sub r1,r7
+
+ LOCAL(frac_ok):
+ mov r4,r1
+ shld r2,r1
+ mov.b @(r0,r1),r1
+ cmp/pz r4
+ mov r4,r0
+ bt/s LOCAL(frac_low)
+ shld r6,r0
+ mov.w LOCAL(xf80),r7
+ shlr r0
+ LOCAL(frac_low):
+ sub r0,r7
+ LOCAL(frac_low2):
+ mov.l LOCAL(x40000080),r0 ! avoid denorm results near 1. << r3
+ sub r1,r7 ! {0.12}
+ mov.l LOCAL(xfffe0000),r5 ! avoid rounding overflow near 4. << r3
+ swap.w r7,r1 ! {0.28}
+ dmulu.l r1,r4 /* two issue cycles */
+ mulu.w r7,r7 /* two issue cycles */
+ sts mach,r2 ! {0.26}
+ mov r1,r7
+ shlr r1
+ sts macl,r6 ! {0.24}
+ cmp/hi r0,r4
+ shlr2 r2
+ bf LOCAL(near_one)
+ shlr r2 ! {0.23} systemic error of linear approximation keeps y1 < 1
+ dmulu.l r2,r6
+ cmp/hs r5,r4
+ add r7,r1 ! {1.28}
+ bt LOCAL(near_four)
+ shlr2 r1 ! {1.26}
+ sts mach,r0 ! {0.15} x*y0^3 == {0.16} 0.5*x*y0^3
+ shlr2 r1 ! {1.24}
+ shlr8 r1 ! {1.16}
+ sett ! compensate for truncation of subtrahend, keep y1 < 1
+ subc r0,r1 ! {0.16} y1; max error about 3.5 ulp
+ swap.w r1,r0
+ dmulu.l r0,r4 ! { 1.30 }
+ mulu.w r1,r1
+ sts mach,r2
+ shlr2 r0
+ sts macl,r1
+ add r2,r0
+ mov.l LOCAL(xff000000),r6
+ add r2,r0
+ dmulu.l r1,r2
+ add #127,r0
+ add r6,r3 ! precompensation for adding leading 1
+ sts mach,r1
+ shlr r3
+ mov.l @r15+,r7
+ sub r1,r0 ! {0.31} max error about 50 ulp (+127)
+ mov.l @r15+,r6
+ shlr8 r0 ! {0.23} max error about 0.7 ulp
+ rts
+ add r3,r0
+
+ LOCAL(r5_large):
+ mov r5,r3
+ mov #-31,r2
+ cmp/ge r2,r0
+ shll8 r5
+ bf LOCAL(ret_abs_r3)
+ rotcr r5
+ tst r1,r4
+ shll8 r4
+ bt/s LOCAL(denorm_r4)
+ cmp/hi r3,r1
+ dmulu.l r5,r5
+ bf LOCAL(inf_nan)
+ rotcr r4
+ LOCAL(denorm_r4_done):
+ shld r0,r4
+ sts mach,r5
+ dmulu.l r4,r4
+ mov.l r6,@-r15
+ mov #20,r6
+ bra LOCAL(add_frac)
+ sts mach,r4
+
+ LOCAL(near_one):
+ bra LOCAL(assemble_sqrt)
+ mov #0,r0
+ LOCAL(near_four):
+ ! exact round-to-nearest would add 255. We add 256 for speed & compactness.
+ mov r4,r0
+ shlr8 r0
+ add #1,r0
+ tst r0,r0
+ addc r0,r3 ! might generate infinity.
+ LOCAL(assemble_sqrt):
+ mov.l @r15+,r7
+ shlr r3
+ mov.l @r15+,r6
+ rts
+ add r3,r0
+ LOCAL(inf_nan):
+ LOCAL(ret_abs_r3):
+ mov r3,r0
+ rts
+ shlr r0
+ LOCAL(denorm_r5):
+ bf LOCAL(inf_nan)
+ tst r1,r4
+ bt LOCAL(denorm_both)
+ dmulu.l r4,r4
+ bra LOCAL(denorm_r5_done)
+ shld r0,r5
+ LOCAL(denorm_r4):
+ bf LOCAL(inf_nan)
+ tst r1,r5
+ dmulu.l r5,r5
+ bf LOCAL(denorm_r4_done)
+ LOCAL(denorm_both): ! normalize according to r3.
+ extu.w r3,r2
+ mov.l LOCAL(c__clz_tab),r0
+ cmp/eq r3,r2
+ mov #-8,r2
+ bt 0f
+ tst r1,r3
+ mov #-16,r2
+ bt 0f
+ mov #-24,r2
+ 0:
+ shld r2,r3
+ mov.l r7,@-r15
+ #ifdef __pic__
+ add r0,r3
+ mova LOCAL(c__clz_tab),r0
+ #endif
+ mov.b @(r0,r3),r0
+ add #32,r2
+ sub r0,r2
+ shld r2,r4
+ mov r2,r7
+ dmulu.l r4,r4
+ sts.l pr,@-r15
+ mov #1,r3
+ bsr LOCAL(denorm_r5_done)
+ shld r2,r5
+ mov.l LOCAL(x01000000),r1
+ neg r7,r2
+ lds.l @r15+,pr
+ tst r1,r0
+ mov.l @r15+,r7
+ bt 0f
+ add #1,r2
+ sub r1,r0
+ 0:
+ rts
+ shld r2,r0
+
+ LOCAL(m25):
+ .word -25
+ LOCAL(x1380):
+ .word 0x1380
+ LOCAL(xf80):
+ .word 0xf80
+ .balign 4
+ LOCAL(xff000000):
+ .long 0xff000000
+ LOCAL(x40000080):
+ .long 0x40000080
+ LOCAL(xfffe0000):
+ .long 0xfffe0000
+ LOCAL(x01000000):
+ .long 0x01000000
+ LOCAL(c__clz_tab):
+ #ifdef __pic__
+ .long GLOBAL(clz_tab) - .
+ #else
+ .long GLOBAL(clz_tab)
+ #endif
+
+ /*
+ double err(double x)
+ {
+ return (x < 2. ? 1.25 - x/4. : 1. - x/8.) - 1./sqrt(x);
+ }
+
+ int
+ main ()
+ {
+ int i = 0;
+ double x, s, v;
+ double lx, hx;
+
+ s = 1./32.;
+ for (x = 1.; x < 4; x += s, i++)
+ {
+ lx = x;
+ hx = x + s - 1. / (1 << 30);
+ v = 0.5 * (err (lx) + err (hx));
+ printf ("%s% 4d%c",
+ (i & 7) == 0 ? "\t.byte\t" : "",
+ (int)(v * 4096 + 0.5) - 128,
+ (i & 7) == 7 ? '\n' : ',');
+ }
+ return 0;
+ } */
+
+ .balign 4
+ LOCAL(tab):
+ .byte -113, -84, -57, -33, -11, 8, 26, 41
+ .byte 55, 67, 78, 87, 94, 101, 106, 110
+ .byte 113, 115, 115, 115, 114, 112, 109, 106
+ .byte 101, 96, 91, 84, 77, 69, 61, 52
+ .byte 51, 57, 63, 68, 72, 77, 80, 84
+ .byte 87, 89, 91, 93, 95, 96, 97, 97
+ .byte 97, 97, 97, 96, 95, 94, 93, 91
+ .byte 89, 87, 84, 82, 79, 76, 72, 69
+ .byte 65, 61, 57, 53, 49, 44, 39, 34
+ .byte 29, 24, 19, 13, 8, 2, -4, -10
+ .byte -17, -23, -29, -36, -43, -50, -57, -64
+ .byte -71, -78, -85, -93,-101,-108,-116,-124
+ ENDFUNC(GLOBAL(hypotf))
+ #endif /* L_hypotf */
+ #endif /* DYN_SHIFT */
Index: sh-modes.def
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/sh/sh-modes.def,v
retrieving revision 1.1
diff -p -r1.1 sh-modes.def
*** sh-modes.def 13 Oct 2003 21:16:32 -0000 1.1
--- sh-modes.def 2 Aug 2004 03:57:36 -0000
***************
*** 1,5 ****
! /* Alpha extra machine modes.
! Copyright (C) 2003 Free Software Foundation, Inc.
This file is part of GCC.
--- 1,5 ----
! /* SH extra machine modes.
! Copyright (C) 2004 Free Software Foundation, Inc.
This file is part of GCC.
*************** Boston, MA 02111-1307, USA. */
*** 21,23 ****
--- 21,27 ----
/* The SH uses a partial integer mode to represent the FPSCR register. */
PARTIAL_INT_MODE (SI);
+ /* For software floating point comparisons. */
+ CC_MODE (CC_FP_NE);
+ CC_MODE (CC_FP_GT);
+ CC_MODE (CC_FP_UNLT);
Index: sh-protos.h
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/sh/sh-protos.h,v
retrieving revision 1.55
diff -p -r1.55 sh-protos.h
*** sh-protos.h 10 May 2004 23:25:13 -0000 1.55
--- sh-protos.h 2 Aug 2004 03:57:36 -0000
*************** extern void expand_sf_binop (rtx (*)(rtx
*** 93,98 ****
--- 93,102 ----
extern void expand_df_unop (rtx (*)(rtx, rtx, rtx), rtx *);
extern void expand_df_binop (rtx (*)(rtx, rtx, rtx, rtx), rtx *);
extern void expand_fp_branch (rtx (*)(void), rtx (*)(void));
+ extern void expand_sfunc_unop (enum machine_mode, rtx (*) (rtx, rtx),
+ const char *, enum rtx_code code, rtx *);
+ extern void expand_sfunc_binop (enum machine_mode, rtx (*) (rtx, rtx),
+ const char *, enum rtx_code code, rtx *);
extern int sh_insn_length_adjustment (rtx);
extern int sh_can_redirect_branch (rtx, rtx);
extern void sh_expand_unop_v2sf (enum rtx_code, rtx, rtx);
Index: sh.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/sh/sh.c,v
retrieving revision 1.270.2.3
diff -p -r1.270.2.3 sh.c
*** sh.c 29 Jun 2004 17:33:57 -0000 1.270.2.3
--- sh.c 2 Aug 2004 03:57:38 -0000
*************** prepare_scc_operands (enum rtx_code code
*** 1061,1066 ****
--- 1061,1128 ----
return t_reg;
}
+ static rtx
+ sh_soft_fp_cmp (int code, enum machine_mode op_mode)
+ {
+ const char *name;
+ rtx (*fun) (rtx, rtx), addr, tmp, first, last, equiv;
+ int df = op_mode == DFmode;
+ enum machine_mode mode;
+
+ if (flag_finite_math_only && ! df)
+ switch (code)
+ {
+ case EQ:
+ return gen_cmpeqsf_i1_finite (sh_compare_op0, sh_compare_op1);
+ case LE:
+ case UNLE:
+ return gen_cmplesf_i1_finite (sh_compare_op0, sh_compare_op1);
+ case GE:
+ case UNGE:
+ return gen_cmplesf_i1_finite (sh_compare_op1, sh_compare_op0);
+ default:
+ break;
+ }
+ if (flag_finite_math_only && df && code == EQ)
+ return gen_cmpeqdf_i1_finite (sh_compare_op0, sh_compare_op1);
+
+ switch (code)
+ {
+ case EQ:
+ name = df ? "__nedf2_" : "__nesf2_";
+ fun = df ? gen_cmpnedf_i1 : gen_cmpnesf_i1;
+ mode = CC_FP_NEmode;
+ break;
+ case UNLE:
+ name = df ? "__gtdf2t" : "__gtsf2t";
+ fun = df ? gen_cmpgtdf_i1 : gen_cmpgtsf_i1;
+ mode = CC_FP_GTmode;
+ break;
+ case GE:
+ name = df ? "__gedf2f" : "__gesf2f";
+ fun = df ? gen_cmpunltdf_i1 : gen_cmpunltsf_i1;
+ mode = CC_FP_UNLTmode;
+ break;
+ default: abort ();
+ }
+ tmp = gen_reg_rtx (mode);
+ addr = force_reg (Pmode, function_symbol (name));
+ first = emit_move_insn (gen_rtx_REG (op_mode, R4_REG), sh_compare_op0);
+ emit_move_insn (gen_rtx_REG (op_mode, R5_REG + df), sh_compare_op1);
+ last = emit_insn (fun (tmp, addr));
+ equiv = gen_rtx_fmt_ee (COMPARE, mode, sh_compare_op0, sh_compare_op1);
+ REG_NOTES (last) = gen_rtx_EXPR_LIST (REG_EQUAL, equiv, REG_NOTES (last));
+ /* Wrap the sequence in REG_LIBCALL / REG_RETVAL notes so that loop
+ invariant code motion can move it. */
+ REG_NOTES (first) = gen_rtx_INSN_LIST (REG_LIBCALL, last, REG_NOTES (first));
+ REG_NOTES (last) = gen_rtx_INSN_LIST (REG_RETVAL, first, REG_NOTES (last));
+ /* Use fpcmp_i1 rather than cmpeqsi_t, so that the optimizers can grok
+ the computation. */
+ return gen_rtx_SET (VOIDmode,
+ gen_rtx_REG (SImode, T_REG),
+ gen_rtx_fmt_ee (code, SImode, tmp, CONST0_RTX (mode)));
+ }
+
/* Called from the md file, set up the operands of a compare instruction. */
void
*************** from_compare (rtx *operands, int code)
*** 1081,1091 ****
|| (TARGET_SH2E && GET_MODE_CLASS (mode) == MODE_FLOAT))
sh_compare_op1 = force_reg (mode, sh_compare_op1);
}
! if (TARGET_SH2E && GET_MODE_CLASS (mode) == MODE_FLOAT && code == GE)
{
from_compare (operands, GT);
insn = gen_ieee_ccmpeqsf_t (sh_compare_op0, sh_compare_op1);
}
else
insn = gen_rtx_SET (VOIDmode,
gen_rtx_REG (SImode, T_REG),
--- 1143,1158 ----
|| (TARGET_SH2E && GET_MODE_CLASS (mode) == MODE_FLOAT))
sh_compare_op1 = force_reg (mode, sh_compare_op1);
}
! if (GET_MODE_CLASS (mode) == MODE_FLOAT && TARGET_SH2E && code == GE
! && (mode == SFmode || TARGET_SH4))
{
from_compare (operands, GT);
insn = gen_ieee_ccmpeqsf_t (sh_compare_op0, sh_compare_op1);
}
+ else if (GET_MODE_CLASS (mode) == MODE_FLOAT
+ && ! TARGET_SH4 && TARGET_SH1
+ && (mode == DFmode || ! TARGET_SH2E))
+ insn = sh_soft_fp_cmp (code, mode);
else
insn = gen_rtx_SET (VOIDmode,
gen_rtx_REG (SImode, T_REG),
*************** equality_comparison_operator (rtx op, en
*** 7582,7588 ****
int
greater_comparison_operator (rtx op, enum machine_mode mode)
{
! if (mode != VOIDmode && GET_MODE (op) == mode)
return 0;
switch (GET_CODE (op))
{
--- 7649,7655 ----
int
greater_comparison_operator (rtx op, enum machine_mode mode)
{
! if (mode != VOIDmode && GET_MODE (op) != mode)
return 0;
switch (GET_CODE (op))
{
*************** greater_comparison_operator (rtx op, enu
*** 7599,7605 ****
int
less_comparison_operator (rtx op, enum machine_mode mode)
{
! if (mode != VOIDmode && GET_MODE (op) == mode)
return 0;
switch (GET_CODE (op))
{
--- 7666,7672 ----
int
less_comparison_operator (rtx op, enum machine_mode mode)
{
! if (mode != VOIDmode && GET_MODE (op) != mode)
return 0;
switch (GET_CODE (op))
{
*************** less_comparison_operator (rtx op, enum m
*** 7613,7618 ****
--- 7680,7716 ----
}
}
+ int
+ soft_fp_comparison_operator (rtx op, enum machine_mode mode)
+ {
+ if (mode != VOIDmode && GET_MODE (op) != mode)
+ return 0;
+ switch (GET_CODE (op))
+ {
+ default:
+ return 0;
+ case EQ: mode = CC_FP_NEmode; break;
+ case UNLE: mode = CC_FP_GTmode; break;
+ case GE: mode = CC_FP_UNLTmode; break;
+ }
+ return register_operand (XEXP (op, 0), mode);
+ }
+
+ int
+ soft_fp_comparison_operand (rtx op, enum machine_mode mode)
+ {
+ switch (GET_MODE (op))
+ {
+ default:
+ return 0;
+ case CC_FP_NEmode: case CC_FP_GTmode: case CC_FP_UNLTmode:
+ break;
+ }
+ if (mode == SFmode && TARGET_SH2E)
+ return 0;
+ return register_operand (op, mode);
+ }
+
/* Accept pseudos and branch target registers. */
int
target_reg_operand (rtx op, enum machine_mode mode)
*************** expand_df_binop (rtx (*fun) (rtx, rtx, r
*** 7946,7951 ****
--- 8044,8097 ----
emit_df_insn ((*fun) (operands[0], operands[1], operands[2],
get_fpscr_rtx ()));
}
+
+ /* Expand an sfunc operation taking NARGS MODE arguments, using generator
+ function FUN, which needs symbol NAME loaded int a register first.
+ Add a REG_EQUAL note using EQUIV. */
+ static void
+ expand_sfunc_op (int nargs, enum machine_mode mode, rtx (*fun) (rtx, rtx),
+ const char *name, rtx equiv, rtx *operands)
+ {
+ int next_reg = FIRST_PARM_REG, i;
+ rtx addr, first = NULL_RTX, last, insn;
+
+ addr = force_reg (Pmode, function_symbol (name));
+ for ( i = 1; i <= nargs; i++)
+ {
+ insn = emit_move_insn (gen_rtx_REG (mode, next_reg), operands[i]);
+ if (!first)
+ first = insn;
+ next_reg += GET_MODE_SIZE (mode) / UNITS_PER_WORD;
+ }
+ last = emit_insn ((*fun) (operands[0], addr));
+ REG_NOTES (last) = gen_rtx_EXPR_LIST (REG_EQUAL, equiv, REG_NOTES (last));
+ /* Wrap the sequence in REG_LIBCALL / REG_RETVAL notes so that loop
+ invariant code motion can move it. */
+ REG_NOTES (first) = gen_rtx_INSN_LIST (REG_LIBCALL, last, REG_NOTES (first));
+ REG_NOTES (last) = gen_rtx_INSN_LIST (REG_RETVAL, first, REG_NOTES (last));
+ }
+
+ /* Expand an sfunc unary operation taking an MODE argument, using generator
+ function FUN, which needs symbol NAME loaded int a register first.
+ Add a REG_EQUAL note using CODE. */
+ void
+ expand_sfunc_unop (enum machine_mode mode, rtx (*fun) (rtx, rtx),
+ const char *name, enum rtx_code code, rtx *operands)
+ {
+ rtx equiv = gen_rtx_fmt_e (code, GET_MODE (operands[0]), operands[1]);
+ expand_sfunc_op (1, mode, fun, name, equiv, operands);
+ }
+
+ /* Expand an sfunc binary operation in MODE, using generator function FUN,
+ which needs symbol NAME loaded int a register first.
+ Add a REG_EQUAL note using CODE. */
+ void
+ expand_sfunc_binop (enum machine_mode mode, rtx (*fun) (rtx, rtx),
+ const char *name, enum rtx_code code, rtx *operands)
+ {
+ rtx equiv = gen_rtx_fmt_ee (code, mode, operands[1], operands[2]);
+ expand_sfunc_op (2, mode, fun, name, equiv, operands);
+ }
/* ??? gcc does flow analysis strictly after common subexpression
elimination. As a result, common subexpression elimination fails
Index: sh.h
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/sh/sh.h,v
retrieving revision 1.240.2.4
diff -p -r1.240.2.4 sh.h
*** sh.h 29 Jun 2004 17:33:57 -0000 1.240.2.4
--- sh.h 2 Aug 2004 03:57:40 -0000
*************** extern int rtx_equal_function_value_matt
*** 3295,3300 ****
--- 3295,3302 ----
{"noncommutative_float_operator", {MINUS, DIV}}, \
{"shmedia_6bit_operand", {SUBREG, REG, CONST_INT}}, \
{"sh_register_operand", {REG, SUBREG, CONST_INT}}, \
+ {"soft_fp_comparison_operand", {SUBREG, REG}}, \
+ {"soft_fp_comparison_operator", {EQ, UNLE, GE}}, \
{"target_reg_operand", {SUBREG, REG}}, \
{"target_operand", {SUBREG, REG, LABEL_REF, SYMBOL_REF, CONST, UNSPEC}},\
{"trunc_hi_operand", {SUBREG, REG, TRUNCATE}}, \
*************** extern int rtx_equal_function_value_matt
*** 3308,3313 ****
--- 3310,3316 ----
#define SPECIAL_MODE_PREDICATES \
"any_register_operand", \
"int_gpr_dest", \
+ "soft_fp_comparison_operand" \
"trunc_hi_operand", \
/* This line intentionally left blank. */
Index: sh.md
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/sh/sh.md,v
retrieving revision 1.172.2.1
diff -p -r1.172.2.1 sh.md
*** sh.md 8 Jun 2004 16:55:33 -0000 1.172.2.1
--- sh.md 2 Aug 2004 03:57:41 -0000
***************
*** 152,157 ****
--- 152,167 ----
(UNSPECV_CONST8 6)
(UNSPECV_WINDOW_END 10)
(UNSPECV_CONST_END 11)
+
+ ;; NaN handling for software floating point:
+ ;; We require one bit specific for a precision to be set in all NaNs,
+ ;; so that we can test them with a not / tst sequence.
+ ;; ??? Ironically, this is the quiet bit for now, because that is the
+ ;; only bit set by __builtin_nan ("").
+ ;; ??? Should really use one bit lower and force it set by using
+ ;; a custom encoding function.
+ (SF_NAN_MASK 0x7fc00000)
+ (DF_NAN_MASK 0x7ff80000)
])
;; -------------------------------------------------------------------------
***************
*** 660,665 ****
--- 670,683 ----
cmp/eq %1,%0"
[(set_attr "type" "mt_group")])
+ (define_insn "fpcmp_i1"
+ [(set (reg:SI T_REG)
+ (match_operator:SI 1 "soft_fp_comparison_operator"
+ [(match_operand 0 "soft_fp_comparison_operand" "r") (const_int 0)]))]
+ "TARGET_SH1 && !TARGET_SH4"
+ "tst %0,%0"
+ [(set_attr "type" "mt_group")])
+
(define_insn "cmpgtsi_t"
[(set (reg:SI T_REG)
(gt:SI (match_operand:SI 0 "arith_reg_operand" "r,r")
***************
*** 5272,5277 ****
--- 5290,5303 ----
DONE;
}
+ if (GET_MODE_CLASS (GET_MODE (sh_compare_op0)) == MODE_FLOAT
+ && TARGET_SH1 && !TARGET_SH4
+ && (!TARGET_SH2E || GET_MODE (sh_compare_op0) == DFmode))
+ {
+ from_compare (operands, UNLE);
+ emit_jump_insn (gen_branch_false (operands[0]));
+ DONE;
+ }
from_compare (operands, GT);
}")
***************
*** 5308,5317 ****
rtx tmp = sh_compare_op0;
sh_compare_op0 = sh_compare_op1;
sh_compare_op1 = tmp;
! emit_insn (gen_bgt (operands[0]));
! DONE;
}
! from_compare (operands, GE);
}")
(define_expand "ble"
--- 5334,5348 ----
rtx tmp = sh_compare_op0;
sh_compare_op0 = sh_compare_op1;
sh_compare_op1 = tmp;
! if (TARGET_SH4 || (TARGET_SH2E && GET_MODE (sh_compare_op0) == SFmode))
! {
! emit_insn (gen_bgt (operands[0]));
! DONE;
! }
! from_compare (operands, UNLE);
}
! else
! from_compare (operands, GE);
}")
(define_expand "ble"
***************
*** 5342,5350 ****
DONE;
}
! if (TARGET_SH2E
! && TARGET_IEEE
! && GET_MODE_CLASS (GET_MODE (sh_compare_op0)) == MODE_FLOAT)
{
rtx tmp = sh_compare_op0;
sh_compare_op0 = sh_compare_op1;
--- 5373,5381 ----
DONE;
}
! if (GET_MODE_CLASS (GET_MODE (sh_compare_op0)) == MODE_FLOAT
! && (!TARGET_SH2E || TARGET_IEEE
! || (!TARGET_SH4 && GET_MODE (sh_compare_op0) == DFmode)))
{
rtx tmp = sh_compare_op0;
sh_compare_op0 = sh_compare_op1;
***************
*** 5383,5391 ****
DONE;
}
! if (TARGET_SH2E
! && ! TARGET_IEEE
! && GET_MODE_CLASS (GET_MODE (sh_compare_op0)) == MODE_FLOAT)
{
rtx tmp = sh_compare_op0;
sh_compare_op0 = sh_compare_op1;
--- 5414,5422 ----
DONE;
}
! if (GET_MODE_CLASS (GET_MODE (sh_compare_op0)) == MODE_FLOAT
! && TARGET_SH2E && !TARGET_IEEE
! && (TARGET_SH4 || GET_MODE (sh_compare_op0) == SFmode))
{
rtx tmp = sh_compare_op0;
sh_compare_op0 = sh_compare_op1;
***************
*** 5484,5502 ****
from_compare (operands, GTU);
}")
(define_expand "bunordered"
[(set (match_dup 1) (unordered:DI (match_dup 2) (match_dup 3)))
(set (pc)
(if_then_else (ne (match_dup 1) (const_int 0))
(label_ref:DI (match_operand 0 "" ""))
(pc)))]
! "TARGET_SHMEDIA"
"
{
- operands[1] = gen_reg_rtx (DImode);
operands[2] = force_reg (GET_MODE (sh_compare_op0), sh_compare_op0);
operands[3] = force_reg (GET_MODE (sh_compare_op1), sh_compare_op1);
}")
;; ------------------------------------------------------------------------
;; Jump and linkage insns
--- 5515,5596 ----
from_compare (operands, GTU);
}")
+ ;; ??? Can't use DFmode bcc patterns for SH2E whwn there is no SFmode
+ ;; equivalent: the insn predicate has to be evaluable at compiler startup,
+ ;; and FAIL in bcc patterns causes crashes.
(define_expand "bunordered"
[(set (match_dup 1) (unordered:DI (match_dup 2) (match_dup 3)))
(set (pc)
(if_then_else (ne (match_dup 1) (const_int 0))
(label_ref:DI (match_operand 0 "" ""))
(pc)))]
! "(TARGET_SH1 && !TARGET_SH2E) || TARGET_SHMEDIA"
"
{
operands[2] = force_reg (GET_MODE (sh_compare_op0), sh_compare_op0);
operands[3] = force_reg (GET_MODE (sh_compare_op1), sh_compare_op1);
+ if (TARGET_SH1)
+ {
+ HOST_WIDE_INT mask;
+ switch (GET_MODE (operands[2]))
+ {
+ case SFmode:
+ mask = SF_NAN_MASK;
+ break;
+ case DFmode:
+ mask = DF_NAN_MASK;
+ break;
+ default:
+ FAIL;
+ }
+ emit_insn (gen_cmpunsf_i1 (operands[2], operands[3],
+ force_reg (SImode, GEN_INT (mask))));
+ emit_jump_insn (gen_branch_true (operands[0]));
+ DONE;
+ }
+ operands[1] = gen_reg_rtx (DImode);
}")
+
+ (define_expand "bunle"
+ [(set (pc)
+ (if_then_else (ne (reg:SI T_REG) (const_int 0))
+ (label_ref (match_operand 0 "" ""))
+ (pc)))]
+ "(TARGET_SH1 && !TARGET_SH2E) || TARGET_SHMEDIA_FPU"
+ "
+ {
+ if (TARGET_SHMEDIA_FPU)
+ {
+ rtx tmp = gen_reg_rtx (DImode);
+
+ emit_insn (gen_sgt (tmp));
+ emit_jump_insn (gen_beq_media (operands[0], tmp, const0_rtx));
+ DONE;
+ }
+
+ from_compare (operands, UNLE);
+ }")
+
+ (define_expand "bunlt"
+ [(set (pc)
+ (if_then_else (eq (reg:SI T_REG) (const_int 0))
+ (label_ref (match_operand 0 "" ""))
+ (pc)))]
+ "(TARGET_SH1 && !TARGET_SH2E) || TARGET_SHMEDIA_FPU"
+ "
+ {
+ if (TARGET_SHMEDIA_FPU)
+ {
+ rtx tmp = gen_reg_rtx (DImode);
+
+ emit_insn (gen_sge (tmp));
+ emit_jump_insn (gen_beq_media (operands[0], tmp, const0_rtx));
+ DONE;
+ }
+
+ from_compare (operands, GE);
+ }")
+
;; ------------------------------------------------------------------------
;; Jump and linkage insns
*************** mov.l\\t1f,r0\\n\\
*** 7495,7500 ****
--- 7589,7601 ----
DONE;
if (! rtx_equal_function_value_matters)
FAIL;
+ if (GET_MODE_CLASS (GET_MODE (sh_compare_op0)) == MODE_FLOAT
+ && !TARGET_SH4 && (!TARGET_SH2E || GET_MODE (sh_compare_op0) == DFmode))
+ {
+ from_compare (operands, EQ);
+ emit_insn (gen_movt (operands[0]));
+ DONE;
+ }
operands[1] = prepare_scc_operands (EQ);
}")
*************** mov.l\\t1f,r0\\n\\
*** 7543,7548 ****
--- 7644,7652 ----
}
if (! rtx_equal_function_value_matters)
FAIL;
+ if (GET_MODE_CLASS (GET_MODE (sh_compare_op0)) == MODE_FLOAT && !TARGET_SH4
+ && (!TARGET_SH2E || GET_MODE (sh_compare_op0) == DFmode))
+ FAIL;
operands[1] = prepare_scc_operands (LT);
}")
*************** mov.l\\t1f,r0\\n\\
*** 7647,7652 ****
--- 7751,7759 ----
}
if (! rtx_equal_function_value_matters)
FAIL;
+ if (GET_MODE_CLASS (GET_MODE (sh_compare_op0)) == MODE_FLOAT && !TARGET_SH4
+ && (!TARGET_SH2E || GET_MODE (sh_compare_op0) == DFmode))
+ FAIL;
operands[1] = prepare_scc_operands (GT);
}")
*************** mov.l\\t1f,r0\\n\\
*** 7703,7709 ****
FAIL;
if (GET_MODE_CLASS (GET_MODE (sh_compare_op0)) == MODE_FLOAT)
{
! if (TARGET_IEEE)
{
rtx lab = gen_label_rtx ();
prepare_scc_operands (EQ);
--- 7810,7822 ----
FAIL;
if (GET_MODE_CLASS (GET_MODE (sh_compare_op0)) == MODE_FLOAT)
{
! if (!TARGET_SH4
! && (!TARGET_SH2E || GET_MODE (sh_compare_op0) == DFmode))
! {
! from_compare (operands, GE);
! emit_insn (gen_movt (operands[0]));
! }
! else if (TARGET_IEEE)
{
rtx lab = gen_label_rtx ();
prepare_scc_operands (EQ);
*************** mov.l\\t1f,r0\\n\\
*** 7834,7839 ****
--- 7947,7967 ----
operands[1] = prepare_scc_operands (GEU);
}")
+ (define_expand "sunle"
+ [(set (match_operand:SI 0 "arith_reg_operand" "")
+ (match_dup 1))]
+ "TARGET_SH1 && !TARGET_SH4"
+ "
+ {
+ if (TARGET_SH2E && GET_MODE (sh_compare_op0) == SFmode)
+ FAIL;
+ if (! rtx_equal_function_value_matters)
+ FAIL;
+ from_compare (operands, UNLE);
+ emit_insn (gen_movt (operands[0]));
+ DONE;
+ }")
+
;; sne moves the complement of the T reg to DEST like this:
;; cmp/eq ...
;; mov #-1,temp
*************** mov.l\\t1f,r0\\n\\
*** 7882,7888 ****
DONE;
if (! rtx_equal_function_value_matters)
FAIL;
! operands[1] = prepare_scc_operands (EQ);
operands[2] = gen_reg_rtx (SImode);
}")
--- 8010,8024 ----
DONE;
if (! rtx_equal_function_value_matters)
FAIL;
! if (GET_MODE_CLASS (GET_MODE (sh_compare_op0)) == MODE_FLOAT
! && !TARGET_SH4
! && (!TARGET_SH2E || GET_MODE (sh_compare_op0) == DFmode))
! {
! from_compare (operands, EQ);
! operands[1] = gen_rtx_REG (SImode, T_REG);
! }
! else
! operands[1] = prepare_scc_operands (EQ);
operands[2] = gen_reg_rtx (SImode);
}")
*************** mov.l\\t1f,r0\\n\\
*** 8257,8263 ****
[(set (match_operand:SF 0 "arith_reg_operand" "")
(plus:SF (match_operand:SF 1 "arith_reg_operand" "")
(match_operand:SF 2 "arith_reg_operand" "")))]
! "TARGET_SH2E || TARGET_SHMEDIA_FPU"
"
{
if (TARGET_SH2E)
--- 8393,8399 ----
[(set (match_operand:SF 0 "arith_reg_operand" "")
(plus:SF (match_operand:SF 1 "arith_reg_operand" "")
(match_operand:SF 2 "arith_reg_operand" "")))]
! "TARGET_SH2E || TARGET_SH3 || TARGET_SHMEDIA_FPU"
"
{
if (TARGET_SH2E)
*************** mov.l\\t1f,r0\\n\\
*** 8265,8270 ****
--- 8401,8412 ----
expand_sf_binop (&gen_addsf3_i, operands);
DONE;
}
+ else if (TARGET_SH3)
+ {
+ expand_sfunc_binop (SFmode, &gen_addsf3_i3, \"__addsf3_\", PLUS,
+ operands);
+ DONE;
+ }
}")
(define_insn "*addsf3_media"
*************** mov.l\\t1f,r0\\n\\
*** 8341,8346 ****
--- 8483,8504 ----
}"
[(set_attr "type" "fparith_media")])
+ (define_insn "addsf3_i3"
+ [(set (match_operand:SF 0 "arith_reg_operand" "=z")
+ (plus:SF (reg:SF R4_REG) (reg:SF R5_REG)))
+ (clobber (reg:SI T_REG))
+ (clobber (reg:SI PR_REG))
+ (clobber (reg:SI R1_REG))
+ (clobber (reg:SI R2_REG))
+ (clobber (reg:SI R3_REG))
+ (clobber (reg:SI R6_REG))
+ (clobber (reg:SI R7_REG))
+ (use (match_operand:SI 1 "arith_reg_operand" "r"))]
+ "TARGET_SH3 && ! TARGET_SH2E"
+ "jsr @%1%#"
+ [(set_attr "type" "sfunc")
+ (set_attr "needs_delay_slot" "yes")])
+
(define_insn "addsf3_i"
[(set (match_operand:SF 0 "arith_reg_operand" "=f")
(plus:SF (match_operand:SF 1 "arith_reg_operand" "%0")
*************** mov.l\\t1f,r0\\n\\
*** 8355,8361 ****
[(set (match_operand:SF 0 "fp_arith_reg_operand" "")
(minus:SF (match_operand:SF 1 "fp_arith_reg_operand" "")
(match_operand:SF 2 "fp_arith_reg_operand" "")))]
! "TARGET_SH2E || TARGET_SHMEDIA_FPU"
"
{
if (TARGET_SH2E)
--- 8513,8519 ----
[(set (match_operand:SF 0 "fp_arith_reg_operand" "")
(minus:SF (match_operand:SF 1 "fp_arith_reg_operand" "")
(match_operand:SF 2 "fp_arith_reg_operand" "")))]
! "TARGET_SH2E || TARGET_SH3 || TARGET_SHMEDIA_FPU"
"
{
if (TARGET_SH2E)
*************** mov.l\\t1f,r0\\n\\
*** 8363,8368 ****
--- 8521,8532 ----
expand_sf_binop (&gen_subsf3_i, operands);
DONE;
}
+ else if (TARGET_SH3)
+ {
+ expand_sfunc_binop (SFmode, &gen_subsf3_i3, \"__subsf3_\", MINUS,
+ operands);
+ DONE;
+ }
}")
(define_insn "*subsf3_media"
*************** mov.l\\t1f,r0\\n\\
*** 8373,8378 ****
--- 8537,8559 ----
"fsub.s %1, %2, %0"
[(set_attr "type" "fparith_media")])
+ (define_insn "subsf3_i3"
+ [(set (match_operand:SF 0 "arith_reg_operand" "=z")
+ (minus:SF (reg:SF R4_REG) (reg:SF R5_REG)))
+ (clobber (reg:SI T_REG))
+ (clobber (reg:SI PR_REG))
+ (clobber (reg:SI R1_REG))
+ (clobber (reg:SI R2_REG))
+ (clobber (reg:SI R3_REG))
+ (clobber (reg:SI R5_REG))
+ (clobber (reg:SI R6_REG))
+ (clobber (reg:SI R7_REG))
+ (use (match_operand:SI 1 "arith_reg_operand" "r"))]
+ "TARGET_SH3 && ! TARGET_SH2E"
+ "jsr @%1%#"
+ [(set_attr "type" "sfunc")
+ (set_attr "needs_delay_slot" "yes")])
+
(define_insn "subsf3_i"
[(set (match_operand:SF 0 "fp_arith_reg_operand" "=f")
(minus:SF (match_operand:SF 1 "fp_arith_reg_operand" "0")
*************** mov.l\\t1f,r0\\n\\
*** 8392,8404 ****
[(set (match_operand:SF 0 "fp_arith_reg_operand" "")
(mult:SF (match_operand:SF 1 "fp_arith_reg_operand" "")
(match_operand:SF 2 "fp_arith_reg_operand" "")))]
! "TARGET_SH2E || TARGET_SHMEDIA_FPU"
"
{
if (TARGET_SH4)
expand_sf_binop (&gen_mulsf3_i4, operands);
else if (TARGET_SH2E)
emit_insn (gen_mulsf3_ie (operands[0], operands[1], operands[2]));
if (! TARGET_SHMEDIA)
DONE;
}")
--- 8573,8591 ----
[(set (match_operand:SF 0 "fp_arith_reg_operand" "")
(mult:SF (match_operand:SF 1 "fp_arith_reg_operand" "")
(match_operand:SF 2 "fp_arith_reg_operand" "")))]
! "TARGET_SH2E || TARGET_SH3 || TARGET_SHMEDIA_FPU"
"
{
if (TARGET_SH4)
expand_sf_binop (&gen_mulsf3_i4, operands);
else if (TARGET_SH2E)
emit_insn (gen_mulsf3_ie (operands[0], operands[1], operands[2]));
+ else if (TARGET_SH3)
+ {
+ expand_sfunc_binop (SFmode, &gen_mulsf3_i3, \"__mulsf3_\", MULT,
+ operands);
+ DONE;
+ }
if (! TARGET_SHMEDIA)
DONE;
}")
*************** mov.l\\t1f,r0\\n\\
*** 8429,8434 ****
--- 8616,8637 ----
"fmul %2,%0"
[(set_attr "type" "fp")])
+ (define_insn "mulsf3_i3"
+ [(set (match_operand:SF 0 "arith_reg_operand" "=z")
+ (mult:SF (reg:SF R4_REG) (reg:SF R5_REG)))
+ (clobber (reg:SI MACH_REG))
+ (clobber (reg:SI MACL_REG))
+ (clobber (reg:SI T_REG))
+ (clobber (reg:SI PR_REG))
+ (clobber (reg:SI R1_REG))
+ (clobber (reg:SI R2_REG))
+ (clobber (reg:SI R3_REG))
+ (use (match_operand:SI 1 "arith_reg_operand" "r"))]
+ "TARGET_SH3 && ! TARGET_SH2E"
+ "jsr @%1%#"
+ [(set_attr "type" "sfunc")
+ (set_attr "needs_delay_slot" "yes")])
+
(define_insn "*mac_media"
[(set (match_operand:SF 0 "fp_arith_reg_operand" "=f")
(plus:SF (mult:SF (match_operand:SF 1 "fp_arith_reg_operand" "%f")
*************** mov.l\\t1f,r0\\n\\
*** 8589,8594 ****
--- 8792,8886 ----
"ftrc %1,%0"
[(set_attr "type" "fp")])
+ (define_insn "cmpnesf_i1"
+ [(set (match_operand:CC_FP_NE 0 "register_operand" "=z")
+ (compare:CC_FP_NE (reg:SF R4_REG) (reg:SF R5_REG)))
+ (clobber (reg:SI T_REG))
+ (clobber (reg:SI PR_REG))
+ (clobber (reg:SI R1_REG))
+ (use (match_operand:SI 1 "arith_reg_operand" "r"))]
+ "TARGET_SH1 && ! TARGET_SH2E"
+ "jsr @%1%#"
+ [(set_attr "type" "sfunc")
+ (set_attr "needs_delay_slot" "yes")])
+
+ (define_insn "cmpgtsf_i1"
+ [(set (match_operand:CC_FP_GT 0 "register_operand" "=z")
+ (compare:CC_FP_GT (reg:SF R4_REG) (reg:SF R5_REG)))
+ (clobber (reg:SI T_REG))
+ (clobber (reg:SI PR_REG))
+ (clobber (reg:SI R1_REG))
+ (use (match_operand:SI 1 "arith_reg_operand" "r"))]
+ "TARGET_SH1 && ! TARGET_SH2E"
+ "jsr @%1%#"
+ [(set_attr "type" "sfunc")
+ (set_attr "needs_delay_slot" "yes")])
+
+ (define_insn "cmpunltsf_i1"
+ [(set (match_operand:CC_FP_UNLT 0 "register_operand" "=z")
+ (compare:CC_FP_UNLT (reg:SF R4_REG) (reg:SF R5_REG)))
+ (clobber (reg:SI T_REG))
+ (clobber (reg:SI PR_REG))
+ (clobber (reg:SI R1_REG))
+ (use (match_operand:SI 1 "arith_reg_operand" "r"))]
+ "TARGET_SH1 && ! TARGET_SH2E"
+ "jsr @%1%#"
+ [(set_attr "type" "sfunc")
+ (set_attr "needs_delay_slot" "yes")])
+
+ (define_insn "cmpeqsf_i1_finite"
+ [(set (reg:SI T_REG)
+ (eq:SI (match_operand:SF 0 "arith_reg_operand" "r,r,r")
+ (match_operand:SF 1 "arith_reg_operand" "r,r,r")))
+ (clobber (match_scratch:SI 2 "=0,1,?r"))]
+ "TARGET_SH1 && ! TARGET_SH2E && flag_finite_math_only"
+ "*
+ {
+ if (which_alternative == 0)
+ output_asm_insn (\"cmp/eq\t%0,%1\;or\t%1,%2\;bt\t0f\", operands);
+ else if (which_alternative == 1)
+ output_asm_insn (\"cmp/eq\t%0,%1\;or\t%0,%2\;bt\t0f\", operands);
+ else
+ output_asm_insn (\"cmp/eq\t%0,%1\;mov\t%0,%2\;bt\t0f\;or\t%1,%2\",
+ operands);
+ return \"add\t%2,%2\;tst\t%2,%2\\n0:\";
+ }"
+ [(set_attr "length" "10,10,12")])
+
+ (define_insn "cmplesf_i1_finite"
+ [(set (reg:SI T_REG)
+ (le:SI (match_operand:SF 0 "arith_reg_operand" "r,r,r")
+ (match_operand:SF 1 "arith_reg_operand" "r,r,r")))
+ (clobber (match_scratch:SI 2 "=0,1,r"))]
+ "TARGET_SH1 && ! TARGET_SH2E && flag_finite_math_only"
+ "*
+ {
+ output_asm_insn (\"cmp/pz\t%0\", operands);
+ if (which_alternative == 2)
+ output_asm_insn (\"mov\t%0,%2\", operands);
+ if (TARGET_SH2)
+ output_asm_insn (\"bf/s\t0f\;cmp/hs\t%1,%0\;cmp/ge\t%0,%1\", operands);
+ else
+ output_asm_insn (\"bt\t1f\;bra\t0f\;cmp/hs\t%1,%0\\n1:\tcmp/ge\t%0,%1\",
+ operands);
+ if (which_alternative == 1)
+ output_asm_insn (\"or\t%0,%2\", operands);
+ else
+ output_asm_insn (\"or\t%1,%2\", operands);
+ return \"bt\t0f\;add\t%2,%2\;tst\t%2,%2\\n0:\";
+ }"
+ [(set_attr "length" "18,18,20")])
+
+ (define_insn "cmpunsf_i1"
+ [(set (reg:SI T_REG)
+ (unordered:SI (match_operand:SF 0 "arith_reg_operand" "r,r")
+ (match_operand:SF 1 "arith_reg_operand" "r,r")))
+ (use (match_operand:SI 2 "arith_reg_operand" "r,r"))
+ (clobber (match_scratch:SI 3 "=0,&r"))]
+ "TARGET_SH1 && ! TARGET_SH2E"
+ "not\t%0,%3\;tst\t%2,%3\;not\t%1,%3\;bt\t0f\;tst\t%2,%3\;0:"
+ [(set_attr "length" "10")])
+
(define_insn "cmpgtsf_t"
[(set (reg:SI T_REG)
(gt:SI (match_operand:SF 0 "fp_arith_reg_operand" "f")
*************** mov.l\\t1f,r0\\n\\
*** 8684,8690 ****
[(set (reg:SI T_REG)
(compare (match_operand:SF 0 "arith_operand" "")
(match_operand:SF 1 "arith_operand" "")))]
! "TARGET_SH2E || TARGET_SHMEDIA_FPU"
"
{
sh_compare_op0 = operands[0];
--- 8976,8982 ----
[(set (reg:SI T_REG)
(compare (match_operand:SF 0 "arith_operand" "")
(match_operand:SF 1 "arith_operand" "")))]
! "TARGET_SH1 || TARGET_SHMEDIA_FPU"
"
{
sh_compare_op0 = operands[0];
*************** mov.l\\t1f,r0\\n\\
*** 8779,8784 ****
--- 9071,9109 ----
[(set_attr "type" "fmove")
(set_attr "fp_mode" "single")])
+ (define_expand "abssc2"
+ [(set (match_operand:SF 0 "fp_arith_reg_operand" "")
+ (abs:SF (match_operand:SC 1 "fp_arith_reg_operand" "")))]
+ "TARGET_SH3 && ! TARGET_SH2E"
+ "
+ {
+ if (TARGET_SH3)
+ {
+ expand_sfunc_unop (SCmode, &gen_abssc2_i3, \"__hypotf\", ABS,
+ operands);
+ DONE;
+ }
+ FAIL;
+ }")
+
+ (define_insn "abssc2_i3"
+ [(set (match_operand:SF 0 "arith_reg_operand" "=z")
+ (abs:SF (reg:SC R4_REG)))
+ (clobber (reg:SI MACH_REG))
+ (clobber (reg:SI MACL_REG))
+ (clobber (reg:SI T_REG))
+ (clobber (reg:SI PR_REG))
+ (clobber (reg:SI R1_REG))
+ (clobber (reg:SI R2_REG))
+ (clobber (reg:SI R3_REG))
+ (clobber (reg:SI R4_REG))
+ (clobber (reg:SI R5_REG))
+ (use (match_operand:SI 1 "arith_reg_operand" "r"))]
+ "TARGET_SH3 && ! TARGET_SH2E"
+ "jsr @%1%#"
+ [(set_attr "type" "sfunc")
+ (set_attr "needs_delay_slot" "yes")])
+
(define_expand "adddf3"
[(set (match_operand:DF 0 "fp_arith_reg_operand" "")
(plus:DF (match_operand:DF 1 "fp_arith_reg_operand" "")
*************** mov.l\\t1f,r0\\n\\
*** 9004,9009 ****
--- 9329,9389 ----
;; (use (match_dup 2))])
;; (set (match_dup 0) (reg:SI FPUL_REG))])
+ (define_insn "cmpnedf_i1"
+ [(set (match_operand:CC_FP_NE 0 "register_operand" "=z")
+ (compare:CC_FP_NE (reg:DF R4_REG) (reg:DF R6_REG)))
+ (clobber (reg:SI T_REG))
+ (clobber (reg:SI PR_REG))
+ (clobber (reg:SI R1_REG))
+ (use (match_operand:SI 1 "arith_reg_operand" "r"))]
+ "TARGET_SH1 && ! TARGET_SH2E"
+ "jsr @%1%#"
+ [(set_attr "type" "sfunc")
+ (set_attr "needs_delay_slot" "yes")])
+
+ (define_insn "cmpgtdf_i1"
+ [(set (match_operand:CC_FP_GT 0 "register_operand" "=z")
+ (compare:CC_FP_GT (reg:DF R4_REG) (reg:DF R6_REG)))
+ (clobber (reg:SI T_REG))
+ (clobber (reg:SI PR_REG))
+ (clobber (reg:SI R1_REG))
+ (use (match_operand:SI 1 "arith_reg_operand" "r"))]
+ "TARGET_SH1 && ! TARGET_SH4"
+ "jsr @%1%#"
+ [(set_attr "type" "sfunc")
+ (set_attr "needs_delay_slot" "yes")])
+
+ (define_insn "cmpunltdf_i1"
+ [(set (match_operand:CC_FP_UNLT 0 "register_operand" "=z")
+ (compare:CC_FP_UNLT (reg:DF R4_REG) (reg:DF R6_REG)))
+ (clobber (reg:SI T_REG))
+ (clobber (reg:SI PR_REG))
+ (clobber (reg:SI R1_REG))
+ (use (match_operand:SI 1 "arith_reg_operand" "r"))]
+ "TARGET_SH1 && ! TARGET_SH4"
+ "jsr @%1%#"
+ [(set_attr "type" "sfunc")
+ (set_attr "needs_delay_slot" "yes")])
+
+ (define_insn "cmpeqdf_i1_finite"
+ [(set (reg:SI T_REG)
+ (eq:SI (match_operand:DF 0 "arith_reg_operand" "r")
+ (match_operand:DF 1 "arith_reg_operand" "r")))
+ (clobber (match_scratch:SI 2 "=&r"))]
+ "TARGET_SH1 && ! TARGET_SH4 && flag_finite_math_only"
+ "cmp/eq\t%R0,%R1\;mov\t%S0,%2\;bf\t0f\;cmp/eq\t%S0,%S1\;bt\t0f\;or\t%S1,%2\;add\t%2,%2\;or\t%R0,%2\;tst\t%2,%2\\n0:"
+ [(set_attr "length" "18")])
+
+ (define_insn "cmpundf_i1"
+ [(set (reg:SI T_REG)
+ (unordered:SI (match_operand:DF 0 "arith_reg_operand" "r,r")
+ (match_operand:DF 1 "arith_reg_operand" "r,r")))
+ (use (match_operand:SI 2 "arith_reg_operand" "r,r"))
+ (clobber (match_scratch:SI 3 "=0,&r"))]
+ "TARGET_SH1 && ! TARGET_SH2E"
+ "not\t%S0,%3\;tst\t%2,%3\;not\t%S1,%3\;bt\t0f\;tst\t%2,%3\;0:"
+ [(set_attr "length" "10")])
+
(define_insn "cmpgtdf_t"
[(set (reg:SI T_REG)
(gt:SI (match_operand:DF 0 "arith_reg_operand" "f")
*************** mov.l\\t1f,r0\\n\\
*** 9071,9077 ****
[(set (reg:SI T_REG)
(compare (match_operand:DF 0 "arith_operand" "")
(match_operand:DF 1 "arith_operand" "")))]
! "TARGET_SH4 || TARGET_SHMEDIA_FPU"
"
{
sh_compare_op0 = operands[0];
--- 9451,9457 ----
[(set (reg:SI T_REG)
(compare (match_operand:DF 0 "arith_operand" "")
(match_operand:DF 1 "arith_operand" "")))]
! "TARGET_SH1 || TARGET_SHMEDIA_FPU"
"
{
sh_compare_op0 = operands[0];
*************** mov.l\\t1f,r0\\n\\
*** 9169,9175 ****
(define_expand "extendsfdf2"
[(set (match_operand:DF 0 "fp_arith_reg_operand" "")
(float_extend:DF (match_operand:SF 1 "fpul_operand" "")))]
! "TARGET_SH4 || TARGET_SHMEDIA_FPU"
"
{
if (TARGET_SH4)
--- 9549,9555 ----
(define_expand "extendsfdf2"
[(set (match_operand:DF 0 "fp_arith_reg_operand" "")
(float_extend:DF (match_operand:SF 1 "fpul_operand" "")))]
! "TARGET_SH1 || TARGET_SHMEDIA_FPU"
"
{
if (TARGET_SH4)
*************** mov.l\\t1f,r0\\n\\
*** 9178,9183 ****
--- 9558,9569 ----
get_fpscr_rtx ()));
DONE;
}
+ else if (TARGET_SH1)
+ {
+ expand_sfunc_unop (SFmode, &gen_extendsfdf2_i1, \"__extendsfdf2_\",
+ FLOAT_EXTEND, operands);
+ DONE;
+ }
}")
(define_insn "*extendsfdf2_media"
*************** mov.l\\t1f,r0\\n\\
*** 9196,9205 ****
[(set_attr "type" "fp")
(set_attr "fp_mode" "double")])
(define_expand "truncdfsf2"
[(set (match_operand:SF 0 "fpul_operand" "")
(float_truncate:SF (match_operand:DF 1 "fp_arith_reg_operand" "")))]
! "TARGET_SH4 || TARGET_SHMEDIA_FPU"
"
{
if (TARGET_SH4)
--- 9582,9624 ----
[(set_attr "type" "fp")
(set_attr "fp_mode" "double")])
+ ;; ??? In order to use this efficiently, we'd have to have an extra
+ ;; register class for r0 and r1 - and that would cause repercussions in
+ ;; register allocation elsewhere. So just say we clobber r0 / r1, and
+ ;; that we can use an arbitrary target. */
+ (define_insn_and_split "extendsfdf2_i1"
+ [(set (match_operand:DF 0 "arith_reg_operand" "=r")
+ (float_extend:DF (reg:SF R4_REG)))
+ (clobber (reg:SI T_REG))
+ (clobber (reg:SI PR_REG))
+ (clobber (reg:SI R0_REG))
+ (clobber (reg:SI R1_REG))
+ (clobber (reg:SI R2_REG))
+ (clobber (reg:SI R3_REG))
+ (use (match_operand:SI 1 "arith_reg_operand" "r"))]
+ "TARGET_SH1 && !TARGET_SH4"
+ "#"
+ "&& reload_completed"
+ [(set (match_dup 0) (reg:DF R0_REG))]
+ "emit_insn (gen_extendsfdf2_i1_r0 (operands[1]));"
+ [(set_attr "type" "sfunc")])
+
+ (define_insn "extendsfdf2_i1_r0"
+ [(set (reg:DF R0_REG) (float_extend:DF (reg:SF R4_REG)))
+ (clobber (reg:SI T_REG))
+ (clobber (reg:SI PR_REG))
+ (clobber (reg:SI R2_REG))
+ (clobber (reg:SI R3_REG))
+ (use (match_operand:SI 0 "arith_reg_operand" "r"))]
+ "TARGET_SH1 && !TARGET_SH4"
+ "jsr @%0%#"
+ [(set_attr "type" "sfunc")
+ (set_attr "needs_delay_slot" "yes")])
+
(define_expand "truncdfsf2"
[(set (match_operand:SF 0 "fpul_operand" "")
(float_truncate:SF (match_operand:DF 1 "fp_arith_reg_operand" "")))]
! "TARGET_SH1 || TARGET_SHMEDIA_FPU"
"
{
if (TARGET_SH4)
*************** mov.l\\t1f,r0\\n\\
*** 9208,9213 ****
--- 9627,9638 ----
get_fpscr_rtx ()));
DONE;
}
+ else if (TARGET_SH1)
+ {
+ expand_sfunc_unop (DFmode, &gen_truncdfsf2_i1, \"__truncdfsf2_\",
+ FLOAT_TRUNCATE, operands);
+ DONE;
+ }
}")
(define_insn "*truncdfsf2_media"
*************** mov.l\\t1f,r0\\n\\
*** 9225,9230 ****
--- 9650,9670 ----
"fcnvds %1,%0"
[(set_attr "type" "fp")
(set_attr "fp_mode" "double")])
+
+ (define_insn "truncdfsf2_i1"
+ [(set (match_operand:SF 0 "arith_reg_operand" "=z")
+ (float_truncate:SF (reg:DF R4_REG)))
+ (clobber (reg:SI T_REG))
+ (clobber (reg:SI PR_REG))
+ (clobber (reg:SI R1_REG))
+ (clobber (reg:SI R2_REG))
+ (clobber (reg:SI R3_REG))
+ (use (match_operand:SI 1 "arith_reg_operand" "r"))]
+ "TARGET_SH1 && ! TARGET_SH4"
+ "jsr @%1%#"
+ [(set_attr "type" "sfunc")
+ (set_attr "needs_delay_slot" "yes")])
+
;; Bit field extract patterns. These give better code for packed bitfields,
;; because they allow auto-increment addresses to be generated.
Index: t-sh
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/sh/t-sh,v
retrieving revision 1.16.30.1
diff -p -r1.16.30.1 t-sh
*** t-sh 16 Jun 2004 19:58:35 -0000 1.16.30.1
--- t-sh 2 Aug 2004 03:57:41 -0000
***************
*** 1,6 ****
--- 1,8 ----
LIB1ASMSRC = sh/lib1funcs.asm
LIB1ASMFUNCS = _ashiftrt _ashiftrt_n _ashiftlt _lshiftrt _movstr \
_movstr_i4 _mulsi3 _sdivsi3 _sdivsi3_i4 _udivsi3 _udivsi3_i4 _set_fpscr \
+ _nesf2 _nedf2 _gtsf2t _gtdf2t _gesf2f _gedf2f _extendsfdf2 _truncdfsf2 \
+ _add_sub_sf3 _mulsf3 _hypotf \
$(LIB1ASMFUNCS_CACHE)
# We want fine grained libraries, so use the new code to build the