This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: RFC: Handling of libgcc symbols in SH shared libraries
- From: amylaar at spamcop dot net (Joern Rennecke)
- To: kumar107 at rediffmail dot com
- Cc: joern dot rennecke at superh dot com (Joern Rennecke), gcc-patches at gcc dot gnu dot org
- Date: Sat, 14 Aug 2004 18:36:08 +0100 (BST)
- Subject: Re: RFC: Handling of libgcc symbols in SH shared libraries
Your adddf3 version doesn't seem to implement any sticky bit when
shifting the smaller operand by 32 or more. It is also on the
slow side, in particular the worst case is very bad indeed.
The version below is optimized both for good typical and worst case
performance.
--- 8>< cut here --- 8>< cut here --- 8>< cut here --- 8>< cut here ---
/* Copyright (C) 2004 Free Software Foundation, Inc.
This file is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
Free Software Foundation; either version 2, or (at your option) any
later version.
In addition to the permissions in the GNU General Public License, the
Free Software Foundation gives you unlimited permission to link the
compiled version of this file into combinations with other programs,
and to distribute those combinations without any restriction coming
from the use of this file. (The General Public License restrictions
do apply in other respects; for example, they cover modification of
the file, and distribution when not linked into a combine
executable.)
This file is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; see the file COPYING. If not, write to
the Free Software Foundation, 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
! adddf3 for the Renesas / SuperH SH CPUs.
! Contributed by Joern Rennecke
! joern.rennecke@superh.com
!
! This code is optimized for SH4 without FPU, but can also be used for SH3.
! Numbers with same sign are added in typically 37 cycles, worst case is
! 43 cycles, unless there is an overflow, in which case the addition can
! take up to takes 47 cycles.
! Normal numbers with different sign are added in 56 (57 for PIC) cycles
! or less on SH4.
! If one of the inputs is a denormal, the worst case is 59 (60 for PIC)
! cycles. (Two denormal inputs are faster than normal inputs, and
! denormal outputs don't slow down computation).
! Subtraction takes two cycles to negate the second input and then drops
! through to addition.
/* If the input exponents of a difference of two normalized numbers
differ by more than one, the output does not need to be adjusted
by more than one bit position. Hence, it makes sense to ensure that
the shifts by 0 & 1 are handled quickly to reduce average and worst
case times. */
FUNC(GLOBAL(adddf3_))
FUNC(GLOBAL(subdf3_))
.global GLOBAL(adddf3_)
.global GLOBAL(subdf3_)
LOCAL(denorm_arg1):
bt LOCAL(inf_nan_arg0)
tst r0,r2
bt/s LOCAL(denorm_both)
shlr r1
mov.l LOCAL(x00100000),r3
bra LOCAL(denorm_arg1_done)
sub r2,r3
! Handle denorm addition here because otherwise the ordinary addition would
! have to check for denormal results.
! Denormal subtraction could also be done faster, but the denorm subtraction
! path here is still one cycles faster than the one for normalized input
! numbers, and 16 instructions shorter than the fastest version.
LOCAL(denorm_both):
div0s DBL0H,DBL1H
mov.l LOCAL(x800fffff),r9
bt/s LOCAL(denorm_sub)
and r1,DBL1H
and r9,DBL0H
mov.l @r15+,r9
mov DBL0L,DBLRL
mov DBL0H,DBLRH
addc DBL1L,DBLRL
mov.l @r15+,r8
rts
addc DBL1H,DBLRH
LOCAL(denorm_sub):
mov DBL0H,r8 ! tentative result sign
and r1,DBL0H
bra LOCAL(sub_same_exp)
addc r1,r2 ! exponent++, clear T
LOCAL(inf_nan_arg0):
mov DBL0L,DBLRL
bra LOCAL(pop_r8_r9)
mov DBL0H,DBLRH
LOCAL(ret_arg0):
mov.l LOCAL(x80000000),DBLRH
mov DBL0L,DBLRL
mov DBL0H,DBL1H
mov r2,r3
LOCAL(ret_arg):
or r3,DBL1H
mov.l @r15+,r9
and r8,DBLRH
mov.l @r15+,r8
rts
or DBL1H,DBLRH
.balign 4
GLOBAL(subdf3_):
cmp/pz DBL1H
add DBL1H,DBL1H
rotcr DBL1H
nop
GLOBAL(adddf3_):
mov.l LOCAL(x7ff00000),r0
mov DBL0H,r2
mov.l LOCAL(x001fffff),r1
mov DBL1H,r3
mov.l r8,@-r15
and r0,r2
mov.l r9,@-r15
and r0,r3
cmp/hi r2,r3
or r0,DBL0H
or r0,DBL1H
bt LOCAL(arg1_gt)
tst r0,r3
mov #-20,r9
bt/s LOCAL(denorm_arg1)
cmp/hs r0,r2
bt LOCAL(inf_nan_arg0)
sub r2,r3
LOCAL(denorm_arg1_done): ! r2 is tentative result exponent
shad r9,r3
mov.w LOCAL(m32),r9
mov DBL0H,r8 ! tentative result sign
and r1,DBL0H
mov DBL1H,r0 ! the 'other' sign
and r1,DBL1H
cmp/ge r9,r3
mov DBL1H,r1
bf/s LOCAL(large_shift_arg1)
shld r3,DBL1H
LOCAL(small_shift_arg1):
mov DBL1L,r9
shld r3,DBL1L
tst r3,r3
add #32,r3
bt/s LOCAL(same_exp)
div0s r8,r0 ! compare signs
shld r3,r1
or r1,DBL1L
bf/s LOCAL(add)
shld r3,r9
clrt
negc r9,r9
mov.l LOCAL(x001f0000),r3
LOCAL(sub_high):
mov DBL0L,DBLRL
subc DBL1L,DBLRL
mov DBL0H,DBLRH
bra LOCAL(subtract_done)
subc DBL1H,DBLRH
LOCAL(large_shift_arg1):
mov.w LOCAL(d0),r9
add #64,r3
cmp/pl r3
shld r3,r1
bf LOCAL(ret_arg0)
cmp/hi r9,DBL1L
mov DBL1H,DBL1L
mov r9,DBL1H
addc r1,r9
div0s r8,r0 ! compare signs
bf LOCAL(add)
clrt
mov.l LOCAL(x001f0000),r3
bra LOCAL(sub_high)
negc r9,r9
LOCAL(add_clr_r9):
mov #0,r9
LOCAL(add):
mov.l LOCAL(x00200000),r3
addc DBL1L,DBL0L
addc DBL1H,DBL0H
mov.l LOCAL(x80000000),r1
tst r3,DBL0H
mov.l LOCAL(x7fffffff),r3
mov DBL0L,r0
bt/s LOCAL(no_carry)
and r1,r8
tst r9,r9
bf LOCAL(add_one)
tst #2,r0
LOCAL(add_one):
subc r9,r9
sett
mov r0,DBLRL
addc r9,DBLRL
mov DBL0H,DBLRH
addc r9,DBLRH
shlr DBLRH
mov.l LOCAL(x7ff00000),r3
add r2,DBLRH
mov.l @r15+,r9
rotcr DBLRL
cmp/hi r3,DBLRH
LOCAL(add_done):
bt LOCAL(inf)
LOCAL(or_sign):
or r8,DBLRH
rts
mov.l @r15+,r8
LOCAL(inf):
bra LOCAL(or_sign)
mov r3,DBLRH
LOCAL(pos_difference_0):
tst r3,DBL0H
mov DBL0L,DBLRL
mov.l LOCAL(x80000000),DBL0L
mov DBL0H,DBLRH
mov.l LOCAL(x00100000),DBL0H
bt/s LOCAL(long_norm)
and DBL0L,r8
bra LOCAL(norm_loop)
not DBL0L,r3
LOCAL(same_exp):
bf LOCAL(add_clr_r9)
clrt
LOCAL(sub_same_exp):
subc DBL1L,DBL0L
mov.w LOCAL(d0),r9
subc DBL1H,DBL0H
mov.l LOCAL(x001f0000),r3
bf LOCAL(pos_difference_0)
clrt
negc DBL0L,DBLRL
mov.l LOCAL(x80000000),DBL0L
negc DBL0H,DBLRH
mov.l LOCAL(x00100000),DBL0H
tst r3,DBLRH
not r8,r8
bt/s LOCAL(long_norm)
and DBL0L,r8
bra LOCAL(norm_loop)
not DBL0L,r3
LOCAL(large_shift_arg0):
add #64,r2
mov #0,r9
cmp/pl r2
shld r2,r1
bf LOCAL(ret_arg1_exp_r3)
cmp/hi r9,DBL0L
mov DBL0H,DBL0L
mov r9,DBL0H
addc r1,r9
div0s r8,r0 ! compare signs
mov r3,r2 ! tentative result exponent
bf LOCAL(add)
clrt
negc r9,r9
bra LOCAL(subtract_arg0_arg1_done)
mov DBL1L,DBLRL
LOCAL(arg1_gt):
tst r0,r2
mov #-20,r9
bt/s LOCAL(denorm_arg0)
cmp/hs r0,r3
bt LOCAL(inf_nan_arg1)
sub r3,r2
LOCAL(denorm_arg0_done):
shad r9,r2
mov.w LOCAL(m32),r9
mov DBL1H,r8 ! tentative result sign
and r1,DBL1H
mov DBL0H,r0 ! the 'other' sign
and r1,DBL0H
cmp/ge r9,r2
mov DBL0H,r1
shld r2,DBL0H
bf LOCAL(large_shift_arg0)
mov DBL0L,r9
shld r2,DBL0L
add #32,r2
mov.l r3,@-r15
shld r2,r1
mov r2,r3
div0s r8,r0 ! compare signs
mov.l @r15+,r2 ! tentative result exponent
shld r3,r9
bf/s LOCAL(add)
or r1,DBL0L
clrt
negc r9,r9
mov DBL1L,DBLRL
LOCAL(subtract_arg0_arg1_done):
subc DBL0L,DBLRL
mov DBL1H,DBLRH
mov.l LOCAL(x001f0000),r3
subc DBL0H,DBLRH
/* Since the exponents were different, the difference is positive. */
/* Fall through */
LOCAL(subtract_done):
/* First check if a shift by a few bits is sufficient. This not only
speeds up this case, but also alleviates the need for considering
lower bits from r2 or rounding in the other code.
Moreover, by handling the upper 1+4 bits of the fraction here, long_norm
can assume that DBLRH fits into 16 bit. */
tst r3,DBLRH
mov.l LOCAL(x80000000),r3
mov.l LOCAL(x00100000),DBL0H
bt/s LOCAL(long_norm)
and r3,r8
mov.l LOCAL(x7fffffff),r3
LOCAL(norm_loop): ! Well, this used to be a loop...
tst DBL0H,DBLRH
sub DBL0H,r2
bf LOCAL(norm_round)
shll r9
rotcl DBLRL
rotcl DBLRH
tst DBL0H,DBLRH
sub DBL0H,r2
bf LOCAL(norm_round)
shll DBLRL
rotcl DBLRH
mov.l @r15+,r9
cmp/gt r2,DBL0H
sub DBL0H,r2
LOCAL(norm_loop_1):
bt LOCAL(denorm0_n)
tst DBL0H,DBLRH
bf LOCAL(norm_pack)
shll DBLRL
rotcl DBLRH ! clears T
bra LOCAL(norm_loop_1)
subc DBL0H,r2
LOCAL(no_carry):
shlr r0
mov.l LOCAL(x000fffff),DBLRH
addc r3,r9
mov.w LOCAL(d0),DBL1H
mov DBL0L,DBLRL
and DBL0H,DBLRH ! mask out implicit 1
mov.l LOCAL(x7ff00000),r3
addc DBL1H,DBLRL
addc r2,DBLRH
mov.l @r15+,r9
add DBL1H,DBLRH ! fraction overflow -> exp increase
bra LOCAL(add_done)
cmp/hi r3,DBLRH
LOCAL(denorm_arg0):
bt LOCAL(inf_nan_arg1)
mov.l LOCAL(x00100000),r2
shlr r1
bra LOCAL(denorm_arg0_done)
sub r3,r2
LOCAL(inf_nan_arg1):
mov DBL1L,DBLRL
bra LOCAL(pop_r8_r9)
mov DBL1H,DBLRH
LOCAL(ret_arg1_exp_r3):
mov.l LOCAL(x80000000),DBLRH
bra LOCAL(ret_arg)
mov DBL1L,DBLRL
LOCAL(m32):
.word -32
LOCAL(d0):
.word 0
LOCAL(long_norm):
tst DBLRH,DBLRH
mov.w LOCAL(xff),DBL0L
mov #21,r3
bf LOCAL(long_norm_highset)
mov.l LOCAL(x02100000),DBL1L
tst DBLRL,DBLRL
extu.w DBLRL,DBL0H
bt LOCAL(zero_or_ulp)
mov DBLRL,DBLRH
cmp/eq DBLRL,DBL0H
bf 0f
mov.l LOCAL(x01100000),DBL1L
clrt
shlr16 DBLRH
xtrct DBLRL,r9
mov DBLRH,DBL0H
LOCAL(long_norm_ulp_done):
0: mov r9,DBLRL
subc DBL1L,r2
bt LOCAL(denorm1_b)
#ifdef __pic__
mov.l LOCAL(c__clz_tab),DBL1H
LOCAL(long_norm_lookup):
mov r0,r9
mova LOCAL(c__clz_tab),r0
add DBL1H,r0
#else
mov r0,r9
LOCAL(long_norm_lookup):
mov.l LOCAL(c__clz_tab),r0
#endif /* __pic__ */
cmp/hi DBL0L,DBL0H
bt 0f
shlr8 DBL0H
0: mov.b @(r0,DBL0H),DBL0H
bt 0f
add #-8,r3
0: mov.w LOCAL(d20),DBL0L
sub r0,r3
clrt
mov r3,DBL1H
shld DBL0L,DBL1H
subc DBL1H,r2
mov r9,r0
bf LOCAL(no_denorm)
mov #-20,DBL0L
shad DBL0L,r2
bra LOCAL(denorm1_done)
add r2,r3
LOCAL(norm_round):
cmp/pz r2
mov #0,DBL1H
bf LOCAL(denorm0_1)
or r8,r2
mov DBLRL,DBL1L
shlr DBL1L
addc r3,r9
mov.l @r15+,r9
addc DBL1H,DBLRL
mov.l @r15+,r8
rts
addc r2,DBLRH
LOCAL(norm_pack):
add r8,DBLRH
mov.l @r15+,r8
rts
add r2,DBLRH
LOCAL(denorm0_1):
mov.l @r15+,r9
mov r8,DBL0L
mov.l @r15+,r8
LOCAL(denorm0_shift):
shlr DBLRH
rotcr DBLRL
rts
add DBL0L,DBLRH
LOCAL(denorm0_n):
mov r8,DBL0L
addc DBL0H,r2
mov.l @r15+,r8
bf LOCAL(denorm0_shift)
rts
add DBL0L,DBLRH
LOCAL(no_denorm):
add r2,r8
LOCAL(denorm1_done):
shld r3,DBLRH
mov DBLRL,DBL0L
shld r3,DBLRL
add r8,DBLRH
mov.l @r15+,r9
add #-32,r3
mov.l @r15+,r8
shld r3,DBL0L
rts
or DBL0L,DBLRH
LOCAL(long_norm_highset):
mov.l LOCAL(x00200000),DBL1L
shll r9
rotcl DBLRL
mov DBLRH,DBL0H
rotcl DBLRH ! clears T
#ifdef __pic__
mov.l LOCAL(c__clz_tab),DBL1H
#else
mov r0,r9
#endif /* __pic__ */
subc DBL1L,r2
bf LOCAL(long_norm_lookup)
LOCAL(denorm1_a):
shlr DBLRH
rotcr DBLRL
mov.l @r15+,r9
or r8,DBLRH
rts
mov.l @r15+,r8
LOCAL(denorm1_b):
mov #-20,DBL0L
shad DBL0L,r2
shld r2,DBLRL
mov DBLRH,DBL0L
shld r2,DBLRH
or r8,DBLRH
mov.l @r15+,r9
add #32,r2
mov.l @r15+,r8
shld r3,DBL0L
rts
or DBL0L,DBLRL
LOCAL(zero_or_ulp):
tst r9,r9
bf LOCAL(long_norm_ulp_done)
LOCAL(pop_r8_r9):
mov.l @r15+,r9
rts
mov.l @r15+,r8
LOCAL(d20):
.word 20
LOCAL(xff):
.word 0xff
.balign 4
LOCAL(x7ff00000):
.long 0x7ff00000
LOCAL(x001fffff):
.long 0x001fffff
LOCAL(x80000000):
.long 0x80000000
LOCAL(x000fffff):
.long 0x000fffff
LOCAL(x800fffff):
.long 0x800fffff
LOCAL(x001f0000):
.long 0x001f0000
LOCAL(x00200000):
.long 0x00200000
LOCAL(x7fffffff):
.long 0x7fffffff
LOCAL(x00100000):
.long 0x00100000
LOCAL(x02100000):
.long 0x02100000
LOCAL(x01100000):
.long 0x01100000
LOCAL(c__clz_tab):
#ifdef __pic__
.long GLOBAL(clz_tab) - .
#else
.long GLOBAL(clz_tab)
#endif
ENDFUNC(GLOBAL(adddf3_))
ENDFUNC(GLOBAL(subdf3_))