This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: RFC: Handling of libgcc symbols in SH shared libraries


Your adddf3 version doesn't seem to implement any sticky bit when
shifting the smaller operand by 32 or more.  It is also on the
slow side, in particular the worst case is very bad indeed.

The version below is optimized both for good typical and worst case
performance.

--- 8>< cut here --- 8>< cut here --- 8>< cut here --- 8>< cut here ---
/* Copyright (C) 2004 Free Software Foundation, Inc.

This file is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
Free Software Foundation; either version 2, or (at your option) any
later version.

In addition to the permissions in the GNU General Public License, the
Free Software Foundation gives you unlimited permission to link the
compiled version of this file into combinations with other programs,
and to distribute those combinations without any restriction coming
from the use of this file.  (The General Public License restrictions
do apply in other respects; for example, they cover modification of
the file, and distribution when not linked into a combine
executable.)

This file is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; see the file COPYING.  If not, write to
the Free Software Foundation, 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA.  */

! adddf3 for the Renesas / SuperH SH CPUs.
! Contributed by Joern Rennecke
! joern.rennecke@superh.com
!
! This code is optimized for SH4 without FPU, but can also be used for SH3.
! Numbers with same sign are added in typically 37 cycles, worst case is
! 43 cycles, unless there is an overflow, in which case the addition can
! take up to takes 47 cycles.
! Normal numbers with different sign are added in 56 (57 for PIC) cycles
! or less on SH4.
! If one of the inputs is a denormal, the worst case is 59 (60 for PIC)
! cycles. (Two denormal inputs are faster than normal inputs, and
! denormal outputs don't slow down computation).
! Subtraction takes two cycles to negate the second input and then drops
! through to addition.

/* If the input exponents of a difference of two normalized numbers
   differ by more than one, the output does not need to be adjusted
   by more than one bit position.  Hence, it makes sense to ensure that
   the shifts by 0 & 1 are handled quickly to reduce average and worst
   case times.  */
FUNC(GLOBAL(adddf3_))
FUNC(GLOBAL(subdf3_))
	.global	GLOBAL(adddf3_)
	.global	GLOBAL(subdf3_)
LOCAL(denorm_arg1):
	bt	LOCAL(inf_nan_arg0)
	tst	r0,r2
	bt/s	LOCAL(denorm_both)
	shlr	r1
	mov.l	LOCAL(x00100000),r3
	bra	LOCAL(denorm_arg1_done)
	 sub	r2,r3

! Handle denorm addition here because otherwise the ordinary addition would
! have to check for denormal results.
! Denormal subtraction could also be done faster, but the denorm subtraction
! path here is still one cycles faster than the one for normalized input
! numbers, and 16 instructions shorter than the fastest version.
LOCAL(denorm_both):
	div0s	DBL0H,DBL1H
	mov.l	LOCAL(x800fffff),r9
	bt/s	LOCAL(denorm_sub)
	and	r1,DBL1H
	and	r9,DBL0H
	mov.l	@r15+,r9
	mov	DBL0L,DBLRL
	mov	DBL0H,DBLRH
	addc	DBL1L,DBLRL
	mov.l	@r15+,r8
	rts
	 addc	DBL1H,DBLRH

LOCAL(denorm_sub):
	mov	DBL0H,r8	! tentative result sign
	and	r1,DBL0H
	bra	LOCAL(sub_same_exp)
	 addc	r1,r2	! exponent++, clear T

LOCAL(inf_nan_arg0):
	mov	DBL0L,DBLRL
	bra	LOCAL(pop_r8_r9)
	 mov	DBL0H,DBLRH

LOCAL(ret_arg0):
	mov.l	LOCAL(x80000000),DBLRH
	mov	DBL0L,DBLRL
	mov	DBL0H,DBL1H
	mov	r2,r3
LOCAL(ret_arg):
	or	r3,DBL1H
	mov.l	@r15+,r9
	and	r8,DBLRH
	mov.l	@r15+,r8
	rts
	 or	DBL1H,DBLRH

	.balign	4
GLOBAL(subdf3_):
	cmp/pz	DBL1H
	add	DBL1H,DBL1H
	rotcr	DBL1H
	nop

GLOBAL(adddf3_):
	mov.l	LOCAL(x7ff00000),r0
	mov	DBL0H,r2
	mov.l	LOCAL(x001fffff),r1
	mov	DBL1H,r3
	mov.l	r8,@-r15
	and	r0,r2
	mov.l	r9,@-r15
	and	r0,r3
	cmp/hi	r2,r3
	or	r0,DBL0H
	or	r0,DBL1H
	bt	LOCAL(arg1_gt)
	tst	r0,r3
	mov	#-20,r9
	bt/s	LOCAL(denorm_arg1)
	cmp/hs	r0,r2
	bt	LOCAL(inf_nan_arg0)
	sub	r2,r3
LOCAL(denorm_arg1_done):	! r2 is tentative result exponent
	shad	r9,r3
	mov.w	LOCAL(m32),r9
	mov	DBL0H,r8	! tentative result sign
	and	r1,DBL0H
	mov	DBL1H,r0	! the 'other' sign
	and	r1,DBL1H
	cmp/ge	r9,r3
	mov	DBL1H,r1
	bf/s	LOCAL(large_shift_arg1)
	 shld	r3,DBL1H
LOCAL(small_shift_arg1):
	mov	DBL1L,r9
	shld	r3,DBL1L
	tst	r3,r3
	add	#32,r3
	bt/s	LOCAL(same_exp)
	 div0s	r8,r0	! compare signs
	shld	r3,r1

	or	r1,DBL1L
	bf/s	LOCAL(add)
	shld	r3,r9
	clrt
	negc	r9,r9
	mov.l	LOCAL(x001f0000),r3
LOCAL(sub_high):
	mov	DBL0L,DBLRL
	subc	DBL1L,DBLRL
	mov	DBL0H,DBLRH
	bra	LOCAL(subtract_done)
	 subc	DBL1H,DBLRH

LOCAL(large_shift_arg1):
	mov.w	LOCAL(d0),r9
	add	#64,r3
	cmp/pl	r3
	shld	r3,r1
	bf	LOCAL(ret_arg0)
	cmp/hi	r9,DBL1L
	mov	DBL1H,DBL1L
	mov	r9,DBL1H
	addc	r1,r9

	div0s	r8,r0	! compare signs

	bf	LOCAL(add)
	clrt
	mov.l	LOCAL(x001f0000),r3
	bra	LOCAL(sub_high)
	 negc	r9,r9

LOCAL(add_clr_r9):
	mov	#0,r9
LOCAL(add):
	mov.l	LOCAL(x00200000),r3
	addc	DBL1L,DBL0L
	addc	DBL1H,DBL0H
	mov.l	LOCAL(x80000000),r1
	tst	r3,DBL0H
	mov.l	LOCAL(x7fffffff),r3
	mov	DBL0L,r0
	bt/s	LOCAL(no_carry)
	and	r1,r8
	tst	r9,r9
	bf	LOCAL(add_one)
	tst	#2,r0
LOCAL(add_one):
	subc	r9,r9
	sett
	mov	r0,DBLRL
	addc	r9,DBLRL
	mov	DBL0H,DBLRH
	addc	r9,DBLRH
	shlr	DBLRH
	mov.l	LOCAL(x7ff00000),r3
	add	r2,DBLRH
	mov.l	@r15+,r9
	rotcr	DBLRL
	cmp/hi	r3,DBLRH
LOCAL(add_done):
	bt	LOCAL(inf)
LOCAL(or_sign):
	or	r8,DBLRH
	rts
	 mov.l	@r15+,r8

LOCAL(inf):
	bra	LOCAL(or_sign)
	 mov	r3,DBLRH

LOCAL(pos_difference_0):
	tst	r3,DBL0H
	mov	DBL0L,DBLRL
	mov.l	LOCAL(x80000000),DBL0L
	mov	DBL0H,DBLRH
	mov.l	LOCAL(x00100000),DBL0H
	bt/s	LOCAL(long_norm)
	and	DBL0L,r8
	bra	LOCAL(norm_loop)
	 not	DBL0L,r3

LOCAL(same_exp):
	bf	LOCAL(add_clr_r9)
	clrt
LOCAL(sub_same_exp):
	subc	DBL1L,DBL0L
	mov.w	LOCAL(d0),r9
	subc	DBL1H,DBL0H
	mov.l	LOCAL(x001f0000),r3
	bf	LOCAL(pos_difference_0)
	clrt
	negc	DBL0L,DBLRL
	mov.l	LOCAL(x80000000),DBL0L
	negc	DBL0H,DBLRH
	mov.l	LOCAL(x00100000),DBL0H
	tst	r3,DBLRH
	not	r8,r8
	bt/s	LOCAL(long_norm)
	and	DBL0L,r8
	bra	LOCAL(norm_loop)
	 not	DBL0L,r3

LOCAL(large_shift_arg0):
	add	#64,r2

	mov	#0,r9
	cmp/pl	r2
	shld	r2,r1
	bf	LOCAL(ret_arg1_exp_r3)
	cmp/hi	r9,DBL0L
	mov	DBL0H,DBL0L
	mov	r9,DBL0H
	addc	r1,r9
	div0s	r8,r0	! compare signs
	mov	r3,r2	! tentative result exponent
	bf	LOCAL(add)
	clrt
	negc	r9,r9
	bra	LOCAL(subtract_arg0_arg1_done)
	 mov	DBL1L,DBLRL

LOCAL(arg1_gt):
	tst	r0,r2
	mov	#-20,r9
	bt/s	LOCAL(denorm_arg0)
	cmp/hs	r0,r3
	bt	LOCAL(inf_nan_arg1)
	sub	r3,r2
LOCAL(denorm_arg0_done):
	shad	r9,r2
	mov.w	LOCAL(m32),r9
	mov	DBL1H,r8	! tentative result sign
	and	r1,DBL1H
	mov	DBL0H,r0	! the 'other' sign
	and	r1,DBL0H
	cmp/ge	r9,r2
	mov	DBL0H,r1
	shld	r2,DBL0H
	bf	LOCAL(large_shift_arg0)
	mov	DBL0L,r9
	shld	r2,DBL0L
	add	#32,r2
	mov.l	r3,@-r15
	shld	r2,r1
	mov	r2,r3
	div0s	r8,r0		! compare signs
	mov.l	@r15+,r2	! tentative result exponent
	shld	r3,r9
	bf/s	LOCAL(add)
	or	r1,DBL0L
	clrt
	negc	r9,r9
	mov	DBL1L,DBLRL
LOCAL(subtract_arg0_arg1_done):
	subc	DBL0L,DBLRL
	mov	DBL1H,DBLRH
	mov.l	LOCAL(x001f0000),r3
	subc	DBL0H,DBLRH
/* Since the exponents were different, the difference is positive.  */
/* Fall through */
LOCAL(subtract_done):
/* First check if a shift by a few bits is sufficient.  This not only
   speeds up this case, but also alleviates the need for considering
   lower bits from r2 or rounding in the other code.
   Moreover, by handling the upper 1+4 bits of the fraction here, long_norm
   can assume that DBLRH fits into 16 bit.  */
	tst	r3,DBLRH
	mov.l	LOCAL(x80000000),r3
	mov.l	LOCAL(x00100000),DBL0H
	bt/s	LOCAL(long_norm)
	and	r3,r8
	mov.l	LOCAL(x7fffffff),r3
LOCAL(norm_loop):	! Well, this used to be a loop...
	tst	DBL0H,DBLRH
	sub	DBL0H,r2
	bf	LOCAL(norm_round)
	shll	r9
	rotcl	DBLRL

	rotcl	DBLRH

	tst	DBL0H,DBLRH
	sub	DBL0H,r2
	bf	LOCAL(norm_round)
	shll	DBLRL
	rotcl	DBLRH
	mov.l	@r15+,r9
	cmp/gt	r2,DBL0H
	sub	DBL0H,r2
LOCAL(norm_loop_1):
	bt	LOCAL(denorm0_n)
	tst	DBL0H,DBLRH
	bf	LOCAL(norm_pack)
	shll	DBLRL
	rotcl	DBLRH	! clears T
	bra	LOCAL(norm_loop_1)
	 subc	DBL0H,r2

LOCAL(no_carry):
	shlr	r0
	mov.l	LOCAL(x000fffff),DBLRH
	addc	r3,r9
	mov.w	LOCAL(d0),DBL1H
	mov	DBL0L,DBLRL
	and	DBL0H,DBLRH	! mask out implicit 1
	mov.l	LOCAL(x7ff00000),r3
	addc	DBL1H,DBLRL
	addc	r2,DBLRH
	mov.l	@r15+,r9
	add	DBL1H,DBLRH	! fraction overflow -> exp increase
	bra	LOCAL(add_done)
	 cmp/hi	r3,DBLRH

LOCAL(denorm_arg0):
	bt	LOCAL(inf_nan_arg1)
	mov.l	LOCAL(x00100000),r2
	shlr	r1
	bra	LOCAL(denorm_arg0_done)
	 sub	r3,r2

LOCAL(inf_nan_arg1):
	mov	DBL1L,DBLRL
	bra	LOCAL(pop_r8_r9)
	 mov	DBL1H,DBLRH

LOCAL(ret_arg1_exp_r3):
	mov.l	LOCAL(x80000000),DBLRH
	bra	LOCAL(ret_arg)
	 mov	DBL1L,DBLRL

LOCAL(m32):
	.word	-32
LOCAL(d0):
	.word	0

LOCAL(long_norm):
	tst	DBLRH,DBLRH
	mov.w	LOCAL(xff),DBL0L
	mov	#21,r3
	bf	LOCAL(long_norm_highset)
	mov.l	LOCAL(x02100000),DBL1L
	tst	DBLRL,DBLRL
	extu.w	DBLRL,DBL0H
	bt	LOCAL(zero_or_ulp)
	mov	DBLRL,DBLRH
	cmp/eq	DBLRL,DBL0H
	bf	0f
	mov.l	LOCAL(x01100000),DBL1L
	clrt
	shlr16  DBLRH
	xtrct	DBLRL,r9
	mov     DBLRH,DBL0H
LOCAL(long_norm_ulp_done):
0:	mov	r9,DBLRL
	subc	DBL1L,r2
	bt	LOCAL(denorm1_b)
#ifdef __pic__
	mov.l	LOCAL(c__clz_tab),DBL1H
LOCAL(long_norm_lookup):
	mov	r0,r9
	mova	LOCAL(c__clz_tab),r0
	add	DBL1H,r0
#else
	mov	r0,r9
LOCAL(long_norm_lookup):
	mov.l	LOCAL(c__clz_tab),r0
#endif /* __pic__ */
	cmp/hi	DBL0L,DBL0H
	bt	0f
	shlr8	DBL0H
0:	mov.b	@(r0,DBL0H),DBL0H
	bt	0f
	add	#-8,r3
0:	mov.w	LOCAL(d20),DBL0L
	sub	r0,r3
	clrt
	mov	r3,DBL1H
	shld	DBL0L,DBL1H
	subc	DBL1H,r2
	mov	r9,r0
	bf	LOCAL(no_denorm)
	mov	#-20,DBL0L
	shad	DBL0L,r2
	bra	LOCAL(denorm1_done)
	 add	r2,r3
	
LOCAL(norm_round):
	cmp/pz	r2
	mov	#0,DBL1H
	bf	LOCAL(denorm0_1)
	or	r8,r2
	mov	DBLRL,DBL1L
	shlr	DBL1L
	addc	r3,r9
	mov.l	@r15+,r9
	addc	DBL1H,DBLRL
	mov.l	@r15+,r8
	rts
	 addc	r2,DBLRH

LOCAL(norm_pack):
	add	r8,DBLRH
	mov.l	@r15+,r8
	rts
	add	r2,DBLRH

LOCAL(denorm0_1):
	mov.l	@r15+,r9
	mov	r8,DBL0L
	mov.l	@r15+,r8
LOCAL(denorm0_shift):
	shlr	DBLRH
	rotcr	DBLRL

	rts
	add	DBL0L,DBLRH

LOCAL(denorm0_n):
	mov	r8,DBL0L
	addc	DBL0H,r2
	mov.l	@r15+,r8
	bf	LOCAL(denorm0_shift)
	rts
	add	DBL0L,DBLRH

LOCAL(no_denorm):
	add	r2,r8

LOCAL(denorm1_done):
	shld	r3,DBLRH
	mov	DBLRL,DBL0L
	shld	r3,DBLRL

	add	r8,DBLRH
	mov.l	@r15+,r9
	add	#-32,r3
	mov.l	@r15+,r8
	shld	r3,DBL0L

	rts
	or	DBL0L,DBLRH

LOCAL(long_norm_highset):
	mov.l	LOCAL(x00200000),DBL1L
	shll	r9
	rotcl	DBLRL
	mov	DBLRH,DBL0H
	rotcl	DBLRH	! clears T
#ifdef __pic__
	mov.l	LOCAL(c__clz_tab),DBL1H
#else
	mov	r0,r9
#endif /* __pic__ */
	subc	DBL1L,r2

	bf	LOCAL(long_norm_lookup)
LOCAL(denorm1_a):
	shlr	DBLRH
	rotcr	DBLRL
	mov.l	@r15+,r9
	or	r8,DBLRH

	rts
	mov.l	@r15+,r8

LOCAL(denorm1_b):
	mov	#-20,DBL0L

	shad	DBL0L,r2

	shld	r2,DBLRL
	mov	DBLRH,DBL0L
	shld	r2,DBLRH

	or	r8,DBLRH
	mov.l	@r15+,r9
	add	#32,r2
	mov.l	@r15+,r8
	shld	r3,DBL0L

	rts
	or	DBL0L,DBLRL

LOCAL(zero_or_ulp):
	tst	r9,r9
	bf	LOCAL(long_norm_ulp_done)
LOCAL(pop_r8_r9):
	mov.l	@r15+,r9
	rts
	mov.l	@r15+,r8

LOCAL(d20):
	.word	20
LOCAL(xff):
	.word 0xff
	.balign	4
LOCAL(x7ff00000):
	.long	0x7ff00000
LOCAL(x001fffff):
	.long	0x001fffff
LOCAL(x80000000):
	.long	0x80000000
LOCAL(x000fffff):
	.long	0x000fffff
LOCAL(x800fffff):
	.long	0x800fffff
LOCAL(x001f0000):
	.long	0x001f0000
LOCAL(x00200000):
	.long	0x00200000
LOCAL(x7fffffff):
	.long	0x7fffffff
LOCAL(x00100000):
	.long	0x00100000
LOCAL(x02100000):
	.long	0x02100000
LOCAL(x01100000):
	.long	0x01100000
LOCAL(c__clz_tab):
#ifdef __pic__
	.long	GLOBAL(clz_tab) - .
#else
	.long	GLOBAL(clz_tab)
#endif
ENDFUNC(GLOBAL(adddf3_))
ENDFUNC(GLOBAL(subdf3_))


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]