This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
[Patch] new (almost rewritten) version of the ARM assembly ieee754lib

From: Nicolas Pitre <nico at cam dot org>
To: gcc-patches at gcc dot gnu dot org
Cc: Richard Earnshaw <rearnsha at arm dot com>
Date: Tue, 12 Oct 2004 13:38:49 -0400 (EDT)
Subject: [Patch] new (almost rewritten) version of the ARM assembly ieee754lib
After many months of part time hacking on the ARM assembly for floating 
point operations, I'm about to lose interest so it's time I finally 
submit the improvements I've done so far and move on to another 
distraction.

Nevertheless, the improvements in performance should be significant.  
See for example this non-exhaustive instruction count comparison between 
the old (current) and the new (with my patch) version of the lib which 
should give a rough idea of the speed increase:

operation                       old             new
---------------------------------------------------
mulsf3 (common case)            40              28
mulsf3 (x or y power of 2)      40              20
mulsf3 (power of 2, ARMv3)      51              20
muldf3 (common case)            59              42
muldf3 (x or y power of 2)      48              25
muldf3 (power of 2, ARMv3)      94              25
addsf3 (say 8.0 + 9.0)          63              43
adddf3 (say 8.0 + 9.0)          82              56
cmpsf2 (say 8.0 and 9.0)        23              14
cmpdf2 (say 8.0 and 9.0)        27              20
fixsfsi (common case)           17              12
fixdfsi (common case)           23              14

... and so on.  All functions were reworked in some way.

The diffstat also shows an an overall code reduction despite the 
addition of special case paths in some places:

 ieee754-df.S |  983 +++++++++++++++++++++++++++--------------------------------
 ieee754-sf.S |  884 ++++++++++++++++++++++++-----------------------------
 2 files changed, 865 insertions(+), 1002 deletions(-)

And finally, operations on NAN values has been reworked to be more 
standard compliant passing more conformance tests now.

So here it is, asking permission to commit:

[date]  Nicolas Pitre <nico@cam.org>

	* config/arm/ieee754-sf.S: Large speed improvements. Fix NAN handling.
	* config/arm/ieee754-df.S: Ditto.

Index: gcc/config/arm/ieee754-df.S
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/arm/ieee754-df.S,v
retrieving revision 1.7
diff -u -r1.7 ieee754-df.S
--- gcc/config/arm/ieee754-df.S	1 Sep 2004 11:14:19 -0000	1.7
+++ gcc/config/arm/ieee754-df.S	12 Oct 2004 16:04:31 -0000
@@ -60,6 +60,7 @@
 
 ARM_FUNC_START negdf2
 ARM_FUNC_ALIAS aeabi_dneg negdf2
+
 	@ flip sign bit
 	eor	xh, xh, #0x80000000
 	RET
@@ -76,10 +77,10 @@
 	eor	xh, xh, #0x80000000	@ flip sign bit of first arg
 	b	1f	
 
-	ARM_FUNC_START subdf3
+ARM_FUNC_START subdf3
 ARM_FUNC_ALIAS aeabi_dsub subdf3
-	@ flip sign bit of second arg
-	eor	yh, yh, #0x80000000
+
+	eor	yh, yh, #0x80000000	@ flip sign bit of second arg
 #if defined(__thumb__) && !defined(__THUMB_INTERWORK__)
 	b	1f			@ Skip Thumb-code prologue
 #endif
@@ -87,36 +88,23 @@
 ARM_FUNC_START adddf3
 ARM_FUNC_ALIAS aeabi_dadd adddf3
 
-1:	@ Compare both args, return zero if equal but the sign.
-	teq	xl, yl
-	eoreq	ip, xh, yh
-	teqeq	ip, #0x80000000
-	beq	LSYM(Lad_z)
-
-	@ If first arg is 0 or -0, return second arg.
-	@ If second arg is 0 or -0, return first arg.
-	orrs	ip, xl, xh, lsl #1
-	moveq	xl, yl
-	moveq	xh, yh
-	orrnes	ip, yl, yh, lsl #1
-	RETc(eq)
-
-	stmfd	sp!, {r4, r5, lr}
-
-	@ Mask out exponents.
-	mov	ip, #0x7f000000
-	orr	ip, ip, #0x00f00000
-	and	r4, xh, ip
-	and	r5, yh, ip
+1:	stmfd	sp!, {r4, r5, lr}
 
-	@ If either of them is 0x7ff, result will be INF or NAN
-	teq	r4, ip
-	teqne	r5, ip
-	beq	LSYM(Lad_i)
+	@ Look for zeroes, equal values, INF, or NAN.
+	mov	r4, xh, lsl #1
+	mov	r5, yh, lsl #1
+	teq	r4, r5
+	teqeq	xl, yl
+	orrnes	ip, r4, xl
+	orrnes	ip, r5, yl
+	mvnnes	ip, r4, asr #21
+	mvnnes	ip, r5, asr #21
+	beq	LSYM(Lad_s)
 
 	@ Compute exponent difference.  Make largest exponent in r4,
 	@ corresponding arg in xh-xl, and positive exponent difference in r5.
-	subs	r5, r5, r4
+	mov	r4, r4, lsr #21
+	rsbs	r5, r4, r5, lsr #21
 	rsblt	r5, r5, #0
 	ble	1f
 	add	r4, r4, r5
@@ -127,24 +115,24 @@
 	eor	yl, xl, yl
 	eor	yh, xh, yh
 1:
-
 	@ If exponent difference is too large, return largest argument
 	@ already in xh-xl.  We need up to 54 bit to handle proper rounding
 	@ of 0x1p54 - 1.1.
-	cmp	r5, #(54 << 20)
+	cmp	r5, #54
 	RETLDM	"r4, r5" hi
 
 	@ Convert mantissa to signed integer.
 	tst	xh, #0x80000000
-	bic	xh, xh, ip, lsl #1
-	orr	xh, xh, #0x00100000
+	mov	xh, xh, lsl #12
+	mov	ip, #0x00100000
+	orr	xh, ip, xh, lsr #12
 	beq	1f
 	rsbs	xl, xl, #0
 	rsc	xh, xh, #0
 1:
 	tst	yh, #0x80000000
-	bic	yh, yh, ip, lsl #1
-	orr	yh, yh, #0x00100000
+	mov	yh, yh, lsl #12
+	orr	yh, ip, yh, lsr #12
 	beq	1f
 	rsbs	yl, yl, #0
 	rsc	yh, yh, #0
@@ -154,42 +142,30 @@
 	teq	r4, r5
 	beq	LSYM(Lad_d)
 LSYM(Lad_x):
-	@ Scale down second arg with exponent difference.
-	@ Apply shift one bit left to first arg and the rest to second arg
-	@ to simplify things later, but only if exponent does not become 0.
-	mov	ip, #0
-	movs	r5, r5, lsr #20
-	beq	3f
-	teq	r4, #(1 << 20)
-	beq	1f
-	movs	xl, xl, lsl #1
-	adc	xh, ip, xh, lsl #1
-	sub	r4, r4, #(1 << 20)
-	subs	r5, r5, #1
-	beq	3f
 
-	@ Shift yh-yl right per r5, keep leftover bits into ip.
-1:	rsbs	lr, r5, #32
-	blt	2f
+	@ Compensate for the exponent overlapping the mantissa MSB added later
+	sub	r4, r4, #1
+
+	@ Shift yh-yl right per r5, add to xh-xl, keep leftover bits into ip.
+	rsbs	lr, r5, #32
+	blt	1f
 	mov	ip, yl, lsl lr
-	mov	yl, yl, lsr r5
-	orr	yl, yl, yh, lsl lr
-	mov	yh, yh, asr r5
-	b	3f
-2:	sub	r5, r5, #32
+	adds	xl, xl, yl, lsr r5
+	adc	xh, xh, #0
+	adds	xl, xl, yh, lsl lr
+	adcs	xh, xh, yh, asr r5
+	b	2f
+1:	sub	r5, r5, #32
 	add	lr, lr, #32
 	cmp	yl, #1
-	adc	ip, ip, yh, lsl lr
-	mov	yl, yh, asr r5
-	mov	yh, yh, asr #32
-3:
-	@ the actual addition
-	adds	xl, xl, yl
-	adc	xh, xh, yh
-
+	mov	ip, yh, lsl lr
+	orrcs	ip, ip, #2		@ 2 not 1, to allow lsr #1 later
+	adds	xl, xl, yh, asr r5
+	adcs	xh, xh, yh, asr #31
+2:
 	@ We now have a result in xh-xl-ip.
-	@ Keep absolute value in xh-xl-ip, sign in r5.
-	ands	r5, xh, #0x80000000
+	@ Keep absolute value in xh-xl-ip, sign in r5 (the n bit was set above)
+	and	r5, xh, #0x80000000
 	bpl	LSYM(Lad_p)
 	rsbs	ip, ip, #0
 	rscs	xl, xl, #0
@@ -198,75 +174,66 @@
 	@ Determine how to normalize the result.
 LSYM(Lad_p):
 	cmp	xh, #0x00100000
-	bcc	LSYM(Lad_l)
+	bcc	LSYM(Lad_a)
 	cmp	xh, #0x00200000
-	bcc	LSYM(Lad_r0)
-	cmp	xh, #0x00400000
-	bcc	LSYM(Lad_r1)
+	bcc	LSYM(Lad_e)
 
 	@ Result needs to be shifted right.
 	movs	xh, xh, lsr #1
 	movs	xl, xl, rrx
-	movs	ip, ip, rrx
-	orrcs	ip, ip, #1
-	add	r4, r4, #(1 << 20)
-LSYM(Lad_r1):
-	movs	xh, xh, lsr #1
-	movs	xl, xl, rrx
-	movs	ip, ip, rrx
-	orrcs	ip, ip, #1
-	add	r4, r4, #(1 << 20)
+	mov	ip, ip, rrx
+	add	r4, r4, #1
+
+	@ Make sure we did not bust our exponent.
+	mov	r2, r4, lsl #21
+	cmn	r2, #(2 << 21)
+	bcs	LSYM(Lad_o)
 
 	@ Our result is now properly aligned into xh-xl, remaining bits in ip.
 	@ Round with MSB of ip. If halfway between two numbers, round towards
 	@ LSB of xl = 0.
-LSYM(Lad_r0):
-	adds	xl, xl, ip, lsr #31
-	adc	xh, xh, #0
-	teq	ip, #0x80000000
-	biceq	xl, xl, #1
-
-	@ One extreme rounding case may add a new MSB.  Adjust exponent.
-	@ That MSB will be cleared when exponent is merged below. 
-	tst	xh, #0x00200000
-	addne	r4, r4, #(1 << 20)
-
-	@ Make sure we did not bust our exponent.
-	adds	ip, r4, #(1 << 20)
-	bmi	LSYM(Lad_o)
-
 	@ Pack final result together.
 LSYM(Lad_e):
-	bic	xh, xh, #0x00300000
-	orr	xh, xh, r4
+	cmp	ip, #0x80000000
+	moveqs	ip, xl, lsr #1
+	adcs	xl, xl, #0
+	adc	xh, xh, r4, lsl #20
 	orr	xh, xh, r5
 	RETLDM	"r4, r5"
 
-LSYM(Lad_l):
 	@ Result must be shifted left and exponent adjusted.
-	@ No rounding necessary since ip will always be 0.
+LSYM(Lad_a):
+	movs	ip, ip, lsl #1
+	adcs	xl, xl, xl
+	adc	xh, xh, xh
+	tst	xh, #0x00100000
+	sub	r4, r4, #1
+	bne	LSYM(Lad_e)
+
+	@ No rounding necessary since ip will always be 0 at this point.
+LSYM(Lad_l):
+
 #if __ARM_ARCH__ < 5
 
 	teq	xh, #0
-	movne	r3, #-11
-	moveq	r3, #21
+	movne	r3, #20
+	moveq	r3, #52
 	moveq	xh, xl
 	moveq	xl, #0
 	mov	r2, xh
-	movs	ip, xh, lsr #16
-	moveq	r2, r2, lsl #16
-	addeq	r3, r3, #16
-	tst	r2, #0xff000000
-	moveq	r2, r2, lsl #8
-	addeq	r3, r3, #8
-	tst	r2, #0xf0000000
-	moveq	r2, r2, lsl #4
-	addeq	r3, r3, #4
-	tst	r2, #0xc0000000
-	moveq	r2, r2, lsl #2
-	addeq	r3, r3, #2
-	tst	r2, #0x80000000
-	addeq	r3, r3, #1
+	cmp	r2, #(1 << 16)
+	movhs	r2, r2, lsr #16
+	subhs	r3, r3, #16
+	cmp	r2, #(1 << 8)
+	movhs	r2, r2, lsr #8
+	subhs	r3, r3, #8
+	cmp	r2, #(1 << 4)
+	movhs	r2, r2, lsr #4
+	subhs	r3, r3, #4
+	cmp	r2, #(1 << 2)
+	subhs	r3, r3, #2
+	sublo	r3, r3, r2, lsr #1
+	sub	r3, r3, r2, lsr #3
 
 #else
 
@@ -302,13 +269,15 @@
 	movle	xl, xl, lsl r2
 
 	@ adjust exponent accordingly.
-3:	subs	r4, r4, r3, lsl #20
-	bgt	LSYM(Lad_e)
+3:	subs	r4, r4, r3
+	addge	xh, xh, r4, lsl #20
+	orrge	xh, xh, r5
+	RETLDM	"r4, r5" ge
 
 	@ Exponent too small, denormalize result.
 	@ Find out proper shift value.
-	mvn	r4, r4, asr #20
-	subs	r4, r4, #30
+	mvn	r4, r4
+	subs	r4, r4, #31
 	bge	2f
 	adds	r4, r4, #12
 	bgt	1f
@@ -337,23 +306,49 @@
 	RETLDM	"r4, r5"
 
 	@ Adjust exponents for denormalized arguments.
+	@ Note that r4 must not remain equal to 0.
 LSYM(Lad_d):
 	teq	r4, #0
-	eoreq	xh, xh, #0x00100000
-	addeq	r4, r4, #(1 << 20)
 	eor	yh, yh, #0x00100000
-	subne	r5, r5, #(1 << 20)
+	eoreq	xh, xh, #0x00100000
+	addeq	r4, r4, #1
+	subne	r5, r5, #1
 	b	LSYM(Lad_x)
 
-	@ Result is x - x = 0, unless x = INF or NAN.
-LSYM(Lad_z):
-	sub	ip, ip, #0x00100000	@ ip becomes 0x7ff00000
-	and	r2, xh, ip
-	teq	r2, ip
-	orreq	xh, ip, #0x00080000
+
+LSYM(Lad_s):
+	mvns	ip, r4, asr #21
+	mvnnes	ip, r5, asr #21
+	beq	LSYM(Lad_i)
+
+	teq	r4, r5
+	teqeq	xl, yl
+	beq	1f
+
+	@ Result is x + 0.0 = x or 0.0 + y = y.
+	teq	r4, #0
+	moveq	xh, yh
+	moveq	xl, yl
+	RETLDM	"r4, r5"
+
+1:	teq	xh, yh
+
+	@ Result is x - x = 0.
 	movne	xh, #0
-	mov	xl, #0
-	RET
+	movne	xl, #0
+	RETLDM	"r4, r5" ne
+
+	@ Result is x + x = 2x.
+	movs	ip, r4, lsr #21
+	bne	2f
+	movs	xl, xl, lsl #1
+	adcs	xh, xh, xh
+	orrcs	xh, xh, #0x80000000
+	RETLDM	"r4, r5"
+2:	adds	r4, r4, #(2 << 21)
+	addcc	xh, xh, #(1 << 20)
+	RETLDM	"r4, r5" cc
+	and	r5, xh, #0x80000000
 
 	@ Overflow: return INF.
 LSYM(Lad_o):
@@ -367,19 +362,18 @@
 	@   if yh-yl != INF/NAN: return xh-xl (which is INF/NAN)
 	@   if either is NAN: return NAN
 	@   if opposite sign: return NAN
-	@   return xh-xl (which is INF or -INF)
+	@   otherwise return xh-xl (which is INF or -INF)
 LSYM(Lad_i):
-	teq	r4, ip
+	mvns	ip, r4, asr #21
 	movne	xh, yh
 	movne	xl, yl
-	teqeq	r5, ip
-	RETLDM	"r4, r5" ne
-
+	mvneqs	ip, r5, asr #21
+	movne	yh, xh
+	movne	yl, xl
 	orrs	r4, xl, xh, lsl #12
-	orreqs	r4, yl, yh, lsl #12
+	orreqs	r5, yl, yh, lsl #12
 	teqeq	xh, yh
-	orrne	xh, r5, #0x00080000
-	movne	xl, #0
+	orrne	xh, xh, #0x00080000	@ quiet NAN
 	RETLDM	"r4, r5"
 
 	FUNC_END aeabi_dsub
@@ -389,14 +383,17 @@
 
 ARM_FUNC_START floatunsidf
 ARM_FUNC_ALIAS aeabi_ui2d floatunsidf
+
 	teq	r0, #0
 	moveq	r1, #0
 	RETc(eq)
 	stmfd	sp!, {r4, r5, lr}
-	mov	r4, #(0x400 << 20)	@ initial exponent
-	add	r4, r4, #((52-1) << 20)
+	mov	r4, #0x400		@ initial exponent
+	add	r4, r4, #(52-1 - 1)
 	mov	r5, #0			@ sign bit is 0
+	.ifnc	xl, r0
 	mov	xl, r0
+	.endif
 	mov	xh, #0
 	b	LSYM(Lad_l)
 
@@ -405,15 +402,18 @@
 
 ARM_FUNC_START floatsidf
 ARM_FUNC_ALIAS aeabi_i2d floatsidf
+
 	teq	r0, #0
 	moveq	r1, #0
 	RETc(eq)
 	stmfd	sp!, {r4, r5, lr}
-	mov	r4, #(0x400 << 20)	@ initial exponent
-	add	r4, r4, #((52-1) << 20)
+	mov	r4, #0x400		@ initial exponent
+	add	r4, r4, #(52-1 - 1)
 	ands	r5, r0, #0x80000000	@ sign bit in r5
 	rsbmi	r0, r0, #0		@ absolute value
+	.ifnc	xl, r0
 	mov	xl, r0
+	.endif
 	mov	xh, #0
 	b	LSYM(Lad_l)
 
@@ -422,26 +422,23 @@
 
 ARM_FUNC_START extendsfdf2
 ARM_FUNC_ALIAS aeabi_f2d extendsfdf2
-	
-	movs	r2, r0, lsl #1
-	beq	1f			@ value is 0.0 or -0.0
+
+	movs	r2, r0, lsl #1		@ toss sign bit
 	mov	xh, r2, asr #3		@ stretch exponent
 	mov	xh, xh, rrx		@ retrieve sign bit
 	mov	xl, r2, lsl #28		@ retrieve remaining bits
-	ands	r2, r2, #0xff000000	@ isolate exponent
-	beq	2f			@ exponent was 0 but not mantissa
-	teq	r2, #0xff000000		@ check if INF or NAN
+	andnes	r3, r2, #0xff000000	@ isolate exponent
+	teqne	r3, #0xff000000		@ if not 0, check if INF or NAN
 	eorne	xh, xh, #0x38000000	@ fixup exponent otherwise.
-	RET
+	RETc(ne)			@ and return it.
 
-1:	mov	xh, r0
-	mov	xl, #0
-	RET
+	teq	r2, #0			@ if actually 0
+	teqne	r3, #0xff000000		@ or INF or NAN
+	RETc(eq)			@ we are done already.
 
-2:	@ value was denormalized.  We can normalize it now.
+	@ value was denormalized.  We can normalize it now.
 	stmfd	sp!, {r4, r5, lr}
-	mov	r4, #(0x380 << 20)	@ setup corresponding exponent
-	add	r4, r4, #(1 << 20)
+	mov	r4, #0x380		@ setup corresponding exponent
 	and	r5, xh, #0x80000000	@ move sign bit in r5
 	bic	xh, xh, #0x80000000
 	b	LSYM(Lad_l)
@@ -451,76 +448,90 @@
 
 ARM_FUNC_START floatundidf
 ARM_FUNC_ALIAS aeabi_ul2d floatundidf
-	
+
 	orrs	r2, r0, r1
 #if !defined (__VFP_FP__) && !defined(__SOFTFP__)
 	mvfeqd	f0, #0.0
 #endif
 	RETc(eq)
+
 #if !defined (__VFP_FP__) && !defined(__SOFTFP__)
 	@ For hard FPA code we want to return via the tail below so that
 	@ we can return the result in f0 as well as in r0/r1 for backwards
 	@ compatibility.
-	adr	ip, 1f
+	adr	ip, LSYM(f0_ret)
 	stmfd	sp!, {r4, r5, ip, lr}
 #else
 	stmfd	sp!, {r4, r5, lr}
 #endif
+
 	mov	r5, #0
 	b	2f
 
 ARM_FUNC_START floatdidf
 ARM_FUNC_ALIAS aeabi_l2d floatdidf
+
 	orrs	r2, r0, r1
 #if !defined (__VFP_FP__) && !defined(__SOFTFP__)
 	mvfeqd	f0, #0.0
 #endif
 	RETc(eq)
+
 #if !defined (__VFP_FP__) && !defined(__SOFTFP__)
 	@ For hard FPA code we want to return via the tail below so that
 	@ we can return the result in f0 as well as in r0/r1 for backwards
 	@ compatibility.
-	adr	ip, 1f
+	adr	ip, LSYM(f0_ret)
 	stmfd	sp!, {r4, r5, ip, lr}
 #else
 	stmfd	sp!, {r4, r5, lr}
 #endif
+
 	ands	r5, ah, #0x80000000	@ sign bit in r5
 	bpl	2f
 	rsbs	al, al, #0
 	rsc	ah, ah, #0
 2:
-	mov	r4, #(0x400 << 20)	@ initial exponent
-	add	r4, r4, #((52 - 1) << 20)
-#if !defined (__VFP_FP__) && !defined(__ARMEB__)
+	mov	r4, #0x400		@ initial exponent
+	add	r4, r4, #(52-1 - 1)
+
 	@ FPA little-endian: must swap the word order.
+	.ifnc	xh, ah
 	mov	ip, al
 	mov	xh, ah
 	mov	xl, ip
-#endif
-	movs	ip, xh, lsr #23
+	.endif
+
+	movs	ip, xh, lsr #22
 	beq	LSYM(Lad_p)
-	@ The value's too big.  Scale it down a bit...
+
+	@ The value is too big.  Scale it down a bit...
 	mov	r2, #3
 	movs	ip, ip, lsr #3
 	addne	r2, r2, #3
 	movs	ip, ip, lsr #3
 	addne	r2, r2, #3
+	add	r2, r2, ip
+
 	rsb	r3, r2, #32
 	mov	ip, xl, lsl r3
 	mov	xl, xl, lsr r2
 	orr	xl, xl, xh, lsl r3
 	mov	xh, xh, lsr r2
-	add	r4, r4, r2, lsl #20
+	add	r4, r4, r2
 	b	LSYM(Lad_p)
+
 #if !defined (__VFP_FP__) && !defined(__SOFTFP__)
-1:
+
 	@ Legacy code expects the result to be returned in f0.  Copy it
 	@ there as well.
+LSYM(f0_ret):
 	stmfd	sp!, {r0, r1}
 	ldfd	f0, [sp], #8
 	RETLDM
+
 #endif
+
 	FUNC_END floatdidf
 	FUNC_END aeabi_l2d
 	FUNC_END floatundidf
@@ -534,46 +545,38 @@
 ARM_FUNC_ALIAS aeabi_dmul muldf3
 	stmfd	sp!, {r4, r5, r6, lr}
 
-	@ Mask out exponents.
-	mov	ip, #0x7f000000
-	orr	ip, ip, #0x00f00000
-	and	r4, xh, ip
-	and	r5, yh, ip
-
-	@ Trap any INF/NAN.
-	teq	r4, ip
+	@ Mask out exponents, trap any zero/denormal/INF/NAN.
+	mov	ip, #0xff
+	orr	ip, ip, #0x700
+	ands	r4, ip, xh, lsr #20
+	andnes	r5, ip, yh, lsr #20
+	teqne	r4, ip
 	teqne	r5, ip
-	beq	LSYM(Lml_s)
-
-	@ Trap any multiplication by 0.
-	orrs	r6, xl, xh, lsl #1
-	orrnes	r6, yl, yh, lsl #1
-	beq	LSYM(Lml_z)
+	bleq	LSYM(Lml_s)
 
-	@ Shift exponents right one bit to make room for overflow bit.
-	@ If either of them is 0, scale denormalized arguments off line.
-	@ Then add both exponents together.
-	movs	r4, r4, lsr #1
-	teqne	r5, #0
-	beq	LSYM(Lml_d)
-LSYM(Lml_x):
-	add	r4, r4, r5, asr #1
+	@ Add exponents together
+	add	r4, r4, r5
 
-	@ Preserve final sign in r4 along with exponent for now.
-	teq	xh, yh
-	orrmi	r4, r4, #0x8000
+	@ Determine final sign.
+	eor	r6, xh, yh
 
 	@ Convert mantissa to unsigned integer.
-	bic	xh, xh, ip, lsl #1
-	bic	yh, yh, ip, lsl #1
+	@ If power of two, branch to a separate path.
+	bic	xh, xh, ip, lsl #21
+	bic	yh, yh, ip, lsl #21
+	orrs	r5, xl, xh, lsl #12
+	orrnes	r5, yl, yh, lsl #12
 	orr	xh, xh, #0x00100000
 	orr	yh, yh, #0x00100000
+	beq	LSYM(Lml_1)
 
 #if __ARM_ARCH__ < 4
 
+	@ Put sign bit in r6, which will be restored in yl later.
+	and   r6, r6, #0x80000000
+
 	@ Well, no way to make it shorter without the umull instruction.
-	@ We must perform that 53 x 53 bit multiplication by hand.
-	stmfd	sp!, {r7, r8, r9, sl, fp}
+	stmfd	sp!, {r6, r7, r8, r9, sl, fp}
 	mov	r7, xl, lsr #16
 	mov	r8, yl, lsr #16
 	mov	r9, xh, lsr #16
@@ -625,92 +628,83 @@
 	mul	fp, xh, yh
 	adcs	r5, r5, fp
 	adc	r6, r6, #0
-	ldmfd	sp!, {r7, r8, r9, sl, fp}
+	ldmfd	sp!, {yl, r7, r8, r9, sl, fp}
 
 #else
 
-	@ Here is the actual multiplication: 53 bits * 53 bits -> 106 bits.
+	@ Here is the actual multiplication.
 	umull	ip, lr, xl, yl
 	mov	r5, #0
-	umlal	lr, r5, xl, yh
 	umlal	lr, r5, xh, yl
+	and	yl, r6, #0x80000000
+	umlal	lr, r5, xl, yh
 	mov	r6, #0
 	umlal	r5, r6, xh, yh
 
 #endif
 
 	@ The LSBs in ip are only significant for the final rounding.
-	@ Fold them into one bit of lr.
+	@ Fold them into lr.
 	teq	ip, #0
 	orrne	lr, lr, #1
 
-	@ Put final sign in xh.
-	mov	xh, r4, lsl #16
-	bic	r4, r4, #0x8000
-
-	@ Adjust result if one extra MSB appeared (one of four times).
-	tst	r6, #(1 << 9)
-	beq	1f
-	add	r4, r4, #(1 << 19)
-	movs	r6, r6, lsr #1
-	movs	r5, r5, rrx
-	movs	lr, lr, rrx
-	orrcs	lr, lr, #1
-1:
-	@ Scale back to 53 bits.
-	@ xh contains sign bit already.
-	orr	xh, xh, r6, lsl #12
-	orr	xh, xh, r5, lsr #20
-	mov	xl, r5, lsl #12
-	orr	xl, xl, lr, lsr #20
-
-	@ Apply exponent bias, check range for underflow.
-	sub	r4, r4, #0x00f80000
-	subs	r4, r4, #0x1f000000
-	ble	LSYM(Lml_u)
-
-	@ Round the result.
-	movs	lr, lr, lsl #12
-	bpl	1f
-	adds	xl, xl, #1
-	adc	xh, xh, #0
-	teq	lr, #0x80000000
-	biceq	xl, xl, #1
-
-	@ Rounding may have produced an extra MSB here.
-	@ The extra bit is cleared before merging the exponent below.
-	tst	xh, #0x00200000
-	addne	r4, r4, #(1 << 19)
+	@ Adjust result upon the MSB position.
+	sub	r4, r4, #0xff
+	cmp	r6, #(1 << (20-11))
+	sbc	r4, r4, #0x300
+	bcs	1f
+	movs	lr, lr, lsl #1
+	adcs	r5, r5, r5
+	adc	r6, r6, r6
 1:
-	@ Check exponent for overflow.
-	adds	ip, r4, #(1 << 19)
-	tst	ip, #(1 << 30)
-	bne	LSYM(Lml_o)
-
-	@ Add final exponent.
-	bic	xh, xh, #0x00300000
-	orr	xh, xh, r4, lsl #1
+	@ Shift to final position, add sign to result.
+	orr	xh, yl, r6, lsl #11
+	orr	xh, xh, r5, lsr #21
+	mov	xl, r5, lsl #11
+	orr	xl, xl, lr, lsr #21
+	mov	lr, lr, lsl #11
+
+	@ Check exponent range for under/overflow.
+	subs	ip, r4, #(254 - 1)
+	cmphi	ip, #0x700
+	bhi	LSYM(Lml_u)
+
+	@ Round the result, merge final exponent.
+	cmp	lr, #0x80000000
+	moveqs	lr, xl, lsr #1
+	adcs	xl, xl, #0
+	adc	xh, xh, r4, lsl #20
 	RETLDM	"r4, r5, r6"
 
-	@ Result is 0, but determine sign anyway.
-LSYM(Lml_z):
+	@ Multiplication by 0x1p*: let''s shortcut a lot of code.
+LSYM(Lml_1):
+	and	r6, r6, #0x80000000
+	orr	xh, r6, xh
+	orr	xl, xl, yl
 	eor	xh, xh, yh
-LSYM(Ldv_z):
-	bic	xh, xh, #0x7fffffff
-	mov	xl, #0
-	RETLDM	"r4, r5, r6"
+	subs	r4, r4, ip, lsr #1
+	rsbgts	r5, r4, ip
+	orrgt	xh, xh, r4, lsl #20
+	RETLDM	"r4, r5, r6" gt
+
+	@ Under/overflow: fix things up for the code below.
+	orr	xh, xh, #0x00100000
+	mov	lr, #0
+	subs	r4, r4, #1
 
-	@ Check if denormalized result is possible, otherwise return signed 0.
 LSYM(Lml_u):
-	cmn	r4, #(53 << 19)
+	@ Overflow?
+	bgt	LSYM(Lml_o)
+
+	@ Check if denormalized result is possible, otherwise return signed 0.
+	cmn	r4, #(53 + 1)
 	movle	xl, #0
 	bicle	xh, xh, #0x7fffffff
 	RETLDM	"r4, r5, r6" le
 
 	@ Find out proper shift value.
-LSYM(Lml_r):
-	mvn	r4, r4, asr #19
-	subs	r4, r4, #30
+	rsb	r4, r4, #0
+	subs	r4, r4, #32
 	bge	2f
 	adds	r4, r4, #12
 	bgt	1f
@@ -721,14 +715,12 @@
 	mov	r3, xl, lsl r5
 	mov	xl, xl, lsr r4
 	orr	xl, xl, xh, lsl r5
-	movs	xh, xh, lsl #1
-	mov	xh, xh, lsr r4
-	mov	xh, xh, rrx
+	and	r2, xh, #0x80000000
+	bic	xh, xh, #0x80000000
 	adds	xl, xl, r3, lsr #31
-	adc	xh, xh, #0
-	teq	lr, #0
-	teqeq	r3, #0x80000000
-	biceq	xl, xl, #1
+	adc	xh, r2, xh, lsr r4
+	orrs	lr, lr, r3, lsl #1
+	biceq	xl, xl, r3, lsr #31
 	RETLDM	"r4, r5, r6"
 
 	@ shift result right of 21 to 31 bits, or left 11 to 1 bits after
@@ -741,54 +733,71 @@
 	bic	xh, xh, #0x7fffffff
 	adds	xl, xl, r3, lsr #31
 	adc	xh, xh, #0
-	teq	lr, #0
-	teqeq	r3, #0x80000000
-	biceq	xl, xl, #1
+	orrs	lr, lr, r3, lsl #1
+	biceq	xl, xl, r3, lsr #31
 	RETLDM	"r4, r5, r6"
 
 	@ Shift value right of 32 to 64 bits, or 0 to 32 bits after a switch
 	@ from xh to xl.  Leftover bits are in r3-r6-lr for rounding.
 2:	rsb	r5, r4, #32
-	mov	r6, xl, lsl r5
+	orr	lr, lr, xl, lsl r5
 	mov	r3, xl, lsr r4
 	orr	r3, r3, xh, lsl r5
 	mov	xl, xh, lsr r4
 	bic	xh, xh, #0x7fffffff
 	bic	xl, xl, xh, lsr r4
 	add	xl, xl, r3, lsr #31
-	orrs	r6, r6, lr
-	teqeq	r3, #0x80000000
-	biceq	xl, xl, #1
+	orrs	lr, lr, r3, lsl #1
+	biceq	xl, xl, r3, lsr #31
 	RETLDM	"r4, r5, r6"
 
 	@ One or both arguments are denormalized.
 	@ Scale them leftwards and preserve sign bit.
 LSYM(Lml_d):
-	mov	lr, #0
 	teq	r4, #0
 	bne	2f
 	and	r6, xh, #0x80000000
 1:	movs	xl, xl, lsl #1
-	adc	xh, lr, xh, lsl #1
+	adc	xh, xh, xh
 	tst	xh, #0x00100000
-	subeq	r4, r4, #(1 << 19)
+	subeq	r4, r4, #1
 	beq	1b
 	orr	xh, xh, r6
 	teq	r5, #0
-	bne	LSYM(Lml_x)
+	movne	pc, lr
 2:	and	r6, yh, #0x80000000
 3:	movs	yl, yl, lsl #1
-	adc	yh, lr, yh, lsl #1
+	adc	yh, yh, yh
 	tst	yh, #0x00100000
-	subeq	r5, r5, #(1 << 20)
+	subeq	r5, r5, #1
 	beq	3b
 	orr	yh, yh, r6
-	b	LSYM(Lml_x)
+	mov	pc, lr
 
-	@ One or both args are INF or NAN.
 LSYM(Lml_s):
+	@ Isolate the INF and NAN cases away
+	teq	r4, ip
+	and	r5, ip, yh, lsr #20
+	teqne	r5, ip
+	beq	1f
+
+	@ Here, one or more arguments are either denormalized or zero.
 	orrs	r6, xl, xh, lsl #1
 	orrnes	r6, yl, yh, lsl #1
+	bne	LSYM(Lml_d)
+
+	@ Result is 0, but determine sign anyway.
+LSYM(Lml_z):
+	eor	xh, xh, yh
+	bic	xh, xh, #0x7fffffff
+	mov	xl, #0
+	RETLDM	"r4, r5, r6"
+
+1:	@ One or both args are INF or NAN.
+	orrs	r6, xl, xh, lsl #1
+	moveq	xl, yl
+	moveq	xh, yh
+	orrnes	r6, yl, yh, lsl #1
 	beq	LSYM(Lml_n)		@ 0 * INF or INF * 0 -> NAN
 	teq	r4, ip
 	bne	1f
@@ -797,6 +806,8 @@
 1:	teq	r5, ip
 	bne	LSYM(Lml_i)
 	orrs	r6, yl, yh, lsl #12
+	movne	xl, yl
+	movne	xh, yh
 	bne	LSYM(Lml_n)		@ <anything> * NAN -> NAN
 
 	@ Result is INF, but we need to determine its sign.
@@ -811,9 +822,9 @@
 	mov	xl, #0
 	RETLDM	"r4, r5, r6"
 
-	@ Return NAN.
+	@ Return a quiet NAN.
 LSYM(Lml_n):
-	mov	xh, #0x7f000000
+	orr	xh, xh, #0x7f000000
 	orr	xh, xh, #0x00f80000
 	RETLDM	"r4, r5, r6"
 
@@ -825,41 +836,31 @@
 	
 	stmfd	sp!, {r4, r5, r6, lr}
 
-	@ Mask out exponents.
-	mov	ip, #0x7f000000
-	orr	ip, ip, #0x00f00000
-	and	r4, xh, ip
-	and	r5, yh, ip
-
-	@ Trap any INF/NAN or zeroes.
-	teq	r4, ip
+	@ Mask out exponents, trap any zero/denormal/INF/NAN.
+	mov	ip, #0xff
+	orr	ip, ip, #0x700
+	ands	r4, ip, xh, lsr #20
+	andnes	r5, ip, yh, lsr #20
+	teqne	r4, ip
 	teqne	r5, ip
-	orrnes	r6, xl, xh, lsl #1
-	orrnes	r6, yl, yh, lsl #1
-	beq	LSYM(Ldv_s)
+	bleq	LSYM(Ldv_s)
 
-	@ Shift exponents right one bit to make room for overflow bit.
-	@ If either of them is 0, scale denormalized arguments off line.
-	@ Then substract divisor exponent from dividend''s.
-	movs	r4, r4, lsr #1
-	teqne	r5, #0
-	beq	LSYM(Ldv_d)
-LSYM(Ldv_x):
-	sub	r4, r4, r5, asr #1
+	@ Substract divisor exponent from dividend''s.
+	sub	r4, r4, r5
 
 	@ Preserve final sign into lr.
 	eor	lr, xh, yh
 
 	@ Convert mantissa to unsigned integer.
 	@ Dividend -> r5-r6, divisor -> yh-yl.
-	mov	r5, #0x10000000
+	orrs	r5, yl, yh, lsl #12
+	mov	xh, xh, lsl #12
+	beq	LSYM(Ldv_1)
 	mov	yh, yh, lsl #12
+	mov	r5, #0x10000000
 	orr	yh, r5, yh, lsr #4
 	orr	yh, yh, yl, lsr #24
-	movs	yl, yl, lsl #8
-	mov	xh, xh, lsl #12
-	teqeq	yh, r5
-	beq	LSYM(Ldv_1)
+	mov	yl, yl, lsl #8
 	orr	r5, r5, xh, lsr #4
 	orr	r5, r5, xl, lsr #24
 	mov	r6, xl, lsl #8
@@ -868,21 +869,15 @@
 	and	xh, lr, #0x80000000
 
 	@ Ensure result will land to known bit position.
+	@ Apply exponent bias accordingly.
 	cmp	r5, yh
 	cmpeq	r6, yl
+	adc	r4, r4, #(255 - 2)
+	add	r4, r4, #0x300
 	bcs	1f
-	sub	r4, r4, #(1 << 19)
 	movs	yh, yh, lsr #1
 	mov	yl, yl, rrx
 1:
-	@ Apply exponent bias, check range for over/underflow.
-	add	r4, r4, #0x1f000000
-	add	r4, r4, #0x00f80000
-	cmn	r4, #(53 << 19)
-	ble	LSYM(Ldv_z)
-	cmp	r4, ip, lsr #1
-	bge	LSYM(Lml_o)
-
 	@ Perform first substraction to align result to a nibble.
 	subs	r6, r6, yl
 	sbc	r5, r5, yh
@@ -944,73 +939,42 @@
 	orreq	xh, xh, xl
 	moveq	xl, #0
 3:
-	@ Check if denormalized result is needed.
-	cmp	r4, #0
-	ble	LSYM(Ldv_u)
+	@ Check exponent range for under/overflow.
+	subs	ip, r4, #(254 - 1)
+	cmphi	ip, #0x700
+	bhi	LSYM(Lml_u)
 
-	@ Apply proper rounding.
+	@ Round the result, merge final exponent.
 	subs	ip, r5, yh
 	subeqs	ip, r6, yl
+	moveqs	ip, xl, lsr #1
 	adcs	xl, xl, #0
-	adc	xh, xh, #0
-	teq	ip, #0
-	biceq	xl, xl, #1
-
-	@ Add exponent to result.
-	bic	xh, xh, #0x00100000
-	orr	xh, xh, r4, lsl #1
+	adc	xh, xh, r4, lsl #20
 	RETLDM	"r4, r5, r6"
 
 	@ Division by 0x1p*: shortcut a lot of code.
 LSYM(Ldv_1):
 	and	lr, lr, #0x80000000
 	orr	xh, lr, xh, lsr #12
-	add	r4, r4, #0x1f000000
-	add	r4, r4, #0x00f80000
-	cmp	r4, ip, lsr #1
-	bge	LSYM(Lml_o)
-	cmp	r4, #0
-	orrgt	xh, xh, r4, lsl #1
+	adds	r4, r4, ip, lsr #1
+	rsbgts	r5, r4, ip
+	orrgt	xh, xh, r4, lsl #20
 	RETLDM	"r4, r5, r6" gt
 
-	cmn	r4, #(53 << 19)
-	ble	LSYM(Ldv_z)
 	orr	xh, xh, #0x00100000
 	mov	lr, #0
-	b	LSYM(Lml_r)
+	subs	r4, r4, #1
+	b	LSYM(Lml_u)
 
-	@ Result must be denormalized: put remainder in lr for
-	@ rounding considerations.
+	@ Result mightt need to be denormalized: put remainder bits
+	@ in lr for rounding considerations.
 LSYM(Ldv_u):
 	orr	lr, r5, r6
-	b	LSYM(Lml_r)
-
-	@ One or both arguments are denormalized.
-	@ Scale them leftwards and preserve sign bit.
-LSYM(Ldv_d):
-	mov	lr, #0
-	teq	r4, #0
-	bne	2f
-	and	r6, xh, #0x80000000
-1:	movs	xl, xl, lsl #1
-	adc	xh, lr, xh, lsl #1
-	tst	xh, #0x00100000
-	subeq	r4, r4, #(1 << 19)
-	beq	1b
-	orr	xh, xh, r6
-	teq	r5, #0
-	bne	LSYM(Ldv_x)
-2:	and	r6, yh, #0x80000000
-3:	movs	yl, yl, lsl #1
-	adc	yh, lr, yh, lsl #1
-	tst	yh, #0x00100000
-	subeq	r5, r5, #(1 << 20)
-	beq	3b
-	orr	yh, yh, r6
-	b	LSYM(Ldv_x)
+	b	LSYM(Lml_u)
 
 	@ One or both arguments is either INF, NAN or zero.
 LSYM(Ldv_s):
+	and	r5, ip, yh, lsr #20
 	teq	r4, ip
 	teqeq	r5, ip
 	beq	LSYM(Lml_n)		@ INF/NAN / INF/NAN -> NAN
@@ -1018,13 +982,23 @@
 	bne	1f
 	orrs	r4, xl, xh, lsl #12
 	bne	LSYM(Lml_n)		@ NAN / <anything> -> NAN
-	b	LSYM(Lml_i)		@ INF / <anything> -> INF
+	teq	r5, ip
+	bne	LSYM(Lml_i)		@ INF / <anything> -> INF
+	mov	xl, yl
+	mov	xh, yh
+	b	LSYM(Lml_n)		@ INF / (INF or NAN) -> NAN
 1:	teq	r5, ip
 	bne	2f
 	orrs	r5, yl, yh, lsl #12
-	bne	LSYM(Lml_n)		@ <anything> / NAN -> NAN
-	b	LSYM(Lml_z)		@ <anything> / INF -> 0
-2:	@ One or both arguments are 0.
+	beq	LSYM(Lml_z)		@ <anything> / INF -> 0
+	mov	xl, yl
+	mov	xh, yh
+	b	LSYM(Lml_n)		@ <anything> / NAN -> NAN
+2:	@ If both are non-zero, we need to normalize and resume above.
+	orrs	r6, xl, xh, lsl #1
+	orrnes	r6, yl, yh, lsl #1
+	bne	LSYM(Lml_d)
+	@ One or both arguments are 0.
 	orrs	r4, xl, xh, lsl #1
 	bne	LSYM(Lml_i)		@ <non_zero> / 0 -> INF
 	orrs	r5, yl, yh, lsl #1
@@ -1038,6 +1012,8 @@
 
 #ifdef L_cmpdf2
 
+@ Note: only r0 (return value) and ip are clobbered here.
+
 ARM_FUNC_START gtdf2
 ARM_FUNC_ALIAS gedf2 gtdf2
 	mov	ip, #-1
@@ -1053,15 +1029,13 @@
 ARM_FUNC_ALIAS eqdf2 cmpdf2
 	mov	ip, #1			@ how should we specify unordered here?
 
-1:	stmfd	sp!, {r4, r5, lr}
+1:	str	ip, [sp, #-4]
 
 	@ Trap any INF/NAN first.
-	mov	lr, #0x7f000000
-	orr	lr, lr, #0x00f00000
-	and	r4, xh, lr
-	and	r5, yh, lr
-	teq	r4, lr
-	teqne	r5, lr
+	mov	ip, xh, lsl #1
+	mvns	ip, ip, asr #21
+	mov	ip, yh, lsl #1
+	mvnnes	ip, ip, asr #21
 	beq	3f
 
 	@ Test for equality.
@@ -1071,37 +1045,37 @@
 	teqne	xh, yh			@ or xh == yh
 	teqeq	xl, yl			@ and xl == yl
 	moveq	r0, #0			@ then equal.
-	RETLDM	"r4, r5" eq
+	RETc(eq)
+
+	@ Clear C flag
+	cmn	r0, #0
 
-	@ Check for sign difference.
+	@ Compare sign, 
 	teq	xh, yh
-	movmi	r0, xh, asr #31
-	orrmi	r0, r0, #1
-	RETLDM	"r4, r5" mi
-
-	@ Compare exponents.
-	cmp	r4, r5
-
-	@ Compare mantissa if exponents are equal.
-	moveq	xh, xh, lsl #12
-	cmpeq	xh, yh, lsl #12
+
+	@ Compare values if same sign
+	cmppl	xh, yh
 	cmpeq	xl, yl
+
+	@ Result:
 	movcs	r0, yh, asr #31
 	mvncc	r0, yh, asr #31
 	orr	r0, r0, #1
-	RETLDM	"r4, r5"
+	RET
 
 	@ Look for a NAN.
-3:	teq	r4, lr
+3:	mov	ip, xh, lsl #1
+	mvns	ip, ip, asr #21
 	bne	4f
-	orrs	xl, xl, xh, lsl #12
+	orrs	ip, xl, xh, lsl #12
 	bne	5f			@ x is NAN
-4:	teq	r5, lr
+4:	mov	ip, yh, lsl #1
+	mvns	ip, ip, asr #21
 	bne	2b
-	orrs	yl, yl, yh, lsl #12
+	orrs	ip, yl, yh, lsl #12
 	beq	2b			@ y is not NAN
-5:	mov	r0, ip			@ return unordered code from ip
-	RETLDM	"r4, r5"
+5:	ldr	r0, [sp, #-4]		@ unordered return code
+	RET
 
 	FUNC_END gedf2
 	FUNC_END gtdf2
@@ -1112,6 +1086,7 @@
 	FUNC_END cmpdf2
 
 ARM_FUNC_START aeabi_cdrcmple
+
 	mov	ip, r0
 	mov	r0, r2
 	mov	r2, ip
@@ -1122,85 +1097,95 @@
 	
 ARM_FUNC_START aeabi_cdcmpeq
 ARM_FUNC_ALIAS aeabi_cdcmple aeabi_cdcmpeq
+
 	@ The status-returning routines are required to preserve all
 	@ registers except ip, lr, and cpsr.
-6:	stmfd	sp!, {r0, r1, r2, r3, lr}
+6:	stmfd	sp!, {r0, lr}
 	ARM_CALL cmpdf2
 	@ Set the Z flag correctly, and the C flag unconditionally.
 	cmp	 r0, #0
 	@ Clear the C flag if the return value was -1, indicating
 	@ that the first operand was smaller than the second.
 	cmnmi	 r0, #0
-	RETLDM   "r0, r1, r2, r3"
+	RETLDM   "r0"
+
 	FUNC_END aeabi_cdcmple
 	FUNC_END aeabi_cdcmpeq
+	FUNC_END aeabi_cdrcmple
 	
 ARM_FUNC_START	aeabi_dcmpeq
+
 	str	lr, [sp, #-4]!
 	ARM_CALL aeabi_cdcmple
 	moveq	r0, #1	@ Equal to.
 	movne	r0, #0	@ Less than, greater than, or unordered.
 	RETLDM
+
 	FUNC_END aeabi_dcmpeq
 
 ARM_FUNC_START	aeabi_dcmplt
+
 	str	lr, [sp, #-4]!
 	ARM_CALL aeabi_cdcmple
 	movcc	r0, #1	@ Less than.
 	movcs	r0, #0	@ Equal to, greater than, or unordered.
 	RETLDM
+
 	FUNC_END aeabi_dcmplt
 
 ARM_FUNC_START	aeabi_dcmple
+
 	str	lr, [sp, #-4]!
 	ARM_CALL aeabi_cdcmple
 	movls	r0, #1  @ Less than or equal to.
 	movhi	r0, #0	@ Greater than or unordered.
 	RETLDM
+
 	FUNC_END aeabi_dcmple
 
 ARM_FUNC_START	aeabi_dcmpge
+
 	str	lr, [sp, #-4]!
 	ARM_CALL aeabi_cdrcmple
 	movls	r0, #1	@ Operand 2 is less than or equal to operand 1.
 	movhi	r0, #0	@ Operand 2 greater than operand 1, or unordered.
 	RETLDM
+
 	FUNC_END aeabi_dcmpge
 
 ARM_FUNC_START	aeabi_dcmpgt
+
 	str	lr, [sp, #-4]!
 	ARM_CALL aeabi_cdrcmple
 	movcc	r0, #1	@ Operand 2 is less than operand 1.
 	movcs	r0, #0  @ Operand 2 is greater than or equal to operand 1,
 			@ or they are unordered.
 	RETLDM
+
 	FUNC_END aeabi_dcmpgt
-		
+
 #endif /* L_cmpdf2 */
 
 #ifdef L_unorddf2
 
 ARM_FUNC_START unorddf2
 ARM_FUNC_ALIAS aeabi_dcmpun unorddf2
-	
-	str	lr, [sp, #-4]!
-	mov	ip, #0x7f000000
-	orr	ip, ip, #0x00f00000
-	and	lr, xh, ip
-	teq	lr, ip
+
+	mov	ip, xh, lsl #1
+	mvns	ip, ip, asr #21
 	bne	1f
-	orrs	xl, xl, xh, lsl #12
+	orrs	ip, xl, xh, lsl #12
 	bne	3f			@ x is NAN
-1:	and	lr, yh, ip
-	teq	lr, ip
+1:	mov	ip, yh, lsl #1
+	mvns	ip, ip, asr #21
 	bne	2f
-	orrs	yl, yl, yh, lsl #12
+	orrs	ip, yl, yh, lsl #12
 	bne	3f			@ y is NAN
 2:	mov	r0, #0			@ arguments are ordered.
-	RETLDM
+	RET
 
 3:	mov	r0, #1			@ arguments are unordered.
-	RETLDM
+	RET
 
 	FUNC_END aeabi_dcmpun
 	FUNC_END unorddf2
@@ -1211,31 +1196,22 @@
 
 ARM_FUNC_START fixdfsi
 ARM_FUNC_ALIAS aeabi_d2iz fixdfsi
-	orrs	ip, xl, xh, lsl #1
-	beq	1f			@ value is 0.
-
-	mov	r3, r3, rrx		@ preserve C flag (the actual sign)
 
 	@ check exponent range.
-	mov	ip, #0x7f000000
-	orr	ip, ip, #0x00f00000
-	and	r2, xh, ip
-	teq	r2, ip
-	beq	2f			@ value is INF or NAN
-	bic	ip, ip, #0x40000000
-	cmp	r2, ip
-	bcc	1f			@ value is too small
-	add	ip, ip, #(31 << 20)
-	cmp	r2, ip
-	bcs	3f			@ value is too large
-
-	rsb	r2, r2, ip
-	mov	ip, xh, lsl #11
-	orr	ip, ip, #0x80000000
-	orr	ip, ip, xl, lsr #21
-	mov	r2, r2, lsr #20
-	tst	r3, #0x80000000		@ the sign bit
-	mov	r0, ip, lsr r2
+	mov	r2, xh, lsl #1
+	adds	r2, r2, #(1 << 21)
+	bcs	2f			@ value is INF or NAN
+	bpl	1f			@ value is too small
+	mov	r3, #(0xfffffc00 + 31)
+	subs	r2, r3, r2, asr #21
+	bls	3f			@ value is too large
+
+	@ scale value
+	mov	r3, xh, lsl #11
+	orr	r3, r3, #0x80000000
+	orr	r3, r3, xl, lsr #21
+	tst	xh, #0x80000000		@ the sign bit
+	mov	r0, r3, lsr r2
 	rsbne	r0, r0, #0
 	RET
 
@@ -1243,8 +1219,8 @@
 	RET
 
 2:	orrs	xl, xl, xh, lsl #12
-	bne	4f			@ r0 is NAN.
-3:	ands	r0, r3, #0x80000000	@ the sign bit
+	bne	4f			@ x is NAN.
+3:	ands	r0, xh, #0x80000000	@ the sign bit
 	moveq	r0, #0x7fffffff		@ maximum signed positive si
 	RET
 
@@ -1260,29 +1236,22 @@
 
 ARM_FUNC_START fixunsdfsi
 ARM_FUNC_ALIAS aeabi_d2uiz fixunsdfsi
-	orrs	ip, xl, xh, lsl #1
-	movcss	r0, #0			@ value is negative
-	RETc(eq)			@ or 0 (xl, xh overlap r0)
 
 	@ check exponent range.
-	mov	ip, #0x7f000000
-	orr	ip, ip, #0x00f00000
-	and	r2, xh, ip
-	teq	r2, ip
-	beq	2f			@ value is INF or NAN
-	bic	ip, ip, #0x40000000
-	cmp	r2, ip
-	bcc	1f			@ value is too small
-	add	ip, ip, #(31 << 20)
-	cmp	r2, ip
-	bhi	3f			@ value is too large
-
-	rsb	r2, r2, ip
-	mov	ip, xh, lsl #11
-	orr	ip, ip, #0x80000000
-	orr	ip, ip, xl, lsr #21
-	mov	r2, r2, lsr #20
-	mov	r0, ip, lsr r2
+	movs	r2, xh, lsl #1
+	bcs	1f			@ value is negative
+	adds	r2, r2, #(1 << 21)
+	bcs	2f			@ value is INF or NAN
+	bpl	1f			@ value is too small
+	mov	r3, #(0xfffffc00 + 31)
+	subs	r2, r3, r2, asr #21
+	bmi	3f			@ value is too large
+
+	@ scale value
+	mov	r3, xh, lsl #11
+	orr	r3, r3, #0x80000000
+	orr	r3, r3, xl, lsr #21
+	mov	r0, r3, lsr r2
 	RET
 
 1:	mov	r0, #0
@@ -1305,91 +1274,61 @@
 
 ARM_FUNC_START truncdfsf2
 ARM_FUNC_ALIAS aeabi_d2f truncdfsf2
-	orrs	r2, xl, xh, lsl #1
-	moveq	r0, r2, rrx
-	RETc(eq)			@ value is 0.0 or -0.0
-	
+
 	@ check exponent range.
-	mov	ip, #0x7f000000
-	orr	ip, ip, #0x00f00000
-	and	r2, ip, xh
-	teq	r2, ip
-	beq	2f			@ value is INF or NAN
-	bic	xh, xh, ip
-	cmp	r2, #(0x380 << 20)
-	bls	4f			@ value is too small
-
-	@ shift and round mantissa
-1:	movs	r3, xl, lsr #29
-	adc	r3, r3, xh, lsl #3
-
-	@ if halfway between two numbers, round towards LSB = 0.
-	mov	xl, xl, lsl #3
-	teq	xl, #0x80000000
-	biceq	r3, r3, #1
-
-	@ rounding might have created an extra MSB.  If so adjust exponent.
-	tst	r3, #0x00800000
-	addne	r2, r2, #(1 << 20)
-	bicne	r3, r3, #0x00800000
-
-	@ check exponent for overflow
-	mov	ip, #(0x400 << 20)
-	orr	ip, ip, #(0x07f << 20)
-	cmp	r2, ip
-	bcs	3f			@ overflow
-
-	@ adjust exponent, merge with sign bit and mantissa.
-	movs	xh, xh, lsl #1
-	mov	r2, r2, lsl #4
-	orr	r0, r3, r2, rrx
-	eor	r0, r0, #0x40000000
+	mov	r2, xh, lsl #1
+	subs	r3, r2, #((1023 - 127) << 21)
+	subcss	ip, r3, #(1 << 21)
+	rsbcss	ip, ip, #(254 << 21)
+	bls	2f			@ value is out of range
+
+1:	@ shift and round mantissa
+	and	ip, xh, #0x80000000
+	mov	r2, xl, lsl #3
+	orr	xl, ip, xl, lsr #29
+	cmp	r2, #0x80000000
+	adc	r0, xl, r3, lsl #2
+	biceq	r0, r0, #1
 	RET
 
-2:	@ chech for NAN
-	orrs	xl, xl, xh, lsl #12
+2:	@ either overflow or underflow
+	tst	xh, #0x40000000
+	bne	3f			@ overflow
+
+	@ check if denormalized value is possible
+	adds	r2, r3, #(23 << 21)
+	andlt	r0, xh, #0x80000000	@ too small, return signed 0.
+	RETc(lt)
+
+	@ denormalize value so we can resume with the code above afterwards.
+	orr	xh, xh, #0x00100000
+	mov	r2, r2, lsr #21
+	rsb	r2, r2, #24
+	rsb	ip, r2, #32
+	movs	r3, xl, lsl ip
+	mov	xl, xl, lsr r2
+	orrne	xl, xl, #1		@ fold r3 for rounding considerations. 
+	mov	r3, xh, lsl #11
+	mov	r3, r3, lsr #11
+	orr	xl, xl, r3, lsl ip
+	mov	r3, r3, lsr r2
+	mov	r3, r3, lsl #1
+	b	1b
+
+3:	@ chech for NAN
+	mvns	r3, r2, asr #21
+	bne	5f			@ simple overflow
+	orrs	r3, xl, xh, lsl #12
 	movne	r0, #0x7f000000
 	orrne	r0, r0, #0x00c00000
 	RETc(ne)			@ return NAN
 
-3:	@ return INF with sign
+5:	@ return INF with sign
 	and	r0, xh, #0x80000000
 	orr	r0, r0, #0x7f000000
 	orr	r0, r0, #0x00800000
 	RET
 
-4:	@ check if denormalized value is possible
-	subs	r2, r2, #((0x380 - 24) << 20)
-	andle	r0, xh, #0x80000000	@ too small, return signed 0.
-	RETc(le)
-	
-	@ denormalize value so we can resume with the code above afterwards.
-	orr	xh, xh, #0x00100000
-	mov	r2, r2, lsr #20
-	rsb	r2, r2, #25
-	cmp	r2, #20
-	bgt	6f
-
-	rsb	ip, r2, #32
-	mov	r3, xl, lsl ip
-	mov	xl, xl, lsr r2
-	orr	xl, xl, xh, lsl ip
-	movs	xh, xh, lsl #1
-	mov	xh, xh, lsr r2
-	mov	xh, xh, rrx
-5:	teq	r3, #0			@ fold r3 bits into the LSB
-	orrne	xl, xl, #1		@ for rounding considerations. 
-	mov	r2, #(0x380 << 20)	@ equivalent to the 0 float exponent
-	b	1b
-
-6:	rsb	r2, r2, #(12 + 20)
-	rsb	ip, r2, #32
-	mov	r3, xl, lsl r2
-	mov	xl, xl, lsr ip
-	orr	xl, xl, xh, lsl r2
-	and	xh, xh, #0x80000000
-	b	5b
-
 	FUNC_END aeabi_d2f
 	FUNC_END truncdfsf2
 
Index: gcc/config/arm/ieee754-sf.S
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/arm/ieee754-sf.S,v
retrieving revision 1.6
diff -u -r1.6 ieee754-sf.S
--- gcc/config/arm/ieee754-sf.S	1 Sep 2004 11:14:20 -0000	1.6
+++ gcc/config/arm/ieee754-sf.S	12 Oct 2004 16:04:31 -0000
@@ -42,7 +42,7 @@
 	
 ARM_FUNC_START negsf2
 ARM_FUNC_ALIAS aeabi_fneg negsf2
-	
+
 	eor	r0, r0, #0x80000000	@ flip sign bit
 	RET
 
@@ -56,11 +56,11 @@
 ARM_FUNC_START aeabi_frsub
 
 	eor	r0, r0, #0x80000000	@ flip sign bit of first arg
-	b	1f	
-	
+	b	1f
+
 ARM_FUNC_START subsf3
 ARM_FUNC_ALIAS aeabi_fsub subsf3
-	
+
 	eor	r1, r1, #0x80000000	@ flip sign bit of second arg
 #if defined(__thumb__) && !defined(__THUMB_INTERWORK__)
 	b	1f			@ Skip Thumb-code prologue
@@ -68,32 +68,19 @@
 
 ARM_FUNC_START addsf3
 ARM_FUNC_ALIAS aeabi_fadd addsf3
-	
-1:	@ Compare both args, return zero if equal but the sign.
-	eor	r2, r0, r1
-	teq	r2, #0x80000000
-	beq	LSYM(Lad_z)
 
-	@ If first arg is 0 or -0, return second arg.
-	@ If second arg is 0 or -0, return first arg.
-	bics	r2, r0, #0x80000000
-	moveq	r0, r1
-	bicnes	r2, r1, #0x80000000
-	RETc(eq)
-
-	@ Mask out exponents.
-	mov	ip, #0xff000000
-	and	r2, r0, ip, lsr #1
-	and	r3, r1, ip, lsr #1
-
-	@ If either of them is 255, result will be INF or NAN
-	teq	r2, ip, lsr #1
-	teqne	r3, ip, lsr #1
-	beq	LSYM(Lad_i)
+1:	@ Look for zeroes, equal values, INF, or NAN.
+	movs	r2, r0, lsl #1
+	movnes	r3, r1, lsl #1
+	teqne	r2, r3
+	mvnnes	ip, r2, asr #24
+	mvnnes	ip, r3, asr #24
+	beq	LSYM(Lad_s)
 
 	@ Compute exponent difference.  Make largest exponent in r2,
 	@ corresponding arg in r0, and positive exponent difference in r3.
-	subs	r3, r3, r2
+	mov	r2, r2, lsr #24
+	rsbs	r3, r2, r3, lsr #24
 	addgt	r2, r2, r3
 	eorgt	r1, r0, r1
 	eorgt	r0, r1, r0
@@ -103,7 +90,7 @@
 	@ If exponent difference is too large, return largest argument
 	@ already in r0.  We need up to 25 bit to handle proper rounding
 	@ of 0x1p25 - 1.1.
-	cmp	r3, #(25 << 23)
+	cmp	r3, #25
 	RETc(hi)
 
 	@ Convert mantissa to signed integer.
@@ -122,25 +109,17 @@
 	beq	LSYM(Lad_d)
 LSYM(Lad_x):
 
-	@ Scale down second arg with exponent difference.
-	@ Apply shift one bit left to first arg and the rest to second arg
-	@ to simplify things later, but only if exponent does not become 0.
-	movs	r3, r3, lsr #23
-	teqne	r2, #(1 << 23)
-	movne	r0, r0, lsl #1
-	subne	r2, r2, #(1 << 23)
-	subne	r3, r3, #1
+	@ Compensate for the exponent overlapping the mantissa MSB added later
+	sub	r2, r2, #1
 
-	@ Shift second arg into ip, keep leftover bits into r1.
-	mov	ip, r1, asr r3
+	@ Shift and add second arg to first arg in r0.
+	@ Keep leftover bits into r1.
+	adds	r0, r0, r1, asr r3
 	rsb	r3, r3, #32
 	mov	r1, r1, lsl r3
 
-	add	r0, r0, ip		@ the actual addition
-
-	@ We now have a 64 bit result in r0-r1.
-	@ Keep absolute value in r0-r1, sign in r3.
-	ands	r3, r0, #0x80000000
+	@ Keep absolute value in r0-r1, sign in r3 (the n bit was set above)
+	and	r3, r0, #0x80000000
 	bpl	LSYM(Lad_p)
 	rsbs	r1, r1, #0
 	rsc	r0, r0, #0
@@ -148,103 +127,117 @@
 	@ Determine how to normalize the result.
 LSYM(Lad_p):
 	cmp	r0, #0x00800000
-	bcc	LSYM(Lad_l)
+	bcc	LSYM(Lad_a)
 	cmp	r0, #0x01000000
-	bcc	LSYM(Lad_r0)
-	cmp	r0, #0x02000000
-	bcc	LSYM(Lad_r1)
+	bcc	LSYM(Lad_e)
 
 	@ Result needs to be shifted right.
 	movs	r0, r0, lsr #1
 	mov	r1, r1, rrx
-	add	r2, r2, #(1 << 23)
-LSYM(Lad_r1):
-	movs	r0, r0, lsr #1
-	mov	r1, r1, rrx
-	add	r2, r2, #(1 << 23)
-
-	@ Our result is now properly aligned into r0, remaining bits in r1.
-	@ Round with MSB of r1. If halfway between two numbers, round towards
-	@ LSB of r0 = 0. 
-LSYM(Lad_r0):
-	add	r0, r0, r1, lsr #31
-	teq	r1, #0x80000000
-	biceq	r0, r0, #1
-
-	@ Rounding may have added a new MSB.  Adjust exponent.
-	@ That MSB will be cleared when exponent is merged below.
-	tst	r0, #0x01000000
-	addne	r2, r2, #(1 << 23)
+	add	r2, r2, #1
 
 	@ Make sure we did not bust our exponent.
-	cmp	r2, #(254 << 23)
-	bhi	LSYM(Lad_o)
+	cmp	r2, #254
+	bhs	LSYM(Lad_o)
 
+	@ Our result is now properly aligned into r0, remaining bits in r1.
 	@ Pack final result together.
+	@ Round with MSB of r1. If halfway between two numbers, round towards
+	@ LSB of r0 = 0. 
 LSYM(Lad_e):
-	bic	r0, r0, #0x01800000
-	orr	r0, r0, r2
+	cmp	r1, #0x80000000
+	adc	r0, r0, r2, lsl #23
+	biceq	r0, r0, #1
 	orr	r0, r0, r3
 	RET
 
-	@ Result must be shifted left.
-	@ No rounding necessary since r1 will always be 0.
+	@ Result must be shifted left and exponent adjusted.
+LSYM(Lad_a):
+	movs	r1, r1, lsl #1
+	adc	r0, r0, r0
+	tst	r0, #0x00800000
+	sub	r2, r2, #1
+	bne	LSYM(Lad_e)
+	
+	@ No rounding necessary since r1 will always be 0 at this point.
 LSYM(Lad_l):
 
 #if __ARM_ARCH__ < 5
 
 	movs	ip, r0, lsr #12
 	moveq	r0, r0, lsl #12
-	subeq	r2, r2, #(12 << 23)
+	subeq	r2, r2, #12
 	tst	r0, #0x00ff0000
 	moveq	r0, r0, lsl #8
-	subeq	r2, r2, #(8 << 23)
+	subeq	r2, r2, #8
 	tst	r0, #0x00f00000
 	moveq	r0, r0, lsl #4
-	subeq	r2, r2, #(4 << 23)
+	subeq	r2, r2, #4
 	tst	r0, #0x00c00000
 	moveq	r0, r0, lsl #2
-	subeq	r2, r2, #(2 << 23)
-	tst	r0, #0x00800000
-	moveq	r0, r0, lsl #1
-	subeq	r2, r2, #(1 << 23)
-	cmp	r2, #0
-	bgt	LSYM(Lad_e)
+	subeq	r2, r2, #2
+	cmp	r0, #0x00800000
+	movcc	r0, r0, lsl #1
+	sbcs	r2, r2, #0
 
 #else
 
 	clz	ip, r0
 	sub	ip, ip, #8
+	subs	r2, r2, ip
 	mov	r0, r0, lsl ip
-	subs	r2, r2, ip, lsl #23
-	bgt	LSYM(Lad_e)
 
 #endif
 
-	@ Exponent too small, denormalize result.
-	mvn	r2, r2, asr #23
-	add	r2, r2, #2
-	orr	r0, r3, r0, lsr r2
+	@ Final result with sign
+	@ If exponent negative, denormalize result.
+	addge	r0, r0, r2, lsl #23
+	rsblt	r2, r2, #0
+	orrge	r0, r0, r3
+	orrlt	r0, r3, r0, lsr r2
 	RET
 
 	@ Fixup and adjust bit position for denormalized arguments.
 	@ Note that r2 must not remain equal to 0.
 LSYM(Lad_d):
 	teq	r2, #0
-	eoreq	r0, r0, #0x00800000
-	addeq	r2, r2, #(1 << 23)
 	eor	r1, r1, #0x00800000
-	subne	r3, r3, #(1 << 23)
+	eoreq	r0, r0, #0x00800000
+	addeq	r2, r2, #1
+	subne	r3, r3, #1
 	b	LSYM(Lad_x)
 
-	@ Result is x - x = 0, unless x is INF or NAN.
-LSYM(Lad_z):
-	mov	ip, #0xff000000
-	and	r2, r0, ip, lsr #1
-	teq	r2, ip, lsr #1
-	moveq	r0, ip, asr #2
+LSYM(Lad_s):
+	mov	r3, r1, lsl #1
+
+	mvns	ip, r2, asr #24
+	mvnnes	ip, r3, asr #24
+	beq	LSYM(Lad_i)
+
+	teq	r2, r3
+	beq	1f
+
+	@ Result is x + 0.0 = x or 0.0 + y = y.
+	teq	r2, #0
+	moveq	r0, r1
+	RET
+
+1:	teq	r0, r1
+
+	@ Result is x - x = 0.
 	movne	r0, #0
+	RETc(ne)
+
+	@ Result is x + x = 2x.
+	tst	r2, #0xff000000
+	bne	2f
+	movs	r0, r0, lsl #1
+	orrcs	r0, r0, #0x80000000
 	RET
+2:	adds	r2, r2, #(2 << 24)
+	addcc	r0, r0, #(1 << 23)
+	RETc(cc)
+	and	r3, r0, #0x80000000
 
 	@ Overflow: return INF.
 LSYM(Lad_o):
@@ -257,16 +250,16 @@
 	@   if r1 != INF/NAN: return r0 (which is INF/NAN)
 	@   if r0 or r1 is NAN: return NAN
 	@   if opposite sign: return NAN
-	@   return r0 (which is INF or -INF)
+	@   otherwise return r0 (which is INF or -INF)
 LSYM(Lad_i):
-	teq	r2, ip, lsr #1
+	mvns	r2, r2, asr #24
 	movne	r0, r1
-	teqeq	r3, ip, lsr #1
-	RETc(ne)
+	mvneqs	r3, r3, asr #24
+	movne	r1, r0
 	movs	r2, r0, lsl #9
-	moveqs	r2, r1, lsl #9
+	moveqs	r3, r1, lsl #9
 	teqeq	r0, r1
-	orrne	r0, r3, #0x00400000	@ NAN
+	orrne	r0, r0, #0x00400000	@ quiet NAN
 	RET
 
 	FUNC_END aeabi_frsub
@@ -287,28 +280,17 @@
 	ands	r3, r0, #0x80000000
 	rsbmi	r0, r0, #0
 
-1:	teq	r0, #0
+1:	movs	ip, r0
 	RETc(eq)
 
-3:
-	mov	r1, #0
-	mov	r2, #((127 + 23) << 23)
-	tst	r0, #0xfc000000
-	beq	LSYM(Lad_p)
-
-	@ We need to scale the value a little before branching to code above.
-	tst	r0, #0xf0000000
-4:
-	orrne	r1, r1, r0, lsl #28
-	movne	r0, r0, lsr #4
-	addne	r2, r2, #(4 << 23)
-	tst	r0, #0x0c000000
-	beq	LSYM(Lad_p)
-	mov	r1, r1, lsr #2
-	orr	r1, r1, r0, lsl #30
-	mov	r0, r0, lsr #2
-	add	r2, r2, #(2 << 23)
-	b	LSYM(Lad_p)
+	@ Add initial exponent to sign
+	orr	r3, r3, #((127 + 23) << 23)
+
+	.ifnc	ah, r0
+	mov	ah, r0
+	.endif
+	mov	al, #0
+	b	2f
 
 	FUNC_END aeabi_i2f
 	FUNC_END floatsisf
@@ -317,22 +299,15 @@
 
 ARM_FUNC_START floatundisf
 ARM_FUNC_ALIAS aeabi_ul2f floatundisf
+
 	orrs	r2, r0, r1
 #if !defined (__VFP_FP__) && !defined(__SOFTFP__)
 	mvfeqs	f0, #0.0
 #endif
 	RETc(eq)
-	
-#if !defined (__VFP_FP__) && !defined(__SOFTFP__)
-	@ For hard FPA code we want to return via the tail below so that
-	@ we can return the result in f0 as well as in r0 for backwards
-	@ compatibility.
-	str	lr, [sp, #-4]!
-	adr	lr, 4f
-#endif
 
 	mov	r3, #0
-	b	2f
+	b	1f
 
 ARM_FUNC_START floatdisf
 ARM_FUNC_ALIAS aeabi_l2f floatdisf
@@ -342,78 +317,80 @@
 	mvfeqs	f0, #0.0
 #endif
 	RETc(eq)
-	
+
+	ands	r3, ah, #0x80000000	@ sign bit in r3
+	bpl	1f
+	rsbs	al, al, #0
+	rsc	ah, ah, #0
+1:
 #if !defined (__VFP_FP__) && !defined(__SOFTFP__)
 	@ For hard FPA code we want to return via the tail below so that
 	@ we can return the result in f0 as well as in r0 for backwards
 	@ compatibility.
 	str	lr, [sp, #-4]!
-	adr	lr, 4f
+	adr	lr, LSYM(f0_ret)
 #endif
-	ands	r3, ah, #0x80000000	@ sign bit in r3
-	bpl	2f
-	rsbs	al, al, #0
-	rsc	ah, ah, #0
-2:
+
 	movs	ip, ah
-#ifdef __ARMEB__
-	moveq	r0, al
-#endif
-	beq	3b
-	mov	r2, #((127 + 23 + 32) << 23)	@ initial exponent
-#ifndef __ARMEB__
-	mov	r1, al
-	mov	r0, ip
-#endif
-	tst	r0, #0xfc000000
-	bne	3f
+	moveq	ip, al
+
+	@ Add initial exponent to sign
+	orr	r3, r3, #((127 + 23 + 32) << 23)
+	subeq	r3, r3, #(32 << 23)
+2:	sub	r3, r3, #(1 << 23)
 
 #if __ARM_ARCH__ < 5
-	cmp	r0, #(1 << 13)
-	movlo	ip, #13
-	movlo	r0, r0, lsl #13
-	movhs	ip, #0
-	tst	r0, #0x03fc0000
-	addeq	ip, ip, #8
-	moveq	r0, r0, lsl #8
-	tst	r0, #0x03c00000
-	addeq	ip, ip, #4
-	moveq	r0, r0, lsl #4
-	tst	r0, #0x03000000
-	addeq	ip, ip, #2
-	moveq	r0, r0, lsl #2
+
+	mov	r2, #23
+	cmp	ip, #(1 << 16)
+	movhs	ip, ip, lsr #16
+	subhs	r2, r2, #16
+	cmp	ip, #(1 << 8)
+	movhs	ip, ip, lsr #8
+	subhs	r2, r2, #8
+	cmp	ip, #(1 << 4)
+	movhs	ip, ip, lsr #4
+	subhs	r2, r2, #4
+	cmp	ip, #(1 << 2)
+	subhs	r2, r2, #2
+	sublo	r2, r2, ip, lsr #1
+	subs	r2, r2, ip, lsr #3
+
 #else
-	clz	ip, r0
-	sub	ip, ip, #6
-	mov	r0, r0, lsl ip
+
+	clz	r2, ip
+	subs	r2, r2, #8
+
 #endif
-	sub	r2, r2, ip, lsl #23
-	rsb	ip, ip, #32
-	orr	r0, r0, r1, lsr ip
-	rsb	ip, ip, #32
-	mov	r1, r1, asl ip
-	@ At this point we no-longer care about the precise value in r1, only
-	@ whether only the top bit is set, or if the top bit and some others
-	@ are set.
-	and	ip, r1, #0xff
-	orr	r1, r1, ip, lsl #8
-	b	LSYM(Lad_p)
-3:
-	@ We need to scale the value a little before branching to code above.
-	@ At this point we no-longer care about the precise value in r1, only
-	@ whether only the top bit is set, or if the top bit and some others
-	@ are set.
-	and	ip, r1, #0xff
-	orr	r1, r1, ip, lsl #8
-	tst	r0, #0xf0000000
-	movne	r1, r1, lsr #4
-	b	4b
+
+	sub	r3, r3, r2, lsl #23
+	blt	3f
+
+	add	r3, r3, ah, lsl r2
+	mov	ip, al, lsl r2
+	rsb	r2, r2, #32
+	cmp	ip, #0x80000000
+	adc	r0, r3, al, lsr r2
+	biceq	r0, r0, #1
+	RET
+
+3:	add	r2, r2, #32
+	mov	ip, ah, lsl r2
+	rsb	r2, r2, #32
+	orrs	al, al, ip, lsl #1
+	adc	r0, r3, ah, lsr r2
+	biceq	r0, r0, ip, lsr #31
+	RET
+
 #if !defined (__VFP_FP__) && !defined(__SOFTFP__)
-4:
+
+LSYM(f0_ret)
 	str	r0, [sp, #-4]!
 	ldfs	f0, [sp], #4
 	RETLDM
+
 #endif
+
 	FUNC_END floatdisf
 	FUNC_END aeabi_l2f
 	FUNC_END floatundisf
@@ -425,139 +402,117 @@
 
 ARM_FUNC_START mulsf3
 ARM_FUNC_ALIAS aeabi_fmul mulsf3
-	
-	@ Mask out exponents.
-	mov	ip, #0xff000000
-	and	r2, r0, ip, lsr #1
-	and	r3, r1, ip, lsr #1
-
-	@ Trap any INF/NAN.
-	teq	r2, ip, lsr #1
-	teqne	r3, ip, lsr #1
-	beq	LSYM(Lml_s)
-
-	@ Trap any multiplication by 0.
-	bics	ip, r0, #0x80000000
-	bicnes	ip, r1, #0x80000000
-	beq	LSYM(Lml_z)
 
-	@ Shift exponents right one bit to make room for overflow bit.
-	@ If either of them is 0, scale denormalized arguments off line.
-	@ Then add both exponents together.
-	movs	r2, r2, lsr #1
-	teqne	r3, #0
-	beq	LSYM(Lml_d)
+	@ Mask out exponents, trap any zero/denormal/INF/NAN.
+	mov	ip, #0xff
+	ands	r2, ip, r0, lsr #23
+	andnes	r3, ip, r1, lsr #23
+	teqne	r2, ip
+	teqne	r3, ip
+	beq	LSYM(Lml_s)
 LSYM(Lml_x):
-	add	r2, r2, r3, asr #1
 
-	@ Preserve final sign in r2 along with exponent for now.
-	teq	r0, r1
-	orrmi	r2, r2, #0x8000
+	@ Add exponents together
+	add	r2, r2, r3
+
+	@ Determine final sign.
+	eor	ip, r0, r1
 
 	@ Convert mantissa to unsigned integer.
-	bic	r0, r0, #0xff000000
-	bic	r1, r1, #0xff000000
-	orr	r0, r0, #0x00800000
-	orr	r1, r1, #0x00800000
+	@ If power of two, branch to a separate path.
+	@ Make up for final alignment.
+	movs	r0, r0, lsl #9
+	movnes	r1, r1, lsl #9
+	beq	LSYM(Lml_1)
+	mov	r3, #0x08000000
+	orr	r0, r3, r0, lsr #5
+	orr	r1, r3, r1, lsr #5
 
 #if __ARM_ARCH__ < 4
 
+	@ Put sign bit in r3, which will be restored into r0 later.
+	and	r3, ip, #0x80000000
+
 	@ Well, no way to make it shorter without the umull instruction.
-	@ We must perform that 24 x 24 -> 48 bit multiplication by hand.
-	stmfd	sp!, {r4, r5}
+	stmfd	sp!, {r3, r4, r5}
 	mov	r4, r0, lsr #16
 	mov	r5, r1, lsr #16
-	bic	r0, r0, #0x00ff0000
-	bic	r1, r1, #0x00ff0000
+	bic	r0, r0, r4, lsl #16
+	bic	r1, r1, r5, lsl #16
 	mul	ip, r4, r5
 	mul	r3, r0, r1
 	mul	r0, r5, r0
 	mla	r0, r4, r1, r0
 	adds	r3, r3, r0, lsl #16
-	adc	ip, ip, r0, lsr #16
-	ldmfd	sp!, {r4, r5}
+	adc	r1, ip, r0, lsr #16
+	ldmfd	sp!, {r0, r4, r5}
 
 #else
 
-	umull	r3, ip, r0, r1		@ The actual multiplication.
+	@ The actual multiplication.
+	umull	r3, r1, r0, r1
+
+	@ Put final sign in r0.
+	and	r0, ip, #0x80000000
 
 #endif
 
-	@ Put final sign in r0.
-	mov	r0, r2, lsl #16
-	bic	r2, r2, #0x8000
+	@ Adjust result upon the MSB position.
+	cmp	r1, #(1 << 23)
+	movcc	r1, r1, lsl #1
+	orrcc	r1, r1, r3, lsr #31
+	movcc	r3, r3, lsl #1
 
-	@ Adjust result if one extra MSB appeared.
-	@ The LSB may be lost but this never changes the result in this case.
-	tst	ip, #(1 << 15)
-	addne	r2, r2, #(1 << 22)
-	movnes	ip, ip, lsr #1
-	movne	r3, r3, rrx
-
-	@ Apply exponent bias, check range for underflow.
-	subs	r2, r2, #(127 << 22)
-	ble	LSYM(Lml_u)
-
-	@ Scale back to 24 bits with rounding.
-	@ r0 contains sign bit already.
-	orrs	r0, r0, r3, lsr #23
-	adc	r0, r0, ip, lsl #9
-
-	@ If halfway between two numbers, rounding should be towards LSB = 0.
-	mov	r3, r3, lsl #9
-	teq	r3, #0x80000000
-	biceq	r0, r0, #1
+	@ Add sign to result.
+	orr	r0, r0, r1
 
-	@ Note: rounding may have produced an extra MSB here.
-	@ The extra bit is cleared before merging the exponent below.
-	tst	r0, #0x01000000
-	addne	r2, r2, #(1 << 22)
-
-	@ Check for exponent overflow
-	cmp	r2, #(255 << 22)
-	bge	LSYM(Lml_o)
-
-	@ Add final exponent.
-	bic	r0, r0, #0x01800000
-	orr	r0, r0, r2, lsl #1
+	@ Apply exponent bias, check for under/overflow.
+	sbc	r2, r2, #127
+	cmp	r2, #(254 - 1)
+	bhi	LSYM(Lml_u)
+
+	@ Round the result, merge final exponent.
+	cmp	r3, #0x80000000
+	adc	r0, r0, r2, lsl #23
+	biceq	r0, r0, #1
 	RET
 
-	@ Result is 0, but determine sign anyway.
-LSYM(Lml_z):
-	eor	r0, r0, r1
-	bic	r0, r0, #0x7fffffff
-	RET
+	@ Multiplication by 0x1p*: let''s shortcut a lot of code.
+LSYM(Lml_1):
+	teq	r0, #0
+	and	ip, ip, #0x80000000
+	moveq	r1, r1, lsl #9
+	orr	r0, ip, r0, lsr #9
+	orr	r0, r0, r1, lsr #9
+	subs	r2, r2, #127
+	rsbgts	r3, r2, #255
+	orrgt	r0, r0, r2, lsl #23
+	RETc(gt)
+
+	@ Under/overflow: fix things up for the code below.
+	orr	r0, r0, #0x00800000
+	mov	r3, #0
+	subs	r2, r2, #1
 
-	@ Check if denormalized result is possible, otherwise return signed 0.
 LSYM(Lml_u):
-	cmn	r2, #(24 << 22)
-	RETc(le)
+	@ Overflow?
+	bgt	LSYM(Lml_o)
 
-	@ Find out proper shift value.
-	mvn	r1, r2, asr #22
-	subs	r1, r1, #7
-	bgt	LSYM(Lml_ur)
-
-	@ Shift value left, round, etc.
-	add	r1, r1, #32
-	orrs	r0, r0, r3, lsr r1
-	rsb	r1, r1, #32
-	adc	r0, r0, ip, lsl r1
-	mov	ip, r3, lsl r1
-	teq	ip, #0x80000000
-	biceq	r0, r0, #1
-	RET
+	@ Check if denormalized result is possible, otherwise return signed 0.
+	cmn	r2, #(24 + 1)
+	bicle	r0, r0, #0x7fffffff
+	RETc(le)
 
 	@ Shift value right, round, etc.
-	@ Note: r1 must not be 0 otherwise carry does not get set.
-LSYM(Lml_ur):
-	orrs	r0, r0, ip, lsr r1
+	rsb	r2, r2, #0
+	movs	r1, r0, lsl #1
+	mov	r1, r1, lsr r2
+	rsb	r2, r2, #32
+	mov	ip, r0, lsl r2
+	movs	r0, r1, rrx
 	adc	r0, r0, #0
-	rsb	r1, r1, #32
-	mov	ip, ip, lsl r1
-	teq	r3, #0
-	teqeq	ip, #0x80000000
-	biceq	r0, r0, #1
+	orrs	r3, r3, ip, lsl #1
+	biceq	r0, r0, ip, lsr #31
 	RET
 
 	@ One or both arguments are denormalized.
@@ -567,32 +522,51 @@
 	and	ip, r0, #0x80000000
 1:	moveq	r0, r0, lsl #1
 	tsteq	r0, #0x00800000
-	subeq	r2, r2, #(1 << 22)
+	subeq	r2, r2, #1
 	beq	1b
 	orr	r0, r0, ip
 	teq	r3, #0
 	and	ip, r1, #0x80000000
 2:	moveq	r1, r1, lsl #1
 	tsteq	r1, #0x00800000
-	subeq	r3, r3, #(1 << 23)
+	subeq	r3, r3, #1
 	beq	2b
 	orr	r1, r1, ip
 	b	LSYM(Lml_x)
 
-	@ One or both args are INF or NAN.
 LSYM(Lml_s):
+	@ Isolate the INF and NAN cases away
+	and	r3, ip, r1, lsr #23
+	teq	r2, ip
+	teqne	r3, ip
+	beq	1f
+
+	@ Here, one or more arguments are either denormalized or zero.
+	bics	ip, r0, #0x80000000
+	bicnes	ip, r1, #0x80000000
+	bne	LSYM(Lml_d)
+
+	@ Result is 0, but determine sign anyway.
+LSYM(Lml_z):
+	eor	r0, r0, r1
+	bic	r0, r0, #0x7fffffff
+	RET
+
+1:	@ One or both args are INF or NAN.
 	teq	r0, #0x0
-	teqne	r1, #0x0
 	teqne	r0, #0x80000000
+	moveq	r0, r1
+	teqne	r1, #0x0
 	teqne	r1, #0x80000000
 	beq	LSYM(Lml_n)		@ 0 * INF or INF * 0 -> NAN
-	teq	r2, ip, lsr #1
+	teq	r2, ip
 	bne	1f
 	movs	r2, r0, lsl #9
 	bne	LSYM(Lml_n)		@ NAN * <anything> -> NAN
-1:	teq	r3, ip, lsr #1
+1:	teq	r3, ip
 	bne	LSYM(Lml_i)
 	movs	r3, r1, lsl #9
+	movne	r0, r1
 	bne	LSYM(Lml_n)		@ <anything> * NAN -> NAN
 
 	@ Result is INF, but we need to determine its sign.
@@ -606,9 +580,9 @@
 	orr	r0, r0, #0x00800000
 	RET
 
-	@ Return NAN.
+	@ Return a quiet NAN.
 LSYM(Lml_n):
-	mov	r0, #0x7f000000
+	orr	r0, r0, #0x7f000000
 	orr	r0, r0, #0x00c00000
 	RET
 
@@ -617,37 +591,28 @@
 
 ARM_FUNC_START divsf3
 ARM_FUNC_ALIAS aeabi_fdiv divsf3
-	
-	@ Mask out exponents.
-	mov	ip, #0xff000000
-	and	r2, r0, ip, lsr #1
-	and	r3, r1, ip, lsr #1
-
-	@ Trap any INF/NAN or zeroes.
-	teq	r2, ip, lsr #1
-	teqne	r3, ip, lsr #1
-	bicnes	ip, r0, #0x80000000
-	bicnes	ip, r1, #0x80000000
-	beq	LSYM(Ldv_s)
 
-	@ Shift exponents right one bit to make room for overflow bit.
-	@ If either of them is 0, scale denormalized arguments off line.
-	@ Then substract divisor exponent from dividend''s.
-	movs	r2, r2, lsr #1
-	teqne	r3, #0
-	beq	LSYM(Ldv_d)
+	@ Mask out exponents, trap any zero/denormal/INF/NAN.
+	mov	ip, #0xff
+	ands	r2, ip, r0, lsr #23
+	andnes	r3, ip, r1, lsr #23
+	teqne	r2, ip
+	teqne	r3, ip
+	beq	LSYM(Ldv_s)
 LSYM(Ldv_x):
-	sub	r2, r2, r3, asr #1
+
+	@ Substract divisor exponent from dividend''s
+	sub	r2, r2, r3
 
 	@ Preserve final sign into ip.
 	eor	ip, r0, r1
 
 	@ Convert mantissa to unsigned integer.
 	@ Dividend -> r3, divisor -> r1.
-	mov	r3, #0x10000000
 	movs	r1, r1, lsl #9
 	mov	r0, r0, lsl #9
 	beq	LSYM(Ldv_1)
+	mov	r3, #0x10000000
 	orr	r1, r3, r1, lsr #4
 	orr	r3, r3, r0, lsr #4
 
@@ -655,16 +620,10 @@
 	and	r0, ip, #0x80000000
 
 	@ Ensure result will land to known bit position.
+	@ Apply exponent bias accordingly.
 	cmp	r3, r1
-	subcc	r2, r2, #(1 << 22)
 	movcc	r3, r3, lsl #1
-
-	@ Apply exponent bias, check range for over/underflow.
-	add	r2, r2, #(127 << 22)
-	cmn	r2, #(24 << 22)
-	RETc(le)
-	cmp	r2, #(255 << 22)
-	bge	LSYM(Lml_o)
+	adc	r2, r2, #(127 - 2)
 
 	@ The actual division loop.
 	mov	ip, #0x00800000
@@ -684,44 +643,29 @@
 	movnes	ip, ip, lsr #4
 	bne	1b
 
-	@ Check if denormalized result is needed.
-	cmp	r2, #0
-	ble	LSYM(Ldv_u)
+	@ Check exponent for under/overflow.
+	cmp	r2, #(254 - 1)
+	bhi	LSYM(Lml_u)
 
-	@ Apply proper rounding.
+	@ Round the result, merge final exponent.
 	cmp	r3, r1
-	addcs	r0, r0, #1
+	adc	r0, r0, r2, lsl #23
 	biceq	r0, r0, #1
-
-	@ Add exponent to result.
-	bic	r0, r0, #0x00800000
-	orr	r0, r0, r2, lsl #1
 	RET
 
 	@ Division by 0x1p*: let''s shortcut a lot of code.
 LSYM(Ldv_1):
 	and	ip, ip, #0x80000000
 	orr	r0, ip, r0, lsr #9
-	add	r2, r2, #(127 << 22)
-	cmp	r2, #(255 << 22)
-	bge	LSYM(Lml_o)
-	cmp	r2, #0
-	orrgt	r0, r0, r2, lsl #1
+	adds	r2, r2, #127
+	rsbgts	r3, r2, #255
+	orrgt	r0, r0, r2, lsl #23
 	RETc(gt)
-	cmn	r2, #(24 << 22)
-	movle	r0, ip
-	RETc(le)
+
 	orr	r0, r0, #0x00800000
 	mov	r3, #0
-
-	@ Result must be denormalized: prepare parameters to use code above.
-	@ r3 already contains remainder for rounding considerations.
-LSYM(Ldv_u):
-	bic	ip, r0, #0x80000000
-	and	r0, r0, #0x80000000
-	mvn	r1, r2, asr #22
-	add	r1, r1, #2
-	b	LSYM(Lml_ur)
+	subs	r2, r2, #1
+	b	LSYM(Lml_u)
 
 	@ One or both arguments are denormalized.
 	@ Scale them leftwards and preserve sign bit.
@@ -730,35 +674,40 @@
 	and	ip, r0, #0x80000000
 1:	moveq	r0, r0, lsl #1
 	tsteq	r0, #0x00800000
-	subeq	r2, r2, #(1 << 22)
+	subeq	r2, r2, #1
 	beq	1b
 	orr	r0, r0, ip
 	teq	r3, #0
 	and	ip, r1, #0x80000000
 2:	moveq	r1, r1, lsl #1
 	tsteq	r1, #0x00800000
-	subeq	r3, r3, #(1 << 23)
+	subeq	r3, r3, #1
 	beq	2b
 	orr	r1, r1, ip
 	b	LSYM(Ldv_x)
 
-	@ One or both arguments is either INF, NAN or zero.
+	@ One or both arguments are either INF, NAN, zero or denormalized.
 LSYM(Ldv_s):
-	mov	ip, #0xff000000
-	teq	r2, ip, lsr #1
-	teqeq	r3, ip, lsr #1
-	beq	LSYM(Lml_n)		@ INF/NAN / INF/NAN -> NAN
-	teq	r2, ip, lsr #1
+	and	r3, ip, r1, lsr #23
+	teq	r2, ip
 	bne	1f
 	movs	r2, r0, lsl #9
 	bne	LSYM(Lml_n)		@ NAN / <anything> -> NAN
-	b	LSYM(Lml_i)		@ INF / <anything> -> INF
-1:	teq	r3, ip, lsr #1
+	teq	r3, ip
+	bne	LSYM(Lml_i)		@ INF / <anything> -> INF
+	mov	r0, r1
+	b	LSYM(Lml_n)		@ INF / (INF or NAN) -> NAN
+1:	teq	r3, ip
 	bne	2f
 	movs	r3, r1, lsl #9
-	bne	LSYM(Lml_n)		@ <anything> / NAN -> NAN
-	b	LSYM(Lml_z)		@ <anything> / INF -> 0
-2:	@ One or both arguments are 0.
+	beq	LSYM(Lml_z)		@ <anything> / INF -> 0
+	mov	r0, r1
+	b	LSYM(Lml_n)		@ <anything> / NAN -> NAN
+2:	@ If both are non-zero, we need to normalize and resume above.
+	bics	ip, r0, #0x80000000
+	bicnes	ip, r1, #0x80000000
+	bne	LSYM(Ldv_d)
+	@ One or both arguments are zero.
 	bics	r2, r0, #0x80000000
 	bne	LSYM(Lml_i)		@ <non_zero> / 0 -> INF
 	bics	r3, r1, #0x80000000
@@ -789,85 +738,50 @@
 
 ARM_FUNC_START gtsf2
 ARM_FUNC_ALIAS gesf2 gtsf2
-	mov	r3, #-1
+	mov	ip, #-1
 	b	1f
 
 ARM_FUNC_START ltsf2
 ARM_FUNC_ALIAS lesf2 ltsf2
-	mov	r3, #1
+	mov	ip, #1
 	b	1f
 
 ARM_FUNC_START cmpsf2
 ARM_FUNC_ALIAS nesf2 cmpsf2
 ARM_FUNC_ALIAS eqsf2 cmpsf2
-	mov	r3, #1			@ how should we specify unordered here?
+	mov	ip, #1			@ how should we specify unordered here?
+
+1:	str	ip, [sp, #-4]
 
-	@ Both Inf and NaN have an exponent of 255.  Therefore, we
-	@ compute (r1 & 0x8f80000) || (r2 & 0x8f8000).
-1:	mov	ip, #0xff000000
-	and	r2, r1, ip, lsr #1
-	teq	r2, ip, lsr #1
-	and	r2, r0, ip, lsr #1
-	teqne	r2, ip, lsr #1
+	@ Trap any INF/NAN first.
+	mov	r2, r0, lsl #1
+	mov	r3, r1, lsl #1
+	mvns	ip, r2, asr #24
+	mvnnes	ip, r3, asr #24
 	beq	3f
 
-	@ Test for equality.  The representations of +0.0 and -0.0
-	@ have all bits set to zero, except for the sign bit.  Since
-	@ 0.0 is equal to -0.0, we begin by testing 
-	@ ((r0 | r1) & ~0x8000000).
-2:	orr	r3, r0, r1
-	@ If the result of the bitwise and is zero, then the Z flag
-	@ will be set.  In any case, the C flag will be set.
-	bics	r3, r3, #0x80000000	@ either 0.0 or -0.0
-	teqne	r0, r1			@ or both the same
-	@ If the Z flag is set, the two operands were equal.  Return zero.
-	moveq	r0, #0
-	RETc(eq)
+	@ Compare values.
+	@ Note that 0.0 is equal to -0.0.
+2:	orrs	ip, r2, r3, lsr #1	@ test if both are 0, clear C flag
+	teqne	r0, r1			@ if not 0 compare sign
+	subpls	r0, r2, r3		@ if same sign compare values, set r0
+
+	@ Result:
+	movhi	r0, r1, asr #31
+	mvnlo	r0, r1, asr #31
+	orrne	r0, r0, #1
+	RET
 
-	@ Check for sign difference.  The N flag is set (due to the
-	@ use of teq above) if the sign bit is set on exactly one
-	@ of the operands.  Return the sign of the first operand.
-	movmi	r0, r0, asr #31
-	orrmi	r0, r0, #1
-	RETc(mi)
-
-	@ Compare exponents.
-	and	r3, r1, ip, lsr #1
-	cmp	r2, r3
-
-	@ Compare mantissa if exponents are equal
-	moveq	r0, r0, lsl #9
-	cmpeq	r0, r1, lsl #9
-
-	@ We know the operands cannot be equal at this point, so the
-	@ Z flag is clear.  The C flag is set if the first operand has
-	@ the greater exponent, or the exponents are equal and the 
-	@ first operand has the greater mantissa.  Therefore, if the C
-	@ flag is set, the first operand is greater iff the sign is
-	@ positive.  These next two instructions will put zero in
-	@ r0 if the first operand is greater, and -1 if the second
-	@ operand is greater.
-	movcs	r0, r1, asr #31
-	mvncc	r0, r1, asr #31
-	@ If r0 is 0, the first operand is greater, so return 1.  Leave
-	@ -1 unchanged.
-	orr	r0, r0, #1
-	RET
-
-	@ We know that at least one argument is either Inf or NaN.
-	@ Look for a NaN. 
-3:	and	r2, r1, ip, lsr #1
-	teq	r2, ip, lsr #1
+	@ Look for a NAN. 
+3:	mvns	ip, r2, asr #24
 	bne	4f
-	movs	r2, r1, lsl #9
-	bne	5f			@ r1 is NAN
-4:	and	r2, r0, ip, lsr #1
-	teq	r2, ip, lsr #1
-	bne	2b
 	movs	ip, r0, lsl #9
-	beq	2b			@ r0 is not NAN
-5:	@ The Z flag is clear at this point.
-	mov	r0, r3			@ return unordered code from r3.
+	bne	5f			@ r0 is NAN
+4:	mvns	ip, r3, asr #24
+	bne	2b
+	movs	ip, r1, lsl #9
+	beq	2b			@ r1 is not NAN
+5:	ldr	r0, [sp, #-4]		@ return unordered code.
 	RET
 
 	FUNC_END gesf2
@@ -879,13 +793,15 @@
 	FUNC_END cmpsf2
 
 ARM_FUNC_START aeabi_cfrcmple
+
 	mov	ip, r0
 	mov	r0, r1
 	mov	r1, ip
 	b	6f
-	
+
 ARM_FUNC_START aeabi_cfcmpeq
 ARM_FUNC_ALIAS aeabi_cfcmple aeabi_cfcmpeq
+
 	@ The status-returning routines are required to preserve all
 	@ registers except ip, lr, and cpsr.
 6:	stmfd	sp!, {r0, r1, r2, r3, lr}
@@ -896,68 +812,79 @@
 	@ that the first operand was smaller than the second.
 	cmnmi	 r0, #0
 	RETLDM  "r0, r1, r2, r3"
+
 	FUNC_END aeabi_cfcmple
 	FUNC_END aeabi_cfcmpeq
-	
+	FUNC_END aeabi_cfrcmple
+
 ARM_FUNC_START	aeabi_fcmpeq
+
 	str	lr, [sp, #-4]!
 	ARM_CALL aeabi_cfcmple
 	moveq	r0, #1	@ Equal to.
 	movne	r0, #0	@ Less than, greater than, or unordered.
 	RETLDM
+
 	FUNC_END aeabi_fcmpeq
 
 ARM_FUNC_START	aeabi_fcmplt
+
 	str	lr, [sp, #-4]!
 	ARM_CALL aeabi_cfcmple
 	movcc	r0, #1	@ Less than.
 	movcs	r0, #0	@ Equal to, greater than, or unordered.
 	RETLDM
+
 	FUNC_END aeabi_fcmplt
 
 ARM_FUNC_START	aeabi_fcmple
+
 	str	lr, [sp, #-4]!
 	ARM_CALL aeabi_cfcmple
 	movls	r0, #1  @ Less than or equal to.
 	movhi	r0, #0	@ Greater than or unordered.
 	RETLDM
+
 	FUNC_END aeabi_fcmple
 
 ARM_FUNC_START	aeabi_fcmpge
+
 	str	lr, [sp, #-4]!
 	ARM_CALL aeabi_cfrcmple
 	movls	r0, #1	@ Operand 2 is less than or equal to operand 1.
 	movhi	r0, #0	@ Operand 2 greater than operand 1, or unordered.
 	RETLDM
+
 	FUNC_END aeabi_fcmpge
 
 ARM_FUNC_START	aeabi_fcmpgt
+
 	str	lr, [sp, #-4]!
 	ARM_CALL aeabi_cfrcmple
 	movcc	r0, #1	@ Operand 2 is less than operand 1.
 	movcs	r0, #0  @ Operand 2 is greater than or equal to operand 1,
 			@ or they are unordered.
 	RETLDM
+
 	FUNC_END aeabi_fcmpgt
-		
+
 #endif /* L_cmpsf2 */
 
 #ifdef L_unordsf2
 
 ARM_FUNC_START unordsf2
 ARM_FUNC_ALIAS aeabi_fcmpun unordsf2
-	
-	mov	ip, #0xff000000
-	and	r2, r1, ip, lsr #1
-	teq	r2, ip, lsr #1
+
+	mov	r2, r0, lsl #1
+	mov	r3, r1, lsl #1
+	mvns	ip, r2, asr #24
 	bne	1f
-	movs	r2, r1, lsl #9
-	bne	3f			@ r1 is NAN
-1:	and	r2, r0, ip, lsr #1
-	teq	r2, ip, lsr #1
-	bne	2f
-	movs	r2, r0, lsl #9
+	movs	ip, r0, lsl #9
 	bne	3f			@ r0 is NAN
+1:	mvns	ip, r3, asr #24
+	bne	2f
+	movs	ip, r1, lsl #9
+	bne	3f			@ r1 is NAN
 2:	mov	r0, #0			@ arguments are ordered.
 	RET
 3:	mov	r0, #1			@ arguments are unordered.
@@ -972,37 +899,35 @@
 
 ARM_FUNC_START fixsfsi
 ARM_FUNC_ALIAS aeabi_f2iz fixsfsi
-	movs	r0, r0, lsl #1
-	RETc(eq)			@ value is 0.
-
-	mov	r1, r1, rrx		@ preserve C flag (the actual sign)
 
 	@ check exponent range.
-	and	r2, r0, #0xff000000
+	mov	r2, r0, lsl #1
 	cmp	r2, #(127 << 24)
-	movcc	r0, #0			@ value is too small
-	RETc(cc)
-	cmp	r2, #((127 + 31) << 24)
-	bcs	1f			@ value is too large
-
-	mov	r0, r0, lsl #7
-	orr	r0, r0, #0x80000000
-	mov	r2, r2, lsr #24
-	rsb	r2, r2, #(127 + 31)
-	tst	r1, #0x80000000		@ the sign bit
-	mov	r0, r0, lsr r2
+	bcc	1f			@ value is too small
+	mov	r3, #(127 + 31)
+	subs	r2, r3, r2, lsr #24
+	bls	2f			@ value is too large
+
+	@ scale value
+	mov	r3, r0, lsl #8
+	orr	r3, r3, #0x80000000
+	tst	r0, #0x80000000		@ the sign bit
+	mov	r0, r3, lsr r2
 	rsbne	r0, r0, #0
 	RET
 
-1:	teq	r2, #0xff000000
-	bne	2f
-	movs	r0, r0, lsl #8
-	bne	3f			@ r0 is NAN.
-2:	ands	r0, r1, #0x80000000	@ the sign bit
+1:	mov	r0, #0
+	RET
+
+2:	cmp	r2, #(127 + 31 - 0xff)
+	bne	3f
+	movs	r2, r0, lsl #9
+	bne	4f			@ r0 is NAN.
+3:	ands	r0, r0, #0x80000000	@ the sign bit
 	moveq	r0, #0x7fffffff		@ the maximum signed positive si
 	RET
 
-3:	mov	r0, #0			@ What should we convert NAN to?
+4:	mov	r0, #0			@ What should we convert NAN to?
 	RET
 
 	FUNC_END aeabi_f2iz
@@ -1014,34 +939,33 @@
 
 ARM_FUNC_START fixunssfsi
 ARM_FUNC_ALIAS aeabi_f2uiz fixunssfsi
-	movs	r0, r0, lsl #1
-	movcss	r0, #0			@ value is negative...
-	RETc(eq)			@ ... or 0.
-
 
 	@ check exponent range.
-	and	r2, r0, #0xff000000
+	movs	r2, r0, lsl #1
+	bcs	1f			@ value is negative
 	cmp	r2, #(127 << 24)
-	movcc	r0, #0			@ value is too small
-	RETc(cc)
-	cmp	r2, #((127 + 32) << 24)
-	bcs	1f			@ value is too large
+	bcc	1f			@ value is too small
+	mov	r3, #(127 + 31)
+	subs	r2, r3, r2, lsr #24
+	bmi	2f			@ value is too large
 
-	mov	r0, r0, lsl #7
-	orr	r0, r0, #0x80000000
-	mov	r2, r2, lsr #24
-	rsb	r2, r2, #(127 + 31)
-	mov	r0, r0, lsr r2
+	@ scale the value
+	mov	r3, r0, lsl #8
+	orr	r3, r3, #0x80000000
+	mov	r0, r3, lsr r2
 	RET
 
-1:	teq	r2, #0xff000000
-	bne	2f
-	movs	r0, r0, lsl #8
-	bne	3f			@ r0 is NAN.
-2:	mov	r0, #0xffffffff		@ maximum unsigned si
+1:	mov	r0, #0
+	RET
+
+2:	cmp	r2, #(127 + 31 - 0xff)
+	bne	3f
+	movs	r2, r0, lsl #9
+	bne	4f			@ r0 is NAN.
+3:	mov	r0, #0xffffffff		@ maximum unsigned si
 	RET
 
-3:	mov	r0, #0			@ What should we convert NAN to?
+4:	mov	r0, #0			@ What should we convert NAN to?
 	RET
 
 	FUNC_END aeabi_f2uiz
Follow-Ups:
- Re: [Patch] new (almost rewritten) version of the ARM assembly ieee754lib
  - From: Mark Mitchell
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]