This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]

ia64 fp division tweek

To: gcc-patches at gcc dot gnu dot org
Subject: ia64 fp division tweek
From: Richard Henderson <rth at cygnus dot com>
Date: Tue, 5 Sep 2000 20:31:56 -0700

(1) The routines did not have a stop bit every two insns.  Since 
    Itanium has two F pipes, a stop every two FP insns is ideal;
    having one every three FP insns means that every other cycle
    we issue fewer insns than we can.

(2) There is a final copy to the destination register.  We'd had

	     frcpa f10, p6 = ...
	(p6) fma f10 = ...
	     mov f8 = f10

    This final mov will stall (in the normal case) for the result
    of the fma.  We have enough nops due to bundling, however, to
    easily calculate p7 = !p6 for free.  This lets us use

	(p6) fma f8 = ...
	(p7) mov f8 = f10

    which folds the latency of the final fma into the caller.


r~


        * config/ia64/lib1func.asm (__divtf3): Rebundle for Itanium.
        Eliminate final copy from non-trapping case.
        (__divdf3, __divsf3): Likewise.

Index: lib1funcs.asm
===================================================================
RCS file: /cvs/gcc/egcs/gcc/config/ia64/lib1funcs.asm,v
retrieving revision 1.8
diff -c -p -d -r1.8 lib1funcs.asm
*** lib1funcs.asm	2000/09/06 01:46:04	1.8
--- lib1funcs.asm	2000/09/06 03:20:38
***************
*** 11,40 ****
  	.global __divtf3
  	.proc __divtf3
  __divtf3:
! 	frcpa f10, p6 = farg0, farg1
  	;;
  (p6)	fnma.s1 f11 = farg1, f10, f1
! 	;;
! (p6)	fma.s1 f12 = f11, f10, f10
! (p6)	fma.s1 f11 = f11, f11, f0
! 	;;
! (p6)	fma.s1 f11 = f11, f12, f12
  	;;
! (p6)	fnma.s1 f12 = farg1, f11, f1
! (p6)	fma.s1 f10 = farg0, f10, f0
  	;;
! (p6)	fma.s1 f11 = f12, f11, f11
! (p6)	fnma.s1 f12 = farg1, f10, farg0
  	;;
! (p6)	fma.s1 f10 = f12, f11, f10
! (p6)	fnma.s1 f12 = farg1, f11, f1
  	;;
! (p6)	fnma.s1 f8 = farg1, f10, farg0
! (p6)	fma.s1 f9 = f12, f11, f11
  	;;
! (p6)	fma f10 = f8, f9, f10
  	;;
! 	mov fret0 = f10
  	br.ret.sptk rp
  	;;
  	.endp __divtf3
--- 11,41 ----
  	.global __divtf3
  	.proc __divtf3
  __divtf3:
! 	cmp.eq p7, p0 = r0, r0
! 	frcpa.s0 f10, p6 = farg0, farg1
  	;;
+ (p6)	cmp.ne p7, p0 = r0, r0
+ 	.pred.rel.mutex p6, p7
  (p6)	fnma.s1 f11 = farg1, f10, f1
! (p6)	fmpy.s1 f12 = farg0, f10
  	;;
! (p6)	fmpy.s1 f13 = f11, f11
! (p6)	fma.s1 f14 = f11, f11, f11
  	;;
! (p6)	fma.s1 f11 = f13, f13, f11
! (p6)	fma.s1 f13 = f14, f10, f10
  	;;
! (p6)	fma.s1 f10 = f13, f11, f10
! (p6)	fnma.s1 f12 = farg1, f12, farg0
  	;;
! (p6)	fma.s1 f11 = f11, f10, f12
! (p6)	fnma.s1 f13 = farg1, f10, f1
  	;;
! (p6)	fma.s1 f10 = f12, f10, f10
! (p6)	fnma.s1 f12 = farg1, f11, farg0
  	;;
! (p6)	fma fret0 = f12, f10, f11
! (p7)	mov fret0 = f10
  	br.ret.sptk rp
  	;;
  	.endp __divtf3
*************** __divtf3:
*** 53,79 ****
  	.global __divdf3
  	.proc __divdf3
  __divdf3:
! 	frcpa f10, p6 = farg0, farg1
  	;;
! (p6)	fma.s1 f11 = farg0, f10, f0
  (p6)	fnma.s1 f12 = farg1, f10, f1
  	;;
  (p6)	fma.s1 f11 = f12, f11, f11
! (p6)	fma.s1 f13 = f12, f12, f0
! (p6)	fma.s1 f10 = f12, f10, f10
  	;;
  (p6)	fma.s1 f11 = f13, f11, f11
! (p6)	fma.s1 f12 = f13, f13, f0
  (p6)	fma.s1 f10 = f13, f10, f10
  	;;
  (p6)	fma.d.s1 f11 = f12, f11, f11
  (p6)	fma.s1 f10 = f12, f10, f10
  	;;
  (p6)	fnma.d.s1 f8 = farg1, f11, farg0
- 	;;
- (p6)	fma.d f10 = f8, f10, f11
  	;;
! 	mov fret0 = f10
  	br.ret.sptk rp
  	;;
  	.endp __divdf3
--- 54,83 ----
  	.global __divdf3
  	.proc __divdf3
  __divdf3:
! 	cmp.eq p7, p0 = r0, r0
! 	frcpa.s0 f10, p6 = farg0, farg1
  	;;
! (p6)	cmp.ne p7, p0 = r0, r0
! 	.pred.rel.mutex p6, p7
! (p6)	fmpy.s1 f11 = farg0, f10
  (p6)	fnma.s1 f12 = farg1, f10, f1
  	;;
  (p6)	fma.s1 f11 = f12, f11, f11
! (p6)	fmpy.s1 f13 = f12, f12
  	;;
+ (p6)	fma.s1 f10 = f12, f10, f10
  (p6)	fma.s1 f11 = f13, f11, f11
! 	;;
! (p6)	fmpy.s1 f12 = f13, f13
  (p6)	fma.s1 f10 = f13, f10, f10
  	;;
  (p6)	fma.d.s1 f11 = f12, f11, f11
  (p6)	fma.s1 f10 = f12, f10, f10
  	;;
  (p6)	fnma.d.s1 f8 = farg1, f11, farg0
  	;;
! (p6)	fma.d fret0 = f8, f10, f11
! (p7)	mov fret0 = f10
  	br.ret.sptk rp
  	;;
  	.endp __divdf3
*************** __divdf3:
*** 92,113 ****
  	.global __divsf3
  	.proc __divsf3
  __divsf3:
! 	frcpa f10, p6 = farg0, farg1
  	;;
! (p6)	fma.s1 f8 = farg0, f10, f0
  (p6)	fnma.s1 f9 = farg1, f10, f1
  	;;
  (p6)	fma.s1 f8 = f9, f8, f8
! (p6)	fma.s1 f9 = f9, f9, f0
  	;;
  (p6)	fma.s1 f8 = f9, f8, f8
! (p6)	fma.s1 f9 = f9, f9, f0
! 	;;
! (p6)	fma.d.s1 f8 = f9, f8, f8
  	;;
! (p6)	fma.s f10 = f8, f1, f0
  	;;
! 	mov fret0 = f10
  	br.ret.sptk rp
  	;;
  	.endp __divsf3
--- 96,115 ----
  	.global __divsf3
  	.proc __divsf3
  __divsf3:
! 	frcpa.s0 f10, p6 = farg0, farg1
  	;;
! (p6)	fmpy.s1 f8 = farg0, f10
  (p6)	fnma.s1 f9 = farg1, f10, f1
  	;;
  (p6)	fma.s1 f8 = f9, f8, f8
! (p6)	fmpy.s1 f9 = f9, f9
  	;;
  (p6)	fma.s1 f8 = f9, f8, f8
! (p6)	fmpy.s1 f9 = f9, f9
  	;;
! (p6)	fma.d.s1 f10 = f9, f8, f8
  	;;
! 	fnorm.s.s0 fret0 = f10
  	br.ret.sptk rp
  	;;
  	.endp __divsf3

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]