This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
ia64 fp division tweek
- To: gcc-patches at gcc dot gnu dot org
- Subject: ia64 fp division tweek
- From: Richard Henderson <rth at cygnus dot com>
- Date: Tue, 5 Sep 2000 20:31:56 -0700
(1) The routines did not have a stop bit every two insns. Since
Itanium has two F pipes, a stop every two FP insns is ideal;
having one every three FP insns means that every other cycle
we issue fewer insns than we can.
(2) There is a final copy to the destination register. We'd had
frcpa f10, p6 = ...
(p6) fma f10 = ...
mov f8 = f10
This final mov will stall (in the normal case) for the result
of the fma. We have enough nops due to bundling, however, to
easily calculate p7 = !p6 for free. This lets us use
(p6) fma f8 = ...
(p7) mov f8 = f10
which folds the latency of the final fma into the caller.
r~
* config/ia64/lib1func.asm (__divtf3): Rebundle for Itanium.
Eliminate final copy from non-trapping case.
(__divdf3, __divsf3): Likewise.
Index: lib1funcs.asm
===================================================================
RCS file: /cvs/gcc/egcs/gcc/config/ia64/lib1funcs.asm,v
retrieving revision 1.8
diff -c -p -d -r1.8 lib1funcs.asm
*** lib1funcs.asm 2000/09/06 01:46:04 1.8
--- lib1funcs.asm 2000/09/06 03:20:38
***************
*** 11,40 ****
.global __divtf3
.proc __divtf3
__divtf3:
! frcpa f10, p6 = farg0, farg1
;;
(p6) fnma.s1 f11 = farg1, f10, f1
! ;;
! (p6) fma.s1 f12 = f11, f10, f10
! (p6) fma.s1 f11 = f11, f11, f0
! ;;
! (p6) fma.s1 f11 = f11, f12, f12
;;
! (p6) fnma.s1 f12 = farg1, f11, f1
! (p6) fma.s1 f10 = farg0, f10, f0
;;
! (p6) fma.s1 f11 = f12, f11, f11
! (p6) fnma.s1 f12 = farg1, f10, farg0
;;
! (p6) fma.s1 f10 = f12, f11, f10
! (p6) fnma.s1 f12 = farg1, f11, f1
;;
! (p6) fnma.s1 f8 = farg1, f10, farg0
! (p6) fma.s1 f9 = f12, f11, f11
;;
! (p6) fma f10 = f8, f9, f10
;;
! mov fret0 = f10
br.ret.sptk rp
;;
.endp __divtf3
--- 11,41 ----
.global __divtf3
.proc __divtf3
__divtf3:
! cmp.eq p7, p0 = r0, r0
! frcpa.s0 f10, p6 = farg0, farg1
;;
+ (p6) cmp.ne p7, p0 = r0, r0
+ .pred.rel.mutex p6, p7
(p6) fnma.s1 f11 = farg1, f10, f1
! (p6) fmpy.s1 f12 = farg0, f10
;;
! (p6) fmpy.s1 f13 = f11, f11
! (p6) fma.s1 f14 = f11, f11, f11
;;
! (p6) fma.s1 f11 = f13, f13, f11
! (p6) fma.s1 f13 = f14, f10, f10
;;
! (p6) fma.s1 f10 = f13, f11, f10
! (p6) fnma.s1 f12 = farg1, f12, farg0
;;
! (p6) fma.s1 f11 = f11, f10, f12
! (p6) fnma.s1 f13 = farg1, f10, f1
;;
! (p6) fma.s1 f10 = f12, f10, f10
! (p6) fnma.s1 f12 = farg1, f11, farg0
;;
! (p6) fma fret0 = f12, f10, f11
! (p7) mov fret0 = f10
br.ret.sptk rp
;;
.endp __divtf3
*************** __divtf3:
*** 53,79 ****
.global __divdf3
.proc __divdf3
__divdf3:
! frcpa f10, p6 = farg0, farg1
;;
! (p6) fma.s1 f11 = farg0, f10, f0
(p6) fnma.s1 f12 = farg1, f10, f1
;;
(p6) fma.s1 f11 = f12, f11, f11
! (p6) fma.s1 f13 = f12, f12, f0
! (p6) fma.s1 f10 = f12, f10, f10
;;
(p6) fma.s1 f11 = f13, f11, f11
! (p6) fma.s1 f12 = f13, f13, f0
(p6) fma.s1 f10 = f13, f10, f10
;;
(p6) fma.d.s1 f11 = f12, f11, f11
(p6) fma.s1 f10 = f12, f10, f10
;;
(p6) fnma.d.s1 f8 = farg1, f11, farg0
- ;;
- (p6) fma.d f10 = f8, f10, f11
;;
! mov fret0 = f10
br.ret.sptk rp
;;
.endp __divdf3
--- 54,83 ----
.global __divdf3
.proc __divdf3
__divdf3:
! cmp.eq p7, p0 = r0, r0
! frcpa.s0 f10, p6 = farg0, farg1
;;
! (p6) cmp.ne p7, p0 = r0, r0
! .pred.rel.mutex p6, p7
! (p6) fmpy.s1 f11 = farg0, f10
(p6) fnma.s1 f12 = farg1, f10, f1
;;
(p6) fma.s1 f11 = f12, f11, f11
! (p6) fmpy.s1 f13 = f12, f12
;;
+ (p6) fma.s1 f10 = f12, f10, f10
(p6) fma.s1 f11 = f13, f11, f11
! ;;
! (p6) fmpy.s1 f12 = f13, f13
(p6) fma.s1 f10 = f13, f10, f10
;;
(p6) fma.d.s1 f11 = f12, f11, f11
(p6) fma.s1 f10 = f12, f10, f10
;;
(p6) fnma.d.s1 f8 = farg1, f11, farg0
;;
! (p6) fma.d fret0 = f8, f10, f11
! (p7) mov fret0 = f10
br.ret.sptk rp
;;
.endp __divdf3
*************** __divdf3:
*** 92,113 ****
.global __divsf3
.proc __divsf3
__divsf3:
! frcpa f10, p6 = farg0, farg1
;;
! (p6) fma.s1 f8 = farg0, f10, f0
(p6) fnma.s1 f9 = farg1, f10, f1
;;
(p6) fma.s1 f8 = f9, f8, f8
! (p6) fma.s1 f9 = f9, f9, f0
;;
(p6) fma.s1 f8 = f9, f8, f8
! (p6) fma.s1 f9 = f9, f9, f0
! ;;
! (p6) fma.d.s1 f8 = f9, f8, f8
;;
! (p6) fma.s f10 = f8, f1, f0
;;
! mov fret0 = f10
br.ret.sptk rp
;;
.endp __divsf3
--- 96,115 ----
.global __divsf3
.proc __divsf3
__divsf3:
! frcpa.s0 f10, p6 = farg0, farg1
;;
! (p6) fmpy.s1 f8 = farg0, f10
(p6) fnma.s1 f9 = farg1, f10, f1
;;
(p6) fma.s1 f8 = f9, f8, f8
! (p6) fmpy.s1 f9 = f9, f9
;;
(p6) fma.s1 f8 = f9, f8, f8
! (p6) fmpy.s1 f9 = f9, f9
;;
! (p6) fma.d.s1 f10 = f9, f8, f8
;;
! fnorm.s.s0 fret0 = f10
br.ret.sptk rp
;;
.endp __divsf3