This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]

ia64 division update


The Intel IA-64 Optimization Guide provides some marginally
quicker implementations for integer division than we were
using.  Also re-enable 32-bit division routines, since they
can be done quicker than 64-bit division.


r~


        * config/ia64/ia64.h (INIT_TARGET_OPTABS): Remove.
        * config/ia64/lib1funcs.asm (__divdi3): Update from Intel IA-64
        Optimization Guide, minimum latency alternative.
        (__moddi3, __udivdi3, __umoddi3): Likewise.
        (__divsi3, __modsi3, __udivsi3, __umodsi3): Likewise.

Index: ia64.h
===================================================================
RCS file: /cvs/gcc/egcs/gcc/config/ia64/ia64.h,v
retrieving revision 1.41
diff -c -p -d -r1.41 ia64.h
*** ia64.h	2000/09/01 22:22:54	1.41
--- ia64.h	2000/09/05 22:52:46
*************** do {									\
*** 1694,1711 ****
     for lib1funcs.asm modules, e.g. __divdi3 vs _divdi3.  Since lib1funcs.asm
     goes into libgcc.a first, the linker will find it first.  */
  
- /* Define this macro as a C statement that declares additional library routines
-    renames existing ones.  */
- 
- /* ??? Disable the SImode divide routines for now.  */
- #define INIT_TARGET_OPTABS \
- do {									\
-   sdiv_optab->handlers[(int) SImode].libfunc = 0;			\
-   udiv_optab->handlers[(int) SImode].libfunc = 0;			\
-   smod_optab->handlers[(int) SImode].libfunc = 0;			\
-   umod_optab->handlers[(int) SImode].libfunc = 0;			\
- } while (0)
- 
  /* Define this macro if GNU CC should generate calls to the System V (and ANSI
     C) library functions `memcpy' and `memset' rather than the BSD functions
     `bcopy' and `bzero'.  */
--- 1694,1699 ----
Index: lib1funcs.asm
===================================================================
RCS file: /cvs/gcc/egcs/gcc/config/ia64/lib1funcs.asm,v
retrieving revision 1.6
diff -c -p -d -r1.6 lib1funcs.asm
*** lib1funcs.asm	2000/08/14 21:01:24	1.6
--- lib1funcs.asm	2000/09/05 22:52:46
*************** __divsf3:
*** 116,131 ****
  #ifdef L__divdi3
  // Compute a 64-bit integer quotient.
  //
! // Use reciprocal approximation and Newton-Raphson iteration to compute the
! // quotient.  frcpa gives 8.6 significant bits, so we need 3 iterations
! // to get more than the 64 bits of precision that we need for DImode.
! //
! // Must use max precision for the reciprocal computations to get 64 bits of
! // precision.
  //
! // r32/f8 holds the dividend.  r33/f9 holds the divisor.
! // f10 holds the value 2.0.  f11 holds the reciprocal approximation.
! // f12 is a temporary.
  
  	.text
  	.align 16
--- 116,125 ----
  #ifdef L__divdi3
  // Compute a 64-bit integer quotient.
  //
! // From the Intel IA-64 Optimization Guide, choose the minimum latency
! // alternative.
  //
! // in0 holds the dividend.  in1 holds the divisor.
  
  	.text
  	.align 16
*************** __divdi3:
*** 143,173 ****
  	;;
  	// Compute the reciprocal approximation.
  	frcpa.s1 f10, p6 = f8, f9
- 	;;
  	// 3 Newton-Raphson iterations.
! (p6)	fma.s1 f11 = farg0, f10, f0
! (p6)	fnma.s1 f12 = farg1, f10, f1
! 	;;
! (p6)	fma.s1 f11 = f12, f11, f11
! (p6)	fma.s1 f13 = f12, f12, f0
! (p6)	fma.s1 f10 = f12, f10, f10
  	;;
! (p6)	fma.s1 f11 = f13, f11, f11
! (p6)	fma.s1 f12 = f13, f13, f0
! (p6)	fma.s1 f10 = f13, f10, f10
  	;;
! (p6)	fma.s1 f11 = f12, f11, f11
! (p6)	fma.s1 f10 = f12, f10, f10
  	;;
! (p6)	fnma.s1 f8 = f9, f11, f8
  	;;
! (p6)	fma.s1 f10 = f8, f10, f11
  	;;
  	// Round quotient to an integer.
! 	fcvt.fx.trunc.s1 f8 = f10
  	;;
  	// Transfer result to GP registers.
! 	getf.sig ret0 = f8
  	br.ret.sptk rp
  	;;
  	.endp __divdi3
--- 137,162 ----
  	;;
  	// Compute the reciprocal approximation.
  	frcpa.s1 f10, p6 = f8, f9
  	// 3 Newton-Raphson iterations.
! (p6)	fnma.s1 f11 = f9, f10, f1
! (p6)	fmpy.s1 f12 = f8, f10
  	;;
! (p6)	fmpy.s1 f13 = f11, f11
! (p6)	fma.s1 f12 = f11, f12, f12
  	;;
! (p6)	fma.s1 f10 = f11, f10, f10
! (p6)	fma.s1 f11 = f13, f12, f12
  	;;
! (p6)	fma.s1 f10 = f13, f10, f10
! (p6)	fnma.s1 f12 = f9, f11, f8
  	;;
! (p6)	fma.s1 f10 = f12, f10, f11
  	;;
  	// Round quotient to an integer.
! 	fcvt.fx.trunc.s1 f10 = f10
  	;;
  	// Transfer result to GP registers.
! 	getf.sig ret0 = f10
  	br.ret.sptk rp
  	;;
  	.endp __divdi3
*************** __divdi3:
*** 176,191 ****
  #ifdef L__moddi3
  // Compute a 64-bit integer modulus.
  //
! // Use reciprocal approximation and Newton-Raphson iteration to compute the
! // quotient.  frcpa gives 8.6 significant bits, so we need 3 iterations
! // to get more than the 64 bits of precision that we need for DImode.
! //
! // Must use max precision for the reciprocal computations to get 64 bits of
! // precision.
  //
! // r32/f8 holds the dividend.  r33/f9 holds the divisor.
! // f10 holds the value 2.0.  f11 holds the reciprocal approximation.
! // f12 is a temporary.
  
  	.text
  	.align 16
--- 165,174 ----
  #ifdef L__moddi3
  // Compute a 64-bit integer modulus.
  //
! // From the Intel IA-64 Optimization Guide, choose the minimum latency
! // alternative.
  //
! // in0 holds the dividend (a).  in1 holds the divisor (b).
  
  	.text
  	.align 16
*************** __divdi3:
*** 194,242 ****
  __moddi3:
  	.regstk 2,0,0,0
  	// Transfer inputs to FP registers.
! 	setf.sig f8 = in0
  	setf.sig f9 = in1
  	;;
  	// Convert the inputs to FP, so that they won't be treated as unsigned.
! 	fcvt.xf f8 = f8
  	fcvt.xf f9 = f9
  	;;
  	// Compute the reciprocal approximation.
  	frcpa.s1 f10, p6 = f8, f9
  	;;
  	// 3 Newton-Raphson iterations.
! (p6)	fma.s1 f11 = farg0, f10, f0
! (p6)	fnma.s1 f12 = farg1, f10, f1
! 	;;
! (p6)	fma.s1 f11 = f12, f11, f11
! (p6)	fma.s1 f13 = f12, f12, f0
! (p6)	fma.s1 f10 = f12, f10, f10
  	;;
! (p6)	fma.s1 f11 = f13, f11, f11
! (p6)	fma.s1 f12 = f13, f13, f0
! (p6)	fma.s1 f10 = f13, f10, f10
  	;;
! (p6)	fma.s1 f11 = f12, f11, f11
! (p6)	fma.s1 f10 = f12, f10, f10
  	;;
  (p6)	fnma.s1 f12 = f9, f11, f8
  	;;
  (p6)	fma.s1 f10 = f12, f10, f11
  	;;
- 	// Round quotient to an integer.
  	fcvt.fx.trunc.s1 f10 = f10
- 	;;
- 	// Renormalize.
- 	fcvt.xf f10 = f10
  	;;
! 	// Compute remainder.
! 	fnma.s1 f8 = f10, f9, f8
! 	;;
! 	// Round remainder to an integer.
! 	fcvt.fx.trunc.s1 f8 = f8
  	;;
  	// Transfer result to GP registers.
! 	getf.sig ret0 = f8
  	br.ret.sptk rp
  	;;
  	.endp __moddi3
--- 177,216 ----
  __moddi3:
  	.regstk 2,0,0,0
  	// Transfer inputs to FP registers.
! 	setf.sig f14 = in0
  	setf.sig f9 = in1
  	;;
  	// Convert the inputs to FP, so that they won't be treated as unsigned.
! 	fcvt.xf f8 = f14
  	fcvt.xf f9 = f9
  	;;
  	// Compute the reciprocal approximation.
  	frcpa.s1 f10, p6 = f8, f9
  	;;
  	// 3 Newton-Raphson iterations.
! (p6)	fmpy.s1 f12 = f8, f10
! (p6)	fnma.s1 f11 = f9, f10, f1
  	;;
! (p6)	fma.s1 f12 = f11, f12, f12
! (p6)	fmpy.s1 f13 = f11, f11
  	;;
! (p6)	fma.s1 f10 = f11, f10, f10
! (p6)	fma.s1 f11 = f13, f12, f12
  	;;
+ 	sub in1 = r0, in1
+ (p6)	fma.s1 f10 = f13, f10, f10
  (p6)	fnma.s1 f12 = f9, f11, f8
  	;;
+ 	setf.sig f9 = in1
  (p6)	fma.s1 f10 = f12, f10, f11
  	;;
  	fcvt.fx.trunc.s1 f10 = f10
  	;;
! 	// r = q * (-b) + a
! 	xma.l f10 = f10, f9, f14
  	;;
  	// Transfer result to GP registers.
! 	getf.sig ret0 = f10
  	br.ret.sptk rp
  	;;
  	.endp __moddi3
*************** __moddi3:
*** 244,260 ****
  
  #ifdef L__udivdi3
  // Compute a 64-bit unsigned integer quotient.
- //
- // Use reciprocal approximation and Newton-Raphson iteration to compute the
- // quotient.  frcpa gives 8.6 significant bits, so we need 3 iterations
- // to get more than the 64 bits of precision that we need for DImode.
  //
! // Must use max precision for the reciprocal computations to get 64 bits of
! // precision.
  //
! // r32/f8 holds the dividend.  r33/f9 holds the divisor.
! // f10 holds the value 2.0.  f11 holds the reciprocal approximation.
! // f12 is a temporary.
  
  	.text
  	.align 16
--- 218,228 ----
  
  #ifdef L__udivdi3
  // Compute a 64-bit unsigned integer quotient.
  //
! // From the Intel IA-64 Optimization Guide, choose the minimum latency
! // alternative.
  //
! // in0 holds the dividend.  in1 holds the divisor.
  
  	.text
  	.align 16
*************** __udivdi3:
*** 274,302 ****
  	frcpa.s1 f10, p6 = f8, f9
  	;;
  	// 3 Newton-Raphson iterations.
! (p6)	fma.s1 f11 = farg0, f10, f0
! (p6)	fnma.s1 f12 = farg1, f10, f1
! 	;;
! (p6)	fma.s1 f11 = f12, f11, f11
! (p6)	fma.s1 f13 = f12, f12, f0
! (p6)	fma.s1 f10 = f12, f10, f10
  	;;
! (p6)	fma.s1 f11 = f13, f11, f11
! (p6)	fma.s1 f12 = f13, f13, f0
! (p6)	fma.s1 f10 = f13, f10, f10
  	;;
! (p6)	fma.s1 f11 = f12, f11, f11
! (p6)	fma.s1 f10 = f12, f10, f10
  	;;
! (p6)	fnma.s1 f8 = f9, f11, f8
  	;;
! (p6)	fma.s1 f10 = f8, f10, f11
  	;;
  	// Round quotient to an unsigned integer.
! 	fcvt.fxu.trunc.s1 f8 = f10
  	;;
  	// Transfer result to GP registers.
! 	getf.sig ret0 = f8
  	br.ret.sptk rp
  	;;
  	.endp __udivdi3
--- 242,266 ----
  	frcpa.s1 f10, p6 = f8, f9
  	;;
  	// 3 Newton-Raphson iterations.
! (p6)	fnma.s1 f11 = f9, f10, f1
! (p6)	fmpy.s1 f12 = f8, f10
  	;;
! (p6)	fmpy.s1 f13 = f11, f11
! (p6)	fma.s1 f12 = f11, f12, f12
  	;;
! (p6)	fma.s1 f10 = f11, f10, f10
! (p6)	fma.s1 f11 = f13, f12, f12
  	;;
! (p6)	fma.s1 f10 = f13, f10, f10
! (p6)	fnma.s1 f12 = f9, f11, f8
  	;;
! (p6)	fma.s1 f10 = f2, f10, f11
  	;;
  	// Round quotient to an unsigned integer.
! 	fcvt.fxu.trunc.s1 f10 = f10
  	;;
  	// Transfer result to GP registers.
! 	getf.sig ret0 = f10
  	br.ret.sptk rp
  	;;
  	.endp __udivdi3
*************** __udivdi3:
*** 305,320 ****
  #ifdef L__umoddi3
  // Compute a 64-bit unsigned integer modulus.
  //
! // Use reciprocal approximation and Newton-Raphson iteration to compute the
! // quotient.  frcpa gives 8.6 significant bits, so we need 3 iterations
! // to get more than the 64 bits of precision that we need for DImode.
! //
! // Must use max precision for the reciprocal computations to get 64 bits of
! // precision.
  //
! // r32/f8 holds the dividend.  r33/f9 holds the divisor.
! // f10 holds the value 2.0.  f11 holds the reciprocal approximation.
! // f12 is a temporary.
  
  	.text
  	.align 16
--- 269,278 ----
  #ifdef L__umoddi3
  // Compute a 64-bit unsigned integer modulus.
  //
! // From the Intel IA-64 Optimization Guide, choose the minimum latency
! // alternative.
  //
! // in0 holds the dividend (a).  in1 holds the divisor (b).
  
  	.text
  	.align 16
*************** __udivdi3:
*** 323,371 ****
  __umoddi3:
  	.regstk 2,0,0,0
  	// Transfer inputs to FP registers.
! 	setf.sig f8 = in0
  	setf.sig f9 = in1
  	;;
  	// Convert the inputs to FP, to avoid FP software assist faults.
! 	fcvt.xuf.s1 f8 = f8
  	fcvt.xuf.s1 f9 = f9
  	;;
  	// Compute the reciprocal approximation.
  	frcpa.s1 f10, p6 = f8, f9
  	;;
  	// 3 Newton-Raphson iterations.
! (p6)	fma.s1 f11 = farg0, f10, f0
! (p6)	fnma.s1 f12 = farg1, f10, f1
! 	;;
! (p6)	fma.s1 f11 = f12, f11, f11
! (p6)	fma.s1 f13 = f12, f12, f0
! (p6)	fma.s1 f10 = f12, f10, f10
  	;;
! (p6)	fma.s1 f11 = f13, f11, f11
! (p6)	fma.s1 f12 = f13, f13, f0
! (p6)	fma.s1 f10 = f13, f10, f10
  	;;
! (p6)	fma.s1 f11 = f12, f11, f11
! (p6)	fma.s1 f10 = f12, f10, f10
  	;;
  (p6)	fnma.s1 f12 = f9, f11, f8
  	;;
  (p6)	fma.s1 f10 = f12, f10, f11
  	;;
  	// Round quotient to an unsigned integer.
  	fcvt.fxu.trunc.s1 f10 = f10
- 	;;
- 	// Renormalize.
- 	fcvt.xuf.s1 f10 = f10
- 	;;
- 	// Compute remainder.
- 	fnma.s1 f8 = f10, f9, f8
  	;;
! 	// Round remainder to an integer.
! 	fcvt.fxu.trunc.s1 f8 = f8
  	;;
  	// Transfer result to GP registers.
! 	getf.sig ret0 = f8
  	br.ret.sptk rp
  	;;
  	.endp __umoddi3
--- 281,321 ----
  __umoddi3:
  	.regstk 2,0,0,0
  	// Transfer inputs to FP registers.
! 	setf.sig f14 = in0
  	setf.sig f9 = in1
  	;;
  	// Convert the inputs to FP, to avoid FP software assist faults.
! 	fcvt.xuf.s1 f8 = f14
  	fcvt.xuf.s1 f9 = f9
  	;;
  	// Compute the reciprocal approximation.
  	frcpa.s1 f10, p6 = f8, f9
  	;;
  	// 3 Newton-Raphson iterations.
! (p6)	fmpy.s1 f12 = f8, f10
! (p6)	fnma.s1 f11 = f9, f10, f1
  	;;
! (p6)	fma.s1 f12 = f11, f12, f12
! (p6)	fmpy.s1 f13 = f11, f11
  	;;
! (p6)	fma.s1 f10 = f11, f10, f10
! (p6)	fma.s1 f11 = f13, f12, f12
  	;;
+ 	sub in1 = r0, in1
+ (p6)	fma.s1 f10 = f13, f10, f10
  (p6)	fnma.s1 f12 = f9, f11, f8
  	;;
+ 	setf.sig f9 = in1
  (p6)	fma.s1 f10 = f12, f10, f11
  	;;
  	// Round quotient to an unsigned integer.
  	fcvt.fxu.trunc.s1 f10 = f10
  	;;
! 	// r = q * (-b) + a
! 	xma.l f10 = f10, f9, f14
  	;;
  	// Transfer result to GP registers.
! 	getf.sig ret0 = f10
  	br.ret.sptk rp
  	;;
  	.endp __umoddi3
*************** __umoddi3:
*** 373,394 ****
  
  #ifdef L__divsi3
  // Compute a 32-bit integer quotient.
- //
- // Use reciprocal approximation and Newton-Raphson iteration to compute the
- // quotient.  frcpa gives 8.6 significant bits, so we need 2 iterations
- // to get more than the 32 bits of precision that we need for SImode.
- //
- // ??? This is currently not used.  It needs to be fixed to be more like the
- // above DImode routines.
- //
- // ??? Check to see if the error is less than >.5ulp error.  We may need
- // some adjustment code to get precise enough results.
  //
! // ??? Should probably use max precision for the reciprocal computations.
  //
! // r32/f8 holds the dividend.  r33/f9 holds the divisor.
! // f10 holds the value 2.0.  f11 holds the reciprocal approximation.
! // f12 is a temporary.
  
  	.text
  	.align 16
--- 323,333 ----
  
  #ifdef L__divsi3
  // Compute a 32-bit integer quotient.
  //
! // From the Intel IA-64 Optimization Guide, choose the minimum latency
! // alternative.
  //
! // in0 holds the dividend.  in1 holds the divisor.
  
  	.text
  	.align 16
*************** __umoddi3:
*** 396,423 ****
  	.proc __divsi3
  __divsi3:
  	.regstk 2,0,0,0
  	setf.sig f8 = in0
  	setf.sig f9 = in1
  	;;
  	fcvt.xf f8 = f8
  	fcvt.xf f9 = f9
  	;;
! 	frcpa f11, p6 = f8, f9
! 	fadd f10 = f1, f1
! 	;;
! 	fnma f12 = f9, f11, f10
! 	;;
! 	fmpy f11 = f11, f12
  	;;
! 	fnma f12 = f9, f11, f10
  	;;
! 	fmpy f11 = f11, f12
  	;;
! 	fmpy f8 = f8, f11
  	;;
! 	fcvt.fx.trunc f8 = f8
  	;;
! 	getf.sig ret0 = f8
  	br.ret.sptk rp
  	;;
  	.endp __divsi3
--- 335,364 ----
  	.proc __divsi3
  __divsi3:
  	.regstk 2,0,0,0
+ 	sxt4 in0 = in0
+ 	sxt4 in1 = in1
+ 	;;
  	setf.sig f8 = in0
  	setf.sig f9 = in1
  	;;
+ 	mov r2 = 0x0ffdd
  	fcvt.xf f8 = f8
  	fcvt.xf f9 = f9
  	;;
! 	setf.exp f11 = r2
! 	frcpa f10, p6 = f8, f9
  	;;
! (p6)	fmpy.s1 f8 = f8, f10
! (p6)	fnma.s1 f9 = f9, f10, f1
  	;;
! (p6)	fma.s1 f8 = f9, f8, f8
! (p6)	fma.s1 f9 = f9, f9, f11
  	;;
! (p6)	fma.s1 f10 = f9, f8, f8
  	;;
! 	fcvt.fx.trunc.s1 f10 = f10
  	;;
! 	getf.sig ret0 = f10
  	br.ret.sptk rp
  	;;
  	.endp __divsi3
*************** __divsi3:
*** 425,446 ****
  
  #ifdef L__modsi3
  // Compute a 32-bit integer modulus.
- //
- // Use reciprocal approximation and Newton-Raphson iteration to compute the
- // quotient.  frcpa gives 8.6 significant bits, so we need 2 iterations
- // to get more than the 32 bits of precision that we need for SImode.
  //
! // ??? This is currently not used.  It needs to be fixed to be more like the
! // above DImode routines.
! //
! // ??? Check to see if the error is less than >.5ulp error.  We may need
! // some adjustment code to get precise enough results.
! //
! // ??? Should probably use max precision for the reciprocal computations.
  //
! // r32/f8 holds the dividend.  r33/f9 holds the divisor.
! // f10 holds the value 2.0.  f11 holds the reciprocal approximation.
! // f12 is a temporary.
  
  	.text
  	.align 16
--- 366,376 ----
  
  #ifdef L__modsi3
  // Compute a 32-bit integer modulus.
  //
! // From the Intel IA-64 Optimization Guide, choose the minimum latency
! // alternative.
  //
! // in0 holds the dividend.  in1 holds the divisor.
  
  	.text
  	.align 16
*************** __divsi3:
*** 448,481 ****
  	.proc __modsi3
  __modsi3:
  	.regstk 2,0,0,0
! 	setf.sig f8 = r32
  	setf.sig f9 = r33
  	;;
! 	fcvt.xf f8 = f8
  	fcvt.xf f9 = f9
  	;;
! 	frcpa f11, p6 = f8, f9
! 	fadd f10 = f1, f1
! 	;;
! 	fnma f12 = f9, f11, f10
! 	;;
! 	fmpy f11 = f11, f12
! 	;;
! 	fnma f12 = f9, f11, f10
! 	;;
! 	fmpy f11 = f11, f12
  	;;
! 	fmpy f10 = f8, f11
  	;;
! 	fcvt.fx.trunc f10 = f10
  	;;
! 	fcvt.xf f10 = f10
  	;;
! 	fnma f8 = f10, f9, f8
  	;;
! 	fcvt.fx f8 = f8
  	;;
! 	getf.sig r32 = f8
  	br.ret.sptk rp
  	;;
  	.endp __modsi3
--- 378,411 ----
  	.proc __modsi3
  __modsi3:
  	.regstk 2,0,0,0
! 	mov r2 = 0x0ffdd
! 	sxt4 in0 = in0
! 	sxt4 in1 = in1
! 	;;
! 	setf.sig f13 = r32
  	setf.sig f9 = r33
  	;;
! 	sub in1 = r0, in1
! 	fcvt.xf f8 = f13
  	fcvt.xf f9 = f9
  	;;
! 	setf.exp f11 = r2
! 	frcpa f10, p6 = f8, f9
  	;;
! (p6)	fmpy.s1 f12 = f8, f10
! (p6)	fnma.s1 f10 = f9, f10, f1
  	;;
! 	setf.sig f9 = in1
! (p6)	fma.s1 f12 = f10, f12, f12
! (p6)	fma.s1 f10 = f10, f10, f11	
  	;;
! (p6)	fma.s1 f10 = f10, f12, f12
  	;;
! 	fcvt.fx.trunc.s1 f10 = f10
  	;;
! 	xma.l f10 = f10, f9, f13
  	;;
! 	getf.sig ret0 = f10
  	br.ret.sptk rp
  	;;
  	.endp __modsi3
*************** __modsi3:
*** 483,507 ****
  
  #ifdef L__udivsi3
  // Compute a 32-bit unsigned integer quotient.
- //
- // Use reciprocal approximation and Newton-Raphson iteration to compute the
- // quotient.  frcpa gives 8.6 significant bits, so we need 2 iterations
- // to get more than the 32 bits of precision that we need for SImode.
- //
- // ??? This is currently not used.  It needs to be fixed to be more like the
- // above DImode routines.
- //
- // ??? Check to see if the error is less than >.5ulp error.  We may need
- // some adjustment code to get precise enough results.
- //
- // ??? Should probably use max precision for the reciprocal computations.
  //
! // r32/f8 holds the dividend.  r33/f9 holds the divisor.
! // f10 holds the value 2.0.  f11 holds the reciprocal approximation.
! // f12 is a temporary.
  //
! // This is the same as divsi3, except that we don't need fcvt instructions
! // before the frcpa.
  
  	.text
  	.align 16
--- 413,423 ----
  
  #ifdef L__udivsi3
  // Compute a 32-bit unsigned integer quotient.
  //
! // From the Intel IA-64 Optimization Guide, choose the minimum latency
! // alternative.
  //
! // in0 holds the dividend.  in1 holds the divisor.
  
  	.text
  	.align 16
*************** __modsi3:
*** 509,533 ****
  	.proc __udivsi3
  __udivsi3:
  	.regstk 2,0,0,0
! 	setf.sig f8 = r32
! 	setf.sig f9 = r33
! 	;;
! 	frcpa f11, p6 = f8, f9
! 	fadd f10 = f1, f1
  	;;
! 	fnma f12 = f9, f11, f10
  	;;
! 	fmpy f11 = f11, f12
  	;;
! 	fnma f12 = f9, f11, f10
  	;;
! 	fmpy f11 = f11, f12
  	;;
! 	fmpy f8 = f8, f11
  	;;
! 	fcvt.fxu.trunc f8 = f8
  	;;
! 	getf.sig ret0 = f8
  	br.ret.sptk rp
  	;;
  	.endp __udivsi3
--- 425,451 ----
  	.proc __udivsi3
  __udivsi3:
  	.regstk 2,0,0,0
! 	mov r2 = 0x0ffdd
! 	zxt4 in0 = in0
! 	zxt4 in1 = in1
  	;;
! 	setf.sig f8 = in0
! 	setf.sig f9 = in1
  	;;
! 	setf.exp f11 = r2
! 	frcpa f10, p6 = f8, f9
  	;;
! (p6)	fmpy.s1 f8 = f8, f10
! (p6)	fnma.s1 f9 = f9, f10, f1
  	;;
! (p6)	fma.s1 f8 = f9, f8, f8
! (p6)	fma.s1 f9, f9, f9, f11
  	;;
! (p6)	fma.s1 f10 = f9, f8, f8
  	;;
! 	fcvt.fxu.trunc.s1 f10 = f10
  	;;
! 	getf.sig ret0 = f10
  	br.ret.sptk rp
  	;;
  	.endp __udivsi3
*************** __udivsi3:
*** 535,559 ****
  
  #ifdef L__umodsi3
  // Compute a 32-bit unsigned integer modulus.
- //
- // Use reciprocal approximation and Newton-Raphson iteration to compute the
- // quotient.  frcpa gives 8.6 significant bits, so we need 2 iterations
- // to get more than the 32 bits of precision that we need for SImode.
- //
- // ??? This is currently not used.  It needs to be fixed to be more like the
- // above DImode routines.
- //
- // ??? Check to see if the error is less than >.5ulp error.  We may need
- // some adjustment code to get precise enough results.
- //
- // ??? Should probably use max precision for the reciprocal computations.
  //
! // r32/f8 holds the dividend.  r33/f9 holds the divisor.
! // f10 holds the value 2.0.  f11 holds the reciprocal approximation.
! // f12 is a temporary.
  //
! // This is the same as modsi3, except that we don't need fcvt instructions
! // before the frcpa.
  
  	.text
  	.align 16
--- 453,463 ----
  
  #ifdef L__umodsi3
  // Compute a 32-bit unsigned integer modulus.
  //
! // From the Intel IA-64 Optimization Guide, choose the minimum latency
! // alternative.
  //
! // in0 holds the dividend.  in1 holds the divisor.
  
  	.text
  	.align 16
*************** __udivsi3:
*** 561,591 ****
  	.proc __umodsi3
  __umodsi3:
  	.regstk 2,0,0,0
! 	setf.sig f8 = r32
! 	setf.sig f9 = r33
! 	;;
! 	frcpa f11, p6 = f8, f9
! 	fadd f10 = f1, f1
! 	;;
! 	fnma f12 = f9, f11, f10
  	;;
! 	fmpy f11 = f11, f12
  	;;
! 	fnma f12 = f9, f11, f10
  	;;
! 	fmpy f11 = f11, f12
  	;;
! 	fmpy f10 = f8, f11
  	;;
! 	fcvt.fxu.trunc f10 = f10
  	;;
! 	fcvt.xuf f10 = f10
  	;;
! 	fnma f8 = f10, f9, f8
  	;;
! 	fcvt.fxu f8 = f8
  	;;
! 	getf.sig r32 = f8
  	br.ret.sptk rp
  	;;
  	.endp __umodsi3
--- 465,497 ----
  	.proc __umodsi3
  __umodsi3:
  	.regstk 2,0,0,0
! 	mov r2 = 0x0ffdd
! 	zxt4 in0 = in0
! 	zxt4 in1 = in1
  	;;
! 	setf.sig f13 = in0
! 	setf.sig f9 = in1
  	;;
! 	sub in1 = r0, in1
! 	fcvt.xf f8 = f13
! 	fcvt.xf f9 = f9
  	;;
! 	setf.exp f11 = r2
! 	frcpa f10, p6 = f8, f9
  	;;
! (p6)	fmpy.s1 f12 = f8, f10
! (p6)	fnma.s1 f10 = f9, f10, f1
  	;;
! (p6)	fma.s1 f12 = f10, f12, f12
! (p6)	fma.s1 f10 = f10, f10, f11
  	;;
! (p6)	fma.s1 f10 = f10, f12, f12
  	;;
! 	fcvt.fxu.trunc.s1 f10 = f10
  	;;
! 	xma.l f10 = f10, f9, f13
  	;;
! 	getf.sig ret0 = f10
  	br.ret.sptk rp
  	;;
  	.endp __umodsi3

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]