This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Patch to change IA64 division code


The current implementation of floating point division on IA64 is
sub-optimal in that it is done with a post-reload split that doesn't
allow for very good instruction scheduling.  This patch allows for the
expansion of the division sequence earlier and thus allows for better
instruction scheduling.  I did some testing with SPEC2000 fp and got
some improvements with this change, ignoring art which had a lot of
variation (both good and bad) when I ran it, I got a less than 1%
slowdown in 3 tests and a less than 1% speed up in 7 tests.  173.applu
sped up by a little over 1%, 200.sixtrack sped up by a little less than
4% and 301.apsi sped up by a little more than 4%.  I got similar results
on HP-UX in LP64 mode and I got slightly better results on HP-UX in
ILP32 mode.

I did one SPEC2006 fp run on HP-UX in LP64 mode and had 3 tests with
less than 1% slowdown and 6 with less than 1% speed up. bwaves sped up
6%, lbm by 4%, and zuesmp, gromacs, wrf, and sphinx3 sped up by 1 to
2%.

This patch only converts the maximum throughput versions of single and
double precision floating point division, if it is approved I will
commit to converting the minimum latency versions to this same setup as
well but I would like to get this patch approved as an intermediate step
and to make sure the overall approach is acceptable.

Tested on IA64 HP-UX and Linux with no regressions.

OK to checkin?

Steve Ellcey
sje@cup.hp.com


2007-02-21  Steve Ellcey  <sje@cup.hp.com>

	* config/ia64/ia64.h (HARD_REGNO_NREGS): Handle RFmode.
	(HARD_REGNO_MODE_OK): Ditto.
	(MODES_TIEABLE_P): Ditto.
	(HARD_REGNO_CALLER_SAVE_MODE): Ditto.
	(CLASS_MAX_NREGS): Ditto.
	* config/ia64/ia64.c (ia64_print_operand_address): Add R format.
	(rtx_needs_barrier): Add UNSPEC_NOP_CONVERT case.
	* config/ia64/ia64.md (UNSPEC_NOP_CONVERT): New.
	(divsf3_internal_thr): Removed.
	(divdf3_internal_thr): Removed.
	* config/ia64/div.md: New file.

Index: config/ia64/ia64.h
===================================================================
--- config/ia64/ia64.h	(revision 122189)
+++ config/ia64/ia64.h	(working copy)
@@ -1,5 +1,5 @@
 /* Definitions of target machine GNU compiler.  IA-64 version.
-   Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006
+   Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007
    Free Software Foundation, Inc.
    Contributed by James E. Wilson <wilson@cygnus.com> and
    		  David Mosberger <davidm@hpl.hp.com>.
@@ -642,6 +642,7 @@ while (0)
    : PR_REGNO_P (REGNO) && (MODE) == BImode ? 2				\
    : PR_REGNO_P (REGNO) && (MODE) == CCImode ? 1			\
    : FR_REGNO_P (REGNO) && (MODE) == XFmode ? 1				\
+   : FR_REGNO_P (REGNO) && (MODE) == RFmode ? 1				\
    : FR_REGNO_P (REGNO) && (MODE) == XCmode ? 2				\
    : (GET_MODE_SIZE (MODE) + UNITS_PER_WORD - 1) / UNITS_PER_WORD)
 
@@ -657,7 +658,7 @@ while (0)
    : PR_REGNO_P (REGNO) ?					\
      (MODE) == BImode || GET_MODE_CLASS (MODE) == MODE_CC	\
    : GR_REGNO_P (REGNO) ?					\
-     (MODE) != CCImode && (MODE) != XFmode && (MODE) != XCmode	\
+     (MODE) != CCImode && (MODE) != XFmode && (MODE) != XCmode	&& (MODE) != RFmode \
    : AR_REGNO_P (REGNO) ? (MODE) == DImode			\
    : BR_REGNO_P (REGNO) ? (MODE) == DImode			\
    : 0)
@@ -674,15 +675,15 @@ while (0)
    we can't tie it with any other modes.  */
 #define MODES_TIEABLE_P(MODE1, MODE2)			\
   (GET_MODE_CLASS (MODE1) == GET_MODE_CLASS (MODE2)	\
-   && ((((MODE1) == XFmode) || ((MODE1) == XCmode))	\
-       == (((MODE2) == XFmode) || ((MODE2) == XCmode)))	\
+   && ((((MODE1) == XFmode) || ((MODE1) == XCmode) || ((MODE1) == RFmode))     \
+       == (((MODE2) == XFmode) || ((MODE2) == XCmode) || ((MODE1) == RFmode))) \
    && (((MODE1) == BImode) == ((MODE2) == BImode)))
 
 /* Specify the modes required to caller save a given hard regno.
    We need to ensure floating pt regs are not saved as DImode.  */
 
 #define HARD_REGNO_CALLER_SAVE_MODE(REGNO, NREGS, MODE) \
-  ((FR_REGNO_P (REGNO) && (NREGS) == 1) ? XFmode        \
+  ((FR_REGNO_P (REGNO) && (NREGS) == 1) ? RFmode        \
    : choose_hard_reg_mode ((REGNO), (NREGS), false))
 
 /* Handling Leaf Functions */
@@ -896,6 +897,7 @@ enum reg_class
 #define CLASS_MAX_NREGS(CLASS, MODE) \
   ((MODE) == BImode && (CLASS) == PR_REGS ? 2			\
    : (((CLASS) == FR_REGS || (CLASS) == FP_REGS) && (MODE) == XFmode) ? 1 \
+   : (((CLASS) == FR_REGS || (CLASS) == FP_REGS) && (MODE) == RFmode) ? 1 \
    : (((CLASS) == FR_REGS || (CLASS) == FP_REGS) && (MODE) == XCmode) ? 2 \
    : (GET_MODE_SIZE (MODE) + UNITS_PER_WORD - 1) / UNITS_PER_WORD)
 
Index: config/ia64/ia64.c
===================================================================
--- config/ia64/ia64.c	(revision 122189)
+++ config/ia64/ia64.c	(working copy)
@@ -1,5 +1,5 @@
 /* Definitions of target machine for GNU compiler.
-   Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006
+   Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007
    Free Software Foundation, Inc.
    Contributed by James E. Wilson <wilson@cygnus.com> and
 		  David Mosberger <davidm@hpl.hp.com>.
@@ -4508,6 +4508,7 @@ ia64_print_operand_address (FILE * strea
    O	Append .acq for volatile load.
    P	Postincrement of a MEM.
    Q	Append .rel for volatile store.
+   R    Print .s .d or nothing for a single, double or no truncation.
    S	Shift amount for shladd instruction.
    T	Print an 8-bit sign extended number (K) as a 32-bit unsigned number
 	for Intel assembler.
@@ -4648,6 +4649,17 @@ ia64_print_operand (FILE * file, rtx x, 
 	fputs(".rel", file);
       return;
 
+    case 'R':
+      if (x == CONST0_RTX (GET_MODE (x)))
+	fputs(".s", file);
+      else if (x == CONST1_RTX (GET_MODE (x)))
+	fputs(".d", file);
+      else if (x == CONST2_RTX (GET_MODE (x)))
+        ;
+      else
+        output_operand_lossage ("invalid %%R value");
+      return;
+
     case 'S':
       fprintf (file, "%d", exact_log2 (INTVAL (x)));
       return;
@@ -5793,6 +5805,7 @@ rtx_needs_barrier (rtx x, struct reg_fla
 	case UNSPEC_LDSA:
 	case UNSPEC_CHKACLR:
         case UNSPEC_CHKS:
+	case UNSPEC_NOP_CONVERT:
 	  need_barrier = rtx_needs_barrier (XVECEXP (x, 0, 0), flags, pred);
 	  break;
 
Index: config/ia64/ia64.md
===================================================================
--- config/ia64/ia64.md	(revision 122189)
+++ config/ia64/ia64.md	(working copy)
@@ -81,6 +81,7 @@ (define_constants
    (UNSPEC_SHRP			29)
    (UNSPEC_COPYSIGN		30)
    (UNSPEC_VECT_EXTR		31)
+   (UNSPEC_NOP_CONVERT		32)
    (UNSPEC_LDA                  40)
    (UNSPEC_LDS                  41)
    (UNSPEC_LDSA                 42)
@@ -3108,60 +3109,6 @@ (define_insn_and_split "divsf3_internal_
 }
   [(set_attr "predicable" "no")])
 
-(define_insn_and_split "divsf3_internal_thr"
-  [(set (match_operand:SF 0 "fr_register_operand" "=&f")
-	(div:SF (match_operand:SF 1 "fr_register_operand" "f")
-		(match_operand:SF 2 "fr_register_operand" "f")))
-   (clobber (match_scratch:XF 3 "=&f"))
-   (clobber (match_scratch:XF 4 "=f"))
-   (clobber (match_scratch:BI 5 "=c"))]
-  "TARGET_INLINE_FLOAT_DIV == INL_MAX_THR"
-  "#"
-  "&& reload_completed"
-  [(parallel [(set (match_dup 6) (div:XF (const_int 1) (match_dup 8)))
-	      (set (match_dup 5) (unspec:BI [(match_dup 7) (match_dup 8)]
-					    UNSPEC_FR_RECIP_APPROX))
-	      (use (const_int 0))])
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 3)
-		     (minus:XF (match_dup 10)
-			       (mult:XF (match_dup 8) (match_dup 6))))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 3)
-		     (plus:XF (mult:XF (match_dup 3) (match_dup 3))
-			      (match_dup 3)))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 6)
-		     (plus:XF (mult:XF (match_dup 3) (match_dup 6))
-			      (match_dup 6)))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 9)
-		     (float_truncate:SF
-		       (mult:XF (match_dup 7) (match_dup 6))))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 4)
-		     (minus:XF (match_dup 7)
-			       (mult:XF (match_dup 8) (match_dup 3))))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (set (match_dup 0)
-	  (float_truncate:SF
-	    (plus:XF (mult:XF (match_dup 4) (match_dup 6))
-			      (match_dup 3)))))
-  ] 
-{
-  operands[6] = gen_rtx_REG (XFmode, REGNO (operands[0]));
-  operands[7] = gen_rtx_REG (XFmode, REGNO (operands[1]));
-  operands[8] = gen_rtx_REG (XFmode, REGNO (operands[2]));
-  operands[9] = gen_rtx_REG (SFmode, REGNO (operands[3]));
-  operands[10] = CONST1_RTX (XFmode);
-}
-  [(set_attr "predicable" "no")])
-
 ;; Inline square root.
 
 (define_insn "*sqrt_approx"
@@ -3614,72 +3561,6 @@ (define_insn_and_split "divdf3_internal_
 }
   [(set_attr "predicable" "no")])
 
-(define_insn_and_split "divdf3_internal_thr"
-  [(set (match_operand:DF 0 "fr_register_operand" "=&f")
-	(div:DF (match_operand:DF 1 "fr_register_operand" "f")
-		(match_operand:DF 2 "fr_register_operand" "f")))
-   (clobber (match_scratch:XF 3 "=&f"))
-   (clobber (match_scratch:DF 4 "=f"))
-   (clobber (match_scratch:BI 5 "=c"))]
-  "TARGET_INLINE_FLOAT_DIV == INL_MAX_THR"
-  "#"
-  "&& reload_completed"
-  [(parallel [(set (match_dup 6) (div:XF (const_int 1) (match_dup 8)))
-	      (set (match_dup 5) (unspec:BI [(match_dup 7) (match_dup 8)]
-					    UNSPEC_FR_RECIP_APPROX))
-	      (use (const_int 0))])
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 3)
-		     (minus:XF (match_dup 10)
-			       (mult:XF (match_dup 8) (match_dup 6))))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 6)
-		     (plus:XF (mult:XF (match_dup 3) (match_dup 6))
-			      (match_dup 6)))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 3)
-		     (mult:XF (match_dup 3) (match_dup 3)))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 6)
-		     (plus:XF (mult:XF (match_dup 3) (match_dup 6))
-			      (match_dup 6)))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 3)
-		     (mult:XF (match_dup 3) (match_dup 3)))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 6)
-		     (plus:XF (mult:XF (match_dup 3) (match_dup 6))
-			      (match_dup 6)))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 9)
-		     (float_truncate:DF
-		       (mult:XF (match_dup 7) (match_dup 6))))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (parallel [(set (match_dup 4)
-		     (minus:DF (match_dup 1)
-			       (mult:DF (match_dup 2) (match_dup 9))))
-		(use (const_int 1))]))
-   (cond_exec (ne (match_dup 5) (const_int 0))
-     (set (match_dup 0)
-	  (plus:DF (mult:DF (match_dup 4) (match_dup 0))
-			    (match_dup 9))))
-  ] 
-{
-  operands[6] = gen_rtx_REG (XFmode, REGNO (operands[0]));
-  operands[7] = gen_rtx_REG (XFmode, REGNO (operands[1]));
-  operands[8] = gen_rtx_REG (XFmode, REGNO (operands[2]));
-  operands[9] = gen_rtx_REG (DFmode, REGNO (operands[3]));
-  operands[10] = CONST1_RTX (XFmode);
-}
-  [(set_attr "predicable" "no")])
-
 ;; Inline square root.
 
 (define_expand "sqrtdf2"
@@ -6540,3 +6421,5 @@ (define_insn "ip_value"
 (include "vect.md")
 ;; Atomic operations
 (include "sync.md")
+;; New division operations
+(include "div.md")
Index: config/ia64/div.md
===================================================================
--- config/ia64/div.md	(revision 0)
+++ config/ia64/div.md	(revision 0)
@@ -0,0 +1,575 @@
+;; IA-64 machine description for inline division operations.
+;; Copyright (C) 2007
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 2, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING.  If not, write to
+;; the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+;; Boston, MA 02110-1301, USA.
+
+
+;; For the internal conditional math routines:
+
+;; _a versions are when we want output to be op 0 if predicate is false.
+;; _b versions are when we don't care about output if predicate is false.
+
+;; operand 0 is always the result
+;; operand 1 is always the predicate
+;; operand 2, 3, and sometimes 4 are the input values.
+;; operand 4 or 5 is the floating point status register to use.
+;; operand 5 or 6 is the rounding to do. (0 = single, 1 = double, 2 = none)
+;;
+;; addrf3_cond_[ab]   - F0 = F2 + F3
+;; subrf3_cond_[ab]   - F0 = F2 - F3
+;; mulrf3_cond_[ab]   - F0 = F2 * F3
+;; nmulrf3_cond_[ab]  - F0 = - (F2 * F3)
+;; m1addrf4_cond_[ab] - F0 = (F2 * F3) + F4
+;; m1subrf4_cond_[ab] - F0 = (F2 * F3) - F4
+;; m2addrf4_cond_[ab] - F0 = F2 + (F3 * F4)
+;; m2subrf4_cond_[ab] - F0 = F2 - (F3 * F4)
+
+;; Basic plus/minus/mult operations
+
+(define_insn "*addrf3_cond_a"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c")
+                                (const_int 0))
+          (plus:RF
+            (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+            (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG"))
+          (match_operand:RF 4 "fr_register_operand" "0")))
+   (use (match_operand:SI 5 "const_int_operand" ""))
+   (use (match_operand:SI 6 "const_int_operand" ""))]
+  ""
+  "(%1) fadd%R6.s%5 %0 = %F2, %F3"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+(define_insn "*addrf3_cond_b"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c")
+                                (const_int 0))
+          (plus:RF
+            (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+            (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG"))
+          (const_int 0)))
+   (use (match_operand:SI 4 "const_int_operand" ""))
+   (use (match_operand:SI 5 "const_int_operand" ""))]
+  ""
+  "(%1) fadd%R5.s%4 %0 = %F2, %F3"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+(define_insn "*subrf3_cond_a"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c")
+                                (const_int 0))
+          (minus:RF
+            (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+            (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG"))
+          (match_operand:RF 4 "fr_register_operand" "0")))
+   (use (match_operand:SI 5 "const_int_operand" ""))
+   (use (match_operand:SI 6 "const_int_operand" ""))]
+  ""
+  "(%1) fsub%R6.s%5 %0 = %F2, %F3"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+(define_insn "*subrf3_cond_b"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c")
+                                (const_int 0))
+          (minus:RF
+            (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+            (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG"))
+          (const_int 0)))
+   (use (match_operand:SI 4 "const_int_operand" ""))
+   (use (match_operand:SI 5 "const_int_operand" ""))]
+  ""
+  "(%1) fsub%R5.s%4 %0 = %F2, %F3"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+(define_insn "*mulrf3_cond_a"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c")
+                                (const_int 0))
+          (mult:RF
+            (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+            (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG"))
+          (match_operand:RF 4 "fr_register_operand" "0")))
+   (use (match_operand:SI 5 "const_int_operand" ""))
+   (use (match_operand:SI 6 "const_int_operand" ""))]
+  ""
+  "(%1) fmpy%R6.s%5 %0 = %F2, %F3"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+(define_insn "*mulrf3_cond_b"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c")
+                                (const_int 0))
+          (mult:RF
+            (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+            (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG"))
+          (const_int 0)))
+   (use (match_operand:SI 4 "const_int_operand" ""))
+   (use (match_operand:SI 5 "const_int_operand" ""))]
+  ""
+  "(%1) fmpy%R5.s%4 %0 = %F2, %F3"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+;; neg-mult operations
+
+(define_insn "*nmulrf3_cond_a"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c")
+                                (const_int 0))
+          (neg:RF (mult:RF
+            (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+            (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG")))
+          (match_operand:RF 4 "fr_register_operand" "0")))
+   (use (match_operand:SI 5 "const_int_operand" ""))
+   (use (match_operand:SI 6 "const_int_operand" ""))]
+  ""
+  "(%1) fnmpy%R6.s%5 %0 = %F2, %F3"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+(define_insn "*nmulrf3_cond_b"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c")
+                                (const_int 0))
+          (neg:RF (mult:RF
+            (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+            (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG")))
+          (const_int 0)))
+   (use (match_operand:SI 4 "const_int_operand" ""))
+   (use (match_operand:SI 5 "const_int_operand" ""))]
+  ""
+  "(%1) fnmpy%R5.s%4 %0 = %F2, %F3"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+;; add-mult/sub-mult operations (mult as op1)
+
+(define_insn "*m1addrf4_cond_a"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c")
+                                (const_int 0))
+          (plus:RF
+            (mult:RF
+              (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+              (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG"))
+            (match_operand:RF 4 "fr_reg_or_fp01_operand" "fG"))
+          (match_operand:RF 5 "fr_register_operand" "0")))
+   (use (match_operand:SI 6 "const_int_operand" ""))
+   (use (match_operand:SI 7 "const_int_operand" ""))]
+  ""
+  "(%1) fma%R7.s%6 %0 = %F2, %F3, %F4"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+(define_insn "*m1addrf4_cond_b"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c")
+                                (const_int 0))
+          (plus:RF
+            (mult:RF
+              (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+              (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG"))
+            (match_operand:RF 4 "fr_reg_or_fp01_operand" "fG"))
+          (const_int 0)))
+   (use (match_operand:SI 5 "const_int_operand" ""))
+   (use (match_operand:SI 6 "const_int_operand" ""))]
+  ""
+  "(%1) fma%R6.s%5 %0 = %F2, %F3, %F4"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+(define_insn "*m1subrf4_cond_a"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c")
+                                (const_int 0))
+          (minus:RF
+            (mult:RF
+              (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+              (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG"))
+            (match_operand:RF 4 "fr_reg_or_fp01_operand" "fG"))
+          (match_operand:RF 5 "fr_register_operand" "0")))
+   (use (match_operand:SI 6 "const_int_operand" ""))
+   (use (match_operand:SI 7 "const_int_operand" ""))]
+  ""
+  "(%1) fms%R7.s%6 %0 = %F2, %F3, %F4"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+(define_insn "*m1subrf4_cond_b"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c")
+                                (const_int 0))
+          (minus:RF
+            (mult:RF
+              (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+              (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG"))
+            (match_operand:RF 4 "fr_reg_or_fp01_operand" "fG"))
+          (const_int 0)))
+   (use (match_operand:SI 5 "const_int_operand" ""))
+   (use (match_operand:SI 6 "const_int_operand" ""))]
+  ""
+  "(%1) fms%R6.s%5 %0 = %F2, %F3, %F4"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+;; add-mult/sub-mult operations (mult as op2)
+
+(define_insn "*m2addrf4_cond_a"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c")
+                                (const_int 0))
+          (plus:RF
+            (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+            (mult:RF
+              (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG")
+              (match_operand:RF 4 "fr_reg_or_fp01_operand" "fG")))
+          (match_operand:RF 5 "fr_register_operand" "0")))
+   (use (match_operand:SI 6 "const_int_operand" ""))
+   (use (match_operand:SI 7 "const_int_operand" ""))]
+  ""
+  "(%1) fma%R7.s%6 %0 = %F3, %F4, %F2"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+(define_insn "*m2addrf4_cond_b"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c")
+                                (const_int 0))
+          (plus:RF
+            (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+            (mult:RF
+              (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG")
+              (match_operand:RF 4 "fr_reg_or_fp01_operand" "fG")))
+          (const_int 0)))
+   (use (match_operand:SI 5 "const_int_operand" ""))
+   (use (match_operand:SI 6 "const_int_operand" ""))]
+  ""
+  "(%1) fma%R6.s%5 %0 = %F3, %F4, %F2"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+(define_insn "*m2subrf4_cond_a"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c")
+                                (const_int 0))
+          (minus:RF
+            (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+            (mult:RF
+              (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG")
+              (match_operand:RF 4 "fr_reg_or_fp01_operand" "fG")))
+          (match_operand:RF 5 "fr_register_operand" "0")))
+   (use (match_operand:SI 6 "const_int_operand" ""))
+   (use (match_operand:SI 7 "const_int_operand" ""))]
+  ""
+  "(%1) fnma%R7.s%6 %0 = %F3, %F4, %F2"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+(define_insn "*m2subrf4_cond_b"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (if_then_else:RF (ne:RF (match_operand:BI 1 "register_operand"  "c")
+                                (const_int 0))
+          (minus:RF
+            (match_operand:RF 2 "fr_reg_or_fp01_operand" "fG")
+            (mult:RF
+              (match_operand:RF 3 "fr_reg_or_fp01_operand" "fG")
+              (match_operand:RF 4 "fr_reg_or_fp01_operand" "fG")))
+          (const_int 0)))
+   (use (match_operand:SI 5 "const_int_operand" ""))
+   (use (match_operand:SI 6 "const_int_operand" ""))]
+  ""
+  "(%1) fnma%R6.s%5 %0 = %F3, %F4, %F2"
+  [(set_attr "itanium_class" "fmac")
+   (set_attr "predicable" "no")])
+
+;; Conversions to/from RF and SF/DF/XF
+
+(define_mode_macro SDX_F [SF DF XF])
+
+(define_insn "*mov_trunc<mode>rf"
+  [(set (match_operand:SDX_F 0 "fr_register_operand" "=f")
+        (unspec:SDX_F [(match_operand:RF 1 "fr_register_operand" "f")]
+                      UNSPEC_NOP_CONVERT))]
+  ""
+  "#"
+  [(set_attr "itanium_class" "fmisc")
+   (set_attr "predicable" "yes")])
+
+
+(define_insn "*mov_extendrf<mode>"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (unspec:RF [(match_operand:SDX_F 1 "fr_register_operand" "f")]
+                   UNSPEC_NOP_CONVERT))]
+  ""
+  "#"
+  [(set_attr "itanium_class" "fmisc")
+   (set_attr "predicable" "yes")])
+
+(define_split
+  [(set (match_operand:SDX_F 0 "fr_register_operand" "")
+        (unspec:SDX_F [(match_operand:RF 1 "fr_register_operand" "")]
+                      UNSPEC_NOP_CONVERT))]
+   "reload_completed"
+   [(set (match_dup 0) (match_dup 2))]
+{
+   operands[2] = gen_rtx_REG (<MODE>mode, REGNO (operands[1]));
+})
+
+(define_split
+  [(set (match_operand:RF 0 "fr_register_operand" "")
+        (unspec:RF [(match_operand:SDX_F 1 "fr_register_operand" "")]
+                      UNSPEC_NOP_CONVERT))]
+   "reload_completed"
+   [(set (match_dup 0) (match_dup 2))]
+{
+   operands[2] = gen_rtx_REG (RFmode, REGNO (operands[1]));
+})
+
+;; Reciprical approximation
+
+(define_insn "*recip_approx_rf"
+  [(set (match_operand:RF 0 "fr_register_operand" "=f")
+        (div:RF (match_operand:RF 2 "fr_register_operand" "f")
+                (match_operand:RF 3 "fr_register_operand" "f")))
+   (set (match_operand:BI 1 "register_operand" "=c")
+        (unspec:BI [(match_dup 2) (match_dup 3)] UNSPEC_FR_RECIP_APPROX))
+   (use (match_operand:SI 4 "const_int_operand" ""))]
+  ""
+  "frcpa.s%4 %0, %1 = %2, %3"
+  [(set_attr "itanium_class" "fmisc")
+   (set_attr "predicable" "no")])
+
+;; Single precision floating point division (high throughput)
+;; The algorithm:
+;;	y = 1 / b			OP3 = 1 / OP5
+;;	e = 1 - (b * y)			OP6 = OP15 - (OP5 * OP3)
+;;	y1 = y + (y * e)		OP7 = OP3 + (OP3 * OP6)
+;;	y2 = y + (y1 * e)		OP8 = OP3 + (OP7 * OP6)
+;;	q = single(a * y2)		OP9 = single(OP4 * OP8)
+;;	r = a - (q * b)        		OP10 = OP4 - (OP9 * OP5)
+;;	Q = single (q + (r * y2))	OP3 = single (OP9 + (OP10 * OP8))
+
+(define_expand "divsf3_internal_thr"
+  [
+
+;; Empty conversions to put inputs into RFmode
+
+   (set (match_dup 4)
+	(unspec:RF [(match_operand:SF 1 "fr_register_operand" "f")]
+		UNSPEC_NOP_CONVERT))
+   (set (match_dup 5)
+	(unspec:RF [(match_operand:SF 2 "fr_register_operand" "f")]
+		UNSPEC_NOP_CONVERT))
+
+;; y = 1 / b		OP3 = 1 / OP5
+   (parallel [(set (match_dup 3) (div:RF (match_dup 4) (match_dup 5)))
+              (set (match_dup 11) (unspec:BI [(match_dup 4) (match_dup 5)] UNSPEC_FR_RECIP_APPROX))
+              (use (match_dup 12))])
+
+;; e = 1 - (b * y)		OP6 = OP15 - (OP5 * OP3)
+   (parallel [(set (match_dup 6)
+                   (if_then_else:RF (ne:RF (match_dup 11) (const_int 0))
+                     (minus:RF (match_dup 15)
+                               (mult:RF (match_dup 5) (match_dup 3)))
+                     (const_int 0)))
+              (use (match_dup 13)) (use (match_dup 14))])
+;; y1 = y + (y * e)		OP7 = OP3 + (OP3 * OP6)
+   (parallel [(set (match_dup 7)
+                   (if_then_else:RF (ne:RF (match_dup 11) (const_int 0))
+                     (plus:RF (match_dup 3)
+                              (mult:RF (match_dup 3) (match_dup 6)))
+                     (const_int 0)))
+              (use (match_dup 13)) (use (match_dup 14))])
+;; y2 = y + (y1 * e)		OP8 = OP3 + (OP7 * OP6)
+   (parallel [(set (match_dup 8)
+                   (if_then_else:RF (ne:RF (match_dup 11) (const_int 0))
+                     (plus:RF (match_dup 3)
+                              (mult:RF (match_dup 7) (match_dup 6)))
+                     (const_int 0)))
+              (use (match_dup 13)) (use (match_dup 14))])
+;; q = single(a * y2)		OP9 = single(OP4 * OP8)
+   (parallel [(set (match_dup 9)
+                   (if_then_else:RF (ne:RF (match_dup 11) (const_int 0))
+                     (mult:RF (match_dup 4) (match_dup 8))
+                     (const_int 0)))
+              (use (match_dup 13)) (use (match_dup 12))])
+;; r = a - (q * b)		OP10 = OP4 - (OP9 * OP5)
+   (parallel [(set (match_dup 10)
+                   (if_then_else:RF (ne:RF (match_dup 11) (const_int 0))
+                     (minus:RF (match_dup 4)
+                               (mult:RF (match_dup 9) (match_dup 5)))
+                     (const_int 0)))
+              (use (match_dup 13)) (use (match_dup 14))])
+;; Q = single (q + (r * y2))	OP3 = single (OP9 + (OP10 * OP8))
+   (parallel [(set (match_dup 3)
+                   (if_then_else:RF (ne:RF (match_dup 11) (const_int 0))
+                     (plus:RF (match_dup 9)
+                              (mult:RF (match_dup 10) (match_dup 8)))
+                     (match_dup 3)))
+              (use (match_dup 12)) (use (match_dup 12))])
+
+   (set (match_operand:SF 0 "fr_register_operand" "=f")
+	(unspec:SF [(match_dup 3)] UNSPEC_NOP_CONVERT))
+  ]
+  ""
+{
+  operands[3]  = gen_reg_rtx (RFmode);
+  operands[4]  = gen_reg_rtx (RFmode);
+  operands[5]  = gen_reg_rtx (RFmode);
+  operands[6]  = gen_reg_rtx (RFmode);
+  operands[7]  = gen_reg_rtx (RFmode);
+  operands[8]  = gen_reg_rtx (RFmode);
+  operands[9]  = gen_reg_rtx (RFmode);
+  operands[10] = gen_reg_rtx (RFmode);
+  operands[11] = gen_reg_rtx (BImode);
+  operands[12] = CONST0_RTX (SImode);
+  operands[13] = CONST1_RTX (SImode);
+  operands[14] = CONST2_RTX (SImode);
+  operands[15] = CONST1_RTX (RFmode);
+})
+
+
+;; Double precision floating point division (high throughput)
+;; The algorithm:
+;;	y  = 1 / b			OP3  = 1 / OP5
+;;	e  = 1 - (b * y)		OP6  = OP18 - (OP5 * OP3)
+;;	y1 = y + (y * e)		OP7  = OP3 + (OP3 * OP6)
+;;	e1 = e * e			OP8  = OP6 * OP6
+;;	y2 = y1 + (y1 * e1)		OP9  = OP7 + (OP7 * OP8)
+;;	e2 = e1 * e1			OP10 = OP8 * OP8
+;;	y3 = y2 + (y2 * e2)		OP11 = OP9 + (OP9 * OP10)
+;;	q  = double (a * y3)		OP12 = double (OP4 * OP11)
+;;	r  = a - (b * q)		OP13 = OP4 - (OP5 * OP12)
+;;	Q  = double (q + (r * y3)	OP3  = double (OP12 + (OP13 * OP11))
+
+(define_expand "divdf3_internal_thr"
+  [
+
+;; Empty conversions to put inputs into RFmode
+
+   (set (match_dup 4)
+	(unspec:RF [(match_operand:DF 1 "fr_register_operand" "f")]
+		UNSPEC_NOP_CONVERT))
+   (set (match_dup 5)
+	(unspec:RF [(match_operand:DF 2 "fr_register_operand" "f")]
+		UNSPEC_NOP_CONVERT))
+
+;;	y  = 1 / b			OP3  = 1 / OP5
+   (parallel [(set (match_dup 3) (div:RF (match_dup 4) (match_dup 5)))
+              (set (match_dup 14) (unspec:BI [(match_dup 4) (match_dup 5)] UNSPEC_FR_RECIP_APPROX))
+              (use (match_dup 15))])
+
+;;	e  = 1 - (b * y)		OP6  = OP18 - (OP5 * OP3)
+   (parallel [(set (match_dup 6)
+                   (if_then_else:RF (ne:RF (match_dup 14) (const_int 0))
+                     (minus:RF (match_dup 18)
+                               (mult:RF (match_dup 5) (match_dup 3)))
+                     (const_int 0)))
+              (use (match_dup 16)) (use (match_dup 17))])
+
+;;	y1 = y + (y * e)		OP7  = OP3 + (OP3 * OP6)
+   (parallel [(set (match_dup 7)
+                   (if_then_else:RF (ne:RF (match_dup 14) (const_int 0))
+                     (plus:RF (match_dup 3)
+                              (mult:RF (match_dup 3) (match_dup 6)))
+                     (const_int 0)))
+              (use (match_dup 16)) (use (match_dup 17))])
+
+;;	e1 = e * e			OP8  = OP6 * OP6
+   (parallel [(set (match_dup 8)
+                   (if_then_else:RF (ne:RF (match_dup 14) (const_int 0))
+                     (mult:RF (match_dup 6) (match_dup 6))
+                     (const_int 0)))
+              (use (match_dup 16)) (use (match_dup 17))])
+
+;;	y2 = y1 + (y1 * e1)		OP9  = OP7 + (OP7 * OP8)
+   (parallel [(set (match_dup 9)
+                   (if_then_else:RF (ne:RF (match_dup 14) (const_int 0))
+                     (plus:RF (match_dup 7)
+                              (mult:RF (match_dup 7) (match_dup 8)))
+                     (const_int 0)))
+              (use (match_dup 16)) (use (match_dup 17))])
+
+;;	e2 = e1 * e1			OP10 = OP8 * OP8
+   (parallel [(set (match_dup 10)
+                   (if_then_else:RF (ne:RF (match_dup 14) (const_int 0))
+                     (mult:RF (match_dup 8) (match_dup 8))
+                     (const_int 0)))
+              (use (match_dup 16)) (use (match_dup 17))])
+
+;;	y3 = y2 + (y2 * e2)		OP11 = OP9 + (OP9 * OP10)
+   (parallel [(set (match_dup 11)
+                   (if_then_else:RF (ne:RF (match_dup 14) (const_int 0))
+                     (plus:RF (match_dup 9)
+                              (mult:RF (match_dup 9) (match_dup 10)))
+                     (const_int 0)))
+              (use (match_dup 16)) (use (match_dup 17))])
+
+;;	q  = double (a * y3)		OP12 = double (OP4 * OP11)
+   (parallel [(set (match_dup 12)
+                   (if_then_else:RF (ne:RF (match_dup 14) (const_int 0))
+                     (mult:RF (match_dup 4) (match_dup 11))
+                     (const_int 0)))
+              (use (match_dup 16)) (use (match_dup 16))])
+
+;;	r  = a - (b * q)		OP13 = OP4 - (OP5 * OP12)
+   (parallel [(set (match_dup 13)
+                   (if_then_else:RF (ne:RF (match_dup 14) (const_int 0))
+                     (minus:RF (match_dup 4)
+                               (mult:RF (match_dup 5) (match_dup 12)))
+                     (const_int 0)))
+              (use (match_dup 16)) (use (match_dup 17))])
+
+;;	Q  = double (q + (r * y3))	OP3  = double (OP12 + (OP13 * OP11))
+   (parallel [(set (match_dup 3)
+                   (if_then_else:RF (ne:RF (match_dup 14) (const_int 0))
+                     (plus:RF (match_dup 12)
+                              (mult:RF (match_dup 13) (match_dup 11)))
+                     (match_dup 3)))
+              (use (match_dup 15)) (use (match_dup 16))])
+
+;; Do an 'empty' conversion back to SFmode
+
+   (set (match_operand:DF 0 "fr_register_operand" "=f")
+	(unspec:DF [(match_dup 3)] UNSPEC_NOP_CONVERT))
+  ]
+""
+{
+  operands[3]  = gen_reg_rtx (RFmode);
+  operands[4]  = gen_reg_rtx (RFmode);
+  operands[5]  = gen_reg_rtx (RFmode);
+  operands[6]  = gen_reg_rtx (RFmode);
+  operands[7]  = gen_reg_rtx (RFmode);
+  operands[8]  = gen_reg_rtx (RFmode);
+  operands[9]  = gen_reg_rtx (RFmode);
+  operands[10] = gen_reg_rtx (RFmode);
+  operands[11] = gen_reg_rtx (RFmode);
+  operands[12] = gen_reg_rtx (RFmode);
+  operands[13] = gen_reg_rtx (RFmode);
+  operands[14] = gen_reg_rtx (BImode);
+  operands[15] = CONST0_RTX (SImode);
+  operands[16] = CONST1_RTX (SImode);
+  operands[17] = CONST2_RTX (SImode);
+  operands[18] = CONST1_RTX (RFmode);
+})


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]