This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]

Athlon tunning...


Hi
This is patch to tune some parameters for AMD-Athlon, basically it corrects
values I've guessed in my first patch, since they wasn't publically available
that time.

The most interesting bits are changes in HI/QImode move patterns. movw and movb
instructions have extra dependency on destination (since CPU has to merge
values), by changing them to full sized moves we give the out of order unit
much more choices.  This brings instant speedups to almost any code dealing
with these types.

Also the code to avoid partial memory stalls is present and code to avoid
integer moves on DFmode, since they are slower on Athlon as well as various
fixes here and there (not necesairly Athlon specific.) Especially fix in
movhi/movqi to avoid partial register dependencies on the promoted code (I was
sending patch for this separately, but it has vanished to ether)

Mon Apr 10 22:08:31 CEST 2000  Jan Hubicka  <jh@suse.cz>
	* i386.c (athlon_cost): Fix lea, divide and XFmode move costs.
	(x86_integer_DFmode_moves, x86_partial_reg_dependency,
	 x86_memory_mismatch_stall): New global variables.
	(ix86_adjust_cost): Handle MEMORY_BOTH on places MEMORY_STORE was only
	alloved; fix load penalties for Athlon.
	* i386.h (x86_integer_DFmode_moves, x86_partial_reg_dependency,
	x86_memory_mismatch_stall): Declare.
	(TARGET_INTEGER_DFMODE_MOVES, TARGET_PARTIAL_REG_DEPENDENCY,
	 TARGET_MEMORY_MISMATCH_STALL): New.
	* i386.md (athlon scheduling parameters): Fix latencies according
	to Athlon Optimization Manual.
	(sahf, xchg, fldcw, leave instruction patterns): Set athlon_decode
	to vector.
	(fsqrt instruction patterns): Set athlon_decode to direct.
	(movhi_1): Promote for TARGET_PARTIAL_REG_DEPENDENCY and for
	PARTIAL_REGISTER_STALL with !TARGET_HIMODE_MATH machines.
	(movqi_1): Handle promoting correctly for TARGET_PARTIAL_REG_DEPENDENCY
	and TARGET_PARTIAL_REGISTER_STALL machines.
	(pushdf_nointeger): New pattern.
	(pushdf_integer): Rename from pushdf.
	(movdf_nointger): Enable for !TARGET_INTEGER_DFMODE_MOVES machines.
	(movdf_intger): Disable for !TARGET_INTEGER_DFMODE_MOVES machines.

*** /usr/src/egcs-20000306.orig1/gcc/config/i386/i386.c	Sun Apr  9 11:41:08 2000
--- i386.c	Mon Apr 10 21:08:22 2000
*************** struct processor_costs k6_cost = {
*** 177,188 ****
  
  struct processor_costs athlon_cost = {
    1,					/* cost of an add instruction */
!   1,					/* cost of a lea instruction */
    1,					/* variable shift costs */
    1,					/* constant shift costs */
    5,					/* cost of starting a multiply */
    0,					/* cost of multiply per each bit set */
!   19,					/* cost of a divide/mod */
    8,					/* "large" insn */
    9,					/* MOVE_RATIO */
    4,					/* cost for loading QImode using movzbl */
--- 177,188 ----
  
  struct processor_costs athlon_cost = {
    1,					/* cost of an add instruction */
!   2,					/* cost of a lea instruction */
    1,					/* variable shift costs */
    1,					/* constant shift costs */
    5,					/* cost of starting a multiply */
    0,					/* cost of multiply per each bit set */
!   42,					/* cost of a divide/mod */
    8,					/* "large" insn */
    9,					/* MOVE_RATIO */
    4,					/* cost for loading QImode using movzbl */
*************** struct processor_costs athlon_cost = {
*** 191,199 ****
  					   Relative to reg-reg move (2). */
    {2, 3, 2},				/* cost of storing integer registers */
    4,					/* cost of reg,reg fld/fst */
!   {6, 6, 6},				/* cost of loading fp registers
  					   in SFmode, DFmode and XFmode */
!   {4, 4, 4},				/* cost of loading integer registers */
    64,					/* size of prefetch block */
    6					/* number of prefetches doable in
  					   parallel */
--- 191,199 ----
  					   Relative to reg-reg move (2). */
    {2, 3, 2},				/* cost of storing integer registers */
    4,					/* cost of reg,reg fld/fst */
!   {6, 6, 20},				/* cost of loading fp registers
  					   in SFmode, DFmode and XFmode */
!   {4, 4, 16},				/* cost of loading integer registers */
    64,					/* size of prefetch block */
    6					/* number of prefetches doable in
  					   parallel */
*************** const int x86_himode_math = ~(m_PPRO);
*** 237,242 ****
--- 237,245 ----
  const int x86_promote_hi_regs = m_PPRO;
  const int x86_3dNOW = m_ATHLON | m_K6;
  const int x86_SSE = m_ATHLON | m_PPRO;
+ const int x86_integer_DFmode_moves = ~m_ATHLON;
+ const int x86_partial_reg_dependency = m_ATHLON;
+ const int x86_memory_mismatch_stall = m_ATHLON;
  
  #define AT_BP(mode) (gen_rtx_MEM ((mode), hard_frame_pointer_rtx))
  
*************** ix86_adjust_cost (insn, link, dep_insn, 
*** 6338,6343 ****
--- 6359,6365 ----
       int cost;
  {
    enum attr_type insn_type, dep_insn_type;
+   enum attr_memory memory;
    rtx set, set2;
    int dep_insn_code_number;
  
*************** ix86_adjust_cost (insn, link, dep_insn, 
*** 6376,6382 ****
        /* Floating point stores require value to be ready one cycle ealier.  */
        if (insn_type == TYPE_FMOV 
  	  && get_attr_memory (insn) == MEMORY_STORE
! 	  && !ix86_agi_dependant (insn, dep_insn, insn_type))
  	cost += 1;
        break;
  
--- 6399,6405 ----
        /* Floating point stores require value to be ready one cycle ealier.  */
        if (insn_type == TYPE_FMOV 
  	  && get_attr_memory (insn) == MEMORY_STORE
! 	  && !ix86_agi_dependant (insn, dep_insn, insn_type, 0))
  	cost += 1;
        break;
  
*************** ix86_adjust_cost (insn, link, dep_insn, 
*** 6385,6391 ****
  	 increase the cost here for non-imov insns.  */
        if (dep_insn_type != TYPE_IMOV
  	  && dep_insn_type != TYPE_FMOV
! 	  && get_attr_memory (dep_insn) == MEMORY_LOAD)
  	cost += 1;
  
        /* INT->FP conversion is expensive.  */
--- 6408,6415 ----
  	 increase the cost here for non-imov insns.  */
        if (dep_insn_type != TYPE_IMOV
  	  && dep_insn_type != TYPE_FMOV
! 	  && ((memory = get_attr_memory (dep_insn) == MEMORY_LOAD)
!               || memory == MEMORY_BOTH))
  	cost += 1;
  
        /* INT->FP conversion is expensive.  */
*************** ix86_adjust_cost (insn, link, dep_insn, 
*** 6410,6416 ****
  
        /* Since we can't represent delayed latencies of load+operation, 
  	 increase the cost here for non-imov insns.  */
!       if (get_attr_memory (dep_insn) == MEMORY_LOAD)
  	cost += (dep_insn_type != TYPE_IMOV) ? 2 : 1;
  
        /* INT->FP conversion is expensive.  */
--- 6434,6441 ----
  
        /* Since we can't represent delayed latencies of load+operation, 
  	 increase the cost here for non-imov insns.  */
!       if ((memory = get_attr_memory (dep_insn) == MEMORY_LOAD)
!           || memory == MEMORY_BOTH)
  	cost += (dep_insn_type != TYPE_IMOV) ? 2 : 1;
  
        /* INT->FP conversion is expensive.  */
*************** ix86_adjust_cost (insn, link, dep_insn, 
*** 6419,6437 ****
        break;
  
      case PROCESSOR_ATHLON:
!       /* Address Generation Interlock cause problems on the Athlon CPU because
!          the loads and stores are done in order so once one load or store has
! 	 to wait, others must too, so penalize the AGIs slightly by one cycle.
! 	 We might experiment with this value later.  */
!       if (ix86_agi_dependant (insn, dep_insn, insn_type))
! 	cost += 1;
  
-       /* Since we can't represent delayed latencies of load+operation, 
- 	 increase the cost here for non-imov insns.  */
-       if (dep_insn_type != TYPE_IMOV
- 	  && dep_insn_type != TYPE_FMOV
- 	  && get_attr_memory (dep_insn) == MEMORY_LOAD)
- 	cost += 2;
      default:
        break;
      }
--- 6444,6458 ----
        break;
  
      case PROCESSOR_ATHLON:
!       if ((memory = get_attr_memory (dep_insn)) == MEMORY_LOAD
!            || memory == MEMORY_BOTH)
! 	{
! 	  if (dep_insn_type == TYPE_IMOV || dep_insn_type == TYPE_FMOV)
! 	    cost += 2;
! 	  else
! 	    cost += 3;
!         }
  
      default:
        break;
      }
*** /usr/src/egcs-20000306.orig1/gcc/config/i386/i386.h	Sun Apr  9 11:41:08 2000
--- i386.h	Mon Apr 10 20:39:30 2000
*************** extern const int x86_read_modify, x86_sp
*** 179,184 ****
  extern const int x86_promote_QImode, x86_single_stringop;
  extern const int x86_himode_math, x86_qimode_math, x86_promote_qi_regs;
  extern const int x86_promote_hi_regs;
! extern const int x86_3dNOW, x86_SSE;
  
  #define TARGET_USE_LEAVE (x86_use_leave & CPUMASK)
  #define TARGET_PUSH_MEMORY (x86_push_memory & CPUMASK)
--- 179,187 ----
  extern const int x86_promote_QImode, x86_single_stringop;
  extern const int x86_himode_math, x86_qimode_math, x86_promote_qi_regs;
  extern const int x86_promote_hi_regs;
! extern const int x86_3dNOW, x86_SSE, x86_integer_DFmode_moves;
! extern const int x86_partial_reg_dependency, x86_memory_mismatch_stall;
  
  #define TARGET_USE_LEAVE (x86_use_leave & CPUMASK)
  #define TARGET_PUSH_MEMORY (x86_push_memory & CPUMASK)
*************** extern const int x86_3dNOW, x86_SSE;
*** 207,212 ****
--- 209,217 ----
  #define TARGET_HIMODE_MATH (x86_himode_math & CPUMASK)
  #define TARGET_PROMOTE_QI_REGS (x86_promote_qi_regs & CPUMASK)
  #define TARGET_PROMOTE_HI_REGS (x86_promote_hi_regs & CPUMASK)
+ #define TARGET_INTEGER_DFMODE_MOVES (x86_integer_DFmode_moves & CPUMASK)
+ #define TARGET_PARTIAL_REG_DEPENDENCY (x86_partial_reg_dependency & CPUMASK)
+ #define TARGET_MEMORY_MISMATCH_STALL (x86_memory_mismatch_stall & CPUMASK)
  
  #define TARGET_STACK_PROBE (target_flags & MASK_STACK_PROBE)
  
*** /usr/src/egcs-20000306.orig1/gcc/config/i386/i386.md	Sun Apr  9 11:41:08 2000
--- i386.md	Mon Apr 10 22:22:08 2000
***************
*** 742,748 ****
  ;; communicates with all the execution units seperately instead.
  
  (define_attr "athlon_decode" "direct,vector"
!   (cond [(eq_attr "type" "call,imul,idiv,other,multi,fcmov,fpspc,str")
  	   (const_string "vector")
           (and (eq_attr "type" "push")
                (match_operand 1 "memory_operand" ""))
--- 761,767 ----
  ;; communicates with all the execution units seperately instead.
  
  (define_attr "athlon_decode" "direct,vector"
!   (cond [(eq_attr "type" "call,imul,idiv,other,multi,fcmov,fpspc,str,pop,cld,fcmov")
  	   (const_string "vector")
           (and (eq_attr "type" "push")
                (match_operand 1 "memory_operand" ""))
***************
*** 770,776 ****
  
  (define_function_unit "athlon_ieu" 3 0
    (and (eq_attr "cpu" "athlon")
!        (eq_attr "type" "alu1,negnot,alu,icmp,imov,imovx,lea,incdec,ishift,imul,idiv,ibr,setcc,push,pop,call,callv,icmov,str,cld,prefetch"))
    1 1)
  
  (define_function_unit "athlon_ieu" 3 0
--- 789,795 ----
  
  (define_function_unit "athlon_ieu" 3 0
    (and (eq_attr "cpu" "athlon")
!        (eq_attr "type" "alu1,negnot,alu,icmp,imov,imovx,lea,incdec,ishift,ibr,call,callv,icmov,cld,pop,setcc,push,pop,prefetch"))
    1 1)
  
  (define_function_unit "athlon_ieu" 3 0
***************
*** 781,792 ****
  (define_function_unit "athlon_ieu" 3 0
    (and (eq_attr "cpu" "athlon")
         (eq_attr "type" "imul"))
!   4 0)
  
  (define_function_unit "athlon_ieu" 3 0
    (and (eq_attr "cpu" "athlon")
         (eq_attr "type" "idiv"))
!   27 0)
  
  (define_function_unit "athlon_muldiv" 1 0
    (and (eq_attr "cpu" "athlon")
--- 800,811 ----
  (define_function_unit "athlon_ieu" 3 0
    (and (eq_attr "cpu" "athlon")
         (eq_attr "type" "imul"))
!   5 0)
  
  (define_function_unit "athlon_ieu" 3 0
    (and (eq_attr "cpu" "athlon")
         (eq_attr "type" "idiv"))
!   42 0)
  
  (define_function_unit "athlon_muldiv" 1 0
    (and (eq_attr "cpu" "athlon")
***************
*** 796,851 ****
  (define_function_unit "athlon_muldiv" 1 0
    (and (eq_attr "cpu" "athlon")
         (eq_attr "type" "idiv"))
!   27 27)
  
! (define_attr "athlon_fpunits" "none,store,mul,add,muladd,all"
    (cond [(eq_attr "type" "fop,fop1,fcmp")
  	   (const_string "add")
!          (eq_attr "type" "fmul,fdiv,fpspc,fsgn")
  	   (const_string "mul")
! 	 (and (eq_attr "type" "fmov") (eq_attr "memory" "!none"))
  	   (const_string "store")
           (and (eq_attr "type" "fmov")
                (ior (match_operand:SI 1 "register_operand" "")
                     (match_operand 1 "immediate_operand" "")))
  	   (const_string "store")
           (eq_attr "type" "fmov")
! 	   (const_string "muladd")
!          (eq_attr "type" "fcmov")
! 	   (const_string "all")]
  	(const_string "none")))
  
! (define_function_unit "athlon_fp_mul" 1 0
    (and (eq_attr "cpu" "athlon")
!        (eq_attr "athlon_fpunits" "mul,all"))
!   4 1)
  
! (define_function_unit "athlon_fp_add" 1 0
    (and (eq_attr "cpu" "athlon")
!        (eq_attr "athlon_fpunits" "add,all"))
    4 1)
  
! (define_function_unit "athlon_fp_muladd" 2 0
    (and (eq_attr "cpu" "athlon")
         (and (eq_attr "type" "fmov")
!             (eq_attr "athlon_fpunits" "muladd,mul,add,all")))
    2 1)
  
  (define_function_unit "athlon_fp_muladd" 2 0
    (and (eq_attr "cpu" "athlon")
!        (and (eq_attr "type" "!fmov")
!             (eq_attr "athlon_fpunits" "muladd,mul,add,all")))
!   4 1)
  
  (define_function_unit "athlon_fp_store" 1 0
    (and (eq_attr "cpu" "athlon")
!        (eq_attr "athlon_fpunits" "store,all"))
    1 1)
  
! (define_function_unit "athlon_agu" 3 0
    (and (eq_attr "cpu" "athlon")
!        (and (eq_attr "memory" "!none")
!             (eq_attr "athlon_fpunits" "none")))
    1 1)
  
  
--- 815,932 ----
  (define_function_unit "athlon_muldiv" 1 0
    (and (eq_attr "cpu" "athlon")
         (eq_attr "type" "idiv"))
!   42 42)
  
! (define_attr "athlon_fpunits" "none,store,mul,add,muladd,any"
    (cond [(eq_attr "type" "fop,fop1,fcmp")
  	   (const_string "add")
!          (eq_attr "type" "fmul,fdiv,fpspc,fsgn,fcmov")
  	   (const_string "mul")
! 	 (and (eq_attr "type" "fmov") (eq_attr "memory" "store,both"))
  	   (const_string "store")
+ 	 (and (eq_attr "type" "fmov") (eq_attr "memory" "load"))
+ 	   (const_string "any")
           (and (eq_attr "type" "fmov")
                (ior (match_operand:SI 1 "register_operand" "")
                     (match_operand 1 "immediate_operand" "")))
  	   (const_string "store")
           (eq_attr "type" "fmov")
! 	   (const_string "muladd")]
  	(const_string "none")))
  
! ;; We use latencies 1 for definitions.  This is OK to model colisions
! ;; in execution units.  The real latencies are modeled in the "fp" pipeline.
! 
! ;; fsin, fcos: 96-192
! ;; fsincos: 107-211
! ;; fsqrt: 19 for SFmode, 27 for DFmode, 35 for XFmode.
! (define_function_unit "athlon_fp" 3 0
    (and (eq_attr "cpu" "athlon")
!        (eq_attr "type" "fpspc"))
!   100 1)
  
! ;; 16 cycles for SFmode, 20 for DFmode and 24 for XFmode.
! (define_function_unit "athlon_fp" 3 0
!   (and (eq_attr "cpu" "athlon")
!        (eq_attr "type" "fdiv"))
!   24 1)
! 
! (define_function_unit "athlon_fp" 3 0
    (and (eq_attr "cpu" "athlon")
!        (eq_attr "type" "fop,fop1,fmul"))
    4 1)
  
! ;; XFmode loads are slow.
! ;; XFmode store is slow too (8 cycles), but we don't need to model it, because
! ;; there are no dependent instructions.
! 
! (define_function_unit "athlon_fp" 3 0
    (and (eq_attr "cpu" "athlon")
         (and (eq_attr "type" "fmov")
! 	    (match_operand:XF 1 "memory_operand" "")))
!   10 1)
! 
! (define_function_unit "athlon_fp" 3 0
!   (and (eq_attr "cpu" "athlon")
!        (eq_attr "type" "fmov,fsgn"))
    2 1)
  
+ ;; fcmp and ftst instructions
+ (define_function_unit "athlon_fp" 3 0
+   (and (eq_attr "cpu" "athlon")
+        (and (eq_attr "type" "fcmp")
+ 	    (eq_attr "athlon_decode" "direct")))
+   3 1)
+ 
+ ;; fcmpi instructions.
+ (define_function_unit "athlon_fp" 3 0
+   (and (eq_attr "cpu" "athlon")
+        (and (eq_attr "type" "fcmp")
+ 	    (eq_attr "athlon_decode" "vector")))
+   3 1)
+ 
+ (define_function_unit "athlon_fp" 3 0
+   (and (eq_attr "cpu" "athlon")
+        (eq_attr "type" "fcmov"))
+   7 1)
+ 
+ (define_function_unit "athlon_fp_mul" 1 0
+   (and (eq_attr "cpu" "athlon")
+        (eq_attr "athlon_fpunits" "mul"))
+   1 1)
+ 
+ (define_function_unit "athlon_fp_add" 1 0
+   (and (eq_attr "cpu" "athlon")
+        (eq_attr "athlon_fpunits" "add"))
+   1 1)
+ 
  (define_function_unit "athlon_fp_muladd" 2 0
    (and (eq_attr "cpu" "athlon")
!        (eq_attr "athlon_fpunits" "muladd,mul,add"))
!   1 1)
  
  (define_function_unit "athlon_fp_store" 1 0
    (and (eq_attr "cpu" "athlon")
!        (eq_attr "athlon_fpunits" "store"))
    1 1)
  
! ;; We don't need to model the Adress Generation Unit, since we don't model
! ;; the re-order buffer yet and thus we never schedule more than three operations
! ;; at time.  Later we may want to experiment with MD_SCHED macros modeling the
! ;; decoders independently on the functional units.
! 
! ;(define_function_unit "athlon_agu" 3 0
! ;  (and (eq_attr "cpu" "athlon")
! ;       (and (eq_attr "memory" "!none")
! ;            (eq_attr "athlon_fpunits" "none")))
! ;  1 1)
! 
! ;; Model load unit to avoid too long sequences of loads.  We don't need to
! ;; model store queue, since it is hardly going to be bottleneck.
! 
! (define_function_unit "athlon_load" 2 0
    (and (eq_attr "cpu" "athlon")
!        (eq_attr "memory" "load,both"))
    1 1)
  
  
***************
*** 1259,1264 ****
--- 1340,1346 ----
    ""
    "sahf"
    [(set_attr "length" "1")
+    (set_attr "athlon_decode" "vector")
     (set_attr "ppro_uops" "one")])
  
  ;; Pentium Pro can do steps 1 through 3 in one go.
***************
*** 1376,1381 ****
--- 1458,1464 ----
    "xchg{l}\\t%1, %0"
    [(set_attr "type" "imov")
     (set_attr "pent_pair" "np")
+    (set_attr "athlon_decode" "vector")
     (set_attr "ppro_uops" "few")])
  
  (define_expand "movhi"
***************
*** 1423,1430 ****
  }"
    [(set (attr "type")
       (cond [(and (eq_attr "alternative" "0")
! 		 (eq (symbol_ref "TARGET_PARTIAL_REG_STALL")
! 		     (const_int 0)))
  	      (const_string "imov")
  	    (and (eq_attr "alternative" "1,2")
  		 (match_operand:HI 1 "aligned_operand" ""))
--- 1506,1515 ----
  }"
    [(set (attr "type")
       (cond [(and (eq_attr "alternative" "0")
! 		 (ior (eq (symbol_ref "TARGET_PARTIAL_REG_STALL")
! 			  (const_int 0))
! 		      (eq (symbol_ref "TARGET_HIMODE_MATH")
! 			  (const_int 0))))
  	      (const_string "imov")
  	    (and (eq_attr "alternative" "1,2")
  		 (match_operand:HI 1 "aligned_operand" ""))
***************
*** 1442,1449 ****
  		  (match_operand:HI 1 "aligned_operand" ""))
  	       (const_string "0")
  	     (and (eq_attr "alternative" "0")
! 		  (eq (symbol_ref "TARGET_PARTIAL_REG_STALL")
! 		      (const_int 0)))
  	       (const_string "0")
  	    ]
  	    (const_string "1")))
--- 1527,1536 ----
  		  (match_operand:HI 1 "aligned_operand" ""))
  	       (const_string "0")
  	     (and (eq_attr "alternative" "0")
! 		  (ior (eq (symbol_ref "TARGET_PARTIAL_REG_STALL")
! 			   (const_int 0))
! 		       (eq (symbol_ref "TARGET_HIMODE_MATH")
! 			   (const_int 0))))
  	       (const_string "0")
  	    ]
  	    (const_string "1")))
***************
*** 1533,1541 ****
    [(set_attr "type" "pop")
     (set_attr "length_prefix" "1")])
  
  (define_insn "*movqi_1"
!   [(set (match_operand:QI 0 "nonimmediate_operand" "=q,q,r,?r,m")
! 	(match_operand:QI 1 "general_operand" "qn,qm,rn,qm,qn"))]
    "GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM"
    "*
  {
--- 1620,1637 ----
    [(set_attr "type" "pop")
     (set_attr "length_prefix" "1")])
  
+ ;; Situation is quite tricky about when to choose full sized (SImode) move over
+ ;; QImode moves.  For Q_REG -> Q_REG move we use full size only for partial register
+ ;; dependency machines (such as AMD Athlon), where QImode moves issue extra dependency
+ ;; and for partial register stalls machines that don't use QImode patterns (and QImode
+ ;; move cause stall on the next instruction).
+ ;;
+ ;; For loads of Q_REG to NONQ_REG we use full sized moves except for partial register
+ ;; stall machines with, where we use QImode instructions, since partial register stall
+ ;; can be caused there.  Then we use movzx.
  (define_insn "*movqi_1"
!   [(set (match_operand:QI 0 "nonimmediate_operand" "=q,q ,q ,r,r ,?r,m")
! 	(match_operand:QI 1 "general_operand"      " q,qn,qm,q,rn,qm,qn"))]
    "GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM"
    "*
  {
***************
*** 1546,1571 ****
  	abort ();
        return \"movz{bl|x}\\t{%1, %k0|%k0, %1}\";
      default:
!       if (which_alternative == 2)
          return \"mov{l}\\t{%k1, %k0|%k0, %k1}\";
        else
          return \"mov{b}\\t{%1, %0|%0, %1}\";
      }
  }"
    [(set (attr "type")
!      (cond [(eq_attr "alternative" "3")
  	      (const_string "imovx")
  	    (and (ne (symbol_ref "TARGET_MOVX")
  		     (const_int 0))
! 		 (eq_attr "alternative" "1"))
  	      (const_string "imovx")
  	   ]
  	   (const_string "imov")))
      ; There's no place to override just the immediate length
      (set (attr "length")
        (cond [(and (eq_attr "type" "imov")
! 		  (and (eq_attr "alternative" "2")
! 		       (match_operand:HI 1 "immediate_operand" "")))
  	       (const_string "5")
  	    ]
  	    (const_string "*")))])
--- 1642,1691 ----
  	abort ();
        return \"movz{bl|x}\\t{%1, %k0|%k0, %1}\";
      default:
!       if (which_alternative == 4 || which_alternative == 3
! 	  || (which_alternative == 1 && get_attr_length (insn) == 5)
! 	  || (which_alternative == 0
! 	      && ((TARGET_PARTIAL_REG_STALL && !TARGET_QIMODE_MATH)
! 		  || TARGET_PARTIAL_REG_DEPENDENCY)))
          return \"mov{l}\\t{%k1, %k0|%k0, %k1}\";
        else
          return \"mov{b}\\t{%1, %0|%0, %1}\";
      }
  }"
    [(set (attr "type")
!      (cond [(and (eq_attr "alternative" "3")
! 		 (ior (eq (symbol_ref "TARGET_PARTIAL_REG_STALL")
! 			  (const_int 0))
! 		      (eq (symbol_ref "TARGET_QIMODE_MATH")
! 			  (const_int 0))))
! 	      (const_string "imov")
! 	    (eq_attr "alternative" "3,5")
  	      (const_string "imovx")
  	    (and (ne (symbol_ref "TARGET_MOVX")
  		     (const_int 0))
! 		 (eq_attr "alternative" "2"))
  	      (const_string "imovx")
  	   ]
  	   (const_string "imov")))
      ; There's no place to override just the immediate length
      (set (attr "length")
        (cond [(and (eq_attr "type" "imov")
! 		  (and (match_operand:HI 1 "immediate_operand" "")
! 		       (eq_attr "alternative" "4")))
! 	       (const_string "5")
! 	     ;; Avoid extra dependency on partial register.
! 	     (and (eq_attr "type" "imov")
! 		  (and (eq_attr "alternative" "1")
! 		       (ne (symbol_ref "TARGET_PARTIAL_REG_DEPENDENCY")
! 			   (const_int 0))))
! 	       (const_string "5")
! 	     ;; Avoid partial register stalls when not using QImode arithmetic
! 	     (and (eq_attr "type" "imov")
! 		  (and (eq_attr "alternative" "1")
! 		       (and (ne (symbol_ref "TARGET_PARTIAL_REG_STALL")
! 				(const_int 0))
! 			    (eq (symbol_ref "TARGET_QIMODE_MATH")
! 				(const_int 0)))))
  	       (const_string "5")
  	    ]
  	    (const_string "*")))])
***************
*** 1890,1899 ****
  ;; On the average, pushdf using integers can be still shorter.  Allow this
  ;; pattern for optimize_size too.
  
! (define_insn "*pushdf"
    [(set (match_operand:DF 0 "push_operand" "=<,<")
  	(match_operand:DF 1 "general_no_elim_operand" "f#r,rFo#f"))]
!   ""
    "*
  {
    switch (which_alternative)
--- 2010,2047 ----
  ;; On the average, pushdf using integers can be still shorter.  Allow this
  ;; pattern for optimize_size too.
  
! (define_insn "*pushdf_nointeger"
!   [(set (match_operand:DF 0 "push_operand" "=<,<,<")
! 	(match_operand:DF 1 "general_no_elim_operand" "f,Fo#f,*r#f"))]
!   "!TARGET_INTEGER_DFMODE_MOVES"
!   "*
! {
!   switch (which_alternative)
!     {
!     case 0:
!       /* %%% We loose REG_DEAD notes for controling pops if we split late.  */
!       operands[0] = gen_rtx_MEM (DFmode, stack_pointer_rtx);
!       operands[2] = stack_pointer_rtx;
!       operands[3] = GEN_INT (8);
!       if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
! 	return \"sub{l}\\t{%3, %2|%2, %3}\;fstp%z0\\t%y0\";
!       else
! 	return \"sub{l}\\t{%3, %2|%2, %3}\;fst%z0\\t%y0\";
! 
!     case 1:
!     case 2:
!       return \"#\";
! 
!     default:
!       abort ();
!     }
! }"
!   [(set_attr "type" "multi")])
! 
! (define_insn "*pushdf_integer"
    [(set (match_operand:DF 0 "push_operand" "=<,<")
  	(match_operand:DF 1 "general_no_elim_operand" "f#r,rFo#f"))]
!   "TARGET_INTEGER_DFMODE_MOVES"
    "*
  {
    switch (which_alternative)
***************
*** 1941,1947 ****
    [(set (match_operand:DF 0 "nonimmediate_operand" "=f,m,f,*r,o")
  	(match_operand:DF 1 "general_operand" "fm,f,G,*roF,F*r"))]
    "(GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM)
!    && optimize_size
     && (reload_in_progress || reload_completed
         || GET_CODE (operands[1]) != CONST_DOUBLE
         || memory_operand (operands[0], DFmode))" 
--- 2089,2095 ----
    [(set (match_operand:DF 0 "nonimmediate_operand" "=f,m,f,*r,o")
  	(match_operand:DF 1 "general_operand" "fm,f,G,*roF,F*r"))]
    "(GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM)
!    && (optimize_size || !TARGET_INTEGER_DFMODE_MOVES)
     && (reload_in_progress || reload_completed
         || GET_CODE (operands[1]) != CONST_DOUBLE
         || memory_operand (operands[0], DFmode))" 
***************
*** 1988,1994 ****
    [(set (match_operand:DF 0 "nonimmediate_operand" "=f#r,m,f#r,r#f,o")
  	(match_operand:DF 1 "general_operand" "fm#r,f#r,G,roF#f,Fr#f"))]
    "(GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM)
!    && !optimize_size
     && (reload_in_progress || reload_completed
         || GET_CODE (operands[1]) != CONST_DOUBLE
         || memory_operand (operands[0], DFmode))" 
--- 2136,2142 ----
    [(set (match_operand:DF 0 "nonimmediate_operand" "=f#r,m,f#r,r#f,o")
  	(match_operand:DF 1 "general_operand" "fm#r,f#r,G,roF#f,Fr#f"))]
    "(GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM)
!    && !optimize_size && TARGET_INTEGER_DFMODE_MOVES
     && (reload_in_progress || reload_completed
         || GET_CODE (operands[1]) != CONST_DOUBLE
         || memory_operand (operands[0], DFmode))" 
***************
*** 2290,2296 ****
    else
      return \"fxch\\t%0\";
  }"
!   [(set_attr "type" "fxch")])
  
  ;; Zero extension instructions
  
--- 2438,2445 ----
    else
      return \"fxch\\t%0\";
  }"
!   [(set_attr "type" "fxch")
!    (set_attr "athlon_decode" "vector")])
  
  ;; Zero extension instructions
  
***************
*** 3188,3193 ****
--- 3337,3343 ----
    "TARGET_80387"
    "fldcw\\t%0"
    [(set_attr "length_opcode" "2")
+    (set_attr "athlon_decode" "vector")
     (set_attr "ppro_uops" "few")])
  
  ;; Conversion between fixed point and floating point.
***************
*** 7677,7682 ****
--- 7828,7834 ----
    ""
    "leave"
    [(set_attr "length" "1")
+    (set_attr "athlon_decode" "vector")
     (set_attr "ppro_uops" "few")])
  
  (define_expand "ffssi2"
***************
*** 8109,8115 ****
  	(sqrt:SF (match_operand:SF 1 "register_operand" "0")))]
    "! TARGET_NO_FANCY_MATH_387 && TARGET_80387"
    "fsqrt"
!   [(set_attr "type" "fpspc")])
  
  (define_insn "sqrtdf2"
    [(set (match_operand:DF 0 "register_operand" "=f")
--- 8261,8268 ----
  	(sqrt:SF (match_operand:SF 1 "register_operand" "0")))]
    "! TARGET_NO_FANCY_MATH_387 && TARGET_80387"
    "fsqrt"
!   [(set_attr "type" "fpspc")
!    (set_attr "athlon_decode" "direct")])
  
  (define_insn "sqrtdf2"
    [(set (match_operand:DF 0 "register_operand" "=f")
***************
*** 8117,8123 ****
    "! TARGET_NO_FANCY_MATH_387 && TARGET_80387
     && (TARGET_IEEE_FP || flag_fast_math) "
    "fsqrt"
!   [(set_attr "type" "fpspc")])
  
  (define_insn "*sqrtextendsfdf2"
    [(set (match_operand:DF 0 "register_operand" "=f")
--- 8270,8277 ----
    "! TARGET_NO_FANCY_MATH_387 && TARGET_80387
     && (TARGET_IEEE_FP || flag_fast_math) "
    "fsqrt"
!   [(set_attr "type" "fpspc")
!    (set_attr "athlon_decode" "direct")])
  
  (define_insn "*sqrtextendsfdf2"
    [(set (match_operand:DF 0 "register_operand" "=f")
***************
*** 8125,8131 ****
  		  (match_operand:SF 1 "register_operand" "0"))))]
    "! TARGET_NO_FANCY_MATH_387 && TARGET_80387"
    "fsqrt"
!   [(set_attr "type" "fpspc")])
  
  (define_insn "sqrtxf2"
    [(set (match_operand:XF 0 "register_operand" "=f")
--- 8279,8286 ----
  		  (match_operand:SF 1 "register_operand" "0"))))]
    "! TARGET_NO_FANCY_MATH_387 && TARGET_80387"
    "fsqrt"
!   [(set_attr "type" "fpspc")
!    (set_attr "athlon_decode" "direct")])
  
  (define_insn "sqrtxf2"
    [(set (match_operand:XF 0 "register_operand" "=f")
***************
*** 8133,8139 ****
    "! TARGET_NO_FANCY_MATH_387 && TARGET_80387 
     && (TARGET_IEEE_FP || flag_fast_math) "
    "fsqrt"
!   [(set_attr "type" "fpspc")])
  
  (define_insn "*sqrtextenddfxf2"
    [(set (match_operand:XF 0 "register_operand" "=f")
--- 8288,8295 ----
    "! TARGET_NO_FANCY_MATH_387 && TARGET_80387 
     && (TARGET_IEEE_FP || flag_fast_math) "
    "fsqrt"
!   [(set_attr "type" "fpspc")
!    (set_attr "athlon_decode" "direct")])
  
  (define_insn "*sqrtextenddfxf2"
    [(set (match_operand:XF 0 "register_operand" "=f")
***************
*** 8141,8147 ****
  		  (match_operand:DF 1 "register_operand" "0"))))]
    "! TARGET_NO_FANCY_MATH_387 && TARGET_80387"
    "fsqrt"
!   [(set_attr "type" "fpspc")])
  
  (define_insn "*sqrtextendsfxf2"
    [(set (match_operand:XF 0 "register_operand" "=f")
--- 8297,8304 ----
  		  (match_operand:DF 1 "register_operand" "0"))))]
    "! TARGET_NO_FANCY_MATH_387 && TARGET_80387"
    "fsqrt"
!   [(set_attr "type" "fpspc")
!    (set_attr "athlon_decode" "direct")])
  
  (define_insn "*sqrtextendsfxf2"
    [(set (match_operand:XF 0 "register_operand" "=f")
***************
*** 8149,8155 ****
  		  (match_operand:SF 1 "register_operand" "0"))))]
    "! TARGET_NO_FANCY_MATH_387 && TARGET_80387"
    "fsqrt"
!   [(set_attr "type" "fpspc")])
  
  (define_insn "sindf2"
    [(set (match_operand:DF 0 "register_operand" "=f")
--- 8306,8313 ----
  		  (match_operand:SF 1 "register_operand" "0"))))]
    "! TARGET_NO_FANCY_MATH_387 && TARGET_80387"
    "fsqrt"
!   [(set_attr "type" "fpspc")
!    (set_attr "athlon_decode" "direct")])
  
  (define_insn "sindf2"
    [(set (match_operand:DF 0 "register_operand" "=f")

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]