This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

ia32 scheduling




Per David M's recommendation, this patch moves scheduling information for
the various processors into their own md files.

	* athlon.md, k6.md, pentium.md, ppro.md): New files.
	* i386.md: Move scheduling information into new files.

Index: athlon.md
===================================================================
RCS file: athlon.md
diff -N athlon.md
*** /dev/null	1 Jan 1970 00:00:00 -0000
--- athlon.md	9 May 2002 23:15:01 -0000
***************
*** 0 ****
--- 1,206 ----
+ ;; AMD Athlon Scheduling
+ ;; Copyright (C) 2002 Free Software Foundation, Inc.
+ ;;
+ ;; This file is part of GNU CC.
+ ;;
+ ;; GNU CC is free software; you can redistribute it and/or modify
+ ;; it under the terms of the GNU General Public License as published by
+ ;; the Free Software Foundation; either version 2, or (at your option)
+ ;; any later version.
+ ;;
+ ;; GNU CC is distributed in the hope that it will be useful,
+ ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ;; GNU General Public License for more details.
+ ;;
+ ;; You should have received a copy of the GNU General Public License
+ ;; along with GNU CC; see the file COPYING.  If not, write to
+ ;; the Free Software Foundation, 59 Temple Place - Suite 330,
+ ;; Boston, MA 02111-1307, USA.  */
+ (define_attr "athlon_decode" "direct,vector"
+   (cond [(eq_attr "type" "call,imul,idiv,other,multi,fcmov,fpspc,str,pop,cld,
fcmov")
+ 	   (const_string "vector")
+          (and (eq_attr "type" "push")
+               (match_operand 1 "memory_operand" ""))
+ 	   (const_string "vector")
+          (and (eq_attr "type" "fmov")
+ 	      (and (eq_attr "memory" "load,store")
+ 		   (eq_attr "mode" "XF")))
+ 	   (const_string "vector")]
+ 	(const_string "direct")))
+ 
+ ;; The Athlon does contain three pipelined FP units, three integer units and
+ ;; three address generation units. 
+ ;;
+ ;; The predecode logic is determining boundaries of instructions in the 64
+ ;; byte cache line. So the cache line straddling problem of K6 might be issue
+ ;; here as well, but it is not noted in the documentation.
+ ;;
+ ;; Three DirectPath instructions decoders and only one VectorPath decoder
+ ;; is available. They can decode three DirectPath instructions or one 
VectorPath
+ ;; instruction per cycle.
+ ;; Decoded macro instructions are then passed to 72 entry instruction control
+ ;; unit, that passes
+ ;; it to the specialized integer (18 entry) and fp (36 entry) schedulers.
+ ;;
+ ;; The load/store queue unit is not attached to the schedulers but
+ ;; communicates with all the execution units separately instead.
+ 
+ (define_function_unit "athlon_vectordec" 1 0
+   (and (eq_attr "cpu" "athlon")
+        (eq_attr "athlon_decode" "vector"))
+   1 1)
+ 
+ (define_function_unit "athlon_directdec" 3 0
+   (and (eq_attr "cpu" "athlon")
+        (eq_attr "athlon_decode" "direct"))
+   1 1)
+ 
+ (define_function_unit "athlon_vectordec" 1 0
+   (and (eq_attr "cpu" "athlon")
+        (eq_attr "athlon_decode" "direct"))
+   1 1 [(eq_attr "athlon_decode" "vector")])
+ 
+ (define_function_unit "athlon_ieu" 3 0
+   (and (eq_attr "cpu" "athlon")
+        (eq_attr "type" "alu1,negnot,alu,icmp,test,imov,imovx,lea,incdec,ishif
t,rotate,ibr,call,callv,icmov,cld,pop,setcc,push,pop"))
+   1 1)
+ 
+ (define_function_unit "athlon_ieu" 3 0
+   (and (eq_attr "cpu" "athlon")
+        (eq_attr "type" "str"))
+   15 15)
+ 
+ (define_function_unit "athlon_ieu" 3 0
+   (and (eq_attr "cpu" "athlon")
+        (eq_attr "type" "imul"))
+   5 0)
+ 
+ (define_function_unit "athlon_ieu" 3 0
+   (and (eq_attr "cpu" "athlon")
+        (eq_attr "type" "idiv"))
+   42 0)
+ 
+ (define_function_unit "athlon_muldiv" 1 0
+   (and (eq_attr "cpu" "athlon")
+        (eq_attr "type" "imul"))
+   5 0)
+ 
+ (define_function_unit "athlon_muldiv" 1 0
+   (and (eq_attr "cpu" "athlon")
+        (eq_attr "type" "idiv"))
+   42 42)
+ 
+ (define_attr "athlon_fpunits" "none,store,mul,add,muladd,any"
+   (cond [(eq_attr "type" "fop,fcmp,fistp")
+ 	   (const_string "add")
+          (eq_attr "type" "fmul,fdiv,fpspc,fsgn,fcmov")
+ 	   (const_string "mul")
+ 	 (and (eq_attr "type" "fmov") (eq_attr "memory" "store,both"))
+ 	   (const_string "store")
+ 	 (and (eq_attr "type" "fmov") (eq_attr "memory" "load"))
+ 	   (const_string "any")
+          (and (eq_attr "type" "fmov")
+               (ior (match_operand:SI 1 "register_operand" "")
+                    (match_operand 1 "immediate_operand" "")))
+ 	   (const_string "store")
+          (eq_attr "type" "fmov")
+ 	   (const_string "muladd")]
+ 	(const_string "none")))
+ 
+ ;; We use latencies 1 for definitions.  This is OK to model colisions
+ ;; in execution units.  The real latencies are modeled in the "fp" pipeline.
+ 
+ ;; fsin, fcos: 96-192
+ ;; fsincos: 107-211
+ ;; fsqrt: 19 for SFmode, 27 for DFmode, 35 for XFmode.
+ (define_function_unit "athlon_fp" 3 0
+   (and (eq_attr "cpu" "athlon")
+        (eq_attr "type" "fpspc"))
+   100 1)
+ 
+ ;; 16 cycles for SFmode, 20 for DFmode and 24 for XFmode.
+ (define_function_unit "athlon_fp" 3 0
+   (and (eq_attr "cpu" "athlon")
+        (eq_attr "type" "fdiv"))
+   24 1)
+ 
+ (define_function_unit "athlon_fp" 3 0
+   (and (eq_attr "cpu" "athlon")
+        (eq_attr "type" "fop,fmul,fistp"))
+   4 1)
+ 
+ ;; XFmode loads are slow.
+ ;; XFmode store is slow too (8 cycles), but we don't need to model it, 
because
+ ;; there are no dependent instructions.
+ 
+ (define_function_unit "athlon_fp" 3 0
+   (and (eq_attr "cpu" "athlon")
+        (and (eq_attr "type" "fmov")
+ 	    (and (eq_attr "memory" "load")
+ 		 (eq_attr "mode" "XF"))))
+   10 1)
+ 
+ (define_function_unit "athlon_fp" 3 0
+   (and (eq_attr "cpu" "athlon")
+        (eq_attr "type" "fmov,fsgn"))
+   2 1)
+ 
+ ;; fcmp and ftst instructions
+ (define_function_unit "athlon_fp" 3 0
+   (and (eq_attr "cpu" "athlon")
+        (and (eq_attr "type" "fcmp")
+ 	    (eq_attr "athlon_decode" "direct")))
+   3 1)
+ 
+ ;; fcmpi instructions.
+ (define_function_unit "athlon_fp" 3 0
+   (and (eq_attr "cpu" "athlon")
+        (and (eq_attr "type" "fcmp")
+ 	    (eq_attr "athlon_decode" "vector")))
+   3 1)
+ 
+ (define_function_unit "athlon_fp" 3 0
+   (and (eq_attr "cpu" "athlon")
+        (eq_attr "type" "fcmov"))
+   7 1)
+ 
+ (define_function_unit "athlon_fp_mul" 1 0
+   (and (eq_attr "cpu" "athlon")
+        (eq_attr "athlon_fpunits" "mul"))
+   1 1)
+ 
+ (define_function_unit "athlon_fp_add" 1 0
+   (and (eq_attr "cpu" "athlon")
+        (eq_attr "athlon_fpunits" "add"))
+   1 1)
+ 
+ (define_function_unit "athlon_fp_muladd" 2 0
+   (and (eq_attr "cpu" "athlon")
+        (eq_attr "athlon_fpunits" "muladd,mul,add"))
+   1 1)
+ 
+ (define_function_unit "athlon_fp_store" 1 0
+   (and (eq_attr "cpu" "athlon")
+        (eq_attr "athlon_fpunits" "store"))
+   1 1)
+ 
+ ;; We don't need to model the Address Generation Unit, since we don't model
+ ;; the re-order buffer yet and thus we never schedule more than three 
operations
+ ;; at time.  Later we may want to experiment with MD_SCHED macros modeling 
the
+ ;; decoders independently on the functional units.
+ 
+ ;(define_function_unit "athlon_agu" 3 0
+ ;  (and (eq_attr "cpu" "athlon")
+ ;       (and (eq_attr "memory" "!none")
+ ;            (eq_attr "athlon_fpunits" "none")))
+ ;  1 1)
+ 
+ ;; Model load unit to avoid too long sequences of loads.  We don't need to
+ ;; model store queue, since it is hardly going to be bottleneck.
+ 
+ (define_function_unit "athlon_load" 2 0
+   (and (eq_attr "cpu" "athlon")
+        (eq_attr "memory" "load,both"))
+   1 1)
+ 
Index: i386.md
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/i386/i386.md,v
retrieving revision 1.355
diff -c -3 -p -r1.355 i386.md
*** i386.md	9 May 2002 17:58:09 -0000	1.355
--- i386.md	9 May 2002 23:15:26 -0000
***************
*** 322,1056 ****
    [(set_attr "length" "128")
     (set_attr "type" "multi")])
  
! ;; Pentium Scheduling
! ;;
! ;; The Pentium is an in-order core with two integer pipelines.
! 
! ;; True for insns that behave like prefixed insns on the Pentium.
! (define_attr "pent_prefix" "false,true"
!   (if_then_else (ior (eq_attr "prefix_0f" "1")
!   		     (ior (eq_attr "prefix_data16" "1")
! 			  (eq_attr "prefix_rep" "1")))
!     (const_string "true")
!     (const_string "false")))
! 
! ;; Categorize how an instruction slots.
! 
! ;; The non-MMX Pentium slots an instruction with prefixes on U pipe only,
! ;; while MMX Pentium can slot it on either U or V.  Model non-MMX Pentium
! ;; rules, because it results in noticeably better code on non-MMX Pentium
! ;; and doesn't hurt much on MMX.  (Prefixed instructions are not very
! ;; common, so the scheduler usualy has a non-prefixed insn to pair).
! 
! (define_attr "pent_pair" "uv,pu,pv,np"
!   (cond [(eq_attr "imm_disp" "true")
! 	   (const_string "np")
! 	 (ior (eq_attr "type" "alu1,alu,imov,icmp,test,lea,incdec")
! 	      (and (eq_attr "type" "pop,push")
! 		   (eq_attr "memory" "!both")))
! 	   (if_then_else (eq_attr "pent_prefix" "true")
! 	     (const_string "pu")
! 	     (const_string "uv"))
! 	 (eq_attr "type" "ibr")
! 	   (const_string "pv")
! 	 (and (eq_attr "type" "ishift")
! 	      (match_operand 2 "const_int_operand" ""))
! 	   (const_string "pu")
! 	 (and (eq_attr "type" "rotate")
! 	      (match_operand 2 "const_int_1_operand" ""))
! 	   (const_string "pu")
! 	 (and (eq_attr "type" "call")
! 	      (match_operand 0 "constant_call_address_operand" ""))
! 	   (const_string "pv")
! 	 (and (eq_attr "type" "callv")
! 	      (match_operand 1 "constant_call_address_operand" ""))
! 	   (const_string "pv")
! 	]
! 	(const_string "np")))
! 
! (define_automaton "pentium,pentium_fpu")
! 
! ;; Pentium do have U and V pipes.  Instruction to both pipes
! ;; are alwyas issued together, much like on VLIW.
! ;;
! ;;                    predecode
! ;;                   /         \
! ;;               decodeu     decodev
! ;;             /    |           |
! ;;           fpu executeu    executev
! ;;            |     |           |
! ;;           fpu  retire     retire
! ;;            |
! ;;           fpu
! ;; We add dummy "port" pipes allocated only first cycle of
! ;; instruction to specify this behaviour.
! 
! (define_cpu_unit "pentium-portu,pentium-portv" "pentium")
! (define_cpu_unit "pentium-u,pentium-v" "pentium")
! (absence_set "pentium-portu" "pentium-u,pentium-v")
! (presence_set "pentium-portv" "pentium-portu")
! 
! ;; Floating point instructions can overlap with new issue of integer
! ;; instructions.  We model only first cycle of FP pipeline, as it is
! ;; fully pipelined.
! (define_cpu_unit "pentium-fp" "pentium_fpu")
! 
! ;; There is non-pipelined multiplier unit used for complex operations.
! (define_cpu_unit "pentium-fmul" "pentium_fpu")
! 
! ;; Pentium preserves memory ordering, so when load-execute-store
! ;; instruction is executed together with other instruction loading
! ;; data, the execution of the other instruction is delayed to very
! ;; last cycle of first instruction, when data are bypassed.
! ;; We model this by allocating "memory" unit when store is pending
! ;; and using conflicting load units together.
! 
! (define_cpu_unit "pentium-memory" "pentium")
! (define_cpu_unit "pentium-load0" "pentium")
! (define_cpu_unit "pentium-load1" "pentium")
! (absence_set "pentium-load0,pentium-load1" "pentium-memory")
! 
! (define_reservation "pentium-load" "(pentium-load0 | pentium-load1)")
! (define_reservation "pentium-np" "(pentium-u + pentium-v)")
! (define_reservation "pentium-uv" "(pentium-u | pentium-v)")
! (define_reservation "pentium-portuv" "(pentium-portu | pentium-portv)")
! (define_reservation "pentium-firstu" "(pentium-u + pentium-portu)")
! (define_reservation "pentium-firstv" "(pentium-v + pentium-portuv)")
! (define_reservation "pentium-firstuv" "(pentium-uv + pentium-portuv)")
! (define_reservation "pentium-firstuload" "(pentium-load + pentium-firstu)")
! (define_reservation "pentium-firstvload" "(pentium-load + pentium-firstv)")
! (define_reservation "pentium-firstuvload" "(pentium-load + pentium-firstuv)
! 					   | (pentium-firstv,pentium-v,
! 					      (pentium-load+pentium-firstv))")
! (define_reservation "pentium-firstuboth" "(pentium-load + pentium-firstu
! 					   + pentium-memory)")
! (define_reservation "pentium-firstvboth" "(pentium-load + pentium-firstu
! 					   + pentium-memory)")
! (define_reservation "pentium-firstuvboth" "(pentium-load + pentium-firstuv
! 					    + pentium-memory)
! 					   | (pentium-firstv,pentium-v,
! 					      (pentium-load+pentium-firstv))")
! 
! ;; Few common long latency instructions
! (define_insn_reservation "pent_mul" 11
!   (and (eq_attr "cpu" "pentium")
!        (eq_attr "type" "imul"))
!   "pentium-np*11")
! 
! (define_insn_reservation "pent_str" 12
!   (and (eq_attr "cpu" "pentium")
!        (eq_attr "type" "str"))
!   "pentium-np*12")
! 
! ;; Integer division and some other long latency instruction block all
! ;; units, including the FP pipe.  There is no value in modeling the
! ;; latency of these instructions and not modeling the latency
! ;; decreases the size of the DFA.
! (define_insn_reservation "pent_block" 1
!   (and (eq_attr "cpu" "pentium")
!        (eq_attr "type" "idiv"))
!   "pentium-np+pentium-fp")
! 
! (define_insn_reservation "pent_cld" 2
!   (and (eq_attr "cpu" "pentium")
!        (eq_attr "type" "cld"))
!   "pentium-np*2")
! 
! ;;  Moves usually have one cycle penalty, but there are exceptions.
! (define_insn_reservation "pent_fmov" 1
!   (and (eq_attr "cpu" "pentium")
!        (and (eq_attr "type" "fmov")
! 	    (eq_attr "memory" "none,load")))
!   "(pentium-fp+pentium-np)")
! 
! (define_insn_reservation "pent_fpmovxf" 3
!   (and (eq_attr "cpu" "pentium")
!        (and (eq_attr "type" "fmov")
! 	    (and (eq_attr "memory" "load,store")
! 		 (eq_attr "mode" "XF"))))
!   "(pentium-fp+pentium-np)*3")
! 
! (define_insn_reservation "pent_fpstore" 2
!   (and (eq_attr "cpu" "pentium")
!        (and (eq_attr "type" "fmov")
! 	    (ior (match_operand 1 "immediate_operand" "")
! 		 (eq_attr "memory" "store"))))
!   "(pentium-fp+pentium-np)*2")
! 
! (define_insn_reservation "pent_imov" 1
!   (and (eq_attr "cpu" "pentium")
!        (eq_attr "type" "imov"))
!   "pentium-firstuv")
! 
! ;; Push and pop instructions have 1 cycle latency and special
! ;; hardware bypass allows them to be paired with other push,pop
! ;; and call instructions.
! (define_bypass 0 "pent_push,pent_pop" "pent_push,pent_pop,pent_call")
! (define_insn_reservation "pent_push" 1
!   (and (eq_attr "cpu" "pentium")
!        (and (eq_attr "type" "push")
! 	    (eq_attr "memory" "store")))
!   "pentium-firstuv")
! 
! (define_insn_reservation "pent_pop" 1
!   (and (eq_attr "cpu" "pentium")
!        (eq_attr "type" "pop"))
!   "pentium-firstuv")
! 
! ;; Call and branch instruction can execute in either pipe, but
! ;; they are only pairable when in the v pipe.
! (define_insn_reservation "pent_call" 10
!   (and (eq_attr "cpu" "pentium")
!        (eq_attr "type" "call,callv"))
!   "pentium-firstv,pentium-v*9")
! 
! (define_insn_reservation "pent_branch" 1
!   (and (eq_attr "cpu" "pentium")
!        (eq_attr "type" "ibr"))
!   "pentium-firstv")
! 
! ;; Floating point instruction dispatch in U pipe, but continue
! ;; in FP pipeline allowing other isntructions to be executed.
! (define_insn_reservation "pent_fp" 3
!   (and (eq_attr "cpu" "pentium")
!        (eq_attr "type" "fop,fistp"))
!   "(pentium-firstu+pentium-fp),nothing,nothing")
! 
! ;; First two cycles of fmul are not pipelined.
! (define_insn_reservation "pent_fmul" 3
!   (and (eq_attr "cpu" "pentium")
!        (eq_attr "type" "fmul"))
!   "(pentium-firstuv+pentium-fp+pentium-fmul),pentium-fmul,nothing")
! 
! ;; Long latency FP instructions overlap with integer instructions,
! ;; but only last 2 cycles with FP ones.
! (define_insn_reservation "pent_fdiv" 39
!   (and (eq_attr "cpu" "pentium")
!        (eq_attr "type" "fdiv"))
!   "(pentium-np+pentium-fp+pentium-fmul),
!    (pentium-fp+pentium-fmul)*36,pentium-fmul*2")
! 
! (define_insn_reservation "pent_fpspc" 70
!   (and (eq_attr "cpu" "pentium")
!        (eq_attr "type" "fpspc"))
!   "(pentium-np+pentium-fp+pentium-fmul),
!    (pentium-fp+pentium-fmul)*67,pentium-fmul*2")
! 
! ;; Integer instructions.  Load/execute/store takes 3 cycles,
! ;; load/execute 2 cycles and execute only one cycle.
! (define_insn_reservation "pent_uv_both" 3
!   (and (eq_attr "cpu" "pentium")
!        (and (eq_attr "pent_pair" "uv")
! 	    (eq_attr "memory" "both")))
!   "pentium-firstuvboth,pentium-uv+pentium-memory,pentium-uv")
! 
! (define_insn_reservation "pent_u_both" 3
!   (and (eq_attr "cpu" "pentium")
!        (and (eq_attr "pent_pair" "pu")
! 	    (eq_attr "memory" "both")))
!   "pentium-firstuboth,pentium-u+pentium-memory,pentium-u")
! 
! (define_insn_reservation "pent_v_both" 3
!   (and (eq_attr "cpu" "pentium")
!        (and (eq_attr "pent_pair" "pv")
! 	    (eq_attr "memory" "both")))
!   "pentium-firstvboth,pentium-v+pentium-memory,pentium-v")
! 
! (define_insn_reservation "pent_np_both" 3
!   (and (eq_attr "cpu" "pentium")
!        (and (eq_attr "pent_pair" "np")
! 	    (eq_attr "memory" "both")))
!   "pentium-np,pentium-np,pentium-np")
! 
! (define_insn_reservation "pent_uv_load" 2
!   (and (eq_attr "cpu" "pentium")
!        (and (eq_attr "pent_pair" "uv")
! 	    (eq_attr "memory" "load")))
!   "pentium-firstuvload,pentium-uv")
! 
! (define_insn_reservation "pent_u_load" 2
!   (and (eq_attr "cpu" "pentium")
!        (and (eq_attr "pent_pair" "pu")
! 	    (eq_attr "memory" "load")))
!   "pentium-firstuload,pentium-u")
! 
! (define_insn_reservation "pent_v_load" 2
!   (and (eq_attr "cpu" "pentium")
!        (and (eq_attr "pent_pair" "pv")
! 	    (eq_attr "memory" "load")))
!   "pentium-firstvload,pentium-v")
! 
! (define_insn_reservation "pent_np_load" 2
!   (and (eq_attr "cpu" "pentium")
!        (and (eq_attr "pent_pair" "np")
! 	    (eq_attr "memory" "load")))
!   "pentium-np,pentium-np")
! 
! (define_insn_reservation "pent_uv" 1
!   (and (eq_attr "cpu" "pentium")
!        (and (eq_attr "pent_pair" "uv")
! 	    (eq_attr "memory" "none")))
!   "pentium-firstuv")
! 
! (define_insn_reservation "pent_u" 1
!   (and (eq_attr "cpu" "pentium")
!        (and (eq_attr "pent_pair" "pu")
! 	    (eq_attr "memory" "none")))
!   "pentium-firstu")
! 
! (define_insn_reservation "pent_v" 1
!   (and (eq_attr "cpu" "pentium")
!        (and (eq_attr "pent_pair" "pv")
! 	    (eq_attr "memory" "none")))
!   "pentium-firstv")
! 
! (define_insn_reservation "pent_np" 1
!   (and (eq_attr "cpu" "pentium")
!        (and (eq_attr "pent_pair" "np")
! 	    (eq_attr "memory" "none")))
!   "pentium-np")
! 
! 
! ;; Pentium Pro/PII Scheduling
! ;;
! ;; The PPro has an out-of-order core, but the instruction decoders are
! ;; naturally in-order and asymmetric.  We get best performance by scheduling
! ;; for the decoders, for in doing so we give the oo execution unit the 
! ;; most choices.
! 
! ;; Categorize how many uops an ia32 instruction evaluates to:
! ;;   one --  an instruction with 1 uop can be decoded by any of the
! ;;           three decoders.
! ;;   few --  an instruction with 1 to 4 uops can be decoded only by 
! ;;	     decoder 0.
! ;;   many -- a complex instruction may take an unspecified number of
! ;;	     cycles to decode in decoder 0.
! 
! (define_attr "ppro_uops" "one,few,many"
!   (cond [(eq_attr "type" "other,multi,call,callv,fpspc,str")
! 	   (const_string "many")
! 	 (eq_attr "type" "icmov,fcmov,str,cld")
! 	   (const_string "few")
! 	 (eq_attr "type" "imov")
! 	   (if_then_else (eq_attr "memory" "store,both")
! 	     (const_string "few")
! 	     (const_string "one"))
! 	 (eq_attr "memory" "!none")
! 	   (const_string "few")
! 	]
! 	(const_string "one")))
! 
! ;; Rough readiness numbers.  Fine tuning happens in i386.c.
! ;;
! ;; p0	describes port 0.
! ;; p01	describes ports 0 and 1 as a pair; alu insns can issue to either.
! ;; p2	describes port 2 for loads.
! ;; p34	describes ports 3 and 4 for stores.
! ;; fpu	describes the fpu accessed via port 0. 
! ;;	??? It is less than clear if there are separate fadd and fmul units
! ;;	that could operate in parallel.
! ;;
! ;; ??? fxch isn't handled; not an issue until sched3 after reg-stack is real.
! 
! (define_function_unit "ppro_p0" 1 0
!   (and (eq_attr "cpu" "pentiumpro")
!        (eq_attr "type" "ishift,rotate,lea,ibr,cld"))
!   1 1)
! 
! (define_function_unit "ppro_p0" 1 0
!   (and (eq_attr "cpu" "pentiumpro")
!        (eq_attr "type" "imul"))
!   4 1)
! 
! ;; ??? Does the divider lock out the pipe while it works,
! ;; or is there a disconnected unit?
! (define_function_unit "ppro_p0" 1 0
!   (and (eq_attr "cpu" "pentiumpro")
!        (eq_attr "type" "idiv"))
!   17 17)
! 
! (define_function_unit "ppro_p0" 1 0
!   (and (eq_attr "cpu" "pentiumpro")
!        (eq_attr "type" "fop,fsgn,fistp"))
!   3 1)
! 
! (define_function_unit "ppro_p0" 1 0
!   (and (eq_attr "cpu" "pentiumpro")
!        (eq_attr "type" "fcmov"))
!   2 1)
! 
! (define_function_unit "ppro_p0" 1 0
!   (and (eq_attr "cpu" "pentiumpro")
!        (eq_attr "type" "fcmp"))
!   1 1)
! 
! (define_function_unit "ppro_p0" 1 0
!   (and (eq_attr "cpu" "pentiumpro")
!        (eq_attr "type" "fmov"))
!   1 1)
! 
! (define_function_unit "ppro_p0" 1 0
!   (and (eq_attr "cpu" "pentiumpro")
!        (eq_attr "type" "fmul"))
!   5 1)
! 
! (define_function_unit "ppro_p0" 1 0
!   (and (eq_attr "cpu" "pentiumpro")
!        (eq_attr "type" "fdiv,fpspc"))
!   56 1)
! 
! (define_function_unit "ppro_p01" 2 0
!   (and (eq_attr "cpu" "pentiumpro")
!        (eq_attr "type" "!imov,fmov"))
!   1 1)
! 
! (define_function_unit "ppro_p01" 2 0
!   (and (and (eq_attr "cpu" "pentiumpro")
!             (eq_attr "type" "imov,fmov"))
!        (eq_attr "memory" "none"))
!   1 1)
! 
! (define_function_unit "ppro_p2" 1 0
!   (and (eq_attr "cpu" "pentiumpro")
!        (ior (eq_attr "type" "pop")
! 	    (eq_attr "memory" "load,both")))
!   3 1)
! 
! (define_function_unit "ppro_p34" 1 0
!   (and (eq_attr "cpu" "pentiumpro")
!        (ior (eq_attr "type" "push")
! 	    (eq_attr "memory" "store,both")))
!   1 1)
! 
! (define_function_unit "fpu" 1 0
!   (and (eq_attr "cpu" "pentiumpro")
!        (eq_attr "type" "fop,fsgn,fmov,fcmp,fcmov,fistp"))
!   1 1)
! 
! (define_function_unit "fpu" 1 0
!   (and (eq_attr "cpu" "pentiumpro")
!        (eq_attr "type" "fmul"))
!   5 2)
! 
! (define_function_unit "fpu" 1 0
!   (and (eq_attr "cpu" "pentiumpro")
!        (eq_attr "type" "fdiv,fpspc"))
!   56 56)
! 
! ;; imul uses the fpu.  ??? does it have the same throughput as fmul?
! (define_function_unit "fpu" 1 0
!   (and (eq_attr "cpu" "pentiumpro")
!        (eq_attr "type" "imul"))
!   4 1)
! 
! ;; AMD K6/K6-2 Scheduling
! ;;
! ;; The K6 has similar architecture to PPro.  Important difference is, that
! ;; there are only two decoders and they seems to be much slower than 
execution
! ;; units.  So we have to pay much more attention to proper decoding for
! ;; schedulers.  We share most of scheduler code for PPro in i386.c
! ;;
! ;; The fp unit is not pipelined and do one operation per two cycles including
! ;; the FXCH.
! ;;
! ;; alu	  describes both ALU units (ALU-X and ALU-Y).
! ;; alux   describes X alu unit
! ;; fpu    describes FPU unit
! ;; load   describes load unit.
! ;; branch describes branch unit.
! ;; store  decsribes store unit.  This unit is not modelled completely and 
only
! ;;        used to model lea operation.  Otherwise it lie outside of the 
critical
! ;;        path.
! ;;
! ;; ??? fxch isn't handled; not an issue until sched3 after reg-stack is real.
! 
! ;; The decoder specification is in the PPro section above!
! 
! ;; Shift instructions and certain arithmetic are issued only to X pipe.
! (define_function_unit "k6_alux" 1 0
!   (and (eq_attr "cpu" "k6")
!        (eq_attr "type" "ishift,rotate,alu1,negnot,cld"))
!   1 1)
! 
! ;; The QI mode arithmetic is issued to X pipe only.
! (define_function_unit "k6_alux" 1 0
!   (and (eq_attr "cpu" "k6")
!        (and (eq_attr "type" "alu,alu1,negnot,icmp,test,imovx,incdec")
! 	    (match_operand:QI 0 "general_operand" "")))
!   1 1)
! 
! (define_function_unit "k6_alu" 2 0
!   (and (eq_attr "cpu" "k6")
!        (eq_attr "type" "ishift,rotate,alu1,negnot,alu,icmp,test,imovx,incdec,
setcc,lea"))
!   1 1)
! 
! (define_function_unit "k6_alu" 2 0
!   (and (eq_attr "cpu" "k6")
!        (and (eq_attr "type" "imov")
!        	    (eq_attr "memory" "none")))
!   1 1)
! 
! (define_function_unit "k6_branch" 1 0
!   (and (eq_attr "cpu" "k6")
!        (eq_attr "type" "call,callv,ibr"))
!   1 1)
! 
! ;; Load unit have two cycle latency, but we take care for it in adjust_cost
! (define_function_unit "k6_load" 1 0
!   (and (eq_attr "cpu" "k6")
!        (ior (eq_attr "type" "pop")
! 	    (eq_attr "memory" "load,both")))
!   1 1)
! 
! (define_function_unit "k6_load" 1 0
!   (and (eq_attr "cpu" "k6")
!        (and (eq_attr "type" "str")
! 	    (eq_attr "memory" "load,both")))
!   10 10)
! 
! ;; Lea have two instructions, so latency is probably 2
! (define_function_unit "k6_store" 1 0
!   (and (eq_attr "cpu" "k6")
!        (eq_attr "type" "lea"))
!   2 1)
! 
! (define_function_unit "k6_store" 1 0
!   (and (eq_attr "cpu" "k6")
!        (eq_attr "type" "str"))
!   10 10)
! 
! (define_function_unit "k6_store" 1 0
!   (and (eq_attr "cpu" "k6")
!        (ior (eq_attr "type" "push")
! 	    (eq_attr "memory" "store,both")))
!   1 1)
! 
! (define_function_unit "k6_fpu" 1 1
!   (and (eq_attr "cpu" "k6")
!        (eq_attr "type" "fop,fmov,fcmp,fistp"))
!   2 2)
! 
! (define_function_unit "k6_fpu" 1 1
!   (and (eq_attr "cpu" "k6")
!        (eq_attr "type" "fmul"))
!   2 2)
! 
! ;; ??? Guess
! (define_function_unit "k6_fpu" 1 1
!   (and (eq_attr "cpu" "k6")
!        (eq_attr "type" "fdiv,fpspc"))
!   56 56)
! 
! (define_function_unit "k6_alu" 2 0
!   (and (eq_attr "cpu" "k6")
!        (eq_attr "type" "imul"))
!   2 2)
! 
! (define_function_unit "k6_alux" 1 0
!   (and (eq_attr "cpu" "k6")
!        (eq_attr "type" "imul"))
!   2 2)
! 
! ;; ??? Guess
! (define_function_unit "k6_alu" 2 0
!   (and (eq_attr "cpu" "k6")
!        (eq_attr "type" "idiv"))
!   17 17)
! 
! (define_function_unit "k6_alux" 1 0
!   (and (eq_attr "cpu" "k6")
!        (eq_attr "type" "idiv"))
!   17 17)
! 
! ;; AMD Athlon Scheduling
! ;;
! ;; The Athlon does contain three pipelined FP units, three integer units and
! ;; three address generation units. 
! ;;
! ;; The predecode logic is determining boundaries of instructions in the 64
! ;; byte cache line. So the cache line straddling problem of K6 might be issue
! ;; here as well, but it is not noted in the documentation.
! ;;
! ;; Three DirectPath instructions decoders and only one VectorPath decoder
! ;; is available. They can decode three DirectPath instructions or one 
VectorPath
! ;; instruction per cycle.
! ;; Decoded macro instructions are then passed to 72 entry instruction control
! ;; unit, that passes
! ;; it to the specialized integer (18 entry) and fp (36 entry) schedulers.
! ;;
! ;; The load/store queue unit is not attached to the schedulers but
! ;; communicates with all the execution units separately instead.
! 
! (define_attr "athlon_decode" "direct,vector"
!   (cond [(eq_attr "type" "call,imul,idiv,other,multi,fcmov,fpspc,str,pop,cld,
fcmov")
! 	   (const_string "vector")
!          (and (eq_attr "type" "push")
!               (match_operand 1 "memory_operand" ""))
! 	   (const_string "vector")
!          (and (eq_attr "type" "fmov")
! 	      (and (eq_attr "memory" "load,store")
! 		   (eq_attr "mode" "XF")))
! 	   (const_string "vector")]
! 	(const_string "direct")))
! 
! (define_function_unit "athlon_vectordec" 1 0
!   (and (eq_attr "cpu" "athlon")
!        (eq_attr "athlon_decode" "vector"))
!   1 1)
! 
! (define_function_unit "athlon_directdec" 3 0
!   (and (eq_attr "cpu" "athlon")
!        (eq_attr "athlon_decode" "direct"))
!   1 1)
! 
! (define_function_unit "athlon_vectordec" 1 0
!   (and (eq_attr "cpu" "athlon")
!        (eq_attr "athlon_decode" "direct"))
!   1 1 [(eq_attr "athlon_decode" "vector")])
! 
! (define_function_unit "athlon_ieu" 3 0
!   (and (eq_attr "cpu" "athlon")
!        (eq_attr "type" "alu1,negnot,alu,icmp,test,imov,imovx,lea,incdec,ishif
t,rotate,ibr,call,callv,icmov,cld,pop,setcc,push,pop"))
!   1 1)
! 
! (define_function_unit "athlon_ieu" 3 0
!   (and (eq_attr "cpu" "athlon")
!        (eq_attr "type" "str"))
!   15 15)
! 
! (define_function_unit "athlon_ieu" 3 0
!   (and (eq_attr "cpu" "athlon")
!        (eq_attr "type" "imul"))
!   5 0)
! 
! (define_function_unit "athlon_ieu" 3 0
!   (and (eq_attr "cpu" "athlon")
!        (eq_attr "type" "idiv"))
!   42 0)
! 
! (define_function_unit "athlon_muldiv" 1 0
!   (and (eq_attr "cpu" "athlon")
!        (eq_attr "type" "imul"))
!   5 0)
! 
! (define_function_unit "athlon_muldiv" 1 0
!   (and (eq_attr "cpu" "athlon")
!        (eq_attr "type" "idiv"))
!   42 42)
! 
! (define_attr "athlon_fpunits" "none,store,mul,add,muladd,any"
!   (cond [(eq_attr "type" "fop,fcmp,fistp")
! 	   (const_string "add")
!          (eq_attr "type" "fmul,fdiv,fpspc,fsgn,fcmov")
! 	   (const_string "mul")
! 	 (and (eq_attr "type" "fmov") (eq_attr "memory" "store,both"))
! 	   (const_string "store")
! 	 (and (eq_attr "type" "fmov") (eq_attr "memory" "load"))
! 	   (const_string "any")
!          (and (eq_attr "type" "fmov")
!               (ior (match_operand:SI 1 "register_operand" "")
!                    (match_operand 1 "immediate_operand" "")))
! 	   (const_string "store")
!          (eq_attr "type" "fmov")
! 	   (const_string "muladd")]
! 	(const_string "none")))
! 
! ;; We use latencies 1 for definitions.  This is OK to model colisions
! ;; in execution units.  The real latencies are modeled in the "fp" pipeline.
! 
! ;; fsin, fcos: 96-192
! ;; fsincos: 107-211
! ;; fsqrt: 19 for SFmode, 27 for DFmode, 35 for XFmode.
! (define_function_unit "athlon_fp" 3 0
!   (and (eq_attr "cpu" "athlon")
!        (eq_attr "type" "fpspc"))
!   100 1)
! 
! ;; 16 cycles for SFmode, 20 for DFmode and 24 for XFmode.
! (define_function_unit "athlon_fp" 3 0
!   (and (eq_attr "cpu" "athlon")
!        (eq_attr "type" "fdiv"))
!   24 1)
! 
! (define_function_unit "athlon_fp" 3 0
!   (and (eq_attr "cpu" "athlon")
!        (eq_attr "type" "fop,fmul,fistp"))
!   4 1)
! 
! ;; XFmode loads are slow.
! ;; XFmode store is slow too (8 cycles), but we don't need to model it, 
because
! ;; there are no dependent instructions.
! 
! (define_function_unit "athlon_fp" 3 0
!   (and (eq_attr "cpu" "athlon")
!        (and (eq_attr "type" "fmov")
! 	    (and (eq_attr "memory" "load")
! 		 (eq_attr "mode" "XF"))))
!   10 1)
! 
! (define_function_unit "athlon_fp" 3 0
!   (and (eq_attr "cpu" "athlon")
!        (eq_attr "type" "fmov,fsgn"))
!   2 1)
! 
! ;; fcmp and ftst instructions
! (define_function_unit "athlon_fp" 3 0
!   (and (eq_attr "cpu" "athlon")
!        (and (eq_attr "type" "fcmp")
! 	    (eq_attr "athlon_decode" "direct")))
!   3 1)
! 
! ;; fcmpi instructions.
! (define_function_unit "athlon_fp" 3 0
!   (and (eq_attr "cpu" "athlon")
!        (and (eq_attr "type" "fcmp")
! 	    (eq_attr "athlon_decode" "vector")))
!   3 1)
! 
! (define_function_unit "athlon_fp" 3 0
!   (and (eq_attr "cpu" "athlon")
!        (eq_attr "type" "fcmov"))
!   7 1)
! 
! (define_function_unit "athlon_fp_mul" 1 0
!   (and (eq_attr "cpu" "athlon")
!        (eq_attr "athlon_fpunits" "mul"))
!   1 1)
! 
! (define_function_unit "athlon_fp_add" 1 0
!   (and (eq_attr "cpu" "athlon")
!        (eq_attr "athlon_fpunits" "add"))
!   1 1)
! 
! (define_function_unit "athlon_fp_muladd" 2 0
!   (and (eq_attr "cpu" "athlon")
!        (eq_attr "athlon_fpunits" "muladd,mul,add"))
!   1 1)
! 
! (define_function_unit "athlon_fp_store" 1 0
!   (and (eq_attr "cpu" "athlon")
!        (eq_attr "athlon_fpunits" "store"))
!   1 1)
! 
! ;; We don't need to model the Address Generation Unit, since we don't model
! ;; the re-order buffer yet and thus we never schedule more than three 
operations
! ;; at time.  Later we may want to experiment with MD_SCHED macros modeling 
the
! ;; decoders independently on the functional units.
! 
! ;(define_function_unit "athlon_agu" 3 0
! ;  (and (eq_attr "cpu" "athlon")
! ;       (and (eq_attr "memory" "!none")
! ;            (eq_attr "athlon_fpunits" "none")))
! ;  1 1)
! 
! ;; Model load unit to avoid too long sequences of loads.  We don't need to
! ;; model store queue, since it is hardly going to be bottleneck.
! 
! (define_function_unit "athlon_load" 2 0
!   (and (eq_attr "cpu" "athlon")
!        (eq_attr "memory" "load,both"))
!   1 1)
! 
  
  ;; Compare instructions.
  
--- 322,331 ----
    [(set_attr "length" "128")
     (set_attr "type" "multi")])
  
! (include "pentium.md")
! (include "ppro.md")
! (include "k6.md")
! (include "athlon.md")
  
  ;; Compare instructions.
  
Index: k6.md
===================================================================
RCS file: k6.md
diff -N k6.md
*** /dev/null	1 Jan 1970 00:00:00 -0000
--- k6.md	9 May 2002 23:15:26 -0000
***************
*** 0 ****
--- 1,136 ----
+ ;; AMD K6/K6-2 Scheduling
+ ;; Copyright (C) 2002 ;; Free Software Foundation, Inc.
+ ;;
+ ;; This file is part of GNU CC.
+ ;;
+ ;; GNU CC is free software; you can redistribute it and/or modify
+ ;; it under the terms of the GNU General Public License as published by
+ ;; the Free Software Foundation; either version 2, or (at your option)
+ ;; any later version.
+ ;;
+ ;; GNU CC is distributed in the hope that it will be useful,
+ ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ;; GNU General Public License for more details.
+ ;;
+ ;; You should have received a copy of the GNU General Public License
+ ;; along with GNU CC; see the file COPYING.  If not, write to
+ ;; the Free Software Foundation, 59 Temple Place - Suite 330,
+ ;; Boston, MA 02111-1307, USA.  */
+ ;;
+ ;; The K6 has similar architecture to PPro.  Important difference is, that
+ ;; there are only two decoders and they seems to be much slower than 
execution
+ ;; units.  So we have to pay much more attention to proper decoding for
+ ;; schedulers.  We share most of scheduler code for PPro in i386.c
+ ;;
+ ;; The fp unit is not pipelined and do one operation per two cycles including
+ ;; the FXCH.
+ ;;
+ ;; alu	  describes both ALU units (ALU-X and ALU-Y).
+ ;; alux   describes X alu unit
+ ;; fpu    describes FPU unit
+ ;; load   describes load unit.
+ ;; branch describes branch unit.
+ ;; store  decsribes store unit.  This unit is not modelled completely and 
only
+ ;;        used to model lea operation.  Otherwise it lie outside of the 
critical
+ ;;        path.
+ ;;
+ ;; ??? fxch isn't handled; not an issue until sched3 after reg-stack is real.
+ 
+ ;; The decoder specification is in the PPro section above!
+ 
+ ;; Shift instructions and certain arithmetic are issued only to X pipe.
+ (define_function_unit "k6_alux" 1 0
+   (and (eq_attr "cpu" "k6")
+        (eq_attr "type" "ishift,rotate,alu1,negnot,cld"))
+   1 1)
+ 
+ ;; The QI mode arithmetic is issued to X pipe only.
+ (define_function_unit "k6_alux" 1 0
+   (and (eq_attr "cpu" "k6")
+        (and (eq_attr "type" "alu,alu1,negnot,icmp,test,imovx,incdec")
+ 	    (match_operand:QI 0 "general_operand" "")))
+   1 1)
+ 
+ (define_function_unit "k6_alu" 2 0
+   (and (eq_attr "cpu" "k6")
+        (eq_attr "type" "ishift,rotate,alu1,negnot,alu,icmp,test,imovx,incdec,
setcc,lea"))
+   1 1)
+ 
+ (define_function_unit "k6_alu" 2 0
+   (and (eq_attr "cpu" "k6")
+        (and (eq_attr "type" "imov")
+        	    (eq_attr "memory" "none")))
+   1 1)
+ 
+ (define_function_unit "k6_branch" 1 0
+   (and (eq_attr "cpu" "k6")
+        (eq_attr "type" "call,callv,ibr"))
+   1 1)
+ 
+ ;; Load unit have two cycle latency, but we take care for it in adjust_cost
+ (define_function_unit "k6_load" 1 0
+   (and (eq_attr "cpu" "k6")
+        (ior (eq_attr "type" "pop")
+ 	    (eq_attr "memory" "load,both")))
+   1 1)
+ 
+ (define_function_unit "k6_load" 1 0
+   (and (eq_attr "cpu" "k6")
+        (and (eq_attr "type" "str")
+ 	    (eq_attr "memory" "load,both")))
+   10 10)
+ 
+ ;; Lea have two instructions, so latency is probably 2
+ (define_function_unit "k6_store" 1 0
+   (and (eq_attr "cpu" "k6")
+        (eq_attr "type" "lea"))
+   2 1)
+ 
+ (define_function_unit "k6_store" 1 0
+   (and (eq_attr "cpu" "k6")
+        (eq_attr "type" "str"))
+   10 10)
+ 
+ (define_function_unit "k6_store" 1 0
+   (and (eq_attr "cpu" "k6")
+        (ior (eq_attr "type" "push")
+ 	    (eq_attr "memory" "store,both")))
+   1 1)
+ 
+ (define_function_unit "k6_fpu" 1 1
+   (and (eq_attr "cpu" "k6")
+        (eq_attr "type" "fop,fmov,fcmp,fistp"))
+   2 2)
+ 
+ (define_function_unit "k6_fpu" 1 1
+   (and (eq_attr "cpu" "k6")
+        (eq_attr "type" "fmul"))
+   2 2)
+ 
+ ;; ??? Guess
+ (define_function_unit "k6_fpu" 1 1
+   (and (eq_attr "cpu" "k6")
+        (eq_attr "type" "fdiv,fpspc"))
+   56 56)
+ 
+ (define_function_unit "k6_alu" 2 0
+   (and (eq_attr "cpu" "k6")
+        (eq_attr "type" "imul"))
+   2 2)
+ 
+ (define_function_unit "k6_alux" 1 0
+   (and (eq_attr "cpu" "k6")
+        (eq_attr "type" "imul"))
+   2 2)
+ 
+ ;; ??? Guess
+ (define_function_unit "k6_alu" 2 0
+   (and (eq_attr "cpu" "k6")
+        (eq_attr "type" "idiv"))
+   17 17)
+ 
+ (define_function_unit "k6_alux" 1 0
+   (and (eq_attr "cpu" "k6")
+        (eq_attr "type" "idiv"))
+   17 17)
Index: pentium.md
===================================================================
RCS file: pentium.md
diff -N pentium.md
*** /dev/null	1 Jan 1970 00:00:00 -0000
--- pentium.md	9 May 2002 23:15:26 -0000
***************
*** 0 ****
--- 1,306 ----
+ ;; Pentium Scheduling
+ ;; Copyright (C) 2002 Free Software Foundation, Inc.
+ ;;
+ ;; This file is part of GNU CC.
+ ;;
+ ;; GNU CC is free software; you can redistribute it and/or modify
+ ;; it under the terms of the GNU General Public License as published by
+ ;; the Free Software Foundation; either version 2, or (at your option)
+ ;; any later version.
+ ;;
+ ;; GNU CC is distributed in the hope that it will be useful,
+ ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ;; GNU General Public License for more details.
+ ;;
+ ;; You should have received a copy of the GNU General Public License
+ ;; along with GNU CC; see the file COPYING.  If not, write to
+ ;; the Free Software Foundation, 59 Temple Place - Suite 330,
+ ;; Boston, MA 02111-1307, USA.  */
+ ;;
+ ;; The Pentium is an in-order core with two integer pipelines.
+ 
+ ;; True for insns that behave like prefixed insns on the Pentium.
+ (define_attr "pent_prefix" "false,true"
+   (if_then_else (ior (eq_attr "prefix_0f" "1")
+   		     (ior (eq_attr "prefix_data16" "1")
+ 			  (eq_attr "prefix_rep" "1")))
+     (const_string "true")
+     (const_string "false")))
+ 
+ ;; Categorize how an instruction slots.
+ 
+ ;; The non-MMX Pentium slots an instruction with prefixes on U pipe only,
+ ;; while MMX Pentium can slot it on either U or V.  Model non-MMX Pentium
+ ;; rules, because it results in noticeably better code on non-MMX Pentium
+ ;; and doesn't hurt much on MMX.  (Prefixed instructions are not very
+ ;; common, so the scheduler usualy has a non-prefixed insn to pair).
+ 
+ (define_attr "pent_pair" "uv,pu,pv,np"
+   (cond [(eq_attr "imm_disp" "true")
+ 	   (const_string "np")
+ 	 (ior (eq_attr "type" "alu1,alu,imov,icmp,test,lea,incdec")
+ 	      (and (eq_attr "type" "pop,push")
+ 		   (eq_attr "memory" "!both")))
+ 	   (if_then_else (eq_attr "pent_prefix" "true")
+ 	     (const_string "pu")
+ 	     (const_string "uv"))
+ 	 (eq_attr "type" "ibr")
+ 	   (const_string "pv")
+ 	 (and (eq_attr "type" "ishift")
+ 	      (match_operand 2 "const_int_operand" ""))
+ 	   (const_string "pu")
+ 	 (and (eq_attr "type" "rotate")
+ 	      (match_operand 2 "const_int_1_operand" ""))
+ 	   (const_string "pu")
+ 	 (and (eq_attr "type" "call")
+ 	      (match_operand 0 "constant_call_address_operand" ""))
+ 	   (const_string "pv")
+ 	 (and (eq_attr "type" "callv")
+ 	      (match_operand 1 "constant_call_address_operand" ""))
+ 	   (const_string "pv")
+ 	]
+ 	(const_string "np")))
+ 
+ (define_automaton "pentium,pentium_fpu")
+ 
+ ;; Pentium do have U and V pipes.  Instruction to both pipes
+ ;; are alwyas issued together, much like on VLIW.
+ ;;
+ ;;                    predecode
+ ;;                   /         \
+ ;;               decodeu     decodev
+ ;;             /    |           |
+ ;;           fpu executeu    executev
+ ;;            |     |           |
+ ;;           fpu  retire     retire
+ ;;            |
+ ;;           fpu
+ ;; We add dummy "port" pipes allocated only first cycle of
+ ;; instruction to specify this behaviour.
+ 
+ (define_cpu_unit "pentium-portu,pentium-portv" "pentium")
+ (define_cpu_unit "pentium-u,pentium-v" "pentium")
+ (absence_set "pentium-portu" "pentium-u,pentium-v")
+ (presence_set "pentium-portv" "pentium-portu")
+ 
+ ;; Floating point instructions can overlap with new issue of integer
+ ;; instructions.  We model only first cycle of FP pipeline, as it is
+ ;; fully pipelined.
+ (define_cpu_unit "pentium-fp" "pentium_fpu")
+ 
+ ;; There is non-pipelined multiplier unit used for complex operations.
+ (define_cpu_unit "pentium-fmul" "pentium_fpu")
+ 
+ ;; Pentium preserves memory ordering, so when load-execute-store
+ ;; instruction is executed together with other instruction loading
+ ;; data, the execution of the other instruction is delayed to very
+ ;; last cycle of first instruction, when data are bypassed.
+ ;; We model this by allocating "memory" unit when store is pending
+ ;; and using conflicting load units together.
+ 
+ (define_cpu_unit "pentium-memory" "pentium")
+ (define_cpu_unit "pentium-load0" "pentium")
+ (define_cpu_unit "pentium-load1" "pentium")
+ (absence_set "pentium-load0,pentium-load1" "pentium-memory")
+ 
+ (define_reservation "pentium-load" "(pentium-load0 | pentium-load1)")
+ (define_reservation "pentium-np" "(pentium-u + pentium-v)")
+ (define_reservation "pentium-uv" "(pentium-u | pentium-v)")
+ (define_reservation "pentium-portuv" "(pentium-portu | pentium-portv)")
+ (define_reservation "pentium-firstu" "(pentium-u + pentium-portu)")
+ (define_reservation "pentium-firstv" "(pentium-v + pentium-portuv)")
+ (define_reservation "pentium-firstuv" "(pentium-uv + pentium-portuv)")
+ (define_reservation "pentium-firstuload" "(pentium-load + pentium-firstu)")
+ (define_reservation "pentium-firstvload" "(pentium-load + pentium-firstv)")
+ (define_reservation "pentium-firstuvload" "(pentium-load + pentium-firstuv)
+ 					   | (pentium-firstv,pentium-v,
+ 					      (pentium-load+pentium-firstv))")
+ (define_reservation "pentium-firstuboth" "(pentium-load + pentium-firstu
+ 					   + pentium-memory)")
+ (define_reservation "pentium-firstvboth" "(pentium-load + pentium-firstu
+ 					   + pentium-memory)")
+ (define_reservation "pentium-firstuvboth" "(pentium-load + pentium-firstuv
+ 					    + pentium-memory)
+ 					   | (pentium-firstv,pentium-v,
+ 					      (pentium-load+pentium-firstv))")
+ 
+ ;; Few common long latency instructions
+ (define_insn_reservation "pent_mul" 11
+   (and (eq_attr "cpu" "pentium")
+        (eq_attr "type" "imul"))
+   "pentium-np*11")
+ 
+ (define_insn_reservation "pent_str" 12
+   (and (eq_attr "cpu" "pentium")
+        (eq_attr "type" "str"))
+   "pentium-np*12")
+ 
+ ;; Integer division and some other long latency instruction block all
+ ;; units, including the FP pipe.  There is no value in modeling the
+ ;; latency of these instructions and not modeling the latency
+ ;; decreases the size of the DFA.
+ (define_insn_reservation "pent_block" 1
+   (and (eq_attr "cpu" "pentium")
+        (eq_attr "type" "idiv"))
+   "pentium-np+pentium-fp")
+ 
+ (define_insn_reservation "pent_cld" 2
+   (and (eq_attr "cpu" "pentium")
+        (eq_attr "type" "cld"))
+   "pentium-np*2")
+ 
+ ;;  Moves usually have one cycle penalty, but there are exceptions.
+ (define_insn_reservation "pent_fmov" 1
+   (and (eq_attr "cpu" "pentium")
+        (and (eq_attr "type" "fmov")
+ 	    (eq_attr "memory" "none,load")))
+   "(pentium-fp+pentium-np)")
+ 
+ (define_insn_reservation "pent_fpmovxf" 3
+   (and (eq_attr "cpu" "pentium")
+        (and (eq_attr "type" "fmov")
+ 	    (and (eq_attr "memory" "load,store")
+ 		 (eq_attr "mode" "XF"))))
+   "(pentium-fp+pentium-np)*3")
+ 
+ (define_insn_reservation "pent_fpstore" 2
+   (and (eq_attr "cpu" "pentium")
+        (and (eq_attr "type" "fmov")
+ 	    (ior (match_operand 1 "immediate_operand" "")
+ 		 (eq_attr "memory" "store"))))
+   "(pentium-fp+pentium-np)*2")
+ 
+ (define_insn_reservation "pent_imov" 1
+   (and (eq_attr "cpu" "pentium")
+        (eq_attr "type" "imov"))
+   "pentium-firstuv")
+ 
+ ;; Push and pop instructions have 1 cycle latency and special
+ ;; hardware bypass allows them to be paired with other push,pop
+ ;; and call instructions.
+ (define_bypass 0 "pent_push,pent_pop" "pent_push,pent_pop,pent_call")
+ (define_insn_reservation "pent_push" 1
+   (and (eq_attr "cpu" "pentium")
+        (and (eq_attr "type" "push")
+ 	    (eq_attr "memory" "store")))
+   "pentium-firstuv")
+ 
+ (define_insn_reservation "pent_pop" 1
+   (and (eq_attr "cpu" "pentium")
+        (eq_attr "type" "pop"))
+   "pentium-firstuv")
+ 
+ ;; Call and branch instruction can execute in either pipe, but
+ ;; they are only pairable when in the v pipe.
+ (define_insn_reservation "pent_call" 10
+   (and (eq_attr "cpu" "pentium")
+        (eq_attr "type" "call,callv"))
+   "pentium-firstv,pentium-v*9")
+ 
+ (define_insn_reservation "pent_branch" 1
+   (and (eq_attr "cpu" "pentium")
+        (eq_attr "type" "ibr"))
+   "pentium-firstv")
+ 
+ ;; Floating point instruction dispatch in U pipe, but continue
+ ;; in FP pipeline allowing other isntructions to be executed.
+ (define_insn_reservation "pent_fp" 3
+   (and (eq_attr "cpu" "pentium")
+        (eq_attr "type" "fop,fistp"))
+   "(pentium-firstu+pentium-fp),nothing,nothing")
+ 
+ ;; First two cycles of fmul are not pipelined.
+ (define_insn_reservation "pent_fmul" 3
+   (and (eq_attr "cpu" "pentium")
+        (eq_attr "type" "fmul"))
+   "(pentium-firstuv+pentium-fp+pentium-fmul),pentium-fmul,nothing")
+ 
+ ;; Long latency FP instructions overlap with integer instructions,
+ ;; but only last 2 cycles with FP ones.
+ (define_insn_reservation "pent_fdiv" 39
+   (and (eq_attr "cpu" "pentium")
+        (eq_attr "type" "fdiv"))
+   "(pentium-np+pentium-fp+pentium-fmul),
+    (pentium-fp+pentium-fmul)*36,pentium-fmul*2")
+ 
+ (define_insn_reservation "pent_fpspc" 70
+   (and (eq_attr "cpu" "pentium")
+        (eq_attr "type" "fpspc"))
+   "(pentium-np+pentium-fp+pentium-fmul),
+    (pentium-fp+pentium-fmul)*67,pentium-fmul*2")
+ 
+ ;; Integer instructions.  Load/execute/store takes 3 cycles,
+ ;; load/execute 2 cycles and execute only one cycle.
+ (define_insn_reservation "pent_uv_both" 3
+   (and (eq_attr "cpu" "pentium")
+        (and (eq_attr "pent_pair" "uv")
+ 	    (eq_attr "memory" "both")))
+   "pentium-firstuvboth,pentium-uv+pentium-memory,pentium-uv")
+ 
+ (define_insn_reservation "pent_u_both" 3
+   (and (eq_attr "cpu" "pentium")
+        (and (eq_attr "pent_pair" "pu")
+ 	    (eq_attr "memory" "both")))
+   "pentium-firstuboth,pentium-u+pentium-memory,pentium-u")
+ 
+ (define_insn_reservation "pent_v_both" 3
+   (and (eq_attr "cpu" "pentium")
+        (and (eq_attr "pent_pair" "pv")
+ 	    (eq_attr "memory" "both")))
+   "pentium-firstvboth,pentium-v+pentium-memory,pentium-v")
+ 
+ (define_insn_reservation "pent_np_both" 3
+   (and (eq_attr "cpu" "pentium")
+        (and (eq_attr "pent_pair" "np")
+ 	    (eq_attr "memory" "both")))
+   "pentium-np,pentium-np,pentium-np")
+ 
+ (define_insn_reservation "pent_uv_load" 2
+   (and (eq_attr "cpu" "pentium")
+        (and (eq_attr "pent_pair" "uv")
+ 	    (eq_attr "memory" "load")))
+   "pentium-firstuvload,pentium-uv")
+ 
+ (define_insn_reservation "pent_u_load" 2
+   (and (eq_attr "cpu" "pentium")
+        (and (eq_attr "pent_pair" "pu")
+ 	    (eq_attr "memory" "load")))
+   "pentium-firstuload,pentium-u")
+ 
+ (define_insn_reservation "pent_v_load" 2
+   (and (eq_attr "cpu" "pentium")
+        (and (eq_attr "pent_pair" "pv")
+ 	    (eq_attr "memory" "load")))
+   "pentium-firstvload,pentium-v")
+ 
+ (define_insn_reservation "pent_np_load" 2
+   (and (eq_attr "cpu" "pentium")
+        (and (eq_attr "pent_pair" "np")
+ 	    (eq_attr "memory" "load")))
+   "pentium-np,pentium-np")
+ 
+ (define_insn_reservation "pent_uv" 1
+   (and (eq_attr "cpu" "pentium")
+        (and (eq_attr "pent_pair" "uv")
+ 	    (eq_attr "memory" "none")))
+   "pentium-firstuv")
+ 
+ (define_insn_reservation "pent_u" 1
+   (and (eq_attr "cpu" "pentium")
+        (and (eq_attr "pent_pair" "pu")
+ 	    (eq_attr "memory" "none")))
+   "pentium-firstu")
+ 
+ (define_insn_reservation "pent_v" 1
+   (and (eq_attr "cpu" "pentium")
+        (and (eq_attr "pent_pair" "pv")
+ 	    (eq_attr "memory" "none")))
+   "pentium-firstv")
+ 
+ (define_insn_reservation "pent_np" 1
+   (and (eq_attr "cpu" "pentium")
+        (and (eq_attr "pent_pair" "np")
+ 	    (eq_attr "memory" "none")))
+   "pentium-np")
+ 
Index: ppro.md
===================================================================
RCS file: ppro.md
diff -N ppro.md
*** /dev/null	1 Jan 1970 00:00:00 -0000
--- ppro.md	9 May 2002 23:15:26 -0000
***************
*** 0 ****
--- 1,150 ----
+ ;; Pentium Pro/PII Scheduling
+ ;; Copyright (C) 2002 Free Software Foundation, Inc.
+ ;;
+ ;; This file is part of GNU CC.
+ ;;
+ ;; GNU CC is free software; you can redistribute it and/or modify
+ ;; it under the terms of the GNU General Public License as published by
+ ;; the Free Software Foundation; either version 2, or (at your option)
+ ;; any later version.
+ ;;
+ ;; GNU CC is distributed in the hope that it will be useful,
+ ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ;; GNU General Public License for more details.
+ ;;
+ ;; You should have received a copy of the GNU General Public License
+ ;; along with GNU CC; see the file COPYING.  If not, write to
+ ;; the Free Software Foundation, 59 Temple Place - Suite 330,
+ ;; Boston, MA 02111-1307, USA.  */
+ 
+ ;; Categorize how many uops an ia32 instruction evaluates to:
+ ;;   one --  an instruction with 1 uop can be decoded by any of the
+ ;;           three decoders.
+ ;;   few --  an instruction with 1 to 4 uops can be decoded only by 
+ ;;	     decoder 0.
+ ;;   many -- a complex instruction may take an unspecified number of
+ ;;	     cycles to decode in decoder 0.
+ 
+ (define_attr "ppro_uops" "one,few,many"
+   (cond [(eq_attr "type" "other,multi,call,callv,fpspc,str")
+ 	   (const_string "many")
+ 	 (eq_attr "type" "icmov,fcmov,str,cld")
+ 	   (const_string "few")
+ 	 (eq_attr "type" "imov")
+ 	   (if_then_else (eq_attr "memory" "store,both")
+ 	     (const_string "few")
+ 	     (const_string "one"))
+ 	 (eq_attr "memory" "!none")
+ 	   (const_string "few")
+ 	]
+ 	(const_string "one")))
+ 
+ ;;
+ ;; The PPro has an out-of-order core, but the instruction decoders are
+ ;; naturally in-order and asymmetric.  We get best performance by scheduling
+ ;; for the decoders, for in doing so we give the oo execution unit the 
+ ;; most choices.
+ ;;
+ ;; Rough readiness numbers.  Fine tuning happens in i386.c.
+ ;;
+ ;; p0	describes port 0.
+ ;; p01	describes ports 0 and 1 as a pair; alu insns can issue to either.
+ ;; p2	describes port 2 for loads.
+ ;; p34	describes ports 3 and 4 for stores.
+ ;; fpu	describes the fpu accessed via port 0. 
+ ;;	??? It is less than clear if there are separate fadd and fmul units
+ ;;	that could operate in parallel.
+ ;;
+ ;; ??? fxch isn't handled; not an issue until sched3 after reg-stack is real.
+ 
+ (define_function_unit "ppro_p0" 1 0
+   (and (eq_attr "cpu" "pentiumpro")
+        (eq_attr "type" "ishift,rotate,lea,ibr,cld"))
+   1 1)
+ 
+ (define_function_unit "ppro_p0" 1 0
+   (and (eq_attr "cpu" "pentiumpro")
+        (eq_attr "type" "imul"))
+   4 1)
+ 
+ ;; ??? Does the divider lock out the pipe while it works,
+ ;; or is there a disconnected unit?
+ (define_function_unit "ppro_p0" 1 0
+   (and (eq_attr "cpu" "pentiumpro")
+        (eq_attr "type" "idiv"))
+   17 17)
+ 
+ (define_function_unit "ppro_p0" 1 0
+   (and (eq_attr "cpu" "pentiumpro")
+        (eq_attr "type" "fop,fsgn,fistp"))
+   3 1)
+ 
+ (define_function_unit "ppro_p0" 1 0
+   (and (eq_attr "cpu" "pentiumpro")
+        (eq_attr "type" "fcmov"))
+   2 1)
+ 
+ (define_function_unit "ppro_p0" 1 0
+   (and (eq_attr "cpu" "pentiumpro")
+        (eq_attr "type" "fcmp"))
+   1 1)
+ 
+ (define_function_unit "ppro_p0" 1 0
+   (and (eq_attr "cpu" "pentiumpro")
+        (eq_attr "type" "fmov"))
+   1 1)
+ 
+ (define_function_unit "ppro_p0" 1 0
+   (and (eq_attr "cpu" "pentiumpro")
+        (eq_attr "type" "fmul"))
+   5 1)
+ 
+ (define_function_unit "ppro_p0" 1 0
+   (and (eq_attr "cpu" "pentiumpro")
+        (eq_attr "type" "fdiv,fpspc"))
+   56 1)
+ 
+ (define_function_unit "ppro_p01" 2 0
+   (and (eq_attr "cpu" "pentiumpro")
+        (eq_attr "type" "!imov,fmov"))
+   1 1)
+ 
+ (define_function_unit "ppro_p01" 2 0
+   (and (and (eq_attr "cpu" "pentiumpro")
+             (eq_attr "type" "imov,fmov"))
+        (eq_attr "memory" "none"))
+   1 1)
+ 
+ (define_function_unit "ppro_p2" 1 0
+   (and (eq_attr "cpu" "pentiumpro")
+        (ior (eq_attr "type" "pop")
+ 	    (eq_attr "memory" "load,both")))
+   3 1)
+ 
+ (define_function_unit "ppro_p34" 1 0
+   (and (eq_attr "cpu" "pentiumpro")
+        (ior (eq_attr "type" "push")
+ 	    (eq_attr "memory" "store,both")))
+   1 1)
+ 
+ (define_function_unit "fpu" 1 0
+   (and (eq_attr "cpu" "pentiumpro")
+        (eq_attr "type" "fop,fsgn,fmov,fcmp,fcmov,fistp"))
+   1 1)
+ 
+ (define_function_unit "fpu" 1 0
+   (and (eq_attr "cpu" "pentiumpro")
+        (eq_attr "type" "fmul"))
+   5 2)
+ 
+ (define_function_unit "fpu" 1 0
+   (and (eq_attr "cpu" "pentiumpro")
+        (eq_attr "type" "fdiv,fpspc"))
+   56 56)
+ 
+ ;; imul uses the fpu.  ??? does it have the same throughput as fmul?
+ (define_function_unit "fpu" 1 0
+   (and (eq_attr "cpu" "pentiumpro")
+        (eq_attr "type" "imul"))
+   4 1)




Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]