This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
ia32 scheduling
- From: law at redhat dot com
- To: gcc-patches at gcc dot gnu dot org
- Date: Thu, 09 May 2002 17:44:38 -0600
- Subject: ia32 scheduling
- Reply-to: law at redhat dot com
Per David M's recommendation, this patch moves scheduling information for
the various processors into their own md files.
* athlon.md, k6.md, pentium.md, ppro.md): New files.
* i386.md: Move scheduling information into new files.
Index: athlon.md
===================================================================
RCS file: athlon.md
diff -N athlon.md
*** /dev/null 1 Jan 1970 00:00:00 -0000
--- athlon.md 9 May 2002 23:15:01 -0000
***************
*** 0 ****
--- 1,206 ----
+ ;; AMD Athlon Scheduling
+ ;; Copyright (C) 2002 Free Software Foundation, Inc.
+ ;;
+ ;; This file is part of GNU CC.
+ ;;
+ ;; GNU CC is free software; you can redistribute it and/or modify
+ ;; it under the terms of the GNU General Public License as published by
+ ;; the Free Software Foundation; either version 2, or (at your option)
+ ;; any later version.
+ ;;
+ ;; GNU CC is distributed in the hope that it will be useful,
+ ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ ;; GNU General Public License for more details.
+ ;;
+ ;; You should have received a copy of the GNU General Public License
+ ;; along with GNU CC; see the file COPYING. If not, write to
+ ;; the Free Software Foundation, 59 Temple Place - Suite 330,
+ ;; Boston, MA 02111-1307, USA. */
+ (define_attr "athlon_decode" "direct,vector"
+ (cond [(eq_attr "type" "call,imul,idiv,other,multi,fcmov,fpspc,str,pop,cld,
fcmov")
+ (const_string "vector")
+ (and (eq_attr "type" "push")
+ (match_operand 1 "memory_operand" ""))
+ (const_string "vector")
+ (and (eq_attr "type" "fmov")
+ (and (eq_attr "memory" "load,store")
+ (eq_attr "mode" "XF")))
+ (const_string "vector")]
+ (const_string "direct")))
+
+ ;; The Athlon does contain three pipelined FP units, three integer units and
+ ;; three address generation units.
+ ;;
+ ;; The predecode logic is determining boundaries of instructions in the 64
+ ;; byte cache line. So the cache line straddling problem of K6 might be issue
+ ;; here as well, but it is not noted in the documentation.
+ ;;
+ ;; Three DirectPath instructions decoders and only one VectorPath decoder
+ ;; is available. They can decode three DirectPath instructions or one
VectorPath
+ ;; instruction per cycle.
+ ;; Decoded macro instructions are then passed to 72 entry instruction control
+ ;; unit, that passes
+ ;; it to the specialized integer (18 entry) and fp (36 entry) schedulers.
+ ;;
+ ;; The load/store queue unit is not attached to the schedulers but
+ ;; communicates with all the execution units separately instead.
+
+ (define_function_unit "athlon_vectordec" 1 0
+ (and (eq_attr "cpu" "athlon")
+ (eq_attr "athlon_decode" "vector"))
+ 1 1)
+
+ (define_function_unit "athlon_directdec" 3 0
+ (and (eq_attr "cpu" "athlon")
+ (eq_attr "athlon_decode" "direct"))
+ 1 1)
+
+ (define_function_unit "athlon_vectordec" 1 0
+ (and (eq_attr "cpu" "athlon")
+ (eq_attr "athlon_decode" "direct"))
+ 1 1 [(eq_attr "athlon_decode" "vector")])
+
+ (define_function_unit "athlon_ieu" 3 0
+ (and (eq_attr "cpu" "athlon")
+ (eq_attr "type" "alu1,negnot,alu,icmp,test,imov,imovx,lea,incdec,ishif
t,rotate,ibr,call,callv,icmov,cld,pop,setcc,push,pop"))
+ 1 1)
+
+ (define_function_unit "athlon_ieu" 3 0
+ (and (eq_attr "cpu" "athlon")
+ (eq_attr "type" "str"))
+ 15 15)
+
+ (define_function_unit "athlon_ieu" 3 0
+ (and (eq_attr "cpu" "athlon")
+ (eq_attr "type" "imul"))
+ 5 0)
+
+ (define_function_unit "athlon_ieu" 3 0
+ (and (eq_attr "cpu" "athlon")
+ (eq_attr "type" "idiv"))
+ 42 0)
+
+ (define_function_unit "athlon_muldiv" 1 0
+ (and (eq_attr "cpu" "athlon")
+ (eq_attr "type" "imul"))
+ 5 0)
+
+ (define_function_unit "athlon_muldiv" 1 0
+ (and (eq_attr "cpu" "athlon")
+ (eq_attr "type" "idiv"))
+ 42 42)
+
+ (define_attr "athlon_fpunits" "none,store,mul,add,muladd,any"
+ (cond [(eq_attr "type" "fop,fcmp,fistp")
+ (const_string "add")
+ (eq_attr "type" "fmul,fdiv,fpspc,fsgn,fcmov")
+ (const_string "mul")
+ (and (eq_attr "type" "fmov") (eq_attr "memory" "store,both"))
+ (const_string "store")
+ (and (eq_attr "type" "fmov") (eq_attr "memory" "load"))
+ (const_string "any")
+ (and (eq_attr "type" "fmov")
+ (ior (match_operand:SI 1 "register_operand" "")
+ (match_operand 1 "immediate_operand" "")))
+ (const_string "store")
+ (eq_attr "type" "fmov")
+ (const_string "muladd")]
+ (const_string "none")))
+
+ ;; We use latencies 1 for definitions. This is OK to model colisions
+ ;; in execution units. The real latencies are modeled in the "fp" pipeline.
+
+ ;; fsin, fcos: 96-192
+ ;; fsincos: 107-211
+ ;; fsqrt: 19 for SFmode, 27 for DFmode, 35 for XFmode.
+ (define_function_unit "athlon_fp" 3 0
+ (and (eq_attr "cpu" "athlon")
+ (eq_attr "type" "fpspc"))
+ 100 1)
+
+ ;; 16 cycles for SFmode, 20 for DFmode and 24 for XFmode.
+ (define_function_unit "athlon_fp" 3 0
+ (and (eq_attr "cpu" "athlon")
+ (eq_attr "type" "fdiv"))
+ 24 1)
+
+ (define_function_unit "athlon_fp" 3 0
+ (and (eq_attr "cpu" "athlon")
+ (eq_attr "type" "fop,fmul,fistp"))
+ 4 1)
+
+ ;; XFmode loads are slow.
+ ;; XFmode store is slow too (8 cycles), but we don't need to model it,
because
+ ;; there are no dependent instructions.
+
+ (define_function_unit "athlon_fp" 3 0
+ (and (eq_attr "cpu" "athlon")
+ (and (eq_attr "type" "fmov")
+ (and (eq_attr "memory" "load")
+ (eq_attr "mode" "XF"))))
+ 10 1)
+
+ (define_function_unit "athlon_fp" 3 0
+ (and (eq_attr "cpu" "athlon")
+ (eq_attr "type" "fmov,fsgn"))
+ 2 1)
+
+ ;; fcmp and ftst instructions
+ (define_function_unit "athlon_fp" 3 0
+ (and (eq_attr "cpu" "athlon")
+ (and (eq_attr "type" "fcmp")
+ (eq_attr "athlon_decode" "direct")))
+ 3 1)
+
+ ;; fcmpi instructions.
+ (define_function_unit "athlon_fp" 3 0
+ (and (eq_attr "cpu" "athlon")
+ (and (eq_attr "type" "fcmp")
+ (eq_attr "athlon_decode" "vector")))
+ 3 1)
+
+ (define_function_unit "athlon_fp" 3 0
+ (and (eq_attr "cpu" "athlon")
+ (eq_attr "type" "fcmov"))
+ 7 1)
+
+ (define_function_unit "athlon_fp_mul" 1 0
+ (and (eq_attr "cpu" "athlon")
+ (eq_attr "athlon_fpunits" "mul"))
+ 1 1)
+
+ (define_function_unit "athlon_fp_add" 1 0
+ (and (eq_attr "cpu" "athlon")
+ (eq_attr "athlon_fpunits" "add"))
+ 1 1)
+
+ (define_function_unit "athlon_fp_muladd" 2 0
+ (and (eq_attr "cpu" "athlon")
+ (eq_attr "athlon_fpunits" "muladd,mul,add"))
+ 1 1)
+
+ (define_function_unit "athlon_fp_store" 1 0
+ (and (eq_attr "cpu" "athlon")
+ (eq_attr "athlon_fpunits" "store"))
+ 1 1)
+
+ ;; We don't need to model the Address Generation Unit, since we don't model
+ ;; the re-order buffer yet and thus we never schedule more than three
operations
+ ;; at time. Later we may want to experiment with MD_SCHED macros modeling
the
+ ;; decoders independently on the functional units.
+
+ ;(define_function_unit "athlon_agu" 3 0
+ ; (and (eq_attr "cpu" "athlon")
+ ; (and (eq_attr "memory" "!none")
+ ; (eq_attr "athlon_fpunits" "none")))
+ ; 1 1)
+
+ ;; Model load unit to avoid too long sequences of loads. We don't need to
+ ;; model store queue, since it is hardly going to be bottleneck.
+
+ (define_function_unit "athlon_load" 2 0
+ (and (eq_attr "cpu" "athlon")
+ (eq_attr "memory" "load,both"))
+ 1 1)
+
Index: i386.md
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/i386/i386.md,v
retrieving revision 1.355
diff -c -3 -p -r1.355 i386.md
*** i386.md 9 May 2002 17:58:09 -0000 1.355
--- i386.md 9 May 2002 23:15:26 -0000
***************
*** 322,1056 ****
[(set_attr "length" "128")
(set_attr "type" "multi")])
! ;; Pentium Scheduling
! ;;
! ;; The Pentium is an in-order core with two integer pipelines.
!
! ;; True for insns that behave like prefixed insns on the Pentium.
! (define_attr "pent_prefix" "false,true"
! (if_then_else (ior (eq_attr "prefix_0f" "1")
! (ior (eq_attr "prefix_data16" "1")
! (eq_attr "prefix_rep" "1")))
! (const_string "true")
! (const_string "false")))
!
! ;; Categorize how an instruction slots.
!
! ;; The non-MMX Pentium slots an instruction with prefixes on U pipe only,
! ;; while MMX Pentium can slot it on either U or V. Model non-MMX Pentium
! ;; rules, because it results in noticeably better code on non-MMX Pentium
! ;; and doesn't hurt much on MMX. (Prefixed instructions are not very
! ;; common, so the scheduler usualy has a non-prefixed insn to pair).
!
! (define_attr "pent_pair" "uv,pu,pv,np"
! (cond [(eq_attr "imm_disp" "true")
! (const_string "np")
! (ior (eq_attr "type" "alu1,alu,imov,icmp,test,lea,incdec")
! (and (eq_attr "type" "pop,push")
! (eq_attr "memory" "!both")))
! (if_then_else (eq_attr "pent_prefix" "true")
! (const_string "pu")
! (const_string "uv"))
! (eq_attr "type" "ibr")
! (const_string "pv")
! (and (eq_attr "type" "ishift")
! (match_operand 2 "const_int_operand" ""))
! (const_string "pu")
! (and (eq_attr "type" "rotate")
! (match_operand 2 "const_int_1_operand" ""))
! (const_string "pu")
! (and (eq_attr "type" "call")
! (match_operand 0 "constant_call_address_operand" ""))
! (const_string "pv")
! (and (eq_attr "type" "callv")
! (match_operand 1 "constant_call_address_operand" ""))
! (const_string "pv")
! ]
! (const_string "np")))
!
! (define_automaton "pentium,pentium_fpu")
!
! ;; Pentium do have U and V pipes. Instruction to both pipes
! ;; are alwyas issued together, much like on VLIW.
! ;;
! ;; predecode
! ;; / \
! ;; decodeu decodev
! ;; / | |
! ;; fpu executeu executev
! ;; | | |
! ;; fpu retire retire
! ;; |
! ;; fpu
! ;; We add dummy "port" pipes allocated only first cycle of
! ;; instruction to specify this behaviour.
!
! (define_cpu_unit "pentium-portu,pentium-portv" "pentium")
! (define_cpu_unit "pentium-u,pentium-v" "pentium")
! (absence_set "pentium-portu" "pentium-u,pentium-v")
! (presence_set "pentium-portv" "pentium-portu")
!
! ;; Floating point instructions can overlap with new issue of integer
! ;; instructions. We model only first cycle of FP pipeline, as it is
! ;; fully pipelined.
! (define_cpu_unit "pentium-fp" "pentium_fpu")
!
! ;; There is non-pipelined multiplier unit used for complex operations.
! (define_cpu_unit "pentium-fmul" "pentium_fpu")
!
! ;; Pentium preserves memory ordering, so when load-execute-store
! ;; instruction is executed together with other instruction loading
! ;; data, the execution of the other instruction is delayed to very
! ;; last cycle of first instruction, when data are bypassed.
! ;; We model this by allocating "memory" unit when store is pending
! ;; and using conflicting load units together.
!
! (define_cpu_unit "pentium-memory" "pentium")
! (define_cpu_unit "pentium-load0" "pentium")
! (define_cpu_unit "pentium-load1" "pentium")
! (absence_set "pentium-load0,pentium-load1" "pentium-memory")
!
! (define_reservation "pentium-load" "(pentium-load0 | pentium-load1)")
! (define_reservation "pentium-np" "(pentium-u + pentium-v)")
! (define_reservation "pentium-uv" "(pentium-u | pentium-v)")
! (define_reservation "pentium-portuv" "(pentium-portu | pentium-portv)")
! (define_reservation "pentium-firstu" "(pentium-u + pentium-portu)")
! (define_reservation "pentium-firstv" "(pentium-v + pentium-portuv)")
! (define_reservation "pentium-firstuv" "(pentium-uv + pentium-portuv)")
! (define_reservation "pentium-firstuload" "(pentium-load + pentium-firstu)")
! (define_reservation "pentium-firstvload" "(pentium-load + pentium-firstv)")
! (define_reservation "pentium-firstuvload" "(pentium-load + pentium-firstuv)
! | (pentium-firstv,pentium-v,
! (pentium-load+pentium-firstv))")
! (define_reservation "pentium-firstuboth" "(pentium-load + pentium-firstu
! + pentium-memory)")
! (define_reservation "pentium-firstvboth" "(pentium-load + pentium-firstu
! + pentium-memory)")
! (define_reservation "pentium-firstuvboth" "(pentium-load + pentium-firstuv
! + pentium-memory)
! | (pentium-firstv,pentium-v,
! (pentium-load+pentium-firstv))")
!
! ;; Few common long latency instructions
! (define_insn_reservation "pent_mul" 11
! (and (eq_attr "cpu" "pentium")
! (eq_attr "type" "imul"))
! "pentium-np*11")
!
! (define_insn_reservation "pent_str" 12
! (and (eq_attr "cpu" "pentium")
! (eq_attr "type" "str"))
! "pentium-np*12")
!
! ;; Integer division and some other long latency instruction block all
! ;; units, including the FP pipe. There is no value in modeling the
! ;; latency of these instructions and not modeling the latency
! ;; decreases the size of the DFA.
! (define_insn_reservation "pent_block" 1
! (and (eq_attr "cpu" "pentium")
! (eq_attr "type" "idiv"))
! "pentium-np+pentium-fp")
!
! (define_insn_reservation "pent_cld" 2
! (and (eq_attr "cpu" "pentium")
! (eq_attr "type" "cld"))
! "pentium-np*2")
!
! ;; Moves usually have one cycle penalty, but there are exceptions.
! (define_insn_reservation "pent_fmov" 1
! (and (eq_attr "cpu" "pentium")
! (and (eq_attr "type" "fmov")
! (eq_attr "memory" "none,load")))
! "(pentium-fp+pentium-np)")
!
! (define_insn_reservation "pent_fpmovxf" 3
! (and (eq_attr "cpu" "pentium")
! (and (eq_attr "type" "fmov")
! (and (eq_attr "memory" "load,store")
! (eq_attr "mode" "XF"))))
! "(pentium-fp+pentium-np)*3")
!
! (define_insn_reservation "pent_fpstore" 2
! (and (eq_attr "cpu" "pentium")
! (and (eq_attr "type" "fmov")
! (ior (match_operand 1 "immediate_operand" "")
! (eq_attr "memory" "store"))))
! "(pentium-fp+pentium-np)*2")
!
! (define_insn_reservation "pent_imov" 1
! (and (eq_attr "cpu" "pentium")
! (eq_attr "type" "imov"))
! "pentium-firstuv")
!
! ;; Push and pop instructions have 1 cycle latency and special
! ;; hardware bypass allows them to be paired with other push,pop
! ;; and call instructions.
! (define_bypass 0 "pent_push,pent_pop" "pent_push,pent_pop,pent_call")
! (define_insn_reservation "pent_push" 1
! (and (eq_attr "cpu" "pentium")
! (and (eq_attr "type" "push")
! (eq_attr "memory" "store")))
! "pentium-firstuv")
!
! (define_insn_reservation "pent_pop" 1
! (and (eq_attr "cpu" "pentium")
! (eq_attr "type" "pop"))
! "pentium-firstuv")
!
! ;; Call and branch instruction can execute in either pipe, but
! ;; they are only pairable when in the v pipe.
! (define_insn_reservation "pent_call" 10
! (and (eq_attr "cpu" "pentium")
! (eq_attr "type" "call,callv"))
! "pentium-firstv,pentium-v*9")
!
! (define_insn_reservation "pent_branch" 1
! (and (eq_attr "cpu" "pentium")
! (eq_attr "type" "ibr"))
! "pentium-firstv")
!
! ;; Floating point instruction dispatch in U pipe, but continue
! ;; in FP pipeline allowing other isntructions to be executed.
! (define_insn_reservation "pent_fp" 3
! (and (eq_attr "cpu" "pentium")
! (eq_attr "type" "fop,fistp"))
! "(pentium-firstu+pentium-fp),nothing,nothing")
!
! ;; First two cycles of fmul are not pipelined.
! (define_insn_reservation "pent_fmul" 3
! (and (eq_attr "cpu" "pentium")
! (eq_attr "type" "fmul"))
! "(pentium-firstuv+pentium-fp+pentium-fmul),pentium-fmul,nothing")
!
! ;; Long latency FP instructions overlap with integer instructions,
! ;; but only last 2 cycles with FP ones.
! (define_insn_reservation "pent_fdiv" 39
! (and (eq_attr "cpu" "pentium")
! (eq_attr "type" "fdiv"))
! "(pentium-np+pentium-fp+pentium-fmul),
! (pentium-fp+pentium-fmul)*36,pentium-fmul*2")
!
! (define_insn_reservation "pent_fpspc" 70
! (and (eq_attr "cpu" "pentium")
! (eq_attr "type" "fpspc"))
! "(pentium-np+pentium-fp+pentium-fmul),
! (pentium-fp+pentium-fmul)*67,pentium-fmul*2")
!
! ;; Integer instructions. Load/execute/store takes 3 cycles,
! ;; load/execute 2 cycles and execute only one cycle.
! (define_insn_reservation "pent_uv_both" 3
! (and (eq_attr "cpu" "pentium")
! (and (eq_attr "pent_pair" "uv")
! (eq_attr "memory" "both")))
! "pentium-firstuvboth,pentium-uv+pentium-memory,pentium-uv")
!
! (define_insn_reservation "pent_u_both" 3
! (and (eq_attr "cpu" "pentium")
! (and (eq_attr "pent_pair" "pu")
! (eq_attr "memory" "both")))
! "pentium-firstuboth,pentium-u+pentium-memory,pentium-u")
!
! (define_insn_reservation "pent_v_both" 3
! (and (eq_attr "cpu" "pentium")
! (and (eq_attr "pent_pair" "pv")
! (eq_attr "memory" "both")))
! "pentium-firstvboth,pentium-v+pentium-memory,pentium-v")
!
! (define_insn_reservation "pent_np_both" 3
! (and (eq_attr "cpu" "pentium")
! (and (eq_attr "pent_pair" "np")
! (eq_attr "memory" "both")))
! "pentium-np,pentium-np,pentium-np")
!
! (define_insn_reservation "pent_uv_load" 2
! (and (eq_attr "cpu" "pentium")
! (and (eq_attr "pent_pair" "uv")
! (eq_attr "memory" "load")))
! "pentium-firstuvload,pentium-uv")
!
! (define_insn_reservation "pent_u_load" 2
! (and (eq_attr "cpu" "pentium")
! (and (eq_attr "pent_pair" "pu")
! (eq_attr "memory" "load")))
! "pentium-firstuload,pentium-u")
!
! (define_insn_reservation "pent_v_load" 2
! (and (eq_attr "cpu" "pentium")
! (and (eq_attr "pent_pair" "pv")
! (eq_attr "memory" "load")))
! "pentium-firstvload,pentium-v")
!
! (define_insn_reservation "pent_np_load" 2
! (and (eq_attr "cpu" "pentium")
! (and (eq_attr "pent_pair" "np")
! (eq_attr "memory" "load")))
! "pentium-np,pentium-np")
!
! (define_insn_reservation "pent_uv" 1
! (and (eq_attr "cpu" "pentium")
! (and (eq_attr "pent_pair" "uv")
! (eq_attr "memory" "none")))
! "pentium-firstuv")
!
! (define_insn_reservation "pent_u" 1
! (and (eq_attr "cpu" "pentium")
! (and (eq_attr "pent_pair" "pu")
! (eq_attr "memory" "none")))
! "pentium-firstu")
!
! (define_insn_reservation "pent_v" 1
! (and (eq_attr "cpu" "pentium")
! (and (eq_attr "pent_pair" "pv")
! (eq_attr "memory" "none")))
! "pentium-firstv")
!
! (define_insn_reservation "pent_np" 1
! (and (eq_attr "cpu" "pentium")
! (and (eq_attr "pent_pair" "np")
! (eq_attr "memory" "none")))
! "pentium-np")
!
!
! ;; Pentium Pro/PII Scheduling
! ;;
! ;; The PPro has an out-of-order core, but the instruction decoders are
! ;; naturally in-order and asymmetric. We get best performance by scheduling
! ;; for the decoders, for in doing so we give the oo execution unit the
! ;; most choices.
!
! ;; Categorize how many uops an ia32 instruction evaluates to:
! ;; one -- an instruction with 1 uop can be decoded by any of the
! ;; three decoders.
! ;; few -- an instruction with 1 to 4 uops can be decoded only by
! ;; decoder 0.
! ;; many -- a complex instruction may take an unspecified number of
! ;; cycles to decode in decoder 0.
!
! (define_attr "ppro_uops" "one,few,many"
! (cond [(eq_attr "type" "other,multi,call,callv,fpspc,str")
! (const_string "many")
! (eq_attr "type" "icmov,fcmov,str,cld")
! (const_string "few")
! (eq_attr "type" "imov")
! (if_then_else (eq_attr "memory" "store,both")
! (const_string "few")
! (const_string "one"))
! (eq_attr "memory" "!none")
! (const_string "few")
! ]
! (const_string "one")))
!
! ;; Rough readiness numbers. Fine tuning happens in i386.c.
! ;;
! ;; p0 describes port 0.
! ;; p01 describes ports 0 and 1 as a pair; alu insns can issue to either.
! ;; p2 describes port 2 for loads.
! ;; p34 describes ports 3 and 4 for stores.
! ;; fpu describes the fpu accessed via port 0.
! ;; ??? It is less than clear if there are separate fadd and fmul units
! ;; that could operate in parallel.
! ;;
! ;; ??? fxch isn't handled; not an issue until sched3 after reg-stack is real.
!
! (define_function_unit "ppro_p0" 1 0
! (and (eq_attr "cpu" "pentiumpro")
! (eq_attr "type" "ishift,rotate,lea,ibr,cld"))
! 1 1)
!
! (define_function_unit "ppro_p0" 1 0
! (and (eq_attr "cpu" "pentiumpro")
! (eq_attr "type" "imul"))
! 4 1)
!
! ;; ??? Does the divider lock out the pipe while it works,
! ;; or is there a disconnected unit?
! (define_function_unit "ppro_p0" 1 0
! (and (eq_attr "cpu" "pentiumpro")
! (eq_attr "type" "idiv"))
! 17 17)
!
! (define_function_unit "ppro_p0" 1 0
! (and (eq_attr "cpu" "pentiumpro")
! (eq_attr "type" "fop,fsgn,fistp"))
! 3 1)
!
! (define_function_unit "ppro_p0" 1 0
! (and (eq_attr "cpu" "pentiumpro")
! (eq_attr "type" "fcmov"))
! 2 1)
!
! (define_function_unit "ppro_p0" 1 0
! (and (eq_attr "cpu" "pentiumpro")
! (eq_attr "type" "fcmp"))
! 1 1)
!
! (define_function_unit "ppro_p0" 1 0
! (and (eq_attr "cpu" "pentiumpro")
! (eq_attr "type" "fmov"))
! 1 1)
!
! (define_function_unit "ppro_p0" 1 0
! (and (eq_attr "cpu" "pentiumpro")
! (eq_attr "type" "fmul"))
! 5 1)
!
! (define_function_unit "ppro_p0" 1 0
! (and (eq_attr "cpu" "pentiumpro")
! (eq_attr "type" "fdiv,fpspc"))
! 56 1)
!
! (define_function_unit "ppro_p01" 2 0
! (and (eq_attr "cpu" "pentiumpro")
! (eq_attr "type" "!imov,fmov"))
! 1 1)
!
! (define_function_unit "ppro_p01" 2 0
! (and (and (eq_attr "cpu" "pentiumpro")
! (eq_attr "type" "imov,fmov"))
! (eq_attr "memory" "none"))
! 1 1)
!
! (define_function_unit "ppro_p2" 1 0
! (and (eq_attr "cpu" "pentiumpro")
! (ior (eq_attr "type" "pop")
! (eq_attr "memory" "load,both")))
! 3 1)
!
! (define_function_unit "ppro_p34" 1 0
! (and (eq_attr "cpu" "pentiumpro")
! (ior (eq_attr "type" "push")
! (eq_attr "memory" "store,both")))
! 1 1)
!
! (define_function_unit "fpu" 1 0
! (and (eq_attr "cpu" "pentiumpro")
! (eq_attr "type" "fop,fsgn,fmov,fcmp,fcmov,fistp"))
! 1 1)
!
! (define_function_unit "fpu" 1 0
! (and (eq_attr "cpu" "pentiumpro")
! (eq_attr "type" "fmul"))
! 5 2)
!
! (define_function_unit "fpu" 1 0
! (and (eq_attr "cpu" "pentiumpro")
! (eq_attr "type" "fdiv,fpspc"))
! 56 56)
!
! ;; imul uses the fpu. ??? does it have the same throughput as fmul?
! (define_function_unit "fpu" 1 0
! (and (eq_attr "cpu" "pentiumpro")
! (eq_attr "type" "imul"))
! 4 1)
!
! ;; AMD K6/K6-2 Scheduling
! ;;
! ;; The K6 has similar architecture to PPro. Important difference is, that
! ;; there are only two decoders and they seems to be much slower than
execution
! ;; units. So we have to pay much more attention to proper decoding for
! ;; schedulers. We share most of scheduler code for PPro in i386.c
! ;;
! ;; The fp unit is not pipelined and do one operation per two cycles including
! ;; the FXCH.
! ;;
! ;; alu describes both ALU units (ALU-X and ALU-Y).
! ;; alux describes X alu unit
! ;; fpu describes FPU unit
! ;; load describes load unit.
! ;; branch describes branch unit.
! ;; store decsribes store unit. This unit is not modelled completely and
only
! ;; used to model lea operation. Otherwise it lie outside of the
critical
! ;; path.
! ;;
! ;; ??? fxch isn't handled; not an issue until sched3 after reg-stack is real.
!
! ;; The decoder specification is in the PPro section above!
!
! ;; Shift instructions and certain arithmetic are issued only to X pipe.
! (define_function_unit "k6_alux" 1 0
! (and (eq_attr "cpu" "k6")
! (eq_attr "type" "ishift,rotate,alu1,negnot,cld"))
! 1 1)
!
! ;; The QI mode arithmetic is issued to X pipe only.
! (define_function_unit "k6_alux" 1 0
! (and (eq_attr "cpu" "k6")
! (and (eq_attr "type" "alu,alu1,negnot,icmp,test,imovx,incdec")
! (match_operand:QI 0 "general_operand" "")))
! 1 1)
!
! (define_function_unit "k6_alu" 2 0
! (and (eq_attr "cpu" "k6")
! (eq_attr "type" "ishift,rotate,alu1,negnot,alu,icmp,test,imovx,incdec,
setcc,lea"))
! 1 1)
!
! (define_function_unit "k6_alu" 2 0
! (and (eq_attr "cpu" "k6")
! (and (eq_attr "type" "imov")
! (eq_attr "memory" "none")))
! 1 1)
!
! (define_function_unit "k6_branch" 1 0
! (and (eq_attr "cpu" "k6")
! (eq_attr "type" "call,callv,ibr"))
! 1 1)
!
! ;; Load unit have two cycle latency, but we take care for it in adjust_cost
! (define_function_unit "k6_load" 1 0
! (and (eq_attr "cpu" "k6")
! (ior (eq_attr "type" "pop")
! (eq_attr "memory" "load,both")))
! 1 1)
!
! (define_function_unit "k6_load" 1 0
! (and (eq_attr "cpu" "k6")
! (and (eq_attr "type" "str")
! (eq_attr "memory" "load,both")))
! 10 10)
!
! ;; Lea have two instructions, so latency is probably 2
! (define_function_unit "k6_store" 1 0
! (and (eq_attr "cpu" "k6")
! (eq_attr "type" "lea"))
! 2 1)
!
! (define_function_unit "k6_store" 1 0
! (and (eq_attr "cpu" "k6")
! (eq_attr "type" "str"))
! 10 10)
!
! (define_function_unit "k6_store" 1 0
! (and (eq_attr "cpu" "k6")
! (ior (eq_attr "type" "push")
! (eq_attr "memory" "store,both")))
! 1 1)
!
! (define_function_unit "k6_fpu" 1 1
! (and (eq_attr "cpu" "k6")
! (eq_attr "type" "fop,fmov,fcmp,fistp"))
! 2 2)
!
! (define_function_unit "k6_fpu" 1 1
! (and (eq_attr "cpu" "k6")
! (eq_attr "type" "fmul"))
! 2 2)
!
! ;; ??? Guess
! (define_function_unit "k6_fpu" 1 1
! (and (eq_attr "cpu" "k6")
! (eq_attr "type" "fdiv,fpspc"))
! 56 56)
!
! (define_function_unit "k6_alu" 2 0
! (and (eq_attr "cpu" "k6")
! (eq_attr "type" "imul"))
! 2 2)
!
! (define_function_unit "k6_alux" 1 0
! (and (eq_attr "cpu" "k6")
! (eq_attr "type" "imul"))
! 2 2)
!
! ;; ??? Guess
! (define_function_unit "k6_alu" 2 0
! (and (eq_attr "cpu" "k6")
! (eq_attr "type" "idiv"))
! 17 17)
!
! (define_function_unit "k6_alux" 1 0
! (and (eq_attr "cpu" "k6")
! (eq_attr "type" "idiv"))
! 17 17)
!
! ;; AMD Athlon Scheduling
! ;;
! ;; The Athlon does contain three pipelined FP units, three integer units and
! ;; three address generation units.
! ;;
! ;; The predecode logic is determining boundaries of instructions in the 64
! ;; byte cache line. So the cache line straddling problem of K6 might be issue
! ;; here as well, but it is not noted in the documentation.
! ;;
! ;; Three DirectPath instructions decoders and only one VectorPath decoder
! ;; is available. They can decode three DirectPath instructions or one
VectorPath
! ;; instruction per cycle.
! ;; Decoded macro instructions are then passed to 72 entry instruction control
! ;; unit, that passes
! ;; it to the specialized integer (18 entry) and fp (36 entry) schedulers.
! ;;
! ;; The load/store queue unit is not attached to the schedulers but
! ;; communicates with all the execution units separately instead.
!
! (define_attr "athlon_decode" "direct,vector"
! (cond [(eq_attr "type" "call,imul,idiv,other,multi,fcmov,fpspc,str,pop,cld,
fcmov")
! (const_string "vector")
! (and (eq_attr "type" "push")
! (match_operand 1 "memory_operand" ""))
! (const_string "vector")
! (and (eq_attr "type" "fmov")
! (and (eq_attr "memory" "load,store")
! (eq_attr "mode" "XF")))
! (const_string "vector")]
! (const_string "direct")))
!
! (define_function_unit "athlon_vectordec" 1 0
! (and (eq_attr "cpu" "athlon")
! (eq_attr "athlon_decode" "vector"))
! 1 1)
!
! (define_function_unit "athlon_directdec" 3 0
! (and (eq_attr "cpu" "athlon")
! (eq_attr "athlon_decode" "direct"))
! 1 1)
!
! (define_function_unit "athlon_vectordec" 1 0
! (and (eq_attr "cpu" "athlon")
! (eq_attr "athlon_decode" "direct"))
! 1 1 [(eq_attr "athlon_decode" "vector")])
!
! (define_function_unit "athlon_ieu" 3 0
! (and (eq_attr "cpu" "athlon")
! (eq_attr "type" "alu1,negnot,alu,icmp,test,imov,imovx,lea,incdec,ishif
t,rotate,ibr,call,callv,icmov,cld,pop,setcc,push,pop"))
! 1 1)
!
! (define_function_unit "athlon_ieu" 3 0
! (and (eq_attr "cpu" "athlon")
! (eq_attr "type" "str"))
! 15 15)
!
! (define_function_unit "athlon_ieu" 3 0
! (and (eq_attr "cpu" "athlon")
! (eq_attr "type" "imul"))
! 5 0)
!
! (define_function_unit "athlon_ieu" 3 0
! (and (eq_attr "cpu" "athlon")
! (eq_attr "type" "idiv"))
! 42 0)
!
! (define_function_unit "athlon_muldiv" 1 0
! (and (eq_attr "cpu" "athlon")
! (eq_attr "type" "imul"))
! 5 0)
!
! (define_function_unit "athlon_muldiv" 1 0
! (and (eq_attr "cpu" "athlon")
! (eq_attr "type" "idiv"))
! 42 42)
!
! (define_attr "athlon_fpunits" "none,store,mul,add,muladd,any"
! (cond [(eq_attr "type" "fop,fcmp,fistp")
! (const_string "add")
! (eq_attr "type" "fmul,fdiv,fpspc,fsgn,fcmov")
! (const_string "mul")
! (and (eq_attr "type" "fmov") (eq_attr "memory" "store,both"))
! (const_string "store")
! (and (eq_attr "type" "fmov") (eq_attr "memory" "load"))
! (const_string "any")
! (and (eq_attr "type" "fmov")
! (ior (match_operand:SI 1 "register_operand" "")
! (match_operand 1 "immediate_operand" "")))
! (const_string "store")
! (eq_attr "type" "fmov")
! (const_string "muladd")]
! (const_string "none")))
!
! ;; We use latencies 1 for definitions. This is OK to model colisions
! ;; in execution units. The real latencies are modeled in the "fp" pipeline.
!
! ;; fsin, fcos: 96-192
! ;; fsincos: 107-211
! ;; fsqrt: 19 for SFmode, 27 for DFmode, 35 for XFmode.
! (define_function_unit "athlon_fp" 3 0
! (and (eq_attr "cpu" "athlon")
! (eq_attr "type" "fpspc"))
! 100 1)
!
! ;; 16 cycles for SFmode, 20 for DFmode and 24 for XFmode.
! (define_function_unit "athlon_fp" 3 0
! (and (eq_attr "cpu" "athlon")
! (eq_attr "type" "fdiv"))
! 24 1)
!
! (define_function_unit "athlon_fp" 3 0
! (and (eq_attr "cpu" "athlon")
! (eq_attr "type" "fop,fmul,fistp"))
! 4 1)
!
! ;; XFmode loads are slow.
! ;; XFmode store is slow too (8 cycles), but we don't need to model it,
because
! ;; there are no dependent instructions.
!
! (define_function_unit "athlon_fp" 3 0
! (and (eq_attr "cpu" "athlon")
! (and (eq_attr "type" "fmov")
! (and (eq_attr "memory" "load")
! (eq_attr "mode" "XF"))))
! 10 1)
!
! (define_function_unit "athlon_fp" 3 0
! (and (eq_attr "cpu" "athlon")
! (eq_attr "type" "fmov,fsgn"))
! 2 1)
!
! ;; fcmp and ftst instructions
! (define_function_unit "athlon_fp" 3 0
! (and (eq_attr "cpu" "athlon")
! (and (eq_attr "type" "fcmp")
! (eq_attr "athlon_decode" "direct")))
! 3 1)
!
! ;; fcmpi instructions.
! (define_function_unit "athlon_fp" 3 0
! (and (eq_attr "cpu" "athlon")
! (and (eq_attr "type" "fcmp")
! (eq_attr "athlon_decode" "vector")))
! 3 1)
!
! (define_function_unit "athlon_fp" 3 0
! (and (eq_attr "cpu" "athlon")
! (eq_attr "type" "fcmov"))
! 7 1)
!
! (define_function_unit "athlon_fp_mul" 1 0
! (and (eq_attr "cpu" "athlon")
! (eq_attr "athlon_fpunits" "mul"))
! 1 1)
!
! (define_function_unit "athlon_fp_add" 1 0
! (and (eq_attr "cpu" "athlon")
! (eq_attr "athlon_fpunits" "add"))
! 1 1)
!
! (define_function_unit "athlon_fp_muladd" 2 0
! (and (eq_attr "cpu" "athlon")
! (eq_attr "athlon_fpunits" "muladd,mul,add"))
! 1 1)
!
! (define_function_unit "athlon_fp_store" 1 0
! (and (eq_attr "cpu" "athlon")
! (eq_attr "athlon_fpunits" "store"))
! 1 1)
!
! ;; We don't need to model the Address Generation Unit, since we don't model
! ;; the re-order buffer yet and thus we never schedule more than three
operations
! ;; at time. Later we may want to experiment with MD_SCHED macros modeling
the
! ;; decoders independently on the functional units.
!
! ;(define_function_unit "athlon_agu" 3 0
! ; (and (eq_attr "cpu" "athlon")
! ; (and (eq_attr "memory" "!none")
! ; (eq_attr "athlon_fpunits" "none")))
! ; 1 1)
!
! ;; Model load unit to avoid too long sequences of loads. We don't need to
! ;; model store queue, since it is hardly going to be bottleneck.
!
! (define_function_unit "athlon_load" 2 0
! (and (eq_attr "cpu" "athlon")
! (eq_attr "memory" "load,both"))
! 1 1)
!
;; Compare instructions.
--- 322,331 ----
[(set_attr "length" "128")
(set_attr "type" "multi")])
! (include "pentium.md")
! (include "ppro.md")
! (include "k6.md")
! (include "athlon.md")
;; Compare instructions.
Index: k6.md
===================================================================
RCS file: k6.md
diff -N k6.md
*** /dev/null 1 Jan 1970 00:00:00 -0000
--- k6.md 9 May 2002 23:15:26 -0000
***************
*** 0 ****
--- 1,136 ----
+ ;; AMD K6/K6-2 Scheduling
+ ;; Copyright (C) 2002 ;; Free Software Foundation, Inc.
+ ;;
+ ;; This file is part of GNU CC.
+ ;;
+ ;; GNU CC is free software; you can redistribute it and/or modify
+ ;; it under the terms of the GNU General Public License as published by
+ ;; the Free Software Foundation; either version 2, or (at your option)
+ ;; any later version.
+ ;;
+ ;; GNU CC is distributed in the hope that it will be useful,
+ ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ ;; GNU General Public License for more details.
+ ;;
+ ;; You should have received a copy of the GNU General Public License
+ ;; along with GNU CC; see the file COPYING. If not, write to
+ ;; the Free Software Foundation, 59 Temple Place - Suite 330,
+ ;; Boston, MA 02111-1307, USA. */
+ ;;
+ ;; The K6 has similar architecture to PPro. Important difference is, that
+ ;; there are only two decoders and they seems to be much slower than
execution
+ ;; units. So we have to pay much more attention to proper decoding for
+ ;; schedulers. We share most of scheduler code for PPro in i386.c
+ ;;
+ ;; The fp unit is not pipelined and do one operation per two cycles including
+ ;; the FXCH.
+ ;;
+ ;; alu describes both ALU units (ALU-X and ALU-Y).
+ ;; alux describes X alu unit
+ ;; fpu describes FPU unit
+ ;; load describes load unit.
+ ;; branch describes branch unit.
+ ;; store decsribes store unit. This unit is not modelled completely and
only
+ ;; used to model lea operation. Otherwise it lie outside of the
critical
+ ;; path.
+ ;;
+ ;; ??? fxch isn't handled; not an issue until sched3 after reg-stack is real.
+
+ ;; The decoder specification is in the PPro section above!
+
+ ;; Shift instructions and certain arithmetic are issued only to X pipe.
+ (define_function_unit "k6_alux" 1 0
+ (and (eq_attr "cpu" "k6")
+ (eq_attr "type" "ishift,rotate,alu1,negnot,cld"))
+ 1 1)
+
+ ;; The QI mode arithmetic is issued to X pipe only.
+ (define_function_unit "k6_alux" 1 0
+ (and (eq_attr "cpu" "k6")
+ (and (eq_attr "type" "alu,alu1,negnot,icmp,test,imovx,incdec")
+ (match_operand:QI 0 "general_operand" "")))
+ 1 1)
+
+ (define_function_unit "k6_alu" 2 0
+ (and (eq_attr "cpu" "k6")
+ (eq_attr "type" "ishift,rotate,alu1,negnot,alu,icmp,test,imovx,incdec,
setcc,lea"))
+ 1 1)
+
+ (define_function_unit "k6_alu" 2 0
+ (and (eq_attr "cpu" "k6")
+ (and (eq_attr "type" "imov")
+ (eq_attr "memory" "none")))
+ 1 1)
+
+ (define_function_unit "k6_branch" 1 0
+ (and (eq_attr "cpu" "k6")
+ (eq_attr "type" "call,callv,ibr"))
+ 1 1)
+
+ ;; Load unit have two cycle latency, but we take care for it in adjust_cost
+ (define_function_unit "k6_load" 1 0
+ (and (eq_attr "cpu" "k6")
+ (ior (eq_attr "type" "pop")
+ (eq_attr "memory" "load,both")))
+ 1 1)
+
+ (define_function_unit "k6_load" 1 0
+ (and (eq_attr "cpu" "k6")
+ (and (eq_attr "type" "str")
+ (eq_attr "memory" "load,both")))
+ 10 10)
+
+ ;; Lea have two instructions, so latency is probably 2
+ (define_function_unit "k6_store" 1 0
+ (and (eq_attr "cpu" "k6")
+ (eq_attr "type" "lea"))
+ 2 1)
+
+ (define_function_unit "k6_store" 1 0
+ (and (eq_attr "cpu" "k6")
+ (eq_attr "type" "str"))
+ 10 10)
+
+ (define_function_unit "k6_store" 1 0
+ (and (eq_attr "cpu" "k6")
+ (ior (eq_attr "type" "push")
+ (eq_attr "memory" "store,both")))
+ 1 1)
+
+ (define_function_unit "k6_fpu" 1 1
+ (and (eq_attr "cpu" "k6")
+ (eq_attr "type" "fop,fmov,fcmp,fistp"))
+ 2 2)
+
+ (define_function_unit "k6_fpu" 1 1
+ (and (eq_attr "cpu" "k6")
+ (eq_attr "type" "fmul"))
+ 2 2)
+
+ ;; ??? Guess
+ (define_function_unit "k6_fpu" 1 1
+ (and (eq_attr "cpu" "k6")
+ (eq_attr "type" "fdiv,fpspc"))
+ 56 56)
+
+ (define_function_unit "k6_alu" 2 0
+ (and (eq_attr "cpu" "k6")
+ (eq_attr "type" "imul"))
+ 2 2)
+
+ (define_function_unit "k6_alux" 1 0
+ (and (eq_attr "cpu" "k6")
+ (eq_attr "type" "imul"))
+ 2 2)
+
+ ;; ??? Guess
+ (define_function_unit "k6_alu" 2 0
+ (and (eq_attr "cpu" "k6")
+ (eq_attr "type" "idiv"))
+ 17 17)
+
+ (define_function_unit "k6_alux" 1 0
+ (and (eq_attr "cpu" "k6")
+ (eq_attr "type" "idiv"))
+ 17 17)
Index: pentium.md
===================================================================
RCS file: pentium.md
diff -N pentium.md
*** /dev/null 1 Jan 1970 00:00:00 -0000
--- pentium.md 9 May 2002 23:15:26 -0000
***************
*** 0 ****
--- 1,306 ----
+ ;; Pentium Scheduling
+ ;; Copyright (C) 2002 Free Software Foundation, Inc.
+ ;;
+ ;; This file is part of GNU CC.
+ ;;
+ ;; GNU CC is free software; you can redistribute it and/or modify
+ ;; it under the terms of the GNU General Public License as published by
+ ;; the Free Software Foundation; either version 2, or (at your option)
+ ;; any later version.
+ ;;
+ ;; GNU CC is distributed in the hope that it will be useful,
+ ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ ;; GNU General Public License for more details.
+ ;;
+ ;; You should have received a copy of the GNU General Public License
+ ;; along with GNU CC; see the file COPYING. If not, write to
+ ;; the Free Software Foundation, 59 Temple Place - Suite 330,
+ ;; Boston, MA 02111-1307, USA. */
+ ;;
+ ;; The Pentium is an in-order core with two integer pipelines.
+
+ ;; True for insns that behave like prefixed insns on the Pentium.
+ (define_attr "pent_prefix" "false,true"
+ (if_then_else (ior (eq_attr "prefix_0f" "1")
+ (ior (eq_attr "prefix_data16" "1")
+ (eq_attr "prefix_rep" "1")))
+ (const_string "true")
+ (const_string "false")))
+
+ ;; Categorize how an instruction slots.
+
+ ;; The non-MMX Pentium slots an instruction with prefixes on U pipe only,
+ ;; while MMX Pentium can slot it on either U or V. Model non-MMX Pentium
+ ;; rules, because it results in noticeably better code on non-MMX Pentium
+ ;; and doesn't hurt much on MMX. (Prefixed instructions are not very
+ ;; common, so the scheduler usualy has a non-prefixed insn to pair).
+
+ (define_attr "pent_pair" "uv,pu,pv,np"
+ (cond [(eq_attr "imm_disp" "true")
+ (const_string "np")
+ (ior (eq_attr "type" "alu1,alu,imov,icmp,test,lea,incdec")
+ (and (eq_attr "type" "pop,push")
+ (eq_attr "memory" "!both")))
+ (if_then_else (eq_attr "pent_prefix" "true")
+ (const_string "pu")
+ (const_string "uv"))
+ (eq_attr "type" "ibr")
+ (const_string "pv")
+ (and (eq_attr "type" "ishift")
+ (match_operand 2 "const_int_operand" ""))
+ (const_string "pu")
+ (and (eq_attr "type" "rotate")
+ (match_operand 2 "const_int_1_operand" ""))
+ (const_string "pu")
+ (and (eq_attr "type" "call")
+ (match_operand 0 "constant_call_address_operand" ""))
+ (const_string "pv")
+ (and (eq_attr "type" "callv")
+ (match_operand 1 "constant_call_address_operand" ""))
+ (const_string "pv")
+ ]
+ (const_string "np")))
+
+ (define_automaton "pentium,pentium_fpu")
+
+ ;; Pentium do have U and V pipes. Instruction to both pipes
+ ;; are alwyas issued together, much like on VLIW.
+ ;;
+ ;; predecode
+ ;; / \
+ ;; decodeu decodev
+ ;; / | |
+ ;; fpu executeu executev
+ ;; | | |
+ ;; fpu retire retire
+ ;; |
+ ;; fpu
+ ;; We add dummy "port" pipes allocated only first cycle of
+ ;; instruction to specify this behaviour.
+
+ (define_cpu_unit "pentium-portu,pentium-portv" "pentium")
+ (define_cpu_unit "pentium-u,pentium-v" "pentium")
+ (absence_set "pentium-portu" "pentium-u,pentium-v")
+ (presence_set "pentium-portv" "pentium-portu")
+
+ ;; Floating point instructions can overlap with new issue of integer
+ ;; instructions. We model only first cycle of FP pipeline, as it is
+ ;; fully pipelined.
+ (define_cpu_unit "pentium-fp" "pentium_fpu")
+
+ ;; There is non-pipelined multiplier unit used for complex operations.
+ (define_cpu_unit "pentium-fmul" "pentium_fpu")
+
+ ;; Pentium preserves memory ordering, so when load-execute-store
+ ;; instruction is executed together with other instruction loading
+ ;; data, the execution of the other instruction is delayed to very
+ ;; last cycle of first instruction, when data are bypassed.
+ ;; We model this by allocating "memory" unit when store is pending
+ ;; and using conflicting load units together.
+
+ (define_cpu_unit "pentium-memory" "pentium")
+ (define_cpu_unit "pentium-load0" "pentium")
+ (define_cpu_unit "pentium-load1" "pentium")
+ (absence_set "pentium-load0,pentium-load1" "pentium-memory")
+
+ (define_reservation "pentium-load" "(pentium-load0 | pentium-load1)")
+ (define_reservation "pentium-np" "(pentium-u + pentium-v)")
+ (define_reservation "pentium-uv" "(pentium-u | pentium-v)")
+ (define_reservation "pentium-portuv" "(pentium-portu | pentium-portv)")
+ (define_reservation "pentium-firstu" "(pentium-u + pentium-portu)")
+ (define_reservation "pentium-firstv" "(pentium-v + pentium-portuv)")
+ (define_reservation "pentium-firstuv" "(pentium-uv + pentium-portuv)")
+ (define_reservation "pentium-firstuload" "(pentium-load + pentium-firstu)")
+ (define_reservation "pentium-firstvload" "(pentium-load + pentium-firstv)")
+ (define_reservation "pentium-firstuvload" "(pentium-load + pentium-firstuv)
+ | (pentium-firstv,pentium-v,
+ (pentium-load+pentium-firstv))")
+ (define_reservation "pentium-firstuboth" "(pentium-load + pentium-firstu
+ + pentium-memory)")
+ (define_reservation "pentium-firstvboth" "(pentium-load + pentium-firstu
+ + pentium-memory)")
+ (define_reservation "pentium-firstuvboth" "(pentium-load + pentium-firstuv
+ + pentium-memory)
+ | (pentium-firstv,pentium-v,
+ (pentium-load+pentium-firstv))")
+
+ ;; Few common long latency instructions
+ (define_insn_reservation "pent_mul" 11
+ (and (eq_attr "cpu" "pentium")
+ (eq_attr "type" "imul"))
+ "pentium-np*11")
+
+ (define_insn_reservation "pent_str" 12
+ (and (eq_attr "cpu" "pentium")
+ (eq_attr "type" "str"))
+ "pentium-np*12")
+
+ ;; Integer division and some other long latency instruction block all
+ ;; units, including the FP pipe. There is no value in modeling the
+ ;; latency of these instructions and not modeling the latency
+ ;; decreases the size of the DFA.
+ (define_insn_reservation "pent_block" 1
+ (and (eq_attr "cpu" "pentium")
+ (eq_attr "type" "idiv"))
+ "pentium-np+pentium-fp")
+
+ (define_insn_reservation "pent_cld" 2
+ (and (eq_attr "cpu" "pentium")
+ (eq_attr "type" "cld"))
+ "pentium-np*2")
+
+ ;; Moves usually have one cycle penalty, but there are exceptions.
+ (define_insn_reservation "pent_fmov" 1
+ (and (eq_attr "cpu" "pentium")
+ (and (eq_attr "type" "fmov")
+ (eq_attr "memory" "none,load")))
+ "(pentium-fp+pentium-np)")
+
+ (define_insn_reservation "pent_fpmovxf" 3
+ (and (eq_attr "cpu" "pentium")
+ (and (eq_attr "type" "fmov")
+ (and (eq_attr "memory" "load,store")
+ (eq_attr "mode" "XF"))))
+ "(pentium-fp+pentium-np)*3")
+
+ (define_insn_reservation "pent_fpstore" 2
+ (and (eq_attr "cpu" "pentium")
+ (and (eq_attr "type" "fmov")
+ (ior (match_operand 1 "immediate_operand" "")
+ (eq_attr "memory" "store"))))
+ "(pentium-fp+pentium-np)*2")
+
+ (define_insn_reservation "pent_imov" 1
+ (and (eq_attr "cpu" "pentium")
+ (eq_attr "type" "imov"))
+ "pentium-firstuv")
+
+ ;; Push and pop instructions have 1 cycle latency and special
+ ;; hardware bypass allows them to be paired with other push,pop
+ ;; and call instructions.
+ (define_bypass 0 "pent_push,pent_pop" "pent_push,pent_pop,pent_call")
+ (define_insn_reservation "pent_push" 1
+ (and (eq_attr "cpu" "pentium")
+ (and (eq_attr "type" "push")
+ (eq_attr "memory" "store")))
+ "pentium-firstuv")
+
+ (define_insn_reservation "pent_pop" 1
+ (and (eq_attr "cpu" "pentium")
+ (eq_attr "type" "pop"))
+ "pentium-firstuv")
+
+ ;; Call and branch instruction can execute in either pipe, but
+ ;; they are only pairable when in the v pipe.
+ (define_insn_reservation "pent_call" 10
+ (and (eq_attr "cpu" "pentium")
+ (eq_attr "type" "call,callv"))
+ "pentium-firstv,pentium-v*9")
+
+ (define_insn_reservation "pent_branch" 1
+ (and (eq_attr "cpu" "pentium")
+ (eq_attr "type" "ibr"))
+ "pentium-firstv")
+
+ ;; Floating point instruction dispatch in U pipe, but continue
+ ;; in FP pipeline allowing other isntructions to be executed.
+ (define_insn_reservation "pent_fp" 3
+ (and (eq_attr "cpu" "pentium")
+ (eq_attr "type" "fop,fistp"))
+ "(pentium-firstu+pentium-fp),nothing,nothing")
+
+ ;; First two cycles of fmul are not pipelined.
+ (define_insn_reservation "pent_fmul" 3
+ (and (eq_attr "cpu" "pentium")
+ (eq_attr "type" "fmul"))
+ "(pentium-firstuv+pentium-fp+pentium-fmul),pentium-fmul,nothing")
+
+ ;; Long latency FP instructions overlap with integer instructions,
+ ;; but only last 2 cycles with FP ones.
+ (define_insn_reservation "pent_fdiv" 39
+ (and (eq_attr "cpu" "pentium")
+ (eq_attr "type" "fdiv"))
+ "(pentium-np+pentium-fp+pentium-fmul),
+ (pentium-fp+pentium-fmul)*36,pentium-fmul*2")
+
+ (define_insn_reservation "pent_fpspc" 70
+ (and (eq_attr "cpu" "pentium")
+ (eq_attr "type" "fpspc"))
+ "(pentium-np+pentium-fp+pentium-fmul),
+ (pentium-fp+pentium-fmul)*67,pentium-fmul*2")
+
+ ;; Integer instructions. Load/execute/store takes 3 cycles,
+ ;; load/execute 2 cycles and execute only one cycle.
+ (define_insn_reservation "pent_uv_both" 3
+ (and (eq_attr "cpu" "pentium")
+ (and (eq_attr "pent_pair" "uv")
+ (eq_attr "memory" "both")))
+ "pentium-firstuvboth,pentium-uv+pentium-memory,pentium-uv")
+
+ (define_insn_reservation "pent_u_both" 3
+ (and (eq_attr "cpu" "pentium")
+ (and (eq_attr "pent_pair" "pu")
+ (eq_attr "memory" "both")))
+ "pentium-firstuboth,pentium-u+pentium-memory,pentium-u")
+
+ (define_insn_reservation "pent_v_both" 3
+ (and (eq_attr "cpu" "pentium")
+ (and (eq_attr "pent_pair" "pv")
+ (eq_attr "memory" "both")))
+ "pentium-firstvboth,pentium-v+pentium-memory,pentium-v")
+
+ (define_insn_reservation "pent_np_both" 3
+ (and (eq_attr "cpu" "pentium")
+ (and (eq_attr "pent_pair" "np")
+ (eq_attr "memory" "both")))
+ "pentium-np,pentium-np,pentium-np")
+
+ (define_insn_reservation "pent_uv_load" 2
+ (and (eq_attr "cpu" "pentium")
+ (and (eq_attr "pent_pair" "uv")
+ (eq_attr "memory" "load")))
+ "pentium-firstuvload,pentium-uv")
+
+ (define_insn_reservation "pent_u_load" 2
+ (and (eq_attr "cpu" "pentium")
+ (and (eq_attr "pent_pair" "pu")
+ (eq_attr "memory" "load")))
+ "pentium-firstuload,pentium-u")
+
+ (define_insn_reservation "pent_v_load" 2
+ (and (eq_attr "cpu" "pentium")
+ (and (eq_attr "pent_pair" "pv")
+ (eq_attr "memory" "load")))
+ "pentium-firstvload,pentium-v")
+
+ (define_insn_reservation "pent_np_load" 2
+ (and (eq_attr "cpu" "pentium")
+ (and (eq_attr "pent_pair" "np")
+ (eq_attr "memory" "load")))
+ "pentium-np,pentium-np")
+
+ (define_insn_reservation "pent_uv" 1
+ (and (eq_attr "cpu" "pentium")
+ (and (eq_attr "pent_pair" "uv")
+ (eq_attr "memory" "none")))
+ "pentium-firstuv")
+
+ (define_insn_reservation "pent_u" 1
+ (and (eq_attr "cpu" "pentium")
+ (and (eq_attr "pent_pair" "pu")
+ (eq_attr "memory" "none")))
+ "pentium-firstu")
+
+ (define_insn_reservation "pent_v" 1
+ (and (eq_attr "cpu" "pentium")
+ (and (eq_attr "pent_pair" "pv")
+ (eq_attr "memory" "none")))
+ "pentium-firstv")
+
+ (define_insn_reservation "pent_np" 1
+ (and (eq_attr "cpu" "pentium")
+ (and (eq_attr "pent_pair" "np")
+ (eq_attr "memory" "none")))
+ "pentium-np")
+
Index: ppro.md
===================================================================
RCS file: ppro.md
diff -N ppro.md
*** /dev/null 1 Jan 1970 00:00:00 -0000
--- ppro.md 9 May 2002 23:15:26 -0000
***************
*** 0 ****
--- 1,150 ----
+ ;; Pentium Pro/PII Scheduling
+ ;; Copyright (C) 2002 Free Software Foundation, Inc.
+ ;;
+ ;; This file is part of GNU CC.
+ ;;
+ ;; GNU CC is free software; you can redistribute it and/or modify
+ ;; it under the terms of the GNU General Public License as published by
+ ;; the Free Software Foundation; either version 2, or (at your option)
+ ;; any later version.
+ ;;
+ ;; GNU CC is distributed in the hope that it will be useful,
+ ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ ;; GNU General Public License for more details.
+ ;;
+ ;; You should have received a copy of the GNU General Public License
+ ;; along with GNU CC; see the file COPYING. If not, write to
+ ;; the Free Software Foundation, 59 Temple Place - Suite 330,
+ ;; Boston, MA 02111-1307, USA. */
+
+ ;; Categorize how many uops an ia32 instruction evaluates to:
+ ;; one -- an instruction with 1 uop can be decoded by any of the
+ ;; three decoders.
+ ;; few -- an instruction with 1 to 4 uops can be decoded only by
+ ;; decoder 0.
+ ;; many -- a complex instruction may take an unspecified number of
+ ;; cycles to decode in decoder 0.
+
+ (define_attr "ppro_uops" "one,few,many"
+ (cond [(eq_attr "type" "other,multi,call,callv,fpspc,str")
+ (const_string "many")
+ (eq_attr "type" "icmov,fcmov,str,cld")
+ (const_string "few")
+ (eq_attr "type" "imov")
+ (if_then_else (eq_attr "memory" "store,both")
+ (const_string "few")
+ (const_string "one"))
+ (eq_attr "memory" "!none")
+ (const_string "few")
+ ]
+ (const_string "one")))
+
+ ;;
+ ;; The PPro has an out-of-order core, but the instruction decoders are
+ ;; naturally in-order and asymmetric. We get best performance by scheduling
+ ;; for the decoders, for in doing so we give the oo execution unit the
+ ;; most choices.
+ ;;
+ ;; Rough readiness numbers. Fine tuning happens in i386.c.
+ ;;
+ ;; p0 describes port 0.
+ ;; p01 describes ports 0 and 1 as a pair; alu insns can issue to either.
+ ;; p2 describes port 2 for loads.
+ ;; p34 describes ports 3 and 4 for stores.
+ ;; fpu describes the fpu accessed via port 0.
+ ;; ??? It is less than clear if there are separate fadd and fmul units
+ ;; that could operate in parallel.
+ ;;
+ ;; ??? fxch isn't handled; not an issue until sched3 after reg-stack is real.
+
+ (define_function_unit "ppro_p0" 1 0
+ (and (eq_attr "cpu" "pentiumpro")
+ (eq_attr "type" "ishift,rotate,lea,ibr,cld"))
+ 1 1)
+
+ (define_function_unit "ppro_p0" 1 0
+ (and (eq_attr "cpu" "pentiumpro")
+ (eq_attr "type" "imul"))
+ 4 1)
+
+ ;; ??? Does the divider lock out the pipe while it works,
+ ;; or is there a disconnected unit?
+ (define_function_unit "ppro_p0" 1 0
+ (and (eq_attr "cpu" "pentiumpro")
+ (eq_attr "type" "idiv"))
+ 17 17)
+
+ (define_function_unit "ppro_p0" 1 0
+ (and (eq_attr "cpu" "pentiumpro")
+ (eq_attr "type" "fop,fsgn,fistp"))
+ 3 1)
+
+ (define_function_unit "ppro_p0" 1 0
+ (and (eq_attr "cpu" "pentiumpro")
+ (eq_attr "type" "fcmov"))
+ 2 1)
+
+ (define_function_unit "ppro_p0" 1 0
+ (and (eq_attr "cpu" "pentiumpro")
+ (eq_attr "type" "fcmp"))
+ 1 1)
+
+ (define_function_unit "ppro_p0" 1 0
+ (and (eq_attr "cpu" "pentiumpro")
+ (eq_attr "type" "fmov"))
+ 1 1)
+
+ (define_function_unit "ppro_p0" 1 0
+ (and (eq_attr "cpu" "pentiumpro")
+ (eq_attr "type" "fmul"))
+ 5 1)
+
+ (define_function_unit "ppro_p0" 1 0
+ (and (eq_attr "cpu" "pentiumpro")
+ (eq_attr "type" "fdiv,fpspc"))
+ 56 1)
+
+ (define_function_unit "ppro_p01" 2 0
+ (and (eq_attr "cpu" "pentiumpro")
+ (eq_attr "type" "!imov,fmov"))
+ 1 1)
+
+ (define_function_unit "ppro_p01" 2 0
+ (and (and (eq_attr "cpu" "pentiumpro")
+ (eq_attr "type" "imov,fmov"))
+ (eq_attr "memory" "none"))
+ 1 1)
+
+ (define_function_unit "ppro_p2" 1 0
+ (and (eq_attr "cpu" "pentiumpro")
+ (ior (eq_attr "type" "pop")
+ (eq_attr "memory" "load,both")))
+ 3 1)
+
+ (define_function_unit "ppro_p34" 1 0
+ (and (eq_attr "cpu" "pentiumpro")
+ (ior (eq_attr "type" "push")
+ (eq_attr "memory" "store,both")))
+ 1 1)
+
+ (define_function_unit "fpu" 1 0
+ (and (eq_attr "cpu" "pentiumpro")
+ (eq_attr "type" "fop,fsgn,fmov,fcmp,fcmov,fistp"))
+ 1 1)
+
+ (define_function_unit "fpu" 1 0
+ (and (eq_attr "cpu" "pentiumpro")
+ (eq_attr "type" "fmul"))
+ 5 2)
+
+ (define_function_unit "fpu" 1 0
+ (and (eq_attr "cpu" "pentiumpro")
+ (eq_attr "type" "fdiv,fpspc"))
+ 56 56)
+
+ ;; imul uses the fpu. ??? does it have the same throughput as fmul?
+ (define_function_unit "fpu" 1 0
+ (and (eq_attr "cpu" "pentiumpro")
+ (eq_attr "type" "imul"))
+ 4 1)