This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [RFC] P6 microarch (Pentium 2/3) DFA scheduler description
- From: James Morrison <ja2morri at csclub dot uwaterloo dot ca>
- To: Steven Bosscher <stevenb at suse dot de>
- Cc: gcc-patches at gcc dot gnu dot org, jh at suse dot cz, law at redhat dot com,vmakarov at redhat dot com
- Date: 25 Feb 2004 21:51:35 -0500
- Subject: Re: [RFC] P6 microarch (Pentium 2/3) DFA scheduler description
- References: <200402260115.17876.stevenb@suse.de>
Steven Bosscher <stevenb@suse.de> writes:
>
> Index: gcc/config/i386/athlon.md
> ===================================================================
> RCS file: /cvs/gcc/gcc/gcc/config/i386/athlon.md,v
> retrieving revision 1.8
> diff -c -3 -p -r1.8 athlon.md
> *** gcc/config/i386/athlon.md 13 Dec 2003 04:44:05 -0000 1.8
> --- gcc/config/i386/athlon.md 25 Feb 2004 01:00:51 -0000
> ***************
> *** 581,587 ****
> (and (eq_attr "cpu" "k8")
> (eq_attr "type" "sselog"))
> "athlon-double,athlon-fpsched,athlon-fmul")
> ! ;; ??? pcmp executes in addmul, probably not wortwhile to brother about that.
> (define_insn_reservation "athlon_ssecmp_load" 2
> (and (eq_attr "cpu" "athlon")
> (and (eq_attr "type" "ssecmp")
> --- 581,587 ----
> (and (eq_attr "cpu" "k8")
> (eq_attr "type" "sselog"))
> "athlon-double,athlon-fpsched,athlon-fmul")
> ! ;; ??? pcmp executes in addmul, probably not wortwhile to bother about that.
^^^^^^^^^
Should this be worthwhile?
> Index: gcc/config/i386/ppro.md
> ===================================================================
> RCS file: /cvs/gcc/gcc/gcc/config/i386/ppro.md,v
> retrieving revision 1.4
> diff -c -3 -p -r1.4 ppro.md
> *** gcc/config/i386/ppro.md 26 Sep 2003 04:07:46 -0000 1.4
> --- gcc/config/i386/ppro.md 25 Feb 2004 01:00:55 -0000
> ***************
> *** 1,5 ****
> ! ;; Pentium Pro/PII Scheduling
> ! ;; Copyright (C) 2002 Free Software Foundation, Inc.
> ;;
> ;; This file is part of GCC.
> ;;
> --- 1,5 ----
> ! ;; Scheduling for the Intel P6 family of processors
> ! ;; Copyright (C) 2004 Free Software Foundation, Inc.
Nothing is left in this file from 2002?
> ;;
> ;; This file is part of GCC.
> ;;
> ***************
> *** 18,150 ****
> ;; the Free Software Foundation, 59 Temple Place - Suite 330,
> ;; Boston, MA 02111-1307, USA. */
>
> ! ;; Categorize how many uops an ia32 instruction evaluates to:
> ! ;; one -- an instruction with 1 uop can be decoded by any of the
> ! ;; three decoders.
> ! ;; few -- an instruction with 1 to 4 uops can be decoded only by
> ! ;; decoder 0.
> ! ;; many -- a complex instruction may take an unspecified number of
> ! ;; cycles to decode in decoder 0.
> !
> ! (define_attr "ppro_uops" "one,few,many"
> ! (cond [(eq_attr "type" "other,multi,call,callv,fpspc,str")
> ! (const_string "many")
> ! (eq_attr "type" "icmov,fcmov,str,cld,leave")
> ! (const_string "few")
> ! (eq_attr "type" "imov")
> ! (if_then_else (eq_attr "memory" "store,both")
> ! (const_string "few")
> ! (const_string "one"))
> ! (eq_attr "memory" "!none")
> ! (const_string "few")
> ! ]
> ! (const_string "one")))
> !
> ! ;;
> ! ;; The PPro has an out-of-order core, but the instruction decoders are
> ! ;; naturally in-order and asymmetric. We get best performance by scheduling
> ! ;; for the decoders, for in doing so we give the oo execution unit the
> ! ;; most choices.
> ! ;;
> ! ;; Rough readiness numbers. Fine tuning happens in i386.c.
> ! ;;
> ! ;; p0 describes port 0.
> ! ;; p01 describes ports 0 and 1 as a pair; alu insns can issue to either.
> ! ;; p2 describes port 2 for loads.
> ! ;; p34 describes ports 3 and 4 for stores.
> ! ;; fpu describes the fpu accessed via port 0.
> ! ;; ??? It is less than clear if there are separate fadd and fmul units
> ! ;; that could operate in parallel.
> ! ;;
> ! ;; ??? fxch isn't handled; not an issue until sched3 after reg-stack is real.
> !
> ! (define_function_unit "ppro_p0" 1 0
> ! (and (eq_attr "cpu" "pentiumpro")
> ! (eq_attr "type" "ishift,rotate,ishift1,rotate1,lea,ibr,cld"))
> ! 1 1)
> !
> ! (define_function_unit "ppro_p0" 1 0
> ! (and (eq_attr "cpu" "pentiumpro")
> ! (eq_attr "type" "imul"))
> ! 4 1)
> !
> ! ;; ??? Does the divider lock out the pipe while it works,
> ! ;; or is there a disconnected unit?
> ! (define_function_unit "ppro_p0" 1 0
> ! (and (eq_attr "cpu" "pentiumpro")
> ! (eq_attr "type" "idiv"))
> ! 17 17)
> !
> ! (define_function_unit "ppro_p0" 1 0
> ! (and (eq_attr "cpu" "pentiumpro")
> ! (eq_attr "type" "fop,fsgn,fistp"))
> ! 3 1)
> !
> ! (define_function_unit "ppro_p0" 1 0
> ! (and (eq_attr "cpu" "pentiumpro")
> ! (eq_attr "type" "fcmov"))
> ! 2 1)
> !
> ! (define_function_unit "ppro_p0" 1 0
> ! (and (eq_attr "cpu" "pentiumpro")
> ! (eq_attr "type" "fcmp"))
> ! 1 1)
> !
> ! (define_function_unit "ppro_p0" 1 0
> ! (and (eq_attr "cpu" "pentiumpro")
> ! (eq_attr "type" "fmov"))
> ! 1 1)
> !
> ! (define_function_unit "ppro_p0" 1 0
> ! (and (eq_attr "cpu" "pentiumpro")
> ! (eq_attr "type" "fmul"))
> ! 5 1)
> !
> ! (define_function_unit "ppro_p0" 1 0
> ! (and (eq_attr "cpu" "pentiumpro")
> ! (eq_attr "type" "fdiv,fpspc"))
> ! 56 1)
> !
> ! (define_function_unit "ppro_p01" 2 0
> ! (and (eq_attr "cpu" "pentiumpro")
> ! (eq_attr "type" "!imov,fmov"))
> ! 1 1)
> !
> ! (define_function_unit "ppro_p01" 2 0
> ! (and (and (eq_attr "cpu" "pentiumpro")
> ! (eq_attr "type" "imov,fmov"))
> ! (eq_attr "memory" "none"))
> ! 1 1)
> !
> ! (define_function_unit "ppro_p2" 1 0
> ! (and (eq_attr "cpu" "pentiumpro")
> ! (ior (eq_attr "type" "pop,leave")
> ! (eq_attr "memory" "load,both")))
> ! 3 1)
> !
> ! (define_function_unit "ppro_p34" 1 0
> ! (and (eq_attr "cpu" "pentiumpro")
> ! (ior (eq_attr "type" "push")
> ! (eq_attr "memory" "store,both")))
> ! 1 1)
> !
> ! (define_function_unit "fpu" 1 0
> ! (and (eq_attr "cpu" "pentiumpro")
> ! (eq_attr "type" "fop,fsgn,fmov,fcmp,fcmov,fistp"))
> ! 1 1)
> !
> ! (define_function_unit "fpu" 1 0
> ! (and (eq_attr "cpu" "pentiumpro")
> ! (eq_attr "type" "fmul"))
> ! 5 2)
> !
> ! (define_function_unit "fpu" 1 0
> ! (and (eq_attr "cpu" "pentiumpro")
> ! (eq_attr "type" "fdiv,fpspc"))
> ! 56 56)
> !
> ! ;; imul uses the fpu. ??? does it have the same throughput as fmul?
> ! (define_function_unit "fpu" 1 0
> ! (and (eq_attr "cpu" "pentiumpro")
> ! (eq_attr "type" "imul"))
> ! 4 1)
> --- 18,717 ----
> ;; the Free Software Foundation, 59 Temple Place - Suite 330,
> ;; Boston, MA 02111-1307, USA. */
>
> ! ;; The P6 familiy includes the Pentium Pro, Pentium II, Pentium III, Celeron
> ! ;; and Xeon lines of CPUs. The DFA scheduler description in this file is
> ! ;; based on information that can be found in the following two documents:
> ! ;;
> ! ;; "P6 Family of Processors Hardware Developer's Manual",
> ! ;; Intel, September 1999.
> ! ;;
> ! ;; "Intel Architecture Optimization Manual",
> ! ;; Intel, 1999 (Order Number: 245127-001).
> ! ;;
> ! ;; The P6 pipeline has three major components:
> ! ;; 1) the FETCH/DECODE unit, an in-order issue front-end
> ! ;; 2) the DISPATCH/EXECUTE unit, which is the out-of-order core
> ! ;; 3) the RETIRE unit, an in-order retirement unit
> ! ;;
> ! ;; So, the P6 CPUs have out-of-order cores, but the instruction decoder and
> ! ;; retirement unit are naturally in-order.
> ! ;;
> ! ;; BUS INTERFACE UNIT
> ! ;; / \
> ! ;; L1 ICACHE L1 DCACHE
> ! ;; / | \ | \
> ! ;; DECODER0 DECODER1 DECODER2 DISP/EXEC RETIRE
> ! ;; \ | / | |
> ! ;; INSTRUCTION POOL __________|_______/
> ! ;; (inc. reorder buffer)
> ! ;;
> ! ;; Since the P6 CPUs execute instructions out-of-order, the most important
> ! ;; consideration in performance tuning is making sure enough micro-ops are
> ! ;; ready for execution in the out-of-order core, while not stalling the
> ! ;; decoder.
> ! ;;
> ! ;; TODO:
> ! ;; - Find a less crude way to model complex instructions.
> ! ;; - Include decoder latencies in the total reservation latencies.
> ! ;; This isn't necessary now because we assume for every instruction
> ! ;; that it never blocks a decoder.
> ! ;; - Figure out where the p0 and p1 reservations come from. These
> ! ;; appear not to be in the manual (e.g. why is cld "(p0+p1)*2" better
> ! ;; than "(p0|p1)*4" ???)
> ! ;; - Lots more because I'm sure this is still far from optimal :-)
> !
> ! ;; The ppro_div automaton is used to model issue latencies of idiv insns.
> ! (define_automaton "ppro_decoder,ppro_core,ppro_idiv,ppro_load,ppro_store")
> !
> ! ;; Simple instructions of the register-register form have only one uop.
> ! ;; Load instructions are also only one uop. Store instructions decode to
> ! ;; two uops, and simple read-modify instructions also take two µops.
^^^^
uops?
> ! ;; Simple instructions of the register-memory form have two to three uops.
> ! ;; Simple read-modify-write instructions have four uops. The rules for
> ! ;; the decoder are simple:
> ! ;; - an instruction with 1 uop can be decoded by any of the three
> ! ;; decoders in one cycle.
> ! ;; - an instruction with 1 to 4 uops can be decoded only by decoder 0
> ! ;; but still in only one cycle.
> ! ;; - a complex instruction can also only be decoded by decoder 0, and
> ! ;; this takes an unspecified number of cycles.
> ! ;;
> ! ;; The goal is to schedule suck that we have few-one-one uops sequences
"such that we have a few-one-one"?
> ! ;; in each cycle to decode as many instructions per cycle as possible.
> ! (define_cpu_unit "decoder0" "ppro_decoder")
> ! (define_cpu_unit "decoder1" "ppro_decoder")
> ! (define_cpu_unit "decoder2" "ppro_decoder")
> !
> ! ;; All simple instructions can be decoded on each of the three decoders.
> ! (define_reservation "decodern" "(decoder0|decoder1|decoder2)")
> !
> ! ;; The out-of-order core has five pipelines. During each cycle, the core
> ! ;; may dispatch zero or one uop on the port of any of the five pipelines
> ! ;; so the maximum number of dispatched uops per cycle is 5. In practicer,
^
> ! ;; 3 uops per cycle is more realistic.
> ! ;;
> ! ;; Two of the five pipelines contain several execution units:
> ! ;;
> ! ;; Port 0 Port 1 Port 2 Port 3 Port 4
> ! ;; ALU ALU LOAD SAC SDA
> ! ;; FPU JUE
> ! ;; AGU MMX
> ! ;; MMX P3FPU
> ! ;; P3FPU
> ! ;;
> ! ;; (SAC=Store Address Calculation, SDA=Store Data Unit, P3FPU = SSE unit,
> ! ;; JUE = Jump Execution Unit, AGU = Addres Generation Unit)
> ! ;;
> ! (define_cpu_unit "p0,p1" "ppro_core")
> ! (define_cpu_unit "p2" "ppro_load")
> ! (define_cpu_unit "p3,p4" "ppro_store")
> ! (define_cpu_unit "idiv" "ppro_idiv")
> !
> ! ;; We assume throughout that when a load and a store are paired, they
> ! ;; produce zero latency. Only the irregular instructions are modeled,
> ! ;; the simple ones follow a very regular pattern of 1 uop per reg-reg
> ! ;; operation, 1 uop per load on port 2, and 2 uops per store on port 4
> ! ;; and port 3. These instructions are modelled at the bottom of this
> ! ;; file.
> !
> ! ;; For some instructions we don't know how many uops are produced.
> ! ;; These instructions are the "complex" ones in the Intel manuals.
> ! ;; What we _do_ know is that they typically produce four or more
> ! ;; uops, so they can only be decoded on decoder0. Modelling their
> ! ;; latency doesn't make sense because we don't know how these
> ! ;; instructions are executed in the core. So we just model that
> ! ;; they can only be decoded on decoder 0, and say that it takes
> ! ;; a little while before the result is availale.
> ! (define_insn_reservation "ppro_complex_insn" 6
> ! (eq_attr "type" "other,multi,call,callv,str")
> ! "decoder0")
> !
> ! ;; imov with memory operands does not use the integer units.
> ! (define_insn_reservation "ppro_imov" 1
> ! (and (eq_attr "cpu" "pentiumpro")
> ! (and (eq_attr "memory" "none")
> ! (eq_attr "type" "imov")))
> ! "decodern,(p0|p1)")
> !
> ! (define_insn_reservation "ppro_imov_load" 1
> ! (and (eq_attr "cpu" "pentiumpro")
> ! (and (eq_attr "memory" "load")
> ! (eq_attr "type" "imov")))
> ! "decodern,p2")
> !
> ! (define_insn_reservation "ppro_imov_store" 1
> ! (and (eq_attr "cpu" "pentiumpro")
> ! (and (eq_attr "memory" "store")
> ! (eq_attr "type" "imov")))
> ! "decoder0,p4+p3")
> !
> ! ;; imovx always decodes to one uop, and also doesn't use the integer
> ! ;; units if it has memory operands.
> ! (define_insn_reservation "ppro_imovx" 1
> ! (and (eq_attr "cpu" "pentiumpro")
> ! (and (eq_attr "memory" "none")
> ! (eq_attr "type" "imovx")))
> ! "decodern,(p0|p1)")
> !
> ! (define_insn_reservation "ppro_imovx_load" 1
> ! (and (eq_attr "cpu" "pentiumpro")
> ! (and (eq_attr "memory" "load")
> ! (eq_attr "type" "imovx")))
> ! "decodern,p2")
> !
> ! ;; lea executes on port 0 with latency one and throughput 1.
> ! (define_insn_reservation "ppro_lea" 1
> ! (and (eq_attr "cpu" "pentiumpro")
> ! (and (eq_attr "memory" "none")
> ! (eq_attr "type" "lea")))
> ! "decodern,p0")
> !
> ! ;; Shift and rotate execute on port 0 with latency and throughput 1.
> ! ;; The load and store units need to be reserved when memory operands
> ! ;; are involved.
> ! (define_insn_reservation "ppro_shift_rotate" 1
> ! (and (eq_attr "cpu" "pentiumpro")
> ! (and (eq_attr "memory" "none")
> ! (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
> ! "decodern,p0")
> !
> ! (define_insn_reservation "ppro_shift_rotate_mem" 1
> ! (and (eq_attr "cpu" "pentiumpro")
> ! (and (eq_attr "memory" "!none")
> ! (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
> ! "decoder0,p2,p0,p4+p3")
> !
> ! (define_insn_reservation "ppro_cld" 2
> ! (and (eq_attr "cpu" "pentiumpro")
> ! (eq_attr "type" "cld"))
> ! "decoder0,(p0+p1)*2")
> !
> ! ;; The P6 has a sophisticated branch prediction mechanism to miminize
^^^^
> ! ;; latencies due to branching. In particular, it has a fast way to
> ! ;; execute branches that are taken multiple times (such as in loops).
> ! ;; Branches not taken suffer no penalty, and correctly predicted
> ! ;; branches cost only one fetch cycle. Mispredicted branches are very
> ! ;; costly: typically 15 cycles and possibly as many as 26 cycles.
> ! ;;
> ! ;; Unfortunatetely all this makes it quite difficult to proper model
properly model.
> ! ;; the latencies for the compiler. Here I've made the choice to be
> ! ;; optimistic and assume branches are often predicted correctly, so
> ! ;; they have latency 1, and the decoders are not blocked.
> ! ;;
> ! ;; In addition, the model assumes a branch always decodes to only 1 uop,
> ! ;; which is not exactly true because there are a few instructions that
> ! ;; decode to 2 uops or microcode. But this probably gives the best
What does microcode mean here?
> ! ;; results because we can assume these instructions can decode on all
> ! ;; decoders.
> ! (define_insn_reservation "ppro_branch" 1
> ! (and (eq_attr "cpu" "pentiumpro")
> ! (and (eq_attr "memory" "none")
> ! (eq_attr "type" "ibr")))
> ! "decodern,p1")
> !
> ! (define_insn_reservation "ppro_indirect_branch" 3
> ! (and (eq_attr "cpu" "pentiumpro")
> ! (and (eq_attr "memory" "!none")
> ! (eq_attr "type" "ibr")))
> ! "decoder0,p2,p1")
> !
> ! (define_insn_reservation "ppro_leave" 1
> ! (and (eq_attr "cpu" "pentiumpro")
> ! (eq_attr "type" "leave"))
> ! "decoder0,p2+(p0|p1),(p0|p1)")
> !
> ! ;; imul has throughput one, but latency 4, and can only execute on port 0.
> ! (define_insn_reservation "ppro_imul" 4
> ! (and (eq_attr "cpu" "pentiumpro")
> ! (and (eq_attr "memory" "none")
> ! (eq_attr "type" "imul")))
> ! "decodern,p0")
> !
> ! (define_insn_reservation "ppro_imul_mem" 4
> ! (and (eq_attr "cpu" "pentiumpro")
> ! (and (eq_attr "memory" "!none")
> ! (eq_attr "type" "imul")))
> ! "decoder0,p2+p0")
> !
> ! ;; div and idiv are very similar, so we model them the same.
> ! ;; QI,HI,and SI have issue latency 12, 21, and 37, respectively.
The code has magic numbers 19, 23, 39. Is it obvious where the extra
7, 2, and 2 are coming from?
> ! (define_insn_reservation "ppro_idiv_QI" 19
> ! (and (eq_attr "cpu" "pentiumpro")
> ! (and (eq_attr "mode" "QI")
> ! (eq_attr "type" "idiv")))
> ! "decoder0,(p0+idiv)*2,(p0|p1)+idiv,idiv*8")
> !
> ! (define_insn_reservation "ppro_idiv_HI" 23
> ! (and (eq_attr "cpu" "pentiumpro")
> ! (and (eq_attr "mode" "HI")
> ! (eq_attr "type" "idiv")))
> ! "decoder0,(p0+idiv)*2,(p0|p1)+idiv,idiv*17")
> !
> ! (define_insn_reservation "ppro_idiv_SI" 39
> ! (and (eq_attr "cpu" "pentiumpro")
> ! (and (eq_attr "mode" "SI")
> ! (eq_attr "type" "idiv")))
> ! "decoder0,(p0+idiv)*2,(p0|p1)+idiv,idiv*33")
> !
Thanks, this was fun to read.
Jim