This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [RFC] P6 microarch (Pentium 2/3) DFA scheduler description


Steven Bosscher <stevenb@suse.de> writes:

> 
> Index: gcc/config/i386/athlon.md
> ===================================================================
> RCS file: /cvs/gcc/gcc/gcc/config/i386/athlon.md,v
> retrieving revision 1.8
> diff -c -3 -p -r1.8 athlon.md
> *** gcc/config/i386/athlon.md	13 Dec 2003 04:44:05 -0000	1.8
> --- gcc/config/i386/athlon.md	25 Feb 2004 01:00:51 -0000
> ***************
> *** 581,587 ****
>   			 (and (eq_attr "cpu" "k8")
>   			      (eq_attr "type" "sselog"))
>   			 "athlon-double,athlon-fpsched,athlon-fmul")
> ! ;; ??? pcmp executes in addmul, probably not wortwhile to brother about that.
>   (define_insn_reservation "athlon_ssecmp_load" 2
>   			 (and (eq_attr "cpu" "athlon")
>   			      (and (eq_attr "type" "ssecmp")
> --- 581,587 ----
>   			 (and (eq_attr "cpu" "k8")
>   			      (eq_attr "type" "sselog"))
>   			 "athlon-double,athlon-fpsched,athlon-fmul")
> ! ;; ??? pcmp executes in addmul, probably not wortwhile to bother about that.
                                                 ^^^^^^^^^

 Should this be worthwhile?

> Index: gcc/config/i386/ppro.md
> ===================================================================
> RCS file: /cvs/gcc/gcc/gcc/config/i386/ppro.md,v
> retrieving revision 1.4
> diff -c -3 -p -r1.4 ppro.md
> *** gcc/config/i386/ppro.md	26 Sep 2003 04:07:46 -0000	1.4
> --- gcc/config/i386/ppro.md	25 Feb 2004 01:00:55 -0000
> ***************
> *** 1,5 ****
> ! ;; Pentium Pro/PII Scheduling
> ! ;; Copyright (C) 2002 Free Software Foundation, Inc.
>   ;;
>   ;; This file is part of GCC.
>   ;;
> --- 1,5 ----
> ! ;; Scheduling for the Intel P6 family of processors
> ! ;; Copyright (C) 2004 Free Software Foundation, Inc.

 Nothing is left in this file from 2002?

>   ;;
>   ;; This file is part of GCC.
>   ;;
> ***************
> *** 18,150 ****
>   ;; the Free Software Foundation, 59 Temple Place - Suite 330,
>   ;; Boston, MA 02111-1307, USA.  */
>   
> ! ;; Categorize how many uops an ia32 instruction evaluates to:
> ! ;;   one --  an instruction with 1 uop can be decoded by any of the
> ! ;;           three decoders.
> ! ;;   few --  an instruction with 1 to 4 uops can be decoded only by 
> ! ;;	     decoder 0.
> ! ;;   many -- a complex instruction may take an unspecified number of
> ! ;;	     cycles to decode in decoder 0.
> ! 
> ! (define_attr "ppro_uops" "one,few,many"
> !   (cond [(eq_attr "type" "other,multi,call,callv,fpspc,str")
> ! 	   (const_string "many")
> ! 	 (eq_attr "type" "icmov,fcmov,str,cld,leave")
> ! 	   (const_string "few")
> ! 	 (eq_attr "type" "imov")
> ! 	   (if_then_else (eq_attr "memory" "store,both")
> ! 	     (const_string "few")
> ! 	     (const_string "one"))
> ! 	 (eq_attr "memory" "!none")
> ! 	   (const_string "few")
> ! 	]
> ! 	(const_string "one")))
> ! 
> ! ;;
> ! ;; The PPro has an out-of-order core, but the instruction decoders are
> ! ;; naturally in-order and asymmetric.  We get best performance by scheduling
> ! ;; for the decoders, for in doing so we give the oo execution unit the 
> ! ;; most choices.
> ! ;;
> ! ;; Rough readiness numbers.  Fine tuning happens in i386.c.
> ! ;;
> ! ;; p0	describes port 0.
> ! ;; p01	describes ports 0 and 1 as a pair; alu insns can issue to either.
> ! ;; p2	describes port 2 for loads.
> ! ;; p34	describes ports 3 and 4 for stores.
> ! ;; fpu	describes the fpu accessed via port 0. 
> ! ;;	??? It is less than clear if there are separate fadd and fmul units
> ! ;;	that could operate in parallel.
> ! ;;
> ! ;; ??? fxch isn't handled; not an issue until sched3 after reg-stack is real.
> ! 
> ! (define_function_unit "ppro_p0" 1 0
> !   (and (eq_attr "cpu" "pentiumpro")
> !        (eq_attr "type" "ishift,rotate,ishift1,rotate1,lea,ibr,cld"))
> !   1 1)
> ! 
> ! (define_function_unit "ppro_p0" 1 0
> !   (and (eq_attr "cpu" "pentiumpro")
> !        (eq_attr "type" "imul"))
> !   4 1)
> ! 
> ! ;; ??? Does the divider lock out the pipe while it works,
> ! ;; or is there a disconnected unit?
> ! (define_function_unit "ppro_p0" 1 0
> !   (and (eq_attr "cpu" "pentiumpro")
> !        (eq_attr "type" "idiv"))
> !   17 17)
> ! 
> ! (define_function_unit "ppro_p0" 1 0
> !   (and (eq_attr "cpu" "pentiumpro")
> !        (eq_attr "type" "fop,fsgn,fistp"))
> !   3 1)
> ! 
> ! (define_function_unit "ppro_p0" 1 0
> !   (and (eq_attr "cpu" "pentiumpro")
> !        (eq_attr "type" "fcmov"))
> !   2 1)
> ! 
> ! (define_function_unit "ppro_p0" 1 0
> !   (and (eq_attr "cpu" "pentiumpro")
> !        (eq_attr "type" "fcmp"))
> !   1 1)
> ! 
> ! (define_function_unit "ppro_p0" 1 0
> !   (and (eq_attr "cpu" "pentiumpro")
> !        (eq_attr "type" "fmov"))
> !   1 1)
> ! 
> ! (define_function_unit "ppro_p0" 1 0
> !   (and (eq_attr "cpu" "pentiumpro")
> !        (eq_attr "type" "fmul"))
> !   5 1)
> ! 
> ! (define_function_unit "ppro_p0" 1 0
> !   (and (eq_attr "cpu" "pentiumpro")
> !        (eq_attr "type" "fdiv,fpspc"))
> !   56 1)
> ! 
> ! (define_function_unit "ppro_p01" 2 0
> !   (and (eq_attr "cpu" "pentiumpro")
> !        (eq_attr "type" "!imov,fmov"))
> !   1 1)
> ! 
> ! (define_function_unit "ppro_p01" 2 0
> !   (and (and (eq_attr "cpu" "pentiumpro")
> !             (eq_attr "type" "imov,fmov"))
> !        (eq_attr "memory" "none"))
> !   1 1)
> ! 
> ! (define_function_unit "ppro_p2" 1 0
> !   (and (eq_attr "cpu" "pentiumpro")
> !        (ior (eq_attr "type" "pop,leave")
> ! 	    (eq_attr "memory" "load,both")))
> !   3 1)
> ! 
> ! (define_function_unit "ppro_p34" 1 0
> !   (and (eq_attr "cpu" "pentiumpro")
> !        (ior (eq_attr "type" "push")
> ! 	    (eq_attr "memory" "store,both")))
> !   1 1)
> ! 
> ! (define_function_unit "fpu" 1 0
> !   (and (eq_attr "cpu" "pentiumpro")
> !        (eq_attr "type" "fop,fsgn,fmov,fcmp,fcmov,fistp"))
> !   1 1)
> ! 
> ! (define_function_unit "fpu" 1 0
> !   (and (eq_attr "cpu" "pentiumpro")
> !        (eq_attr "type" "fmul"))
> !   5 2)
> ! 
> ! (define_function_unit "fpu" 1 0
> !   (and (eq_attr "cpu" "pentiumpro")
> !        (eq_attr "type" "fdiv,fpspc"))
> !   56 56)
> ! 
> ! ;; imul uses the fpu.  ??? does it have the same throughput as fmul?
> ! (define_function_unit "fpu" 1 0
> !   (and (eq_attr "cpu" "pentiumpro")
> !        (eq_attr "type" "imul"))
> !   4 1)
> --- 18,717 ----
>   ;; the Free Software Foundation, 59 Temple Place - Suite 330,
>   ;; Boston, MA 02111-1307, USA.  */
>   
> ! ;; The P6 familiy includes the Pentium Pro, Pentium II, Pentium III, Celeron
> ! ;; and Xeon lines of CPUs.  The DFA scheduler description in this file is
> ! ;; based on information that can be found in the following two documents:
> ! ;;
> ! ;;    "P6 Family of Processors Hardware Developer's Manual",
> ! ;;    Intel, September 1999.
> ! ;;
> ! ;;    "Intel Architecture Optimization Manual",
> ! ;;    Intel, 1999 (Order Number: 245127-001).
> ! ;;
> ! ;; The P6 pipeline has three major components:
> ! ;;   1) the FETCH/DECODE unit, an in-order issue front-end
> ! ;;   2) the DISPATCH/EXECUTE unit, which is the out-of-order core
> ! ;;   3) the RETIRE unit, an in-order retirement unit
> ! ;;
> ! ;; So, the P6 CPUs have out-of-order cores, but the instruction decoder and
> ! ;; retirement unit are naturally in-order.
> ! ;;
> ! ;;                       BUS INTERFACE UNIT
> ! ;;                     /                   \
> ! ;;                L1 ICACHE             L1 DCACHE
> ! ;;              /     |     \              |     \
> ! ;;       DECODER0  DECODER1  DECODER2  DISP/EXEC  RETIRE
> ! ;;              \     |     /              |        |
> ! ;;            INSTRUCTION POOL   __________|_______/
> ! ;;          (inc. reorder buffer)
> ! ;;
> ! ;; Since the P6 CPUs execute instructions out-of-order, the most important
> ! ;; consideration in performance tuning is making sure enough micro-ops are
> ! ;; ready for execution in the out-of-order core, while not stalling the
> ! ;; decoder.
> ! ;;
> ! ;; TODO:
> ! ;; - Find a less crude way to model complex instructions.
> ! ;; - Include decoder latencies in the total reservation latencies.
> ! ;;   This isn't necessary now because we assume for every instruction
> ! ;;   that it never blocks a decoder.
> ! ;; - Figure out where the p0 and p1 reservations come from.  These
> ! ;;   appear not to be in the manual (e.g. why is cld "(p0+p1)*2" better
> ! ;;   than "(p0|p1)*4" ???)
> ! ;; - Lots more because I'm sure this is still far from optimal :-)
> ! 
> ! ;; The ppro_div automaton is used to model issue latencies of idiv insns.
> ! (define_automaton "ppro_decoder,ppro_core,ppro_idiv,ppro_load,ppro_store")
> ! 
> ! ;; Simple instructions of the register-register form have only one uop.
> ! ;; Load instructions are also only one uop.  Store instructions decode to
> ! ;; two uops, and simple read-modify instructions also take two µops.
                                                                   ^^^^
 uops?

> ! ;; Simple instructions of the register-memory form have two to three uops.
> ! ;; Simple read-modify-write instructions have four uops.  The rules for
> ! ;; the decoder are simple:
> ! ;;  - an instruction with 1 uop can be decoded by any of the three
> ! ;;    decoders in one cycle.
> ! ;;  - an instruction with 1 to 4 uops can be decoded only by decoder 0
> ! ;;    but still in only one cycle.
> ! ;;  - a complex instruction can also only be decoded by decoder 0, and
> ! ;;    this takes an unspecified number of cycles.
> ! ;;
> ! ;; The goal is to schedule suck that we have few-one-one uops sequences

 "such that we have a few-one-one"?

> ! ;; in each cycle to decode as many instructions per cycle as possible.
> ! (define_cpu_unit "decoder0" "ppro_decoder")
> ! (define_cpu_unit "decoder1" "ppro_decoder")
> ! (define_cpu_unit "decoder2" "ppro_decoder")
> ! 
> ! ;; All simple instructions can be decoded on each of the three decoders.
> ! (define_reservation "decodern" "(decoder0|decoder1|decoder2)")
> ! 
> ! ;; The out-of-order core has five pipelines.  During each cycle, the core
> ! ;; may dispatch zero or one uop on the port of any of the five pipelines
> ! ;; so the maximum number of dispatched uops per cycle is 5.  In practicer,
                                                                            ^
> ! ;; 3 uops per cycle is more realistic.
> ! ;;
> ! ;; Two of the five pipelines contain several execution units:
> ! ;;
> ! ;; Port 0	Port 1		Port 2		Port 3		Port 4
> ! ;; ALU		ALU		LOAD		SAC		SDA
> ! ;; FPU		JUE
> ! ;; AGU		MMX
> ! ;; MMX		P3FPU
> ! ;; P3FPU
> ! ;;
> ! ;; (SAC=Store Address Calculation, SDA=Store Data Unit, P3FPU = SSE unit,
> ! ;;  JUE = Jump Execution Unit, AGU = Addres Generation Unit)
> ! ;;
> ! (define_cpu_unit "p0,p1" "ppro_core")
> ! (define_cpu_unit "p2" "ppro_load")
> ! (define_cpu_unit "p3,p4" "ppro_store")
> ! (define_cpu_unit "idiv" "ppro_idiv")
> ! 
> ! ;; We assume throughout that when a load and a store are paired, they
> ! ;; produce zero latency.  Only the irregular instructions are modeled,
> ! ;; the simple ones follow a very regular pattern of 1 uop per reg-reg
> ! ;; operation, 1 uop per load on port 2, and 2 uops per store on port 4
> ! ;; and port 3.  These instructions are modelled at the bottom of this
> ! ;; file.
> ! 
> ! ;; For some instructions we don't know how many uops are produced.
> ! ;; These instructions are the "complex" ones in the Intel manuals.
> ! ;; What we _do_ know is that they typically produce four or more
> ! ;; uops, so they can only be decoded on decoder0.  Modelling their
> ! ;; latency doesn't make sense because we don't know how these
> ! ;; instructions are executed in the core.  So we just model that
> ! ;; they can only be decoded on decoder 0, and say that it takes
> ! ;; a little while before the result is availale.
> ! (define_insn_reservation "ppro_complex_insn" 6
> ! 			 (eq_attr "type" "other,multi,call,callv,str")
> ! 			 "decoder0")
> ! 
> ! ;; imov with memory operands does not use the integer units.
> ! (define_insn_reservation "ppro_imov" 1
> ! 			 (and (eq_attr "cpu" "pentiumpro")
> ! 			      (and (eq_attr "memory" "none")
> ! 				   (eq_attr "type" "imov")))
> ! 			 "decodern,(p0|p1)")
> ! 
> ! (define_insn_reservation "ppro_imov_load" 1
> ! 			 (and (eq_attr "cpu" "pentiumpro")
> ! 			      (and (eq_attr "memory" "load")
> ! 				   (eq_attr "type" "imov")))
> ! 			 "decodern,p2")
> ! 
> ! (define_insn_reservation "ppro_imov_store" 1
> ! 			 (and (eq_attr "cpu" "pentiumpro")
> ! 			      (and (eq_attr "memory" "store")
> ! 				   (eq_attr "type" "imov")))
> ! 			 "decoder0,p4+p3")
> ! 
> ! ;; imovx always decodes to one uop, and also doesn't use the integer
> ! ;; units if it has memory operands.
> ! (define_insn_reservation "ppro_imovx" 1
> ! 			 (and (eq_attr "cpu" "pentiumpro")
> ! 			      (and (eq_attr "memory" "none")
> ! 				   (eq_attr "type" "imovx")))
> ! 			 "decodern,(p0|p1)")
> ! 
> ! (define_insn_reservation "ppro_imovx_load" 1
> ! 			 (and (eq_attr "cpu" "pentiumpro")
> ! 			      (and (eq_attr "memory" "load")
> ! 				   (eq_attr "type" "imovx")))
> ! 			 "decodern,p2")
> ! 
> ! ;; lea executes on port 0 with latency one and throughput 1.
> ! (define_insn_reservation "ppro_lea" 1
> ! 			 (and (eq_attr "cpu" "pentiumpro")
> ! 			      (and (eq_attr "memory" "none")
> ! 				   (eq_attr "type" "lea")))
> ! 			 "decodern,p0")
> ! 
> ! ;; Shift and rotate execute on port 0 with latency and throughput 1.
> ! ;; The load and store units need to be reserved when memory operands
> ! ;; are involved.
> ! (define_insn_reservation "ppro_shift_rotate" 1
> ! 			 (and (eq_attr "cpu" "pentiumpro")
> ! 			      (and (eq_attr "memory" "none")
> ! 				   (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
> ! 			 "decodern,p0")
> ! 
> ! (define_insn_reservation "ppro_shift_rotate_mem" 1
> ! 			 (and (eq_attr "cpu" "pentiumpro")
> ! 			      (and (eq_attr "memory" "!none")
> ! 				   (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
> ! 			 "decoder0,p2,p0,p4+p3")
> ! 
> ! (define_insn_reservation "ppro_cld" 2
> ! 			 (and (eq_attr "cpu" "pentiumpro")
> ! 			      (eq_attr "type" "cld"))
> ! 			 "decoder0,(p0+p1)*2")
> ! 
> ! ;; The P6 has a sophisticated branch prediction mechanism to miminize
                                                                 ^^^^
> ! ;; latencies due to branching.  In particular, it has a fast way to
> ! ;; execute branches that are taken multiple times (such as in loops).
> ! ;; Branches not taken suffer no penalty, and correctly predicted
> ! ;; branches cost only one fetch cycle.  Mispredicted branches are very
> ! ;; costly: typically 15 cycles and possibly as many as 26 cycles.
> ! ;;
> ! ;; Unfortunatetely all this makes it quite difficult to proper model

 properly model.

> ! ;; the latencies for the compiler.  Here I've made the choice to be
> ! ;; optimistic and assume branches are often predicted correctly, so
> ! ;; they have latency 1, and the decoders are not blocked.
> ! ;;
> ! ;; In addition, the model assumes a branch always decodes to only 1 uop,
> ! ;; which is not exactly true because there are a few instructions that
> ! ;; decode to 2 uops or microcode.  But this probably gives the best

 What does microcode mean here?

> ! ;; results because we can assume these instructions can decode on all
> ! ;; decoders.
> ! (define_insn_reservation "ppro_branch" 1
> ! 			 (and (eq_attr "cpu" "pentiumpro")
> ! 			      (and (eq_attr "memory" "none")
> ! 				   (eq_attr "type" "ibr")))
> ! 			 "decodern,p1")
> ! 
> ! (define_insn_reservation "ppro_indirect_branch" 3
> ! 			 (and (eq_attr "cpu" "pentiumpro")
> ! 			      (and (eq_attr "memory" "!none")
> ! 				   (eq_attr "type" "ibr")))
> ! 			 "decoder0,p2,p1")
> ! 
> ! (define_insn_reservation "ppro_leave" 1
> ! 			 (and (eq_attr "cpu" "pentiumpro")
> ! 			      (eq_attr "type" "leave"))
> ! 			 "decoder0,p2+(p0|p1),(p0|p1)")
> ! 
> ! ;; imul has throughput one, but latency 4, and can only execute on port 0.
> ! (define_insn_reservation "ppro_imul" 4
> ! 			 (and (eq_attr "cpu" "pentiumpro")
> ! 			      (and (eq_attr "memory" "none")
> ! 				   (eq_attr "type" "imul")))
> ! 			 "decodern,p0")
> ! 
> ! (define_insn_reservation "ppro_imul_mem" 4
> ! 			 (and (eq_attr "cpu" "pentiumpro")
> ! 			      (and (eq_attr "memory" "!none")
> ! 				   (eq_attr "type" "imul")))
> ! 			 "decoder0,p2+p0")
> ! 
> ! ;; div and idiv are very similar, so we model them the same.
> ! ;; QI,HI,and SI have issue latency 12, 21, and 37, respectively.

 The code has magic numbers 19, 23, 39.  Is it obvious where the extra
7, 2, and 2 are coming from?

> ! (define_insn_reservation "ppro_idiv_QI" 19
> ! 			 (and (eq_attr "cpu" "pentiumpro")
> ! 			      (and (eq_attr "mode" "QI")
> ! 			           (eq_attr "type" "idiv")))
> ! 			 "decoder0,(p0+idiv)*2,(p0|p1)+idiv,idiv*8")
> ! 
> ! (define_insn_reservation "ppro_idiv_HI" 23
> ! 			 (and (eq_attr "cpu" "pentiumpro")
> ! 			      (and (eq_attr "mode" "HI")
> ! 			           (eq_attr "type" "idiv")))
> ! 			 "decoder0,(p0+idiv)*2,(p0|p1)+idiv,idiv*17")
> ! 
> ! (define_insn_reservation "ppro_idiv_SI" 39
> ! 			 (and (eq_attr "cpu" "pentiumpro")
> ! 			      (and (eq_attr "mode" "SI")
> ! 			           (eq_attr "type" "idiv")))
> ! 			 "decoder0,(p0+idiv)*2,(p0|p1)+idiv,idiv*33")
> ! 

Thanks, this was fun to read.

Jim


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]