This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
P6 microarch (Pentium 2/3) DFA scheduler description
- From: Steven Bosscher <stevenb at suse dot de>
- To: gcc-patches at gcc dot gnu dot org
- Date: Sun, 29 Feb 2004 19:14:07 +0100
- Subject: P6 microarch (Pentium 2/3) DFA scheduler description
- Organization: SUSE Labs
- References: <200402260115.17876.stevenb@suse.de>
Hello,
Here's a new version of my PPro DFA scheduler description patch.
After changing a few changes following the feedback on my posting
of an earlier version of this patch, and after some patient testing
to get reportable SPEC numbers, I would like to have this reviewed
for inclusion. The numbers and the patch are attached.
Bootstrapped and tested on i686-pc-linux-gnu. OK?
Gr.
Steven
Base Compiler: mainline tree as of 24022004
Peak Compiler: same, but using PPro DFA scheduler
cflags base: -O2
cflags peak: -O2
Iterations: 3
Hardware: i686/Pentium III (Coppermine) at 551.296
SPECint
Size of binaries:
Total: Base: 3142751 bytes
Total: Peak: 3143507 bytes
Compile times for benchmarks:
Total time for base compilation: 994 s
Total time for peak compilation: 995 s
Estimated Estimated
Base Base Base Peak Peak Peak
Benchmarks Ref Time Run Time Ratio Ref Time Run Time Ratio
------------ -------- -------- -------- -------- -------- --------
164.gzip 1400 574 244 * 1400 574 244 *
175.vpr 1400 659 212 * 1400 657 213 *
176.gcc X X
181.mcf 1800 828 217 * 1800 826 218 *
186.crafty 1000 424 236 * 1000 425 235 *
197.parser 1800 1071 168 * 1800 1072 168 *
252.eon 1300 644 202 * 1300 646 201 *
253.perlbmk 1800 634 284 * 1800 637 283 *
254.gap 1100 503 219 * 1100 504 218 *
255.vortex 1900 717 265 * 1900 719 264 *
256.bzip2 1500 780 192 * 1500 783 192 *
300.twolf 3000 1307 230 * 3000 1296 231 *
Est. SPECint_base2000 222
Est. SPECint2000 222
SPECfp
Size of binaries:
Total: Base: 2052132 bytes
Total: Peak: 2044716 bytes
Compile times for benchmarks:
Total time for base compilation: 356 s
Total time for peak compilation: 357 s
Estimated Estimated
Base Base Base Peak Peak Peak
Benchmarks Ref Time Run Time Ratio Ref Time Run Time Ratio
168.wupwise 1600 845 189 * 1600 839 191 *
171.swim 3100 1468 211 * 3100 1464 212 *
172.mgrid X X
173.applu 2100 1601 131 * 2100 1605 131 *
177.mesa 1400 779 180 * 1400 777 180 *
178.galgel X X
179.art 2600 2320 112 * 2600 2319 112 *
183.equake 1300 791 164 * 1300 781 166 *
187.facerec X X
188.ammp 2200 1644 134 * 2200 1653 133 *
189.lucas X X
191.fma3d X X
200.sixtrack 1100 1144 96.2* 1100 1132 97.2*
301.apsi 2600 1700 153 * 2600 1723 151 *
Est. SPECfp_base2000 148
Est. SPECfp2000 148
* ppro.md: Rewrite as a DFA pipeline description.
* i386.md: Remove all uses of the ppro_uops attribute.
* i386.c: (ix86_safe_ppro_uops, ix86_dump_ppro_packet,
ix86_reorder_insn, ix86_sched_reorder_ppro, ix86_sched_init,
ix86_sched_reorder, ix86_variable_issue,
struct ix86_sched_data, TARGET_SCHED_VARIABLE_ISSUE,
TARGET_SCHED_INIT, TARGET_SCHED_REORDER): Remove.
(ia32_use_dfa_pipeline_interface): Add TARGET_PENTIUMPRO.
(ia32_multipass_dfa_lookahead): Add TARGET_PENTIUMPRO.
* athlon.md (athlon_ssecmp_load): Fix comment.
Index: ./gcc/config/i386/athlon.md
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/i386/athlon.md,v
retrieving revision 1.8
diff -c -3 -p -r1.8 athlon.md
*** ./gcc/config/i386/athlon.md 13 Dec 2003 04:44:05 -0000 1.8
--- ./gcc/config/i386/athlon.md 29 Feb 2004 18:06:27 -0000
***************
*** 581,587 ****
(and (eq_attr "cpu" "k8")
(eq_attr "type" "sselog"))
"athlon-double,athlon-fpsched,athlon-fmul")
! ;; ??? pcmp executes in addmul, probably not wortwhile to brother about that.
(define_insn_reservation "athlon_ssecmp_load" 2
(and (eq_attr "cpu" "athlon")
(and (eq_attr "type" "ssecmp")
--- 581,587 ----
(and (eq_attr "cpu" "k8")
(eq_attr "type" "sselog"))
"athlon-double,athlon-fpsched,athlon-fmul")
! ;; ??? pcmp executes in addmul, probably not worthwhile to bother about that.
(define_insn_reservation "athlon_ssecmp_load" 2
(and (eq_attr "cpu" "athlon")
(and (eq_attr "type" "ssecmp")
Index: ./gcc/config/i386/i386.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/i386/i386.c,v
retrieving revision 1.654
diff -c -3 -p -r1.654 i386.c
*** ./gcc/config/i386/i386.c 26 Feb 2004 21:43:34 -0000 1.654
--- ./gcc/config/i386/i386.c 29 Feb 2004 18:06:56 -0000
*************** static rtx gen_push (rtx);
*** 806,814 ****
static int memory_address_length (rtx addr);
static int ix86_flags_dependant (rtx, rtx, enum attr_type);
static int ix86_agi_dependant (rtx, rtx, enum attr_type);
- static enum attr_ppro_uops ix86_safe_ppro_uops (rtx);
- static void ix86_dump_ppro_packet (FILE *);
- static void ix86_reorder_insn (rtx *, rtx *);
static struct machine_function * ix86_init_machine_status (void);
static int ix86_split_to_parts (rtx, rtx *, enum machine_mode);
static int ix86_nsaved_regs (void);
--- 806,811 ----
*************** static void ix86_emit_save_regs (void);
*** 816,831 ****
static void ix86_emit_save_regs_using_mov (rtx, HOST_WIDE_INT);
static void ix86_emit_restore_regs_using_mov (rtx, HOST_WIDE_INT, int);
static void ix86_output_function_epilogue (FILE *, HOST_WIDE_INT);
- static void ix86_sched_reorder_ppro (rtx *, rtx *);
static HOST_WIDE_INT ix86_GOT_alias_set (void);
static void ix86_adjust_counter (rtx, HOST_WIDE_INT);
static rtx ix86_expand_aligntest (rtx, int);
static void ix86_expand_strlensi_unroll_1 (rtx, rtx, rtx);
static int ix86_issue_rate (void);
static int ix86_adjust_cost (rtx, rtx, rtx, int);
- static void ix86_sched_init (FILE *, int, int);
- static int ix86_sched_reorder (FILE *, int, rtx *, int *, int);
- static int ix86_variable_issue (FILE *, int, rtx, int);
static int ia32_use_dfa_pipeline_interface (void);
static int ia32_multipass_dfa_lookahead (void);
static void ix86_init_mmx_sse_builtins (void);
--- 813,824 ----
*************** static void init_ext_80387_constants (vo
*** 974,985 ****
#define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
#undef TARGET_SCHED_ISSUE_RATE
#define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
- #undef TARGET_SCHED_VARIABLE_ISSUE
- #define TARGET_SCHED_VARIABLE_ISSUE ix86_variable_issue
- #undef TARGET_SCHED_INIT
- #define TARGET_SCHED_INIT ix86_sched_init
- #undef TARGET_SCHED_REORDER
- #define TARGET_SCHED_REORDER ix86_sched_reorder
#undef TARGET_SCHED_USE_DFA_PIPELINE_INTERFACE
#define TARGET_SCHED_USE_DFA_PIPELINE_INTERFACE \
ia32_use_dfa_pipeline_interface
--- 967,972 ----
*************** ix86_adjust_cost (rtx insn, rtx link, rt
*** 12318,12561 ****
return cost;
}
- static union
- {
- struct ppro_sched_data
- {
- rtx decode[3];
- int issued_this_cycle;
- } ppro;
- } ix86_sched_data;
-
- static enum attr_ppro_uops
- ix86_safe_ppro_uops (rtx insn)
- {
- if (recog_memoized (insn) >= 0)
- return get_attr_ppro_uops (insn);
- else
- return PPRO_UOPS_MANY;
- }
-
- static void
- ix86_dump_ppro_packet (FILE *dump)
- {
- if (ix86_sched_data.ppro.decode[0])
- {
- fprintf (dump, "PPRO packet: %d",
- INSN_UID (ix86_sched_data.ppro.decode[0]));
- if (ix86_sched_data.ppro.decode[1])
- fprintf (dump, " %d", INSN_UID (ix86_sched_data.ppro.decode[1]));
- if (ix86_sched_data.ppro.decode[2])
- fprintf (dump, " %d", INSN_UID (ix86_sched_data.ppro.decode[2]));
- fputc ('\n', dump);
- }
- }
-
- /* We're beginning a new block. Initialize data structures as necessary. */
-
- static void
- ix86_sched_init (FILE *dump ATTRIBUTE_UNUSED,
- int sched_verbose ATTRIBUTE_UNUSED,
- int veclen ATTRIBUTE_UNUSED)
- {
- memset (&ix86_sched_data, 0, sizeof (ix86_sched_data));
- }
-
- /* Shift INSN to SLOT, and shift everything else down. */
-
- static void
- ix86_reorder_insn (rtx *insnp, rtx *slot)
- {
- if (insnp != slot)
- {
- rtx insn = *insnp;
- do
- insnp[0] = insnp[1];
- while (++insnp != slot);
- *insnp = insn;
- }
- }
-
- static void
- ix86_sched_reorder_ppro (rtx *ready, rtx *e_ready)
- {
- rtx decode[3];
- enum attr_ppro_uops cur_uops;
- int issued_this_cycle;
- rtx *insnp;
- int i;
-
- /* At this point .ppro.decode contains the state of the three
- decoders from last "cycle". That is, those insns that were
- actually independent. But here we're scheduling for the
- decoder, and we may find things that are decodable in the
- same cycle. */
-
- memcpy (decode, ix86_sched_data.ppro.decode, sizeof (decode));
- issued_this_cycle = 0;
-
- insnp = e_ready;
- cur_uops = ix86_safe_ppro_uops (*insnp);
-
- /* If the decoders are empty, and we've a complex insn at the
- head of the priority queue, let it issue without complaint. */
- if (decode[0] == NULL)
- {
- if (cur_uops == PPRO_UOPS_MANY)
- {
- decode[0] = *insnp;
- goto ppro_done;
- }
-
- /* Otherwise, search for a 2-4 uop unsn to issue. */
- while (cur_uops != PPRO_UOPS_FEW)
- {
- if (insnp == ready)
- break;
- cur_uops = ix86_safe_ppro_uops (*--insnp);
- }
-
- /* If so, move it to the head of the line. */
- if (cur_uops == PPRO_UOPS_FEW)
- ix86_reorder_insn (insnp, e_ready);
-
- /* Issue the head of the queue. */
- issued_this_cycle = 1;
- decode[0] = *e_ready--;
- }
-
- /* Look for simple insns to fill in the other two slots. */
- for (i = 1; i < 3; ++i)
- if (decode[i] == NULL)
- {
- if (ready > e_ready)
- goto ppro_done;
-
- insnp = e_ready;
- cur_uops = ix86_safe_ppro_uops (*insnp);
- while (cur_uops != PPRO_UOPS_ONE)
- {
- if (insnp == ready)
- break;
- cur_uops = ix86_safe_ppro_uops (*--insnp);
- }
-
- /* Found one. Move it to the head of the queue and issue it. */
- if (cur_uops == PPRO_UOPS_ONE)
- {
- ix86_reorder_insn (insnp, e_ready);
- decode[i] = *e_ready--;
- issued_this_cycle++;
- continue;
- }
-
- /* ??? Didn't find one. Ideally, here we would do a lazy split
- of 2-uop insns, issue one and queue the other. */
- }
-
- ppro_done:
- if (issued_this_cycle == 0)
- issued_this_cycle = 1;
- ix86_sched_data.ppro.issued_this_cycle = issued_this_cycle;
- }
-
- /* We are about to being issuing insns for this clock cycle.
- Override the default sort algorithm to better slot instructions. */
- static int
- ix86_sched_reorder (FILE *dump ATTRIBUTE_UNUSED,
- int sched_verbose ATTRIBUTE_UNUSED, rtx *ready,
- int *n_readyp, int clock_var ATTRIBUTE_UNUSED)
- {
- int n_ready = *n_readyp;
- rtx *e_ready = ready + n_ready - 1;
-
- /* Make sure to go ahead and initialize key items in
- ix86_sched_data if we are not going to bother trying to
- reorder the ready queue. */
- if (n_ready < 2)
- {
- ix86_sched_data.ppro.issued_this_cycle = 1;
- goto out;
- }
-
- switch (ix86_tune)
- {
- default:
- break;
-
- case PROCESSOR_PENTIUMPRO:
- ix86_sched_reorder_ppro (ready, e_ready);
- break;
- }
-
- out:
- return ix86_issue_rate ();
- }
-
- /* We are about to issue INSN. Return the number of insns left on the
- ready queue that can be issued this cycle. */
-
- static int
- ix86_variable_issue (FILE *dump, int sched_verbose, rtx insn,
- int can_issue_more)
- {
- int i;
- switch (ix86_tune)
- {
- default:
- return can_issue_more - 1;
-
- case PROCESSOR_PENTIUMPRO:
- {
- enum attr_ppro_uops uops = ix86_safe_ppro_uops (insn);
-
- if (uops == PPRO_UOPS_MANY)
- {
- if (sched_verbose)
- ix86_dump_ppro_packet (dump);
- ix86_sched_data.ppro.decode[0] = insn;
- ix86_sched_data.ppro.decode[1] = NULL;
- ix86_sched_data.ppro.decode[2] = NULL;
- if (sched_verbose)
- ix86_dump_ppro_packet (dump);
- ix86_sched_data.ppro.decode[0] = NULL;
- }
- else if (uops == PPRO_UOPS_FEW)
- {
- if (sched_verbose)
- ix86_dump_ppro_packet (dump);
- ix86_sched_data.ppro.decode[0] = insn;
- ix86_sched_data.ppro.decode[1] = NULL;
- ix86_sched_data.ppro.decode[2] = NULL;
- }
- else
- {
- for (i = 0; i < 3; ++i)
- if (ix86_sched_data.ppro.decode[i] == NULL)
- {
- ix86_sched_data.ppro.decode[i] = insn;
- break;
- }
- if (i == 3)
- abort ();
- if (i == 2)
- {
- if (sched_verbose)
- ix86_dump_ppro_packet (dump);
- ix86_sched_data.ppro.decode[0] = NULL;
- ix86_sched_data.ppro.decode[1] = NULL;
- ix86_sched_data.ppro.decode[2] = NULL;
- }
- }
- }
- return --ix86_sched_data.ppro.issued_this_cycle;
- }
- }
-
static int
ia32_use_dfa_pipeline_interface (void)
{
! if (TARGET_PENTIUM || TARGET_ATHLON_K8)
return 1;
return 0;
}
--- 12305,12316 ----
return cost;
}
static int
ia32_use_dfa_pipeline_interface (void)
{
! if (TARGET_PENTIUM
! || TARGET_PENTIUMPRO
! || TARGET_ATHLON_K8)
return 1;
return 0;
}
*************** ia32_multipass_dfa_lookahead (void)
*** 12569,12576 ****
{
if (ix86_tune == PROCESSOR_PENTIUM)
return 2;
else
! return 0;
}
--- 12324,12335 ----
{
if (ix86_tune == PROCESSOR_PENTIUM)
return 2;
+
+ if (ix86_tune == PROCESSOR_PENTIUMPRO)
+ return 1;
+
else
! return 0;
}
Index: ./gcc/config/i386/i386.md
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/i386/i386.md,v
retrieving revision 1.514
diff -c -3 -p -r1.514 i386.md
*** ./gcc/config/i386/i386.md 24 Feb 2004 03:27:10 -0000 1.514
--- ./gcc/config/i386/i386.md 29 Feb 2004 18:07:41 -0000
***************
*** 246,252 ****
(const_int 1)
(const_int 0)))
! ;; Set when 0f opcode prefix is used.
(define_attr "prefix_rex" ""
(cond [(and (eq_attr "mode" "DI")
(eq_attr "type" "!push,pop,call,callv,leave,ibr"))
--- 246,252 ----
(const_int 1)
(const_int 0)))
! ;; Set when REX opcode prefix is used.
(define_attr "prefix_rex" ""
(cond [(and (eq_attr "mode" "DI")
(eq_attr "type" "!push,pop,call,callv,leave,ibr"))
***************
*** 939,946 ****
"fnstsw\t%0"
[(set_attr "length" "2")
(set_attr "mode" "SI")
! (set_attr "unit" "i387")
! (set_attr "ppro_uops" "few")])
;; FP compares, step 3
;; Get ax into flags, general case.
--- 939,945 ----
"fnstsw\t%0"
[(set_attr "length" "2")
(set_attr "mode" "SI")
! (set_attr "unit" "i387")])
;; FP compares, step 3
;; Get ax into flags, general case.
***************
*** 952,959 ****
"sahf"
[(set_attr "length" "1")
(set_attr "athlon_decode" "vector")
! (set_attr "mode" "SI")
! (set_attr "ppro_uops" "one")])
;; Pentium Pro can do steps 1 through 3 in one go.
--- 951,957 ----
"sahf"
[(set_attr "length" "1")
(set_attr "athlon_decode" "vector")
! (set_attr "mode" "SI")])
;; Pentium Pro can do steps 1 through 3 in one go.
***************
*** 1264,1271 ****
(set_attr "pent_pair" "np")
(set_attr "athlon_decode" "vector")
(set_attr "mode" "SI")
! (set_attr "modrm" "0")
! (set_attr "ppro_uops" "few")])
(define_expand "movhi"
[(set (match_operand:HI 0 "nonimmediate_operand" "")
--- 1262,1268 ----
(set_attr "pent_pair" "np")
(set_attr "athlon_decode" "vector")
(set_attr "mode" "SI")
! (set_attr "modrm" "0")])
(define_expand "movhi"
[(set (match_operand:HI 0 "nonimmediate_operand" "")
***************
*** 1384,1391 ****
[(set_attr "type" "imov")
(set_attr "pent_pair" "np")
(set_attr "mode" "HI")
! (set_attr "modrm" "0")
! (set_attr "ppro_uops" "few")])
(define_insn "*swaphi_2"
[(set (match_operand:HI 0 "register_operand" "+r")
--- 1381,1387 ----
[(set_attr "type" "imov")
(set_attr "pent_pair" "np")
(set_attr "mode" "HI")
! (set_attr "modrm" "0")])
(define_insn "*swaphi_2"
[(set (match_operand:HI 0 "register_operand" "+r")
***************
*** 1397,1404 ****
[(set_attr "type" "imov")
(set_attr "pent_pair" "np")
(set_attr "mode" "SI")
! (set_attr "modrm" "0")
! (set_attr "ppro_uops" "few")])
(define_expand "movstricthi"
[(set (strict_low_part (match_operand:HI 0 "nonimmediate_operand" ""))
--- 1393,1399 ----
[(set_attr "type" "imov")
(set_attr "pent_pair" "np")
(set_attr "mode" "SI")
! (set_attr "modrm" "0")])
(define_expand "movstricthi"
[(set (strict_low_part (match_operand:HI 0 "nonimmediate_operand" ""))
***************
*** 1557,1564 ****
[(set_attr "type" "imov")
(set_attr "pent_pair" "np")
(set_attr "mode" "QI")
! (set_attr "modrm" "0")
! (set_attr "ppro_uops" "few")])
(define_expand "movstrictqi"
[(set (strict_low_part (match_operand:QI 0 "nonimmediate_operand" ""))
--- 1552,1558 ----
[(set_attr "type" "imov")
(set_attr "pent_pair" "np")
(set_attr "mode" "QI")
! (set_attr "modrm" "0")])
(define_expand "movstrictqi"
[(set (strict_low_part (match_operand:QI 0 "nonimmediate_operand" ""))
***************
*** 2115,2122 ****
(set_attr "pent_pair" "np")
(set_attr "athlon_decode" "vector")
(set_attr "mode" "DI")
! (set_attr "modrm" "0")
! (set_attr "ppro_uops" "few")])
(define_expand "movsf"
--- 2109,2115 ----
(set_attr "pent_pair" "np")
(set_attr "athlon_decode" "vector")
(set_attr "mode" "DI")
! (set_attr "modrm" "0")])
(define_expand "movsf"
***************
*** 4450,4457 ****
"fnstcw\t%0"
[(set_attr "length" "2")
(set_attr "mode" "HI")
! (set_attr "unit" "i387")
! (set_attr "ppro_uops" "few")])
(define_insn "x86_fldcw_1"
[(set (reg:HI 18)
--- 4443,4449 ----
"fnstcw\t%0"
[(set_attr "length" "2")
(set_attr "mode" "HI")
! (set_attr "unit" "i387")])
(define_insn "x86_fldcw_1"
[(set (reg:HI 18)
***************
*** 4461,4468 ****
[(set_attr "length" "2")
(set_attr "mode" "HI")
(set_attr "unit" "i387")
! (set_attr "athlon_decode" "vector")
! (set_attr "ppro_uops" "few")])
;; Conversion between fixed point and floating point.
--- 4453,4459 ----
[(set_attr "length" "2")
(set_attr "mode" "HI")
(set_attr "unit" "i387")
! (set_attr "athlon_decode" "vector")])
;; Conversion between fixed point and floating point.
***************
*** 4972,4979 ****
"adc{q}\t{%2, %0|%0, %2}"
[(set_attr "type" "alu")
(set_attr "pent_pair" "pu")
! (set_attr "mode" "DI")
! (set_attr "ppro_uops" "few")])
(define_insn "*adddi3_cc_rex64"
[(set (reg:CC 17)
--- 4963,4969 ----
"adc{q}\t{%2, %0|%0, %2}"
[(set_attr "type" "alu")
(set_attr "pent_pair" "pu")
! (set_attr "mode" "DI")])
(define_insn "*adddi3_cc_rex64"
[(set (reg:CC 17)
***************
*** 4997,5004 ****
"adc{b}\t{%2, %0|%0, %2}"
[(set_attr "type" "alu")
(set_attr "pent_pair" "pu")
! (set_attr "mode" "QI")
! (set_attr "ppro_uops" "few")])
(define_insn "addhi3_carry"
[(set (match_operand:HI 0 "nonimmediate_operand" "=rm,r")
--- 4987,4993 ----
"adc{b}\t{%2, %0|%0, %2}"
[(set_attr "type" "alu")
(set_attr "pent_pair" "pu")
! (set_attr "mode" "QI")])
(define_insn "addhi3_carry"
[(set (match_operand:HI 0 "nonimmediate_operand" "=rm,r")
***************
*** 5010,5017 ****
"adc{w}\t{%2, %0|%0, %2}"
[(set_attr "type" "alu")
(set_attr "pent_pair" "pu")
! (set_attr "mode" "HI")
! (set_attr "ppro_uops" "few")])
(define_insn "addsi3_carry"
[(set (match_operand:SI 0 "nonimmediate_operand" "=rm,r")
--- 4999,5005 ----
"adc{w}\t{%2, %0|%0, %2}"
[(set_attr "type" "alu")
(set_attr "pent_pair" "pu")
! (set_attr "mode" "HI")])
(define_insn "addsi3_carry"
[(set (match_operand:SI 0 "nonimmediate_operand" "=rm,r")
***************
*** 5023,5030 ****
"adc{l}\t{%2, %0|%0, %2}"
[(set_attr "type" "alu")
(set_attr "pent_pair" "pu")
! (set_attr "mode" "SI")
! (set_attr "ppro_uops" "few")])
(define_insn "*addsi3_carry_zext"
[(set (match_operand:DI 0 "register_operand" "=r")
--- 5011,5017 ----
"adc{l}\t{%2, %0|%0, %2}"
[(set_attr "type" "alu")
(set_attr "pent_pair" "pu")
! (set_attr "mode" "SI")])
(define_insn "*addsi3_carry_zext"
[(set (match_operand:DI 0 "register_operand" "=r")
***************
*** 5037,5044 ****
"adc{l}\t{%2, %k0|%k0, %2}"
[(set_attr "type" "alu")
(set_attr "pent_pair" "pu")
! (set_attr "mode" "SI")
! (set_attr "ppro_uops" "few")])
(define_insn "*addsi3_cc"
[(set (reg:CC 17)
--- 5024,5030 ----
"adc{l}\t{%2, %k0|%k0, %2}"
[(set_attr "type" "alu")
(set_attr "pent_pair" "pu")
! (set_attr "mode" "SI")])
(define_insn "*addsi3_cc"
[(set (reg:CC 17)
***************
*** 6645,6651 ****
"sbb{q}\t{%2, %0|%0, %2}"
[(set_attr "type" "alu")
(set_attr "pent_pair" "pu")
- (set_attr "ppro_uops" "few")
(set_attr "mode" "DI")])
(define_insn "*subdi_1_rex64"
--- 6631,6636 ----
***************
*** 6694,6700 ****
"sbb{b}\t{%2, %0|%0, %2}"
[(set_attr "type" "alu")
(set_attr "pent_pair" "pu")
- (set_attr "ppro_uops" "few")
(set_attr "mode" "QI")])
(define_insn "subhi3_carry"
--- 6679,6684 ----
***************
*** 6707,6713 ****
"sbb{w}\t{%2, %0|%0, %2}"
[(set_attr "type" "alu")
(set_attr "pent_pair" "pu")
- (set_attr "ppro_uops" "few")
(set_attr "mode" "HI")])
(define_insn "subsi3_carry"
--- 6691,6696 ----
***************
*** 6720,6726 ****
"sbb{l}\t{%2, %0|%0, %2}"
[(set_attr "type" "alu")
(set_attr "pent_pair" "pu")
- (set_attr "ppro_uops" "few")
(set_attr "mode" "SI")])
(define_insn "subsi3_carry_zext"
--- 6703,6708 ----
***************
*** 6734,6740 ****
"sbb{l}\t{%2, %k0|%k0, %2}"
[(set_attr "type" "alu")
(set_attr "pent_pair" "pu")
- (set_attr "ppro_uops" "few")
(set_attr "mode" "SI")])
(define_expand "subsi3"
--- 6716,6721 ----
***************
*** 7155,7161 ****
&& (GET_CODE (operands[1]) != MEM || GET_CODE (operands[2]) != MEM)"
"mul{q}\t%2"
[(set_attr "type" "imul")
- (set_attr "ppro_uops" "few")
(set_attr "length_immediate" "0")
(set (attr "athlon_decode")
(if_then_else (eq_attr "cpu" "athlon")
--- 7136,7141 ----
***************
*** 7183,7189 ****
&& (GET_CODE (operands[1]) != MEM || GET_CODE (operands[2]) != MEM)"
"mul{l}\t%2"
[(set_attr "type" "imul")
- (set_attr "ppro_uops" "few")
(set_attr "length_immediate" "0")
(set (attr "athlon_decode")
(if_then_else (eq_attr "cpu" "athlon")
--- 7163,7168 ----
***************
*** 7272,7278 ****
&& (GET_CODE (operands[1]) != MEM || GET_CODE (operands[2]) != MEM)"
"mul{q}\t%2"
[(set_attr "type" "imul")
- (set_attr "ppro_uops" "few")
(set_attr "length_immediate" "0")
(set (attr "athlon_decode")
(if_then_else (eq_attr "cpu" "athlon")
--- 7251,7256 ----
***************
*** 7308,7314 ****
"GET_CODE (operands[1]) != MEM || GET_CODE (operands[2]) != MEM"
"mul{l}\t%2"
[(set_attr "type" "imul")
- (set_attr "ppro_uops" "few")
(set_attr "length_immediate" "0")
(set (attr "athlon_decode")
(if_then_else (eq_attr "cpu" "athlon")
--- 7286,7291 ----
***************
*** 7331,7337 ****
&& (GET_CODE (operands[1]) != MEM || GET_CODE (operands[2]) != MEM)"
"mul{l}\t%2"
[(set_attr "type" "imul")
- (set_attr "ppro_uops" "few")
(set_attr "length_immediate" "0")
(set (attr "athlon_decode")
(if_then_else (eq_attr "cpu" "athlon")
--- 7308,7313 ----
***************
*** 7368,7374 ****
&& (GET_CODE (operands[1]) != MEM || GET_CODE (operands[2]) != MEM)"
"imul{q}\t%2"
[(set_attr "type" "imul")
- (set_attr "ppro_uops" "few")
(set (attr "athlon_decode")
(if_then_else (eq_attr "cpu" "athlon")
(const_string "vector")
--- 7344,7349 ----
***************
*** 7403,7409 ****
"GET_CODE (operands[1]) != MEM || GET_CODE (operands[2]) != MEM"
"imul{l}\t%2"
[(set_attr "type" "imul")
- (set_attr "ppro_uops" "few")
(set (attr "athlon_decode")
(if_then_else (eq_attr "cpu" "athlon")
(const_string "vector")
--- 7378,7383 ----
***************
*** 7425,7431 ****
&& (GET_CODE (operands[1]) != MEM || GET_CODE (operands[2]) != MEM)"
"imul{l}\t%2"
[(set_attr "type" "imul")
- (set_attr "ppro_uops" "few")
(set (attr "athlon_decode")
(if_then_else (eq_attr "cpu" "athlon")
(const_string "vector")
--- 7399,7404 ----
***************
*** 7465,7472 ****
"TARGET_QIMODE_MATH"
"idiv{b}\t%2"
[(set_attr "type" "idiv")
! (set_attr "mode" "QI")
! (set_attr "ppro_uops" "few")])
(define_insn "udivqi3"
[(set (match_operand:QI 0 "register_operand" "=a")
--- 7438,7444 ----
"TARGET_QIMODE_MATH"
"idiv{b}\t%2"
[(set_attr "type" "idiv")
! (set_attr "mode" "QI")])
(define_insn "udivqi3"
[(set (match_operand:QI 0 "register_operand" "=a")
***************
*** 7476,7483 ****
"TARGET_QIMODE_MATH"
"div{b}\t%2"
[(set_attr "type" "idiv")
! (set_attr "mode" "QI")
! (set_attr "ppro_uops" "few")])
;; The patterns that match these are at the end of this file.
--- 7448,7454 ----
"TARGET_QIMODE_MATH"
"div{b}\t%2"
[(set_attr "type" "idiv")
! (set_attr "mode" "QI")])
;; The patterns that match these are at the end of this file.
***************
*** 7550,7557 ****
"TARGET_64BIT"
"idiv{q}\t%2"
[(set_attr "type" "idiv")
! (set_attr "mode" "DI")
! (set_attr "ppro_uops" "few")])
(define_split
[(set (match_operand:DI 0 "register_operand" "")
--- 7521,7527 ----
"TARGET_64BIT"
"idiv{q}\t%2"
[(set_attr "type" "idiv")
! (set_attr "mode" "DI")])
(define_split
[(set (match_operand:DI 0 "register_operand" "")
***************
*** 7635,7642 ****
""
"idiv{l}\t%2"
[(set_attr "type" "idiv")
! (set_attr "mode" "SI")
! (set_attr "ppro_uops" "few")])
(define_split
[(set (match_operand:SI 0 "register_operand" "")
--- 7605,7611 ----
""
"idiv{l}\t%2"
[(set_attr "type" "idiv")
! (set_attr "mode" "SI")])
(define_split
[(set (match_operand:SI 0 "register_operand" "")
***************
*** 7710,7716 ****
"TARGET_64BIT"
"div{q}\t%2"
[(set_attr "type" "idiv")
- (set_attr "ppro_uops" "few")
(set_attr "mode" "DI")])
(define_split
--- 7679,7684 ----
***************
*** 7754,7760 ****
""
"div{l}\t%2"
[(set_attr "type" "idiv")
- (set_attr "ppro_uops" "few")
(set_attr "mode" "SI")])
(define_split
--- 7722,7727 ----
***************
*** 7797,7804 ****
""
"div{w}\t%2"
[(set_attr "type" "idiv")
! (set_attr "mode" "HI")
! (set_attr "ppro_uops" "few")])
;; We can not use div/idiv for double division, because it causes
;; "division by zero" on the overflow and that's not what we expect
--- 7764,7770 ----
""
"div{w}\t%2"
[(set_attr "type" "idiv")
! (set_attr "mode" "HI")])
;; We can not use div/idiv for double division, because it causes
;; "division by zero" on the overflow and that's not what we expect
***************
*** 7817,7824 ****
; (clobber (reg:CC 17))]
; ""
; "div{l}\t{%2, %0|%0, %2}"
! ; [(set_attr "type" "idiv")
! ; (set_attr "ppro_uops" "few")])
;;- Logical AND instructions
--- 7783,7789 ----
; (clobber (reg:CC 17))]
; ""
; "div{l}\t{%2, %0|%0, %2}"
! ; [(set_attr "type" "idiv")])
;;- Logical AND instructions
***************
*** 9932,9939 ****
"TARGET_80387 && reload_completed"
"fchs"
[(set_attr "type" "fsgn")
! (set_attr "mode" "SF")
! (set_attr "ppro_uops" "few")])
(define_insn "*negdf2_1"
[(set (match_operand:DF 0 "register_operand" "=f")
--- 9897,9903 ----
"TARGET_80387 && reload_completed"
"fchs"
[(set_attr "type" "fsgn")
! (set_attr "mode" "SF")])
(define_insn "*negdf2_1"
[(set (match_operand:DF 0 "register_operand" "=f")
***************
*** 9941,9948 ****
"TARGET_80387 && reload_completed"
"fchs"
[(set_attr "type" "fsgn")
! (set_attr "mode" "DF")
! (set_attr "ppro_uops" "few")])
(define_insn "*negextendsfdf2"
[(set (match_operand:DF 0 "register_operand" "=f")
--- 9905,9911 ----
"TARGET_80387 && reload_completed"
"fchs"
[(set_attr "type" "fsgn")
! (set_attr "mode" "DF")])
(define_insn "*negextendsfdf2"
[(set (match_operand:DF 0 "register_operand" "=f")
***************
*** 9951,9958 ****
"TARGET_80387"
"fchs"
[(set_attr "type" "fsgn")
! (set_attr "mode" "DF")
! (set_attr "ppro_uops" "few")])
(define_insn "*negxf2_1"
[(set (match_operand:XF 0 "register_operand" "=f")
--- 9914,9920 ----
"TARGET_80387"
"fchs"
[(set_attr "type" "fsgn")
! (set_attr "mode" "DF")])
(define_insn "*negxf2_1"
[(set (match_operand:XF 0 "register_operand" "=f")
***************
*** 9960,9967 ****
"TARGET_80387 && reload_completed"
"fchs"
[(set_attr "type" "fsgn")
! (set_attr "mode" "XF")
! (set_attr "ppro_uops" "few")])
(define_insn "*negextenddfxf2"
[(set (match_operand:XF 0 "register_operand" "=f")
--- 9922,9928 ----
"TARGET_80387 && reload_completed"
"fchs"
[(set_attr "type" "fsgn")
! (set_attr "mode" "XF")])
(define_insn "*negextenddfxf2"
[(set (match_operand:XF 0 "register_operand" "=f")
***************
*** 9970,9977 ****
"TARGET_80387"
"fchs"
[(set_attr "type" "fsgn")
! (set_attr "mode" "XF")
! (set_attr "ppro_uops" "few")])
(define_insn "*negextendsfxf2"
[(set (match_operand:XF 0 "register_operand" "=f")
--- 9931,9937 ----
"TARGET_80387"
"fchs"
[(set_attr "type" "fsgn")
! (set_attr "mode" "XF")])
(define_insn "*negextendsfxf2"
[(set (match_operand:XF 0 "register_operand" "=f")
***************
*** 9980,9987 ****
"TARGET_80387"
"fchs"
[(set_attr "type" "fsgn")
! (set_attr "mode" "XF")
! (set_attr "ppro_uops" "few")])
;; Absolute value instructions
--- 9940,9946 ----
"TARGET_80387"
"fchs"
[(set_attr "type" "fsgn")
! (set_attr "mode" "XF")])
;; Absolute value instructions
***************
*** 10762,10769 ****
(set_attr "prefix_0f" "1")
(set_attr "mode" "SI")
(set_attr "pent_pair" "np")
! (set_attr "athlon_decode" "vector")
! (set_attr "ppro_uops" "few")])
(define_expand "x86_shift_adj_1"
[(set (reg:CCZ 17)
--- 10721,10727 ----
(set_attr "prefix_0f" "1")
(set_attr "mode" "SI")
(set_attr "pent_pair" "np")
! (set_attr "athlon_decode" "vector")])
(define_expand "x86_shift_adj_1"
[(set (reg:CCZ 17)
***************
*** 11466,11472 ****
[(set_attr "type" "ishift")
(set_attr "prefix_0f" "1")
(set_attr "pent_pair" "np")
- (set_attr "ppro_uops" "few")
(set_attr "mode" "SI")])
(define_expand "x86_shift_adj_3"
--- 11424,11429 ----
***************
*** 13484,13491 ****
else
return "dec{l}\t%1\;%+jne\t%l0";
}
! [(set_attr "ppro_uops" "many")
! (set (attr "length")
(if_then_else (and (eq_attr "alternative" "0")
(and (ge (minus (match_dup 0) (pc))
(const_int -126))
--- 13441,13447 ----
else
return "dec{l}\t%1\;%+jne\t%l0";
}
! [(set (attr "length")
(if_then_else (and (eq_attr "alternative" "0")
(and (ge (minus (match_dup 0) (pc))
(const_int -126))
***************
*** 13871,13878 ****
"nop"
[(set_attr "length" "1")
(set_attr "length_immediate" "0")
! (set_attr "modrm" "0")
! (set_attr "ppro_uops" "one")])
;; Align to 16-byte boundary, max skip in op0. Used to avoid
;; branch prediction penalty for the third jump in a 16-byte
--- 13827,13833 ----
"nop"
[(set_attr "length" "1")
(set_attr "length_immediate" "0")
! (set_attr "modrm" "0")])
;; Align to 16-byte boundary, max skip in op0. Used to avoid
;; branch prediction penalty for the third jump in a 16-byte
***************
*** 14033,14040 ****
(ctz:SI (match_dup 1)))]
""
"bsf{l}\t{%1, %0|%0, %1}"
! [(set_attr "prefix_0f" "1")
! (set_attr "ppro_uops" "few")])
(define_insn "ctzsi2"
[(set (match_operand:SI 0 "register_operand" "=r")
--- 13988,13994 ----
(ctz:SI (match_dup 1)))]
""
"bsf{l}\t{%1, %0|%0, %1}"
! [(set_attr "prefix_0f" "1")])
(define_insn "ctzsi2"
[(set (match_operand:SI 0 "register_operand" "=r")
***************
*** 14042,14049 ****
(clobber (reg:CC 17))]
""
"bsf{l}\t{%1, %0|%0, %1}"
! [(set_attr "prefix_0f" "1")
! (set_attr "ppro_uops" "few")])
(define_expand "clzsi2"
[(parallel
--- 13996,14002 ----
(clobber (reg:CC 17))]
""
"bsf{l}\t{%1, %0|%0, %1}"
! [(set_attr "prefix_0f" "1")])
(define_expand "clzsi2"
[(parallel
***************
*** 14064,14071 ****
(clobber (reg:CC 17))]
""
"bsr{l}\t{%1, %0|%0, %1}"
! [(set_attr "prefix_0f" "1")
! (set_attr "ppro_uops" "few")])
;; Thread-local storage patterns for ELF.
;;
--- 14017,14023 ----
(clobber (reg:CC 17))]
""
"bsr{l}\t{%1, %0|%0, %1}"
! [(set_attr "prefix_0f" "1")])
;; Thread-local storage patterns for ELF.
;;
***************
*** 14482,14488 ****
]
(const_string "fop")))
(set_attr "fp_int_src" "true")
- (set_attr "ppro_uops" "many")
(set_attr "mode" "SI")])
(define_insn "*fop_sf_3"
--- 14434,14439 ----
***************
*** 14500,14506 ****
]
(const_string "fop")))
(set_attr "fp_int_src" "true")
- (set_attr "ppro_uops" "many")
(set_attr "mode" "SI")])
(define_insn "*fop_df_1_nosse"
--- 14451,14456 ----
***************
*** 14581,14587 ****
]
(const_string "fop")))
(set_attr "fp_int_src" "true")
- (set_attr "ppro_uops" "many")
(set_attr "mode" "SI")])
(define_insn "*fop_df_3"
--- 14531,14536 ----
***************
*** 14599,14605 ****
]
(const_string "fop")))
(set_attr "fp_int_src" "true")
- (set_attr "ppro_uops" "many")
(set_attr "mode" "SI")])
(define_insn "*fop_df_4"
--- 14548,14553 ----
***************
*** 14686,14693 ****
]
(const_string "fop")))
(set_attr "fp_int_src" "true")
! (set_attr "mode" "SI")
! (set_attr "ppro_uops" "many")])
(define_insn "*fop_xf_3"
[(set (match_operand:XF 0 "register_operand" "=f,f")
--- 14634,14640 ----
]
(const_string "fop")))
(set_attr "fp_int_src" "true")
! (set_attr "mode" "SI")])
(define_insn "*fop_xf_3"
[(set (match_operand:XF 0 "register_operand" "=f,f")
***************
*** 14704,14711 ****
]
(const_string "fop")))
(set_attr "fp_int_src" "true")
! (set_attr "mode" "SI")
! (set_attr "ppro_uops" "many")])
(define_insn "*fop_xf_4"
[(set (match_operand:XF 0 "register_operand" "=f,f")
--- 14651,14657 ----
]
(const_string "fop")))
(set_attr "fp_int_src" "true")
! (set_attr "mode" "SI")])
(define_insn "*fop_xf_4"
[(set (match_operand:XF 0 "register_operand" "=f,f")
Index: ./gcc/config/i386/ppro.md
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/i386/ppro.md,v
retrieving revision 1.4
diff -c -3 -p -r1.4 ppro.md
*** ./gcc/config/i386/ppro.md 26 Sep 2003 04:07:46 -0000 1.4
--- ./gcc/config/i386/ppro.md 29 Feb 2004 18:07:43 -0000
***************
*** 1,5 ****
! ;; Pentium Pro/PII Scheduling
! ;; Copyright (C) 2002 Free Software Foundation, Inc.
;;
;; This file is part of GCC.
;;
--- 1,5 ----
! ;; Scheduling for the Intel P6 family of processors
! ;; Copyright (C) 2004 Free Software Foundation, Inc.
;;
;; This file is part of GCC.
;;
***************
*** 18,150 ****
;; the Free Software Foundation, 59 Temple Place - Suite 330,
;; Boston, MA 02111-1307, USA. */
! ;; Categorize how many uops an ia32 instruction evaluates to:
! ;; one -- an instruction with 1 uop can be decoded by any of the
! ;; three decoders.
! ;; few -- an instruction with 1 to 4 uops can be decoded only by
! ;; decoder 0.
! ;; many -- a complex instruction may take an unspecified number of
! ;; cycles to decode in decoder 0.
!
! (define_attr "ppro_uops" "one,few,many"
! (cond [(eq_attr "type" "other,multi,call,callv,fpspc,str")
! (const_string "many")
! (eq_attr "type" "icmov,fcmov,str,cld,leave")
! (const_string "few")
! (eq_attr "type" "imov")
! (if_then_else (eq_attr "memory" "store,both")
! (const_string "few")
! (const_string "one"))
! (eq_attr "memory" "!none")
! (const_string "few")
! ]
! (const_string "one")))
!
! ;;
! ;; The PPro has an out-of-order core, but the instruction decoders are
! ;; naturally in-order and asymmetric. We get best performance by scheduling
! ;; for the decoders, for in doing so we give the oo execution unit the
! ;; most choices.
! ;;
! ;; Rough readiness numbers. Fine tuning happens in i386.c.
! ;;
! ;; p0 describes port 0.
! ;; p01 describes ports 0 and 1 as a pair; alu insns can issue to either.
! ;; p2 describes port 2 for loads.
! ;; p34 describes ports 3 and 4 for stores.
! ;; fpu describes the fpu accessed via port 0.
! ;; ??? It is less than clear if there are separate fadd and fmul units
! ;; that could operate in parallel.
! ;;
! ;; ??? fxch isn't handled; not an issue until sched3 after reg-stack is real.
!
! (define_function_unit "ppro_p0" 1 0
! (and (eq_attr "cpu" "pentiumpro")
! (eq_attr "type" "ishift,rotate,ishift1,rotate1,lea,ibr,cld"))
! 1 1)
!
! (define_function_unit "ppro_p0" 1 0
! (and (eq_attr "cpu" "pentiumpro")
! (eq_attr "type" "imul"))
! 4 1)
!
! ;; ??? Does the divider lock out the pipe while it works,
! ;; or is there a disconnected unit?
! (define_function_unit "ppro_p0" 1 0
! (and (eq_attr "cpu" "pentiumpro")
! (eq_attr "type" "idiv"))
! 17 17)
!
! (define_function_unit "ppro_p0" 1 0
! (and (eq_attr "cpu" "pentiumpro")
! (eq_attr "type" "fop,fsgn,fistp"))
! 3 1)
!
! (define_function_unit "ppro_p0" 1 0
! (and (eq_attr "cpu" "pentiumpro")
! (eq_attr "type" "fcmov"))
! 2 1)
!
! (define_function_unit "ppro_p0" 1 0
! (and (eq_attr "cpu" "pentiumpro")
! (eq_attr "type" "fcmp"))
! 1 1)
!
! (define_function_unit "ppro_p0" 1 0
! (and (eq_attr "cpu" "pentiumpro")
! (eq_attr "type" "fmov"))
! 1 1)
!
! (define_function_unit "ppro_p0" 1 0
! (and (eq_attr "cpu" "pentiumpro")
! (eq_attr "type" "fmul"))
! 5 1)
!
! (define_function_unit "ppro_p0" 1 0
! (and (eq_attr "cpu" "pentiumpro")
! (eq_attr "type" "fdiv,fpspc"))
! 56 1)
!
! (define_function_unit "ppro_p01" 2 0
! (and (eq_attr "cpu" "pentiumpro")
! (eq_attr "type" "!imov,fmov"))
! 1 1)
!
! (define_function_unit "ppro_p01" 2 0
! (and (and (eq_attr "cpu" "pentiumpro")
! (eq_attr "type" "imov,fmov"))
! (eq_attr "memory" "none"))
! 1 1)
!
! (define_function_unit "ppro_p2" 1 0
! (and (eq_attr "cpu" "pentiumpro")
! (ior (eq_attr "type" "pop,leave")
! (eq_attr "memory" "load,both")))
! 3 1)
!
! (define_function_unit "ppro_p34" 1 0
! (and (eq_attr "cpu" "pentiumpro")
! (ior (eq_attr "type" "push")
! (eq_attr "memory" "store,both")))
! 1 1)
!
! (define_function_unit "fpu" 1 0
! (and (eq_attr "cpu" "pentiumpro")
! (eq_attr "type" "fop,fsgn,fmov,fcmp,fcmov,fistp"))
! 1 1)
!
! (define_function_unit "fpu" 1 0
! (and (eq_attr "cpu" "pentiumpro")
! (eq_attr "type" "fmul"))
! 5 2)
!
! (define_function_unit "fpu" 1 0
! (and (eq_attr "cpu" "pentiumpro")
! (eq_attr "type" "fdiv,fpspc"))
! 56 56)
!
! ;; imul uses the fpu. ??? does it have the same throughput as fmul?
! (define_function_unit "fpu" 1 0
! (and (eq_attr "cpu" "pentiumpro")
! (eq_attr "type" "imul"))
! 4 1)
--- 18,763 ----
;; the Free Software Foundation, 59 Temple Place - Suite 330,
;; Boston, MA 02111-1307, USA. */
! ;; The P6 familiy includes the Pentium Pro, Pentium II, Pentium III, Celeron
! ;; and Xeon lines of CPUs. The DFA scheduler description in this file is
! ;; based on information that can be found in the following two documents:
! ;;
! ;; "P6 Family of Processors Hardware Developer's Manual",
! ;; Intel, September 1999.
! ;;
! ;; "Intel Architecture Optimization Manual",
! ;; Intel, 1999 (Order Number: 245127-001).
! ;;
! ;; "How to optimize for the Pentium family of microprocessors",
! ;; by Agner Fog, PhD.
! ;;
! ;; The P6 pipeline has three major components:
! ;; 1) the FETCH/DECODE unit, an in-order issue front-end
! ;; 2) the DISPATCH/EXECUTE unit, which is the out-of-order core
! ;; 3) the RETIRE unit, an in-order retirement unit
! ;;
! ;; So, the P6 CPUs have out-of-order cores, but the instruction decoder and
! ;; retirement unit are naturally in-order.
! ;;
! ;; BUS INTERFACE UNIT
! ;; / \
! ;; L1 ICACHE L1 DCACHE
! ;; / | \ | \
! ;; DECODER0 DECODER1 DECODER2 DISP/EXEC RETIRE
! ;; \ | / | |
! ;; INSTRUCTION POOL __________|_______/
! ;; (inc. reorder buffer)
! ;;
! ;; Since the P6 CPUs execute instructions out-of-order, the most important
! ;; consideration in performance tuning is making sure enough micro-ops are
! ;; ready for execution in the out-of-order core, while not stalling the
! ;; decoder.
! ;;
! ;; TODO:
! ;; - Find a less crude way to model complex instructions, in
! ;; particular how many cycles they take to be decoded.
! ;; - Include decoder latencies in the total reservation latencies.
! ;; This isn't necessary right now because we assume for every
! ;; instruction that it never blocks a decoder.
! ;; - Figure out where the p0 and p1 reservations come from. These
! ;; appear not to be in the manual (e.g. why is cld "(p0+p1)*2"
! ;; better than "(p0|p1)*4" ???)
! ;; - Lots more because I'm sure this is still far from optimal :-)
!
! ;; The ppro_idiv and ppro_fdiv automata are used to model issue
! ;; latencies of idiv and fdiv type insns.
! (define_automaton "ppro_decoder,ppro_core,ppro_idiv,ppro_fdiv,ppro_load,ppro_store")
!
! ;; Simple instructions of the register-register form have only one uop.
! ;; Load instructions are also only one uop. Store instructions decode to
! ;; two uops, and simple read-modify instructions also take two uops.
! ;; Simple instructions of the register-memory form have two to three uops.
! ;; Simple read-modify-write instructions have four uops. The rules for
! ;; the decoder are simple:
! ;; - an instruction with 1 uop can be decoded by any of the three
! ;; decoders in one cycle.
! ;; - an instruction with 1 to 4 uops can be decoded only by decoder 0
! ;; but still in only one cycle.
! ;; - a complex (microcode) instruction can also only be decoded by
! ;; decoder 0, and this takes an unspecified number of cycles.
! ;;
! ;; The goal is to schedule such that we have a few-one-one uops sequence
! ;; in each cycle, to decode as many instructions per cycle as possible.
! (define_cpu_unit "decoder0" "ppro_decoder")
! (define_cpu_unit "decoder1" "ppro_decoder")
! (define_cpu_unit "decoder2" "ppro_decoder")
!
! ;; We first wish to find an instruction for decoder0, so exclude
! ;; decoder1 and decoder2 from being reserved until decoder 0 is
! ;; reserved.
! (presence_set "decoder1" "decoder0")
! (presence_set "decoder2" "decoder0")
!
! ;; Most instructions can be decoded on any of the three decoders.
! (define_reservation "decodern" "(decoder0|decoder1|decoder2)")
!
! ;; The out-of-order core has five pipelines. During each cycle, the core
! ;; may dispatch zero or one uop on the port of any of the five pipelines
! ;; so the maximum number of dispatched uops per cycle is 5. In practicer,
! ;; 3 uops per cycle is more realistic.
! ;;
! ;; Two of the five pipelines contain several execution units:
! ;;
! ;; Port 0 Port 1 Port 2 Port 3 Port 4
! ;; ALU ALU LOAD SAC SDA
! ;; FPU JUE
! ;; AGU MMX
! ;; MMX P3FPU
! ;; P3FPU
! ;;
! ;; (SAC=Store Address Calculation, SDA=Store Data Unit, P3FPU = SSE unit,
! ;; JUE = Jump Execution Unit, AGU = Addres Generation Unit)
! ;;
! (define_cpu_unit "p0,p1" "ppro_core")
! (define_cpu_unit "p2" "ppro_load")
! (define_cpu_unit "p3,p4" "ppro_store")
! (define_cpu_unit "idiv" "ppro_idiv")
! (define_cpu_unit "fdiv" "ppro_fdiv")
!
! ;; Only the irregular instructions have to be modeled here. A load
! ;; increases the latency by 2 or 3, or by nothing if the manual gives
! ;; a latency already. Store latencies are not accounted for.
! ;;
! ;; The simple instructions follow a very regular pattern of 1 uop per
! ;; reg-reg operation, 1 uop per load on port 2. and 2 uops per store
! ;; on port 4 and port 3. These instructions are modelled at the bottom
! ;; of this file.
! ;;
! ;; For microcoded instructions we don't know how many uops are produced.
! ;; These instructions are the "complex" ones in the Intel manuals. All
! ;; we _do_ know is that they typically produce four or more uops, so
! ;; they can only be decoded on decoder0. Modelling their latencies
! ;; doesn't make sense because we don't know how these instructions are
! ;; executed in the core. So we just model that they can only be decoded
! ;; on decoder 0, and say that it takes a little while before the result
! ;; is availale.
! (define_insn_reservation "ppro_complex_insn" 6
! (eq_attr "type" "other,multi,call,callv,str")
! "decoder0")
!
! ;; imov with memory operands does not use the integer units.
! (define_insn_reservation "ppro_imov" 1
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "none")
! (eq_attr "type" "imov")))
! "decodern,(p0|p1)")
!
! (define_insn_reservation "ppro_imov_load" 4
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "load")
! (eq_attr "type" "imov")))
! "decodern,p2")
!
! (define_insn_reservation "ppro_imov_store" 1
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "store")
! (eq_attr "type" "imov")))
! "decoder0,p4+p3")
!
! ;; imovx always decodes to one uop, and also doesn't use the integer
! ;; units if it has memory operands.
! (define_insn_reservation "ppro_imovx" 1
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "none")
! (eq_attr "type" "imovx")))
! "decodern,(p0|p1)")
!
! (define_insn_reservation "ppro_imovx_load" 4
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "load")
! (eq_attr "type" "imovx")))
! "decodern,p2")
!
! ;; lea executes on port 0 with latency one and throughput 1.
! (define_insn_reservation "ppro_lea" 1
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "none")
! (eq_attr "type" "lea")))
! "decodern,p0")
!
! ;; Shift and rotate execute on port 0 with latency and throughput 1.
! ;; The load and store units need to be reserved when memory operands
! ;; are involved.
! (define_insn_reservation "ppro_shift_rotate" 1
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "none")
! (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
! "decodern,p0")
!
! (define_insn_reservation "ppro_shift_rotate_mem" 4
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "!none")
! (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
! "decoder0,p2+p0,p4+p3")
!
! (define_insn_reservation "ppro_cld" 2
! (and (eq_attr "cpu" "pentiumpro")
! (eq_attr "type" "cld"))
! "decoder0,(p0+p1)*2")
!
! ;; The P6 has a sophisticated branch prediction mechanism to miminize
! ;; latencies due to branching. In particular, it has a fast way to
! ;; execute branches that are taken multiple times (such as in loops).
! ;; Branches not taken suffer no penalty, and correctly predicted
! ;; branches cost only one fetch cycle. Mispredicted branches are very
! ;; costly: typically 15 cycles and possibly as many as 26 cycles.
! ;;
! ;; Unfortunatetely all this makes it quite difficult to properly model
! ;; the latencies for the compiler. Here I've made the choice to be
! ;; optimistic and assume branches are often predicted correctly, so
! ;; they have latency 1, and the decoders are not blocked.
! ;;
! ;; In addition, the model assumes a branch always decodes to only 1 uop,
! ;; which is not exactly true because there are a few instructions that
! ;; decode to 2 uops or microcode. But this probably gives the best
! ;; results because we can assume these instructions can decode on all
! ;; decoders.
! (define_insn_reservation "ppro_branch" 1
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "none")
! (eq_attr "type" "ibr")))
! "decodern,p1")
!
! ;; ??? Indirect branches probably have worse latency than this.
! (define_insn_reservation "ppro_indirect_branch" 6
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "!none")
! (eq_attr "type" "ibr")))
! "decoder0,p2+p1")
!
! (define_insn_reservation "ppro_leave" 4
! (and (eq_attr "cpu" "pentiumpro")
! (eq_attr "type" "leave"))
! "decoder0,p2+(p0|p1),(p0|p1)")
!
! ;; imul has throughput one, but latency 4, and can only execute on port 0.
! (define_insn_reservation "ppro_imul" 4
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "none")
! (eq_attr "type" "imul")))
! "decodern,p0")
!
! (define_insn_reservation "ppro_imul_mem" 4
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "!none")
! (eq_attr "type" "imul")))
! "decoder0,p2+p0")
!
! ;; div and idiv are very similar, so we model them the same.
! ;; QI, HI, and SI have issue latency 12, 21, and 37, respectively.
! ;; These issue latencies are modelled via the ppro_div automaton.
! (define_insn_reservation "ppro_idiv_QI" 19
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "none")
! (and (eq_attr "mode" "QI")
! (eq_attr "type" "idiv"))))
! "decoder0,(p0+idiv)*2,(p0|p1)+idiv,idiv*9")
!
! (define_insn_reservation "ppro_idiv_QI_load" 19
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "load")
! (and (eq_attr "mode" "QI")
! (eq_attr "type" "idiv"))))
! "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*9")
!
! (define_insn_reservation "ppro_idiv_HI" 23
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "none")
! (and (eq_attr "mode" "HI")
! (eq_attr "type" "idiv"))))
! "decoder0,(p0+idiv)*3,(p0|p1)+idiv,idiv*17")
!
! (define_insn_reservation "ppro_idiv_HI_load" 23
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "load")
! (and (eq_attr "mode" "HI")
! (eq_attr "type" "idiv"))))
! "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*18")
!
! (define_insn_reservation "ppro_idiv_SI" 39
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "none")
! (and (eq_attr "mode" "SI")
! (eq_attr "type" "idiv"))))
! "decoder0,(p0+idiv)*3,(p0|p1)+idiv,idiv*33")
!
! (define_insn_reservation "ppro_idiv_SI_load" 39
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "load")
! (and (eq_attr "mode" "SI")
! (eq_attr "type" "idiv"))))
! "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*34")
!
! ;; Floating point operations always execute on port 0.
! ;; ??? where do these latencies come from? fadd has latency 3 and
! ;; has throughput "1/cycle (align with FADD)". What do they
! ;; mean and how can we model that?
! (define_insn_reservation "ppro_fop" 3
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "none,unknown")
! (eq_attr "type" "fop")))
! "decodern,p0")
!
! (define_insn_reservation "ppro_fop_load" 5
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "load")
! (eq_attr "type" "fop")))
! "decoder0,p2+p0,p0")
!
! (define_insn_reservation "ppro_fop_store" 3
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "store")
! (eq_attr "type" "fop")))
! "decoder0,p0,p0,p0+p4+p3")
!
! (define_insn_reservation "ppro_fop_both" 5
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "both")
! (eq_attr "type" "fop")))
! "decoder0,p2+p0,p0+p4+p3")
!
! (define_insn_reservation "ppro_fsgn" 1
! (and (eq_attr "cpu" "pentiumpro")
! (eq_attr "type" "fsgn"))
! "decodern,p0")
!
! (define_insn_reservation "ppro_fistp" 5
! (and (eq_attr "cpu" "pentiumpro")
! (eq_attr "type" "fistp"))
! "decoder0,p0*2,p4+p3")
!
! (define_insn_reservation "ppro_fcmov" 2
! (and (eq_attr "cpu" "pentiumpro")
! (eq_attr "type" "fcmov"))
! "decoder0,p0*2")
!
! (define_insn_reservation "ppro_fcmp" 1
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "none")
! (eq_attr "type" "fcmp")))
! "decodern,p0")
!
! (define_insn_reservation "ppro_fcmp_load" 4
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "load")
! (eq_attr "type" "fcmp")))
! "decoder0,p2+p0")
!
! (define_insn_reservation "ppro_fmov" 1
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "none")
! (eq_attr "type" "fmov")))
! "decodern,p0")
!
! (define_insn_reservation "ppro_fmov_load" 1
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "load")
! (and (eq_attr "mode" "!XF")
! (eq_attr "type" "fmov"))))
! "decodern,p2")
!
! (define_insn_reservation "ppro_fmov_XF_load" 3
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "load")
! (and (eq_attr "mode" "XF")
! (eq_attr "type" "fmov"))))
! "decoder0,(p2+p0)*2")
!
! (define_insn_reservation "ppro_fmov_store" 1
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "store")
! (and (eq_attr "mode" "!XF")
! (eq_attr "type" "fmov"))))
! "decodern,p0")
!
! (define_insn_reservation "ppro_fmov_XF_store" 3
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "store")
! (and (eq_attr "mode" "XF")
! (eq_attr "type" "fmov"))))
! "decoder0,(p0+p4),(p0+p3)")
!
! ;; fmul executes on port 0 with latency 5. It has issue latency 2,
! ;; but we don't model this.
! (define_insn_reservation "ppro_fmul" 5
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "none")
! (eq_attr "type" "fmul")))
! "decoder0,p0*2")
!
! (define_insn_reservation "ppro_fmul_load" 6
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "load")
! (eq_attr "type" "fmul")))
! "decoder0,p2+p0,p0")
!
! ;; fdiv latencies depend on the mode of the operands. XFmode gives
! ;; a latency of 38 cycles, DFmode gives 32, and SFmode gives latency 18.
! ;; Division by a power of 2 takes only 9 cycles, but we cannot model
! ;; that. Throughput is equal to latency - 1, which we model using the
! ;; ppro_div automaton.
! (define_insn_reservation "ppro_fdiv_SF" 18
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "none")
! (and (eq_attr "mode" "SF")
! (eq_attr "type" "fdiv,fpspc"))))
! "decodern,p0+fdiv,fdiv*16")
!
! (define_insn_reservation "ppro_fdiv_SF_load" 19
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "load")
! (and (eq_attr "mode" "SF")
! (eq_attr "type" "fdiv,fpspc"))))
! "decoder0,p2+p0+fdiv,fdiv*16")
!
! (define_insn_reservation "ppro_fdiv_DF" 32
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "none")
! (and (eq_attr "mode" "DF")
! (eq_attr "type" "fdiv,fpspc"))))
! "decodern,p0+fdiv,fdiv*30")
!
! (define_insn_reservation "ppro_fdiv_DF_load" 33
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "load")
! (and (eq_attr "mode" "DF")
! (eq_attr "type" "fdiv,fpspc"))))
! "decoder0,p2+p0+fdiv,fdiv*30")
!
! (define_insn_reservation "ppro_fdiv_XF" 38
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "none")
! (and (eq_attr "mode" "XF")
! (eq_attr "type" "fdiv,fpspc"))))
! "decodern,p0+fdiv,fdiv*36")
!
! (define_insn_reservation "ppro_fdiv_XF_load" 39
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "load")
! (and (eq_attr "mode" "XF")
! (eq_attr "type" "fdiv,fpspc"))))
! "decoder0,p2+p0+fdiv,fdiv*36")
!
! ;; MMX instructions can execute on either port 0 or port 1 with a
! ;; throughput of 1/cycle.
! ;; on port 0: - ALU (latency 1)
! ;; - Multiplier Unit (latency 3)
! ;; on port 1: - ALU (latency 1)
! ;; - Shift Unit (latency 1)
! ;;
! ;; MMX instructions are either of the type reg-reg, or read-modify, and
! ;; except for mmxshft and mmxmul they can execute on port 0 or port 1,
! ;; so they behave as "simple" instructions that need no special modelling.
! ;; We only have to model mmxshft and mmxmul.
! (define_insn_reservation "ppro_mmx_shft" 1
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "none")
! (eq_attr "type" "mmxshft")))
! "decodern,p1")
!
! (define_insn_reservation "ppro_mmx_shft_load" 2
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "none")
! (eq_attr "type" "mmxshft")))
! "decoder0,p2+p1")
!
! (define_insn_reservation "ppro_mmx_mul" 3
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "none")
! (eq_attr "type" "mmxmul")))
! "decodern,p0")
!
! (define_insn_reservation "ppro_mmx_mul_load" 3
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "none")
! (eq_attr "type" "mmxmul")))
! "decoder0,p2+p0")
!
! (define_insn_reservation "ppro_sse_mmxcvt" 4
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "mode" "DI")
! (eq_attr "type" "mmxcvt")))
! "decodern,p1")
!
! ;; FIXME: These are Pentium III only, but we cannot tell here if
! ;; we're generating code for PentiumPro/Pentium II or Pentium III
! ;; (define_insn_reservation "ppro_sse_mmxshft" 2
! ;; (and (eq_attr "cpu" "pentiumpro")
! ;; (and (eq_attr "mode" "DI")
! ;; (eq_attr "type" "mmxshft")))
! ;; "decodern,p0")
!
! ;; SSE is very complicated, and takes a bit more effort.
! ;; ??? I assumed that all SSE instructions decode on decoder0,
! ;; but is this correct?
!
! ;; The sfence instruction.
! (define_insn_reservation "ppro_sse_sfence" 3
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "unknown")
! (eq_attr "type" "sse")))
! "decoder0,p4+p3")
!
! ;; FIXME: This reservation is all wrong when we're scheduling sqrtss.
! (define_insn_reservation "ppro_sse_SF" 3
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "mode" "SF")
! (eq_attr "type" "sse")))
! "decodern,p0")
!
! (define_insn_reservation "ppro_sse_add_SF" 3
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "none")
! (and (eq_attr "mode" "SF")
! (eq_attr "type" "sseadd"))))
! "decodern,p1")
!
! (define_insn_reservation "ppro_sse_add_SF_load" 3
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "load")
! (and (eq_attr "mode" "SF")
! (eq_attr "type" "sseadd"))))
! "decoder0,p2+p1")
!
! (define_insn_reservation "ppro_sse_cmp_SF" 3
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "none")
! (and (eq_attr "mode" "SF")
! (eq_attr "type" "ssecmp"))))
! "decoder0,p1")
!
! (define_insn_reservation "ppro_sse_cmp_SF_load" 3
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "load")
! (and (eq_attr "mode" "SF")
! (eq_attr "type" "ssecmp"))))
! "decoder0,p2+p1")
!
! (define_insn_reservation "ppro_sse_comi_SF" 1
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "none")
! (and (eq_attr "mode" "SF")
! (eq_attr "type" "ssecomi"))))
! "decodern,p0")
!
! (define_insn_reservation "ppro_sse_comi_SF_load" 1
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "load")
! (and (eq_attr "mode" "SF")
! (eq_attr "type" "ssecomi"))))
! "decoder0,p2+p0")
!
! (define_insn_reservation "ppro_sse_mul_SF" 4
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "none")
! (and (eq_attr "mode" "SF")
! (eq_attr "type" "ssemul"))))
! "decodern,p0")
!
! (define_insn_reservation "ppro_sse_mul_SF_load" 4
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "load")
! (and (eq_attr "mode" "SF")
! (eq_attr "type" "ssemul"))))
! "decoder0,p2+p0")
!
! ;; FIXME: ssediv doesn't close p0 for 17 cycles, surely???
! (define_insn_reservation "ppro_sse_div_SF" 18
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "none")
! (and (eq_attr "mode" "SF")
! (eq_attr "type" "ssediv"))))
! "decoder0,p0*17")
!
! (define_insn_reservation "ppro_sse_div_SF_load" 18
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "none")
! (and (eq_attr "mode" "SF")
! (eq_attr "type" "ssediv"))))
! "decoder0,(p2+p0),p0*16")
!
! (define_insn_reservation "ppro_sse_icvt_SF" 4
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "mode" "SF")
! (eq_attr "type" "sseicvt")))
! "decoder0,(p2+p1)*2")
!
! (define_insn_reservation "ppro_sse_icvt_SI" 3
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "mode" "SI")
! (eq_attr "type" "sseicvt")))
! "decoder0,(p2+p1)")
!
! (define_insn_reservation "ppro_sse_mov_SF" 3
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "none")
! (and (eq_attr "mode" "SF")
! (eq_attr "type" "ssemov"))))
! "decoder0,(p0|p1)")
!
! (define_insn_reservation "ppro_sse_mov_SF_load" 3
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "load")
! (and (eq_attr "mode" "SF")
! (eq_attr "type" "ssemov"))))
! "decoder0,p2+(p0|p1)")
!
! (define_insn_reservation "ppro_sse_mov_SF_store" 3
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "store")
! (and (eq_attr "mode" "SF")
! (eq_attr "type" "ssemov"))))
! "decoder0,p4+p3")
!
! (define_insn_reservation "ppro_sse_V4SF" 4
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "mode" "V4SF")
! (eq_attr "type" "sse")))
! "decoder0,p1*2")
!
! (define_insn_reservation "ppro_sse_add_V4SF" 3
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "none")
! (and (eq_attr "mode" "V4SF")
! (eq_attr "type" "sseadd"))))
! "decoder0,p1*2")
!
! (define_insn_reservation "ppro_sse_add_V4SF_load" 3
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "load")
! (and (eq_attr "mode" "V4SF")
! (eq_attr "type" "sseadd"))))
! "decoder0,(p2+p1)*2")
!
! (define_insn_reservation "ppro_sse_cmp_V4SF" 3
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "none")
! (and (eq_attr "mode" "V4SF")
! (eq_attr "type" "ssecmp"))))
! "decoder0,p1*2")
!
! (define_insn_reservation "ppro_sse_cmp_V4SF_load" 3
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "load")
! (and (eq_attr "mode" "V4SF")
! (eq_attr "type" "ssecmp"))))
! "decoder0,(p2+p1)*2")
!
! (define_insn_reservation "ppro_sse_cvt_V4SF" 3
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "none,unknown")
! (and (eq_attr "mode" "V4SF")
! (eq_attr "type" "ssecvt"))))
! "decoder0,p1*2")
!
! (define_insn_reservation "ppro_sse_cvt_V4SF_other" 4
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "!none,unknown")
! (and (eq_attr "mode" "V4SF")
! (eq_attr "type" "ssecmp"))))
! "decoder0,p1,p4+p3")
!
! (define_insn_reservation "ppro_sse_mul_V4SF" 5
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "none")
! (and (eq_attr "mode" "V4SF")
! (eq_attr "type" "ssemul"))))
! "decoder0,p0*2")
!
! (define_insn_reservation "ppro_sse_mul_V4SF_load" 5
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "load")
! (and (eq_attr "mode" "V4SF")
! (eq_attr "type" "ssemul"))))
! "decoder0,(p2+p0)*2")
!
! ;; FIXME: p0 really closed this long???
! (define_insn_reservation "ppro_sse_div_V4SF" 48
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "none")
! (and (eq_attr "mode" "V4SF")
! (eq_attr "type" "ssediv"))))
! "decoder0,p0*34")
!
! (define_insn_reservation "ppro_sse_div_V4SF_load" 48
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "none")
! (and (eq_attr "mode" "V4SF")
! (eq_attr "type" "ssediv"))))
! "decoder0,(p2+p0)*2,p0*32")
!
! (define_insn_reservation "ppro_sse_log_V4SF" 2
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "none")
! (and (eq_attr "mode" "V4SF")
! (eq_attr "type" "sselog"))))
! "decodern,p1")
!
! (define_insn_reservation "ppro_sse_log_V4SF_load" 2
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "none")
! (and (eq_attr "mode" "V4SF")
! (eq_attr "type" "sselog"))))
! "decoder0,(p2+p1)")
!
! (define_insn_reservation "ppro_sse_mov_V4SF" 1
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "none")
! (and (eq_attr "mode" "V4SF")
! (eq_attr "type" "ssemov"))))
! "decoder0,(p0|p1)*2")
!
! (define_insn_reservation "ppro_sse_mov_V4SF_load" 2
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "load")
! (and (eq_attr "mode" "V4SF")
! (eq_attr "type" "ssemov"))))
! "decoder0,p2*2")
!
! (define_insn_reservation "ppro_sse_mov_V4SF_store" 3
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "store")
! (and (eq_attr "mode" "V4SF")
! (eq_attr "type" "ssemov"))))
! "decoder0,(p4+p3)*2")
!
! ;; All other instructions are modelled as simple instructions.
! ;; We have already modelled all i387 floating point instructions, so all
! ;; other instructions execute on either port 0 or port 1. This includes
! ;; the ALU units, and the MMX units.
! ;;
! ;; reg-reg instructions produce 1 uop so they can be decoded on any of
! ;; the three decoders.
! (define_insn_reservation "ppro_insn" 1
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "none,unknown")
! (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp")))
! "decodern,(p0|p1)")
!
! ;; read-modify and register-memory instructions have 2 or three uops,
! ;; so they have to be decoded on decoder0.
! (define_insn_reservation "ppro_insn_load" 3
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "load")
! (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp")))
! "decoder0,p2+(p0|p1)")
!
! (define_insn_reservation "ppro_insn_store" 1
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "store")
! (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp")))
! "decoder0,(p0|p1),p4+p3")
!
! ;; read-modify-store instructions produce 4 uops so they have to be
! ;; decoded on decoder0 as well.
! (define_insn_reservation "ppro_insn_both" 4
! (and (eq_attr "cpu" "pentiumpro")
! (and (eq_attr "memory" "both")
! (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp")))
! "decoder0,p2+(p0|p1),p4+p3")
!