From: Richard Henderson Date: Sun, 5 May 2002 21:54:39 +0000 (-0700) Subject: alpha.c (alpha_adjust_cost): Remove everything but memory latency adjustments. X-Git-Tag: releases/gcc-3.3.0~5240 X-Git-Url: https://gcc.gnu.org/git/?a=commitdiff_plain;h=98791e3a3f59c66a1699becef28b69888d319136;p=gcc.git alpha.c (alpha_adjust_cost): Remove everything but memory latency adjustments. * config/alpha/alpha.c (alpha_adjust_cost): Remove everything but memory latency adjustments. (alpha_variable_issue): Remove. (alpha_use_dfa_pipeline_interface): New. (alpha_multipass_dfa_lookahead): New. * config/alpha/alpha.md: Remove define_function_unit scheduling; include new dfa scheduling. (attr type): Add none. (blockage): Use it. * config/alpha/ev4.md: New. * config/alpha/ev5.md: New. * config/alpha/ev6.md: New. From-SVN: r53196 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index fd879138c04d..8dfaae729a12 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,18 @@ +2002-05-05 Richard Henderson + + * config/alpha/alpha.c (alpha_adjust_cost): Remove everything but + memory latency adjustments. + (alpha_variable_issue): Remove. + (alpha_use_dfa_pipeline_interface): New. + (alpha_multipass_dfa_lookahead): New. + * config/alpha/alpha.md: Remove define_function_unit scheduling; + include new dfa scheduling. + (attr type): Add none. + (blockage): Use it. + * config/alpha/ev4.md: New. + * config/alpha/ev5.md: New. + * config/alpha/ev6.md: New. + 2002-05-05 David S. Miller * recog.c (store_data_bypass_p): Handle CLOBBER inside PARALLEL. diff --git a/gcc/config/alpha/alpha.c b/gcc/config/alpha/alpha.c index 915d6e489965..c40636a4bb13 100644 --- a/gcc/config/alpha/alpha.c +++ b/gcc/config/alpha/alpha.c @@ -157,8 +157,10 @@ static int alpha_adjust_cost PARAMS ((rtx, rtx, rtx, int)); static int alpha_issue_rate PARAMS ((void)); -static int alpha_variable_issue - PARAMS ((FILE *, int, rtx, int)); +static int alpha_use_dfa_pipeline_interface + PARAMS ((void)); +static int alpha_multipass_dfa_lookahead + PARAMS ((void)); #if TARGET_ABI_UNICOSMK static void alpha_init_machine_status @@ -231,8 +233,12 @@ static unsigned int unicosmk_section_type_flags PARAMS ((tree, const char *, #define TARGET_SCHED_ADJUST_COST alpha_adjust_cost #undef TARGET_SCHED_ISSUE_RATE #define TARGET_SCHED_ISSUE_RATE alpha_issue_rate -#undef TARGET_SCHED_VARIABLE_ISSUE -#define TARGET_SCHED_VARIABLE_ISSUE alpha_variable_issue +#undef TARGET_SCHED_USE_DFA_PIPELINE_INTERFACE +#define TARGET_SCHED_USE_DFA_PIPELINE_INTERFACE \ + alpha_use_dfa_pipeline_interface +#undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD +#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \ + alpha_multipass_dfa_lookahead struct gcc_target targetm = TARGET_INITIALIZER; @@ -4828,9 +4834,8 @@ alpha_adjust_cost (insn, link, dep_insn, cost) /* If the dependence is an anti-dependence, there is no cost. For an output dependence, there is sometimes a cost, but it doesn't seem worth handling those few cases. */ - if (REG_NOTE_KIND (link) != 0) - return 0; + return cost; /* If we can't recognize the insns, we can't really do anything. */ if (recog_memoized (insn) < 0 || recog_memoized (dep_insn) < 0) @@ -4845,122 +4850,13 @@ alpha_adjust_cost (insn, link, dep_insn, cost) || dep_insn_type == TYPE_LDSYM) cost += alpha_memory_latency-1; - switch (alpha_cpu) - { - case PROCESSOR_EV4: - /* On EV4, if INSN is a store insn and DEP_INSN is setting the data - being stored, we can sometimes lower the cost. */ - - if ((insn_type == TYPE_IST || insn_type == TYPE_FST) - && (set = single_set (dep_insn)) != 0 - && GET_CODE (PATTERN (insn)) == SET - && rtx_equal_p (SET_DEST (set), SET_SRC (PATTERN (insn)))) - { - switch (dep_insn_type) - { - case TYPE_ILD: - case TYPE_FLD: - /* No savings here. */ - return cost; - - case TYPE_IMUL: - /* In these cases, we save one cycle. */ - return cost - 1; - - default: - /* In all other cases, we save two cycles. */ - return MAX (0, cost - 2); - } - } + /* Everything else handled in DFA bypasses now. */ - /* Another case that needs adjustment is an arithmetic or logical - operation. It's cost is usually one cycle, but we default it to - two in the MD file. The only case that it is actually two is - for the address in loads, stores, and jumps. */ - - if (dep_insn_type == TYPE_IADD || dep_insn_type == TYPE_ILOG) - { - switch (insn_type) - { - case TYPE_ILD: - case TYPE_IST: - case TYPE_FLD: - case TYPE_FST: - case TYPE_JSR: - return cost; - default: - return 1; - } - } - - /* The final case is when a compare feeds into an integer branch; - the cost is only one cycle in that case. */ - - if (dep_insn_type == TYPE_ICMP && insn_type == TYPE_IBR) - return 1; - break; - - case PROCESSOR_EV5: - /* And the lord DEC saith: "A special bypass provides an effective - latency of 0 cycles for an ICMP or ILOG insn producing the test - operand of an IBR or ICMOV insn." */ - - if ((dep_insn_type == TYPE_ICMP || dep_insn_type == TYPE_ILOG) - && (set = single_set (dep_insn)) != 0) - { - /* A branch only has one input. This must be it. */ - if (insn_type == TYPE_IBR) - return 0; - /* A conditional move has three, make sure it is the test. */ - if (insn_type == TYPE_ICMOV - && GET_CODE (set_src = PATTERN (insn)) == SET - && GET_CODE (set_src = SET_SRC (set_src)) == IF_THEN_ELSE - && rtx_equal_p (SET_DEST (set), XEXP (set_src, 0))) - return 0; - } - - /* "The multiplier is unable to receive data from IEU bypass paths. - The instruction issues at the expected time, but its latency is - increased by the time it takes for the input data to become - available to the multiplier" -- which happens in pipeline stage - six, when results are comitted to the register file. */ - - if (insn_type == TYPE_IMUL) - { - switch (dep_insn_type) - { - /* These insns produce their results in pipeline stage five. */ - case TYPE_ILD: - case TYPE_ICMOV: - case TYPE_IMUL: - case TYPE_MVI: - return cost + 1; - - /* Other integer insns produce results in pipeline stage four. */ - default: - return cost + 2; - } - } - break; - - case PROCESSOR_EV6: - /* There is additional latency to move the result of (most) FP - operations anywhere but the FP register file. */ - - if ((insn_type == TYPE_FST || insn_type == TYPE_FTOI) - && (dep_insn_type == TYPE_FADD || - dep_insn_type == TYPE_FMUL || - dep_insn_type == TYPE_FCMOV)) - return cost + 2; - - break; - } - - /* Otherwise, return the default cost. */ return cost; } -/* Function to initialize the issue rate used by the scheduler. */ +/* The number of instructions that can be issued per cycle. */ + static int alpha_issue_rate () { @@ -4968,18 +4864,24 @@ alpha_issue_rate () } static int -alpha_variable_issue (dump, verbose, insn, cim) - FILE *dump ATTRIBUTE_UNUSED; - int verbose ATTRIBUTE_UNUSED; - rtx insn; - int cim; +alpha_use_dfa_pipeline_interface () { - if (recog_memoized (insn) < 0 || get_attr_type (insn) == TYPE_MULTI) - return 0; - - return cim - 1; + return true; } +/* How many alternative schedules to try. This should be as wide as the + scheduling freedom in the DFA, but no wider. Making this value too + large results extra work for the scheduler. + + For EV4, loads can be issued to either IB0 or IB1, thus we have 2 + alternative schedules. For EV5, we can choose between E0/E1 and + FA/FM. For EV6, an arithmatic insn can be issued to U0/U1/L0/L1. */ + +static int +alpha_multipass_dfa_lookahead () +{ + return (alpha_cpu == PROCESSOR_EV6 ? 4 : 2); +} /* Register global variables and machine-specific functions with the garbage collector. */ diff --git a/gcc/config/alpha/alpha.md b/gcc/config/alpha/alpha.md index 785a61df01ef..1df715ad0c27 100644 --- a/gcc/config/alpha/alpha.md +++ b/gcc/config/alpha/alpha.md @@ -78,7 +78,7 @@ (define_attr "type" "ild,fld,ldsym,ist,fst,ibr,fbr,jsr,iadd,ilog,shift,icmov,fcmov,icmp,imul,\ -fadd,fmul,fcpys,fdiv,fsqrt,misc,mvi,ftoi,itof,multi" +fadd,fmul,fcpys,fdiv,fsqrt,misc,mvi,ftoi,itof,multi,none" (const_string "iadd")) ;; Describe a user's asm statement. @@ -123,313 +123,11 @@ fadd,fmul,fcpys,fdiv,fsqrt,misc,mvi,ftoi,itof,multi" (define_attr "length" "" (const_int 4)) -;; On EV4 there are two classes of resources to consider: resources needed -;; to issue, and resources needed to execute. IBUS[01] are in the first -;; category. ABOX, BBOX, EBOX, FBOX, IMUL & FDIV make up the second. -;; (There are a few other register-like resources, but ...) - -; First, describe all of the issue constraints with single cycle delays. -; All insns need a bus, but all except loads require one or the other. -(define_function_unit "ev4_ibus0" 1 0 - (and (eq_attr "cpu" "ev4") - (eq_attr "type" "fst,fbr,iadd,imul,ilog,shift,icmov,icmp")) - 1 1) - -(define_function_unit "ev4_ibus1" 1 0 - (and (eq_attr "cpu" "ev4") - (eq_attr "type" "ist,ibr,jsr,fadd,fcmov,fcpys,fmul,fdiv,misc")) - 1 1) - -; Memory delivers its result in three cycles. Actually return one and -; take care of this in adjust_cost, since we want to handle user-defined -; memory latencies. -(define_function_unit "ev4_abox" 1 0 - (and (eq_attr "cpu" "ev4") - (eq_attr "type" "ild,fld,ldsym,ist,fst")) - 1 1) - -; Branches have no delay cost, but do tie up the unit for two cycles. -(define_function_unit "ev4_bbox" 1 1 - (and (eq_attr "cpu" "ev4") - (eq_attr "type" "ibr,fbr,jsr")) - 2 2) - -; Arithmetic insns are normally have their results available after -; two cycles. There are a number of exceptions. They are encoded in -; ADJUST_COST. Some of the other insns have similar exceptions. -(define_function_unit "ev4_ebox" 1 0 - (and (eq_attr "cpu" "ev4") - (eq_attr "type" "iadd,ilog,shift,icmov,icmp,misc")) - 2 1) - -(define_function_unit "imul" 1 0 - (and (eq_attr "cpu" "ev4") - (and (eq_attr "type" "imul") - (eq_attr "opsize" "si"))) - 21 19) - -(define_function_unit "imul" 1 0 - (and (eq_attr "cpu" "ev4") - (and (eq_attr "type" "imul") - (eq_attr "opsize" "!si"))) - 23 21) - -(define_function_unit "ev4_fbox" 1 0 - (and (eq_attr "cpu" "ev4") - (eq_attr "type" "fadd,fmul,fcpys,fcmov")) - 6 1) - -(define_function_unit "fdiv" 1 0 - (and (eq_attr "cpu" "ev4") - (and (eq_attr "type" "fdiv") - (eq_attr "opsize" "si"))) - 34 30) - -(define_function_unit "fdiv" 1 0 - (and (eq_attr "cpu" "ev4") - (and (eq_attr "type" "fdiv") - (eq_attr "opsize" "di"))) - 63 59) - -;; EV5 scheduling. EV5 can issue 4 insns per clock. -;; -;; EV5 has two asymetric integer units. Model this with E0 & E1 along -;; with the combined resource EBOX. - -(define_function_unit "ev5_ebox" 2 0 - (and (eq_attr "cpu" "ev5") - (eq_attr "type" "!fbr,fcmov,fadd,fmul,fcpys,fdiv")) - 1 1) - -; Memory takes at least 2 clocks. Return one from here and fix up with -; user-defined latencies in adjust_cost. -(define_function_unit "ev5_ebox" 2 0 - (and (eq_attr "cpu" "ev5") - (eq_attr "type" "ild,fld,ldsym")) - 1 1) - -; Loads can dual issue with one another, but loads and stores do not mix. -(define_function_unit "ev5_e0" 1 0 - (and (eq_attr "cpu" "ev5") - (eq_attr "type" "ild,fld,ldsym")) - 1 1 - [(eq_attr "type" "ist,fst")]) - -; Stores, shifts, multiplies can only issue to E0 -(define_function_unit "ev5_e0" 1 0 - (and (eq_attr "cpu" "ev5") - (eq_attr "type" "ist,fst,shift,imul")) - 1 1) - -; Motion video insns also issue only to E0, and take two ticks. -(define_function_unit "ev5_e0" 1 0 - (and (eq_attr "cpu" "ev5") - (eq_attr "type" "mvi")) - 2 1) - -; Conditional moves always take 2 ticks. -(define_function_unit "ev5_ebox" 2 0 - (and (eq_attr "cpu" "ev5") - (eq_attr "type" "icmov")) - 2 1) - -; Branches can only issue to E1 -(define_function_unit "ev5_e1" 1 0 - (and (eq_attr "cpu" "ev5") - (eq_attr "type" "ibr,jsr")) - 1 1) - -; Multiplies also use the integer multiplier. -; ??? How to: "No instruction can be issued to pipe E0 exactly two -; cycles before an integer multiplication completes." -(define_function_unit "imul" 1 0 - (and (eq_attr "cpu" "ev5") - (and (eq_attr "type" "imul") - (eq_attr "opsize" "si"))) - 8 4) - -(define_function_unit "imul" 1 0 - (and (eq_attr "cpu" "ev5") - (and (eq_attr "type" "imul") - (eq_attr "opsize" "di"))) - 12 8) - -(define_function_unit "imul" 1 0 - (and (eq_attr "cpu" "ev5") - (and (eq_attr "type" "imul") - (eq_attr "opsize" "udi"))) - 14 8) - -;; Similarly for the FPU we have two asymetric units. But fcpys can issue -;; on either so we have to play the game again. - -(define_function_unit "ev5_fbox" 2 0 - (and (eq_attr "cpu" "ev5") - (eq_attr "type" "fadd,fcmov,fmul,fcpys,fbr,fdiv")) - 4 1) - -(define_function_unit "ev5_fm" 1 0 - (and (eq_attr "cpu" "ev5") - (eq_attr "type" "fmul")) - 4 1) - -; Add and cmov as you would expect; fbr never produces a result; -; fdiv issues through fa to the divider, -(define_function_unit "ev5_fa" 1 0 - (and (eq_attr "cpu" "ev5") - (eq_attr "type" "fadd,fcmov,fbr,fdiv")) - 4 1) - -; ??? How to: "No instruction can be issued to pipe FA exactly five -; cycles before a floating point divide completes." -(define_function_unit "fdiv" 1 0 - (and (eq_attr "cpu" "ev5") - (and (eq_attr "type" "fdiv") - (eq_attr "opsize" "si"))) - 15 15) ; 15 to 31 data dependent - -(define_function_unit "fdiv" 1 0 - (and (eq_attr "cpu" "ev5") - (and (eq_attr "type" "fdiv") - (eq_attr "opsize" "di"))) - 22 22) ; 22 to 60 data dependent - -;; EV6 scheduling. EV6 can issue 4 insns per clock. -;; -;; EV6 has two symmetric pairs ("clusters") of two asymetric integer units -;; ("upper" and "lower"), yielding pipe names U0, U1, L0, L1. - -;; Conditional moves decompose into two independent primitives, each -;; taking one cycle. Since ev6 is out-of-order, we can't see anything -;; but two cycles. -(define_function_unit "ev6_ebox" 4 0 - (and (eq_attr "cpu" "ev6") - (eq_attr "type" "icmov")) - 2 1) - -(define_function_unit "ev6_ebox" 4 0 - (and (eq_attr "cpu" "ev6") - (eq_attr "type" "!fbr,fcmov,fadd,fmul,fcpys,fdiv,fsqrt")) - 1 1) - -;; Integer loads take at least 3 clocks, and only issue to lower units. -;; Return one from here and fix up with user-defined latencies in adjust_cost. -(define_function_unit "ev6_l" 2 0 - (and (eq_attr "cpu" "ev6") - (eq_attr "type" "ild,ldsym,ist,fst")) - 1 1) - -;; FP loads take at least 4 clocks. Return two from here... -(define_function_unit "ev6_l" 2 0 - (and (eq_attr "cpu" "ev6") - (eq_attr "type" "fld")) - 2 1) - -;; Motion video insns also issue only to U0, and take three ticks. -(define_function_unit "ev6_u0" 1 0 - (and (eq_attr "cpu" "ev6") - (eq_attr "type" "mvi")) - 3 1) - -(define_function_unit "ev6_u" 2 0 - (and (eq_attr "cpu" "ev6") - (eq_attr "type" "mvi")) - 3 1) - -;; Shifts issue to either upper pipe. -(define_function_unit "ev6_u" 2 0 - (and (eq_attr "cpu" "ev6") - (eq_attr "type" "shift")) - 1 1) - -;; Multiplies issue only to U1, and all take 7 ticks. -;; Rather than create a new function unit just for U1, reuse IMUL -(define_function_unit "imul" 1 0 - (and (eq_attr "cpu" "ev6") - (eq_attr "type" "imul")) - 7 1) - -(define_function_unit "ev6_u" 2 0 - (and (eq_attr "cpu" "ev6") - (eq_attr "type" "imul")) - 7 1) - -;; Branches issue to either upper pipe -(define_function_unit "ev6_u" 2 0 - (and (eq_attr "cpu" "ev6") - (eq_attr "type" "ibr")) - 3 1) - -;; Calls only issue to L0. -(define_function_unit "ev6_l0" 1 0 - (and (eq_attr "cpu" "ev6") - (eq_attr "type" "jsr")) - 1 1) - -(define_function_unit "ev6_l" 2 0 - (and (eq_attr "cpu" "ev6") - (eq_attr "type" "jsr")) - 1 1) - -;; Ftoi/itof only issue to lower pipes -(define_function_unit "ev6_l" 2 0 - (and (eq_attr "cpu" "ev6") - (eq_attr "type" "ftoi")) - 3 1) - -(define_function_unit "ev6_l" 2 0 - (and (eq_attr "cpu" "ev6") - (eq_attr "type" "itof")) - 4 1) - -;; For the FPU we are very similar to EV5, except there's no insn that -;; can issue to fm & fa, so we get to leave that out. - -(define_function_unit "ev6_fm" 1 0 - (and (eq_attr "cpu" "ev6") - (eq_attr "type" "fmul")) - 4 1) - -(define_function_unit "ev6_fa" 1 0 - (and (eq_attr "cpu" "ev6") - (eq_attr "type" "fadd,fcpys,fbr,fdiv,fsqrt")) - 4 1) - -(define_function_unit "ev6_fa" 1 0 - (and (eq_attr "cpu" "ev6") - (eq_attr "type" "fcmov")) - 8 1) - -(define_function_unit "fdiv" 1 0 - (and (eq_attr "cpu" "ev6") - (and (eq_attr "type" "fdiv") - (eq_attr "opsize" "si"))) - 12 10) - -(define_function_unit "fdiv" 1 0 - (and (eq_attr "cpu" "ev6") - (and (eq_attr "type" "fdiv") - (eq_attr "opsize" "di"))) - 15 13) - -(define_function_unit "fsqrt" 1 0 - (and (eq_attr "cpu" "ev6") - (and (eq_attr "type" "fsqrt") - (eq_attr "opsize" "si"))) - 16 14) - -(define_function_unit "fsqrt" 1 0 - (and (eq_attr "cpu" "ev6") - (and (eq_attr "type" "fsqrt") - (eq_attr "opsize" "di"))) - 32 30) - -; ??? The FPU communicates with memory and the integer register file -; via two fp store units. We need a slot in the fst immediately, and -; a slot in LOW after the operand data is ready. At which point the -; data may be moved either to the store queue or the integer register -; file and the insn retired. - +;; Include scheduling descriptions. + +(include "ev4.md") +(include "ev5.md") +(include "ev6.md") ;; First define the arithmetic insns. Note that the 32-bit forms also ;; sign-extend. @@ -5018,7 +4716,8 @@ fadd,fmul,fcpys,fdiv,fsqrt,misc,mvi,ftoi,itof,multi" [(unspec_volatile [(const_int 0)] UNSPECV_BLOCKAGE)] "" "" - [(set_attr "length" "0")]) + [(set_attr "length" "0") + (set_attr "type" "none")]) (define_insn "jump" [(set (pc) diff --git a/gcc/config/alpha/ev4.md b/gcc/config/alpha/ev4.md new file mode 100644 index 000000000000..6816e44640ba --- /dev/null +++ b/gcc/config/alpha/ev4.md @@ -0,0 +1,142 @@ +;; Scheduling description for Alpha EV4. +;; Copyright (C) 2002 Free Software Foundation, Inc. +;; +;; This file is part of GNU CC. +;; +;; GNU CC is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 2, or (at your option) +;; any later version. +;; +;; GNU CC is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GNU CC; see the file COPYING. If not, write to +;; the Free Software Foundation, 59 Temple Place - Suite 330, +;; Boston, MA 02111-1307, USA. + +; On EV4 there are two classes of resources to consider: resources needed +; to issue, and resources needed to execute. IBUS[01] are in the first +; category. ABOX, BBOX, EBOX, FBOX, IMUL & FDIV make up the second. +; (There are a few other register-like resources, but ...) + +(define_automaton "ev4_0,ev4_1,ev4_2") +(define_cpu_unit "ev4_ib0,ev4_ib1,ev4_abox,ev4_bbox" "ev4_0") +(define_cpu_unit "ev4_ebox,ev4_imul" "ev4_1") +(define_cpu_unit "ev4_fbox,ev4_fdiv" "ev4_2") +(define_reservation "ev4_ib01" "ev4_ib0|ev4_ib1") + +; Assume type "multi" single issues. +(define_insn_reservation "ev4_multi" 1 + (and (eq_attr "cpu" "ev4") + (eq_attr "type" "multi")) + "ev4_ib0+ev4_ib1") + +; Loads from L0 completes in three cycles. adjust_cost still factors +; in user-specified memory latency, so return 1 here. +(define_insn_reservation "ev4_ld" 1 + (and (eq_attr "cpu" "ev4") + (eq_attr "type" "ild,fld,ldsym")) + "ev4_ib01+ev4_abox") + +; Stores can issue before the data (but not address) is ready. +(define_insn_reservation "ev4_ist" 1 + (and (eq_attr "cpu" "ev4") + (eq_attr "type" "ist")) + "ev4_ib1+ev4_abox") + +(define_insn_reservation "ev4_fst" 1 + (and (eq_attr "cpu" "ev4") + (eq_attr "type" "fst")) + "ev4_ib0+ev4_abox") + +; Branches have no delay cost, but do tie up the unit for two cycles. +(define_insn_reservation "ev4_ibr" 2 + (and (eq_attr "cpu" "ev4") + (eq_attr "type" "ibr,jsr")) + "ev4_ib1+ev4_bbox,ev4_bbox") + +(define_insn_reservation "ev4_fbr" 2 + (and (eq_attr "cpu" "ev4") + (eq_attr "type" "fbr")) + "ev4_ib0+ev4_bbox,ev4_bbox") + +; Arithmetic insns are normally have their results available after +; two cycles. There are a number of exceptions. + +(define_insn_reservation "ev4_iaddlog" 2 + (and (eq_attr "cpu" "ev4") + (eq_attr "type" "iadd,ilog")) + "ev4_ib0+ev4_ebox") + +(define_bypass 1 + "ev4_iaddlog" + "ev4_ibr,ev4_iaddlog,ev4_shiftcm,ev4_icmp,ev4_imulsi,ev4_imuldi") + +(define_insn_reservation "ev4_shiftcm" 2 + (and (eq_attr "cpu" "ev4") + (eq_attr "type" "shift,icmov")) + "ev4_ib0+ev4_ebox") + +(define_insn_reservation "ev4_icmp" 2 + (and (eq_attr "cpu" "ev4") + (eq_attr "type" "icmp")) + "ev4_ib0+ev4_ebox") + +(define_bypass 1 "ev4_icmp" "ev4_ibr") + +(define_bypass 0 + "ev4_iaddlog,ev4_shiftcm,ev4_icmp" + "ev4_ist" + "store_data_bypass_p") + +; Multiplies use a non-piplined imul unit. Also, "no [ebox] insn can +; be issued exactly three cycles before an integer multiply completes". + +(define_insn_reservation "ev4_imulsi" 21 + (and (eq_attr "cpu" "ev4") + (and (eq_attr "type" "imul") + (eq_attr "opsize" "si"))) + "ev4_ib0+ev4_imul,ev4_imul*18,ev4_ebox") + +(define_bypass 20 "ev4_imulsi" "ev4_ist" "store_data_bypass_p") + +(define_insn_reservation "ev4_imuldi" 23 + (and (eq_attr "cpu" "ev4") + (and (eq_attr "type" "imul") + (eq_attr "opsize" "!si"))) + "ev4_ib0+ev4_imul,ev4_imul*20,ev4_ebox") + +(define_bypass 22 "ev4_imuldi" "ev4_ist" "store_data_bypass_p") + +; Most FP insns have a 6 cycle latency, but with a 4 cycle bypass back in. +(define_insn_reservation "ev4_fpop" 6 + (and (eq_attr "cpu" "ev4") + (eq_attr "type" "fadd,fmul,fcpys,fcmov")) + "ev4_ib1+ev4_fbox") + +(define_bypass 4 "ev4_fpop" "ev4_fpop") + +; The floating point divider is not pipelined. Also, "no FPOP insn can be +; issued exactly five or exactly six cycles before an fdiv insn completes". + +(define_insn_reservation "ev4_fdivsf" 34 + (and (eq_attr "cpu" "ev4") + (and (eq_attr "type" "fdiv") + (eq_attr "opsize" "si"))) + "ev4_ib1+ev4_fdiv,ev4_fdiv*28,ev4_fdiv+ev4_fbox,ev4_fbox") + +(define_insn_reservation "ev4_fdivdf" 63 + (and (eq_attr "cpu" "ev4") + (and (eq_attr "type" "fdiv") + (eq_attr "opsize" "di"))) + "ev4_ib1+ev4_fdiv,ev4_fdiv*57,ev4_fdiv+ev4_fbox,ev4_fbox") + +; Traps don't consume or produce data. +(define_insn_reservation "ev4_misc" 1 + (and (eq_attr "cpu" "ev4") + (eq_attr "type" "misc")) + "ev4_ib1") diff --git a/gcc/config/alpha/ev5.md b/gcc/config/alpha/ev5.md new file mode 100644 index 000000000000..f0dfbdf22e6f --- /dev/null +++ b/gcc/config/alpha/ev5.md @@ -0,0 +1,180 @@ +;; Scheduling description for Alpha EV5. +;; Copyright (C) 2002 Free Software Foundation, Inc. +;; +;; This file is part of GNU CC. +;; +;; GNU CC is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 2, or (at your option) +;; any later version. +;; +;; GNU CC is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GNU CC; see the file COPYING. If not, write to +;; the Free Software Foundation, 59 Temple Place - Suite 330, +;; Boston, MA 02111-1307, USA. + +;; EV5 has two asymetric integer units, E0 and E1, plus separate +;; FP add and multiply units. + +(define_automaton "ev5_0,ev5_1") +(define_cpu_unit "ev5_e0,ev5_e1,ev5_fa,ev5_fm" "ev5_0") +(define_reservation "ev5_e01" "ev5_e0|ev5_e1") +(define_reservation "ev5_fam" "ev5_fa|ev5_fm") +(define_cpu_unit "ev5_imul" "ev5_0") +(define_cpu_unit "ev5_fdiv" "ev5_1") + +; Assume type "multi" single issues. +(define_insn_reservation "ev5_multi" 1 + (and (eq_attr "cpu" "ev5") + (eq_attr "type" "multi")) + "ev5_e0+ev5_e1+ev5_fa+ev5_fm") + +; Stores can only issue to E0, and may not issue with loads. +; Model this with some fake units. + +(define_cpu_unit "ev5_l0,ev5_l1,ev5_st" "ev5_0") +(define_reservation "ev5_ld" "ev5_l0|ev5_l1") +(exclusion_set "ev5_l0,ev5_l1" "ev5_st") + +(define_insn_reservation "ev5_st" 1 + (and (eq_attr "cpu" "ev5") + (eq_attr "type" "ist,fst")) + "ev5_e0+ev5_st") + +; Loads from L0 complete in two cycles. adjust_cost still factors +; in user-specified memory latency, so return 1 here. +(define_insn_reservation "ev5_ld" 1 + (and (eq_attr "cpu" "ev5") + (eq_attr "type" "ild,fld,ldsym")) + "ev5_e01+ev5_ld") + +; Integer branches slot only to E1. +(define_insn_reservation "ev5_ibr" 1 + (and (eq_attr "cpu" "ev5") + (eq_attr "type" "ibr")) + "ev5_e1") + +(define_insn_reservation "ev5_jsr" 1 + (and (eq_attr "cpu" "ev5") + (eq_attr "type" "jsr")) + "ev5_e1") + +(define_insn_reservation "ev5_shiftmvi" 2 + (and (eq_attr "cpu" "ev5") + (eq_attr "type" "shift,mvi")) + "ev5_e0") + +(define_insn_reservation "ev5_cmov" 2 + (and (eq_attr "cpu" "ev5") + (eq_attr "type" "icmov")) + "ev5_e01") + +(define_insn_reservation "ev5_iadd" 1 + (and (eq_attr "cpu" "ev5") + (eq_attr "type" "iadd")) + "ev5_e01") + +(define_insn_reservation "ev5_ilogcmp" 1 + (and (eq_attr "cpu" "ev5") + (eq_attr "type" "ilog,icmp")) + "ev5_e01") + +; Conditional move and branch can issue the same cycle as the test. +(define_bypass 0 "ev5_ilogcmp" "ev5_ibr,ev5_cmov" "if_test_bypass_p") + +; Multiplies use a non-piplined imul unit. Also, "no insn can be issued +; to E0 exactly two cycles before an integer multiply completes". + +(define_insn_reservation "ev5_imull" 8 + (and (eq_attr "cpu" "ev5") + (and (eq_attr "type" "imul") + (eq_attr "opsize" "si"))) + "ev5_e0+ev5_imul,ev5_imul*3,nothing,ev5_e0") + +(define_insn_reservation "ev5_imulq" 12 + (and (eq_attr "cpu" "ev5") + (and (eq_attr "type" "imul") + (eq_attr "opsize" "di"))) + "ev5_e0+ev5_imul,ev5_imul*7,nothing,ev5_e0") + +(define_insn_reservation "ev5_imulh" 14 + (and (eq_attr "cpu" "ev5") + (and (eq_attr "type" "imul") + (eq_attr "opsize" "udi"))) + "ev5_e0+ev5_imul,ev5_imul*7,nothing*3,ev5_e0") + +; The multiplier is unable to receive data from Ebox bypass paths. The +; instruction issues at the expected time, but its latency is increased +; by the time it takes for the input data to become available to the +; multiplier. For example, an IMULL instruction issued one cycle later +; than an ADDL instruction, which produced one of its operands, has a +; latency of 10 (8 + 2). If the IMULL instruction is issued two cycles +; later than the ADDL instruction, the latency is 9 (8 + 1). +; +; Model this instead with increased latency on the input instruction. + +(define_bypass 3 + "ev5_ld,ev5_shiftmvi,ev5_cmov,ev5_iadd,ev5_ilogcmp" + "ev5_imull,ev5_imulq,ev5_imulh") + +(define_bypass 9 "ev5_imull" "ev5_imull,ev5_imulq,ev5_imulh") +(define_bypass 13 "ev5_imulq" "ev5_imull,ev5_imulq,ev5_imulh") +(define_bypass 15 "ev5_imulh" "ev5_imull,ev5_imulq,ev5_imulh") + +; Similarly for the FPU we have two asymetric units. + +(define_insn_reservation "ev5_fadd" 4 + (and (eq_attr "cpu" "ev5") + (eq_attr "type" "fadd,fcmov")) + "ev5_fa") + +(define_insn_reservation "ev5_fbr" 1 + (and (eq_attr "cpu" "ev5") + (eq_attr "type" "fbr")) + "ev5_fa") + +(define_insn_reservation "ev5_fcpys" 4 + (and (eq_attr "cpu" "ev5") + (eq_attr "type" "fcpys")) + "ev5_fam") + +(define_insn_reservation "ev5_fmul" 4 + (and (eq_attr "cpu" "ev5") + (eq_attr "type" "fmul")) + "ev5_fm") + +; The floating point divider is not pipelined. Also, "no insn can be issued +; to FA exactly five before an fdiv insn completes". +; +; ??? Do not model this late reservation due to the enormously increased +; size of the resulting DFA. +; +; ??? Putting ev5_fa and ev5_fdiv alone into the same automata produces +; a DFA of acceptable size, but putting ev5_fm and ev5_fa into separate +; automata produces incorrect results for insns that can choose one or +; the other, i.e. ev5_fcpys. + +(define_insn_reservation "ev5_fdivsf" 15 + (and (eq_attr "cpu" "ev5") + (and (eq_attr "type" "fdiv") + (eq_attr "opsize" "si"))) + ; "ev5_fa+ev5_fdiv,ev5_fdiv*9,ev5_fa+ev5_fdiv,ev5_fdiv*4" + "ev5_fa+ev5_fdiv,ev5_fdiv*14") + +(define_insn_reservation "ev5_fdivdf" 22 + (and (eq_attr "cpu" "ev5") + (and (eq_attr "type" "fdiv") + (eq_attr "opsize" "di"))) + ; "ev5_fa+ev5_fdiv,ev5_fdiv*17,ev5_fa+ev5_fdiv,ev5_fdiv*4" + "ev5_fa+ev5_fdiv,ev5_fdiv*21") + +; Traps don't consume or produce data; rpcc is latency 2 if we ever add it. +(define_insn_reservation "ev5_misc" 2 + (and (eq_attr "cpu" "ev5") + (eq_attr "type" "misc")) + "ev5_e0") diff --git a/gcc/config/alpha/ev6.md b/gcc/config/alpha/ev6.md new file mode 100644 index 000000000000..78bb51f23f2d --- /dev/null +++ b/gcc/config/alpha/ev6.md @@ -0,0 +1,173 @@ +;; Scheduling description for Alpha EV6. +;; Copyright (C) 2002 Free Software Foundation, Inc. +;; +;; This file is part of GNU CC. +;; +;; GNU CC is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 2, or (at your option) +;; any later version. +;; +;; GNU CC is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GNU CC; see the file COPYING. If not, write to +;; the Free Software Foundation, 59 Temple Place - Suite 330, +;; Boston, MA 02111-1307, USA. + +; EV6 can issue 4 insns per clock. It's out-of-order, so this isn't +; expected to help over-much, but a precise description can be important +; for software pipelining. +; +; EV6 has two symmetric pairs ("clusters") of two asymetric integer +; units ("upper" and "lower"), yielding pipe names U0, U1, L0, L1. +; +; ??? The clusters have independent register files that are re-synced +; every cycle. Thus there is one additional cycle of latency between +; insns issued on different clusters. Possibly model that by duplicating +; all EBOX insn_reservations that can issue to either cluster, increasing +; all latencies by one, and adding bypasses within the cluster. +; +; ??? In addition, instruction order affects cluster issue. + +(define_automaton "ev6_0,ev6_1") +(define_cpu_unit "ev6_u0,ev6_u1,ev6_l0,ev6_l1" "ev6_0") +(define_reservation "ev6_u" "ev6_u0|ev6_u1") +(define_reservation "ev6_l" "ev6_l0|ev6_l1") +(define_reservation "ev6_ebox" "ev6_u|ev6_l") + +(define_cpu_unit "ev6_fa" "ev6_1") +(define_cpu_unit "ev6_fm,ev6_fst0,ev6_fst1" "ev6_0") +(define_reservation "ev6_fst" "ev6_fst0|ev6_fst1") + +; Assume type "multi" single issues. +(define_insn_reservation "ev6_multi" 1 + (and (eq_attr "cpu" "ev6") + (eq_attr "type" "multi")) + "ev6_u0+ev6_u1+ev6_l0+ev6_l1+ev6_fa+ev6_fm+ev6_fst0+ev6_fst1") + +; Integer loads take at least 3 clocks, and only issue to lower units. +; adjust_cost still factors in user-specified memory latency, so return 1 here. +(define_insn_reservation "ev6_ild" 1 + (and (eq_attr "cpu" "ev6") + (eq_attr "type" "ild,ldsym")) + "ev6_l") + +(define_insn_reservation "ev6_ist" 1 + (and (eq_attr "cpu" "ev6") + (eq_attr "type" "ist")) + "ev6_l") + +; FP loads take at least 4 clocks. adjust_cost still factors +; in user-specified memory latency, so return 2 here. +(define_insn_reservation "ev6_fld" 2 + (and (eq_attr "cpu" "ev6") + (eq_attr "type" "fld")) + "ev6_l") + +; The FPU communicates with memory and the integer register file +; via two fp store units. We need a slot in the fst immediately, and +; a slot in LOW after the operand data is ready. At which point the +; data may be moved either to the store queue or the integer register +; file and the insn retired. + +(define_insn_reservation "ev6_fst" 3 + (and (eq_attr "cpu" "ev6") + (eq_attr "type" "fst")) + "ev6_fst,nothing,ev6_l") + +; Arithmetic goes anywhere. +(define_insn_reservation "ev6_arith" 1 + (and (eq_attr "cpu" "ev6") + (eq_attr "type" "iadd,ilog,icmp")) + "ev6_ebox") + +; Motion video insns also issue only to U0, and take three ticks. +(define_insn_reservation "ev6_mvi" 3 + (and (eq_attr "cpu" "ev6") + (eq_attr "type" "mvi")) + "ev6_u0") + +; Shifts issue to upper units. +(define_insn_reservation "ev6_shift" 1 + (and (eq_attr "cpu" "ev6") + (eq_attr "type" "shift")) + "ev6_u") + +; Multiplies issue only to U1, and all take 7 ticks. +(define_insn_reservation "ev6_imul" 7 + (and (eq_attr "cpu" "ev6") + (eq_attr "type" "imul")) + "ev6_u1") + +; Conditional moves decompose into two independent primitives, each taking +; one cycle. Since ev6 is out-of-order, we can't see anything but two cycles. +(define_insn_reservation "ev6_icmov" 2 + (and (eq_attr "cpu" "ev6") + (eq_attr "type" "icmov")) + "ev6_ebox,ev6_ebox") + +; Integer branches issue to upper units +(define_insn_reservation "ev6_ibr" 1 + (and (eq_attr "cpu" "ev6") + (eq_attr "type" "ibr")) + "ev6_u") + +; Calls only issue to L0. +(define_insn_reservation "ev6_jsr" 1 + (and (eq_attr "cpu" "ev6") + (eq_attr "type" "jsr")) + "ev6_l0") + +; Ftoi/itof only issue to lower pipes. +(define_insn_reservation "ev6_itof" 3 + (and (eq_attr "cpu" "ev6") + (eq_attr "type" "itof")) + "ev6_l") + +(define_insn_reservation "ev6_ftoi" 3 + (and (eq_attr "cpu" "ev6") + (eq_attr "type" "ftoi")) + "ev6_fst,nothing,ev6_l") + +(define_insn_reservation "ev6_fmul" 4 + (and (eq_attr "cpu" "ev6") + (eq_attr "type" "fmul")) + "ev6_fm") + +(define_insn_reservation "ev6_fadd" 4 + (and (eq_attr "cpu" "ev6") + (eq_attr "type" "fadd,fcpys,fbr")) + "ev6_fa") + +(define_insn_reservation "ev6_fcmov" 8 + (and (eq_attr "cpu" "ev6") + (eq_attr "type" "fcmov")) + "ev6_fa,nothing*3,ev6_fa") + +(define_insn_reservation "ev6_fdivsf" 12 + (and (eq_attr "cpu" "ev6") + (and (eq_attr "type" "fdiv") + (eq_attr "opsize" "si"))) + "ev6_fa*9") + +(define_insn_reservation "ev6_fdivdf" 15 + (and (eq_attr "cpu" "ev6") + (and (eq_attr "type" "fdiv") + (eq_attr "opsize" "di"))) + "ev6_fa*12") + +(define_insn_reservation "ev6_sqrtsf" 18 + (and (eq_attr "cpu" "ev6") + (and (eq_attr "type" "fsqrt") + (eq_attr "opsize" "si"))) + "ev6_fa*15") + +(define_insn_reservation "ev6_sqrtdf" 33 + (and (eq_attr "cpu" "ev6") + (and (eq_attr "type" "fsqrt") + (eq_attr "opsize" "di"))) + "ev6_fa*30")