diff -uNr tmp/gcc-4.6-svn-20101116/gcc/ChangeLog gcc-4.6-svn-20101116/gcc/ChangeLog --- tmp/gcc-4.6-svn-20101116/gcc/ChangeLog 2010-11-23 13:43:41.725061000 +0800 +++ gcc-4.6-svn-20101116/gcc/ChangeLog 2010-11-23 14:40:20.796047000 +0800 @@ -1,3 +1,24 @@ +2010-11-22 Toolchain + + Add Faraday CPU support - FA526/FA626/FA606TE/FA626TE/FMP626/FA726TE. + Modify files: + * config/arm/arm-cores.def + * config/arm/arm-tune.md + * config/arm/arm.c + * config/arm/arm.md + * config/arm/bpabi.h + * config/arm/t-arm + * config/arm/t-arm-elf + * config/arm/t-linux-eabi + * doc/invoke.texi + New added files: + * config/arm/fa526.md + * config/arm/fa626.md + * config/arm/fa606te.md + * config/arm/fa626te.md + * config/arm/fmp626.md + * config/arm/fa726te.md + 2010-11-13 Paolo Bonzini PR c/46462 diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/arm-cores.def gcc-4.6-svn-20101116/gcc/config/arm/arm-cores.def --- tmp/gcc-4.6-svn-20101116/gcc/config/arm/arm-cores.def 2010-11-23 13:43:46.812579000 +0800 +++ gcc-4.6-svn-20101116/gcc/config/arm/arm-cores.def 2010-11-23 13:57:43.556323000 +0800 @@ -74,6 +74,8 @@ ARM_CORE("strongarm110", strongarm110, 4, FL_MODE26 | FL_LDSCHED | FL_STRONG, fastmul) ARM_CORE("strongarm1100", strongarm1100, 4, FL_MODE26 | FL_LDSCHED | FL_STRONG, fastmul) ARM_CORE("strongarm1110", strongarm1110, 4, FL_MODE26 | FL_LDSCHED | FL_STRONG, fastmul) +ARM_CORE("fa526", fa526, 4, FL_LDSCHED, fastmul) +ARM_CORE("fa626", fa626, 4, FL_LDSCHED, fastmul) /* V4T Architecture Processors */ ARM_CORE("arm7tdmi", arm7tdmi, 4T, FL_CO_PROC , fastmul) @@ -104,6 +106,10 @@ ARM_CORE("xscale", xscale, 5TE, FL_LDSCHED | FL_STRONG | FL_XSCALE, xscale) ARM_CORE("iwmmxt", iwmmxt, 5TE, FL_LDSCHED | FL_STRONG | FL_XSCALE | FL_IWMMXT, xscale) ARM_CORE("iwmmxt2", iwmmxt2, 5TE, FL_LDSCHED | FL_STRONG | FL_XSCALE | FL_IWMMXT, xscale) +ARM_CORE("fa606te", fa606te, 5TE, FL_LDSCHED, 9e) +ARM_CORE("fa626te", fa626te, 5TE, FL_LDSCHED, 9e) +ARM_CORE("fmp626", fmp626, 5TE, FL_LDSCHED, 9e) +ARM_CORE("fa726te", fa726te, 5TE, FL_LDSCHED, fa726te) /* V5TEJ Architecture Processors */ ARM_CORE("arm926ej-s", arm926ejs, 5TEJ, FL_LDSCHED, 9e) diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/arm-tune.md gcc-4.6-svn-20101116/gcc/config/arm/arm-tune.md --- tmp/gcc-4.6-svn-20101116/gcc/config/arm/arm-tune.md 2010-11-23 13:43:46.829584000 +0800 +++ gcc-4.6-svn-20101116/gcc/config/arm/arm-tune.md 2010-11-23 13:57:43.559341000 +0800 @@ -1,5 +1,5 @@ ;; -*- buffer-read-only: t -*- ;; Generated automatically by gentune.sh from arm-cores.def (define_attr "tune" - "arm2,arm250,arm3,arm6,arm60,arm600,arm610,arm620,arm7,arm7d,arm7di,arm70,arm700,arm700i,arm710,arm720,arm710c,arm7100,arm7500,arm7500fe,arm7m,arm7dm,arm7dmi,arm8,arm810,strongarm,strongarm110,strongarm1100,strongarm1110,arm7tdmi,arm7tdmis,arm710t,arm720t,arm740t,arm9,arm9tdmi,arm920,arm920t,arm922t,arm940t,ep9312,arm10tdmi,arm1020t,arm9e,arm946es,arm966es,arm968es,arm10e,arm1020e,arm1022e,xscale,iwmmxt,iwmmxt2,arm926ejs,arm1026ejs,arm1136js,arm1136jfs,arm1176jzs,arm1176jzfs,mpcorenovfp,mpcore,arm1156t2s,arm1156t2fs,cortexa5,cortexa8,cortexa9,cortexa15,cortexr4,cortexr4f,cortexm4,cortexm3,cortexm1,cortexm0" + "arm2,arm250,arm3,arm6,arm60,arm600,arm610,arm620,arm7,arm7d,arm7di,arm70,arm700,arm700i,arm710,arm720,arm710c,arm7100,arm7500,arm7500fe,arm7m,arm7dm,arm7dmi,arm8,arm810,strongarm,strongarm110,strongarm1100,strongarm1110,fa526,fa626,arm7tdmi,arm7tdmis,arm710t,arm720t,arm740t,arm9,arm9tdmi,arm920,arm920t,arm922t,arm940t,ep9312,arm10tdmi,arm1020t,arm9e,arm946es,arm966es,arm968es,arm10e,arm1020e,arm1022e,xscale,iwmmxt,iwmmxt2,fa606te,fa626te,fmp626,fa726te,arm926ejs,arm1026ejs,arm1136js,arm1136jfs,arm1176jzs,arm1176jzfs,mpcorenovfp,mpcore,arm1156t2s,arm1156t2fs,cortexa5,cortexa8,cortexa9,cortexa15,cortexr4,cortexr4f,cortexm4,cortexm3,cortexm1,cortexm0" (const (symbol_ref "((enum attr_tune) arm_tune)"))) diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/arm.c gcc-4.6-svn-20101116/gcc/config/arm/arm.c --- tmp/gcc-4.6-svn-20101116/gcc/config/arm/arm.c 2010-11-23 13:43:46.895582000 +0800 +++ gcc-4.6-svn-20101116/gcc/config/arm/arm.c 2010-11-23 13:57:43.649325000 +0800 @@ -128,6 +128,7 @@ static void thumb1_output_function_prologue (FILE *, HOST_WIDE_INT); static int arm_comp_type_attributes (const_tree, const_tree); static void arm_set_default_type_attributes (tree); +static int arm_sched_variable_issue (FILE *, int, rtx, int); static int arm_adjust_cost (rtx, rtx, rtx, int); static int count_insns_for_constant (HOST_WIDE_INT, int); static int arm_get_strip_length (int); @@ -239,6 +240,7 @@ static rtx arm_pic_static_addr (rtx orig, rtx reg); static bool cortex_a9_sched_adjust_cost (rtx, rtx, rtx, int *); static bool xscale_sched_adjust_cost (rtx, rtx, rtx, int *); +static bool fa726te_sched_adjust_cost (rtx, rtx, rtx, int *); static enum machine_mode arm_preferred_simd_mode (enum machine_mode); static bool arm_class_likely_spilled_p (reg_class_t); static bool arm_vector_alignment_reachable (const_tree type, bool is_packed); @@ -350,6 +352,9 @@ #undef TARGET_SET_DEFAULT_TYPE_ATTRIBUTES #define TARGET_SET_DEFAULT_TYPE_ATTRIBUTES arm_set_default_type_attributes +#undef TARGET_SCHED_VARIABLE_ISSUE +#define TARGET_SCHED_VARIABLE_ISSUE arm_sched_variable_issue + #undef TARGET_SCHED_ADJUST_COST #define TARGET_SCHED_ADJUST_COST arm_adjust_cost @@ -862,6 +867,13 @@ 1 }; +const struct tune_params arm_fa726te_tune = +{ + arm_9e_rtx_costs, + fa726te_sched_adjust_cost, + 1 +}; + /* Not all of these give usefully different compilation alternatives, but there is no simple way of generalizing them. */ @@ -7913,6 +7925,56 @@ return true; } +/* Adjust cost hook for FA726TE. */ +static bool +fa726te_sched_adjust_cost (rtx insn, rtx link, rtx dep, int * cost) +{ + /* For FA726TE, true dependency on CPSR (i.e. set cond followed by predicated) + have penalty of 3 */ + if (REG_NOTE_KIND (link) == REG_DEP_TRUE + && recog_memoized (insn) >= 0 + && recog_memoized (dep) >= 0 + && get_attr_conds (dep) == CONDS_SET) + { + /* Use of carry (e.g. 64-bit arithmetic) in ALU: 3-cycle latency */ + if (get_attr_conds(insn) == CONDS_USE && + get_attr_type(insn) != TYPE_BRANCH) + { + *cost = 3; + return false; + } + + if (GET_CODE (PATTERN (insn)) == COND_EXEC + || get_attr_conds(insn) == CONDS_USE) + { + *cost = 0; + return false; + } + } + + return true; +} + +/* Determine how many instructions can we issue. Fixup the issue that some + UNSPECs get scheduled. */ +static int +arm_sched_variable_issue (FILE *f ATTRIBUTE_UNUSED, + int verbose ATTRIBUTE_UNUSED, rtx insn, int more) +{ + if (arm_tune == fa726te + && recog_memoized (insn) >= 0 /* insn recognizable? */ + && (get_attr_type (insn) == TYPE_ALU + || get_attr_type (insn) == TYPE_ALU_SHIFT + || get_attr_type (insn) == TYPE_ALU_SHIFT_REG)) + { + return more; + } + else + { + return more-1; + } +} + /* This function implements the target macro TARGET_SCHED_ADJUST_COST. It corrects the value of COST based on the relationship between INSN and DEP through the dependence LINK. It returns the new @@ -22722,6 +22784,7 @@ case cortexa5: case cortexa8: case cortexa9: + case fa726te: return 2; default: diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/arm.md gcc-4.6-svn-20101116/gcc/config/arm/arm.md --- tmp/gcc-4.6-svn-20101116/gcc/config/arm/arm.md 2010-11-23 13:43:46.939579000 +0800 +++ gcc-4.6-svn-20101116/gcc/config/arm/arm.md 2010-11-23 13:57:43.697327000 +0800 @@ -498,7 +498,7 @@ (define_attr "generic_sched" "yes,no" (const (if_then_else - (ior (eq_attr "tune" "arm926ejs,arm1020e,arm1026ejs,arm1136js,arm1136jfs,cortexa5,cortexa8,cortexa9,cortexm4") + (ior (eq_attr "tune" "fa526,fa626,fa606te,fa626te,fmp626,fa726te,arm926ejs,arm1020e,arm1026ejs,arm1136js,arm1136jfs,cortexa5,cortexa8,cortexa9,cortexm4") (eq_attr "tune_cortexr4" "yes")) (const_string "no") (const_string "yes")))) @@ -516,6 +516,11 @@ (include "arm1020e.md") (include "arm1026ejs.md") (include "arm1136jfs.md") +(include "fa526.md") +(include "fa606te.md") +(include "fa626te.md") +(include "fmp626.md") +(include "fa726te.md") (include "cortex-a5.md") (include "cortex-a8.md") (include "cortex-a9.md") diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/bpabi.h gcc-4.6-svn-20101116/gcc/config/arm/bpabi.h --- tmp/gcc-4.6-svn-20101116/gcc/config/arm/bpabi.h 2010-11-23 13:43:46.998580000 +0800 +++ gcc-4.6-svn-20101116/gcc/config/arm/bpabi.h 2010-11-23 13:57:43.702332000 +0800 @@ -52,7 +52,7 @@ /* The BPABI integer comparison routines return { -1, 0, 1 }. */ #define TARGET_LIB_INT_CMP_BIASED !TARGET_BPABI -#define TARGET_FIX_V4BX_SPEC " %{mcpu=arm8|mcpu=arm810|mcpu=strongarm*|march=armv4:--fix-v4bx}" +#define TARGET_FIX_V4BX_SPEC " %{mcpu=arm8|mcpu=arm810|mcpu=strongarm*|march=armv4|mcpu=fa5*|mcpu=fa626:--fix-v4bx}" #define BE8_LINK_SPEC " %{mbig-endian:%{march=armv7-a|mcpu=cortex-a5|mcpu=cortex-a8|mcpu=cortex-a9|mcpu=cortex-a15:%{!r:--be8}}}" diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa526.md gcc-4.6-svn-20101116/gcc/config/arm/fa526.md --- tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa526.md 1970-01-01 08:00:00.000000000 +0800 +++ gcc-4.6-svn-20101116/gcc/config/arm/fa526.md 2010-11-23 14:36:17.916371000 +0800 @@ -0,0 +1,161 @@ +;; Faraday FA526 Pipeline Description +;; Copyright (C) 2010 Free Software Foundation, Inc. +;; Written by I-Jui Sung, based on ARM926EJ-S Pipeline Description + +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it under +;; the terms of the GNU General Public License as published by the Free +;; Software Foundation; either version 3, or (at your option) any later +;; version. +;; +;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY +;; WARRANTY; without even the implied warranty of MERCHANTABILITY or +;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +;; for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . */ + +;; These descriptions are based on the information contained in the +;; FA526 Core Design Note, Copyright (c) 2010 Faraday Technology Corp. +;; +;; Modeled pipeline characteristics: +;; LD -> any use: latency = 3 (2 cycle penalty) +;; ALU -> any use: latency = 2 (1 cycle penalty) + +;; This automaton provides a pipeline description for the Faraday +;; FA526 core. +;; +;; The model given here assumes that the condition for all conditional +;; instructions is "true", i.e., that all of the instructions are +;; actually executed. + +(define_automaton "fa526") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Pipelines +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; There is a single pipeline +;; +;; The ALU pipeline has fetch, decode, execute, memory, and +;; write stages. We only need to model the execute, memory and write +;; stages. + +;; S E M W + +(define_cpu_unit "fa526_core" "fa526") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; ALU Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; ALU instructions require two cycles to execute, and use the ALU +;; pipeline in each of the three stages. The results are available +;; after the execute stage stage has finished. +;; +;; If the destination register is the PC, the pipelines are stalled +;; for several cycles. That case is not modeled here. + +;; ALU operations +(define_insn_reservation "526_alu_op" 1 + (and (eq_attr "tune" "fa526") + (eq_attr "type" "alu")) + "fa526_core") + +(define_insn_reservation "526_alu_shift_op" 2 + (and (eq_attr "tune" "fa526") + (eq_attr "type" "alu_shift,alu_shift_reg")) + "fa526_core") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Multiplication Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_insn_reservation "526_mult1" 2 + (and (eq_attr "tune" "fa526") + (eq_attr "insn" "smlalxy,smulxy,smlaxy,smlalxy")) + "fa526_core") + +(define_insn_reservation "526_mult2" 5 + (and (eq_attr "tune" "fa526") + (eq_attr "insn" "mul,mla,muls,mlas,umull,umlal,smull,smlal,umulls,\ + umlals,smulls,smlals,smlawx")) + "fa526_core*4") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Load/Store Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; The models for load/store instructions do not accurately describe +;; the difference between operations with a base register writeback +;; (such as "ldm!"). These models assume that all memory references +;; hit in dcache. + +(define_insn_reservation "526_load1_op" 3 + (and (eq_attr "tune" "fa526") + (eq_attr "type" "load1,load_byte")) + "fa526_core") + +(define_insn_reservation "526_load2_op" 4 + (and (eq_attr "tune" "fa526") + (eq_attr "type" "load2")) + "fa526_core*2") + +(define_insn_reservation "526_load3_op" 5 + (and (eq_attr "tune" "fa526") + (eq_attr "type" "load3")) + "fa526_core*3") + +(define_insn_reservation "526_load4_op" 6 + (and (eq_attr "tune" "fa526") + (eq_attr "type" "load4")) + "fa526_core*4") + +(define_insn_reservation "526_store1_op" 0 + (and (eq_attr "tune" "fa526") + (eq_attr "type" "store1")) + "fa526_core") + +(define_insn_reservation "526_store2_op" 1 + (and (eq_attr "tune" "fa526") + (eq_attr "type" "store2")) + "fa526_core*2") + +(define_insn_reservation "526_store3_op" 2 + (and (eq_attr "tune" "fa526") + (eq_attr "type" "store3")) + "fa526_core*3") + +(define_insn_reservation "526_store4_op" 3 + (and (eq_attr "tune" "fa526") + (eq_attr "type" "store4")) + "fa526_core*4") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Branch and Call Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Branch instructions are difficult to model accurately. The ARM +;; core can predict most branches. If the branch is predicted +;; correctly, and predicted early enough, the branch can be completely +;; eliminated from the instruction stream. Some branches can +;; therefore appear to require zero cycle to execute. We assume that +;; all branches are predicted correctly, and that the latency is +;; therefore the minimum value. + +(define_insn_reservation "526_branch_op" 0 + (and (eq_attr "tune" "fa526") + (eq_attr "type" "branch")) + "fa526_core") + +;; The latency for a call is actually the latency when the result is available. +;; i.e. R0 ready for int return value. For most cases, the return value is set +;; by a mov instruction, which has 1 cycle latency. +(define_insn_reservation "526_call_op" 1 + (and (eq_attr "tune" "fa526") + (eq_attr "type" "call")) + "fa526_core") + diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa606te.md gcc-4.6-svn-20101116/gcc/config/arm/fa606te.md --- tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa606te.md 1970-01-01 08:00:00.000000000 +0800 +++ gcc-4.6-svn-20101116/gcc/config/arm/fa606te.md 2010-11-23 14:36:35.789647000 +0800 @@ -0,0 +1,172 @@ +;; Faraday FA606TE Pipeline Description +;; Copyright (C) 2010 Free Software Foundation, Inc. +;; Written by I-Jui Sung, based on ARM926EJ-S Pipeline Description +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it under +;; the terms of the GNU General Public License as published by the Free +;; Software Foundation; either version 3, or (at your option) any later +;; version. +;; +;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY +;; WARRANTY; without even the implied warranty of MERCHANTABILITY or +;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +;; for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . */ + +;; These descriptions are based on the information contained in the +;; FA606TE Core Design Note, Copyright (c) 2010 Faraday Technology Corp. +;; + +;; Modeled pipeline characteristics: +;; LD -> any use: latency = 2 (1 cycle penalty) +;; ALU -> any use: latency = 1 (0 cycle penalty) + +;; This automaton provides a pipeline description for the Faraday +;; FA606TE core. +;; +;; The model given here assumes that the condition for all conditional +;; instructions is "true", i.e., that all of the instructions are +;; actually executed. + +(define_automaton "fa606te") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Pipelines +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; There is a single pipeline +;; +;; The ALU pipeline has fetch, decode, execute, memory, and +;; write stages. We only need to model the execute, memory and write +;; stages. + +;; E M W + +(define_cpu_unit "fa606te_core" "fa606te") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; ALU Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; ALU instructions require two cycles to execute, and use the ALU +;; pipeline in each of the three stages. The results are available +;; after the execute stage stage has finished. +;; +;; If the destination register is the PC, the pipelines are stalled +;; for several cycles. That case is not modeled here. + +;; ALU operations +(define_insn_reservation "606te_alu_op" 1 + (and (eq_attr "tune" "fa606te") + (eq_attr "type" "alu,alu_shift,alu_shift_reg")) + "fa606te_core") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Multiplication Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_insn_reservation "606te_mult1" 2 + (and (eq_attr "tune" "fa606te") + (eq_attr "insn" "smlalxy")) + "fa606te_core") + +(define_insn_reservation "606te_mult2" 3 + (and (eq_attr "tune" "fa606te") + (eq_attr "insn" "smlaxy,smulxy,smulwy,smlawy")) + "fa606te_core*2") + +(define_insn_reservation "606te_mult3" 4 + (and (eq_attr "tune" "fa606te") + (eq_attr "insn" "mul,mla,muls,mlas")) + "fa606te_core*3") + +(define_insn_reservation "606te_mult4" 5 + (and (eq_attr "tune" "fa606te") + (eq_attr "insn" "umull,umlal,smull,smlal,umulls,umlals,smulls,smlals")) + "fa606te_core*4") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Load/Store Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; The models for load/store instructions do not accurately describe +;; the difference between operations with a base register writeback +;; (such as "ldm!"). These models assume that all memory references +;; hit in dcache. + +(define_insn_reservation "606te_load1_op" 2 + (and (eq_attr "tune" "fa606te") + (eq_attr "type" "load1,load_byte")) + "fa606te_core") + +(define_insn_reservation "606te_load2_op" 3 + (and (eq_attr "tune" "fa606te") + (eq_attr "type" "load2")) + "fa606te_core*2") + +(define_insn_reservation "606te_load3_op" 4 + (and (eq_attr "tune" "fa606te") + (eq_attr "type" "load3")) + "fa606te_core*3") + +(define_insn_reservation "606te_load4_op" 5 + (and (eq_attr "tune" "fa606te") + (eq_attr "type" "load4")) + "fa606te_core*4") + +(define_insn_reservation "606te_store1_op" 0 + (and (eq_attr "tune" "fa606te") + (eq_attr "type" "store1")) + "fa606te_core") + +(define_insn_reservation "606te_store2_op" 1 + (and (eq_attr "tune" "fa606te") + (eq_attr "type" "store2")) + "fa606te_core*2") + +(define_insn_reservation "606te_store3_op" 2 + (and (eq_attr "tune" "fa606te") + (eq_attr "type" "store3")) + "fa606te_core*3") + +(define_insn_reservation "606te_store4_op" 3 + (and (eq_attr "tune" "fa606te") + (eq_attr "type" "store4")) + "fa606te_core*4") + + +;;(define_insn_reservation "606te_ldm_op" 9 +;; (and (eq_attr "tune" "fa606te") +;; (eq_attr "type" "load2,load3,load4,store2,store3,store4")) +;; "fa606te_core*7") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Branch and Call Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Branch instructions are difficult to model accurately. The ARM +;; core can predict most branches. If the branch is predicted +;; correctly, and predicted early enough, the branch can be completely +;; eliminated from the instruction stream. Some branches can +;; therefore appear to require zero cycles to execute. We assume that +;; all branches are predicted correctly, and that the latency is +;; therefore the minimum value. + +(define_insn_reservation "606te_branch_op" 0 + (and (eq_attr "tune" "fa606te") + (eq_attr "type" "branch")) + "fa606te_core") + +;; The latency for a call is actually the latency when the result being available. +;; i.e. R0 ready for int return value. For most cases, the return value is set by a +;; mov instruction, which has 1 cycle latency +(define_insn_reservation "606te_call_op" 1 + (and (eq_attr "tune" "fa606te") + (eq_attr "type" "call")) + "fa606te_core") + diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa626te.md gcc-4.6-svn-20101116/gcc/config/arm/fa626te.md --- tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa626te.md 1970-01-01 08:00:00.000000000 +0800 +++ gcc-4.6-svn-20101116/gcc/config/arm/fa626te.md 2010-11-23 14:35:59.928102000 +0800 @@ -0,0 +1,166 @@ +;; Faraday FA626TE Pipeline Description +;; Copyright (C) 2010 Free Software Foundation, Inc. +;; Written by I-Jui Sung, based on ARM926EJ-S Pipeline Description +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it under +;; the terms of the GNU General Public License as published by the Free +;; Software Foundation; either version 3, or (at your option) any later +;; version. +;; +;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY +;; WARRANTY; without even the implied warranty of MERCHANTABILITY or +;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +;; for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . */ + +;; These descriptions are based on the information contained in the +;; FA626TE Core Design Note, Copyright (c) 2010 Faraday Technology Corp. +;; + +;; Modeled pipeline characteristics: +;; ALU -> simple address LDR/STR: latency=2 (available after 2 cycles) +;; ALU -> shifted address LDR/STR: latency=3 +;; ( extra 1 cycle unavoidable stall) +;; ALU -> other use: latency=2 (available after 2 cycles) +;; LD -> simple address LDR/STR: latency=3 (available after 3 cycles) +;; LD -> shifted address LDR/STR: latency=4 +;; ( extra 1 cycle unavoidable stall) +;; LD -> any other use: latency = 3 (available after 3 cycles) + +;; This automaton provides a pipeline description for the Faraday +;; FA626TE core. +;; +;; The model given here assumes that the condition for all conditional +;; instructions is "true", i.e., that all of the instructions are +;; actually executed. + +(define_automaton "fa626te") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Pipelines +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; There is a single pipeline +;; +;; The ALU pipeline has fetch, decode, execute, memory, and +;; write stages. We only need to model the execute, memory and write +;; stages. + +;; S E M W + +(define_cpu_unit "fa626te_core" "fa626te") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; ALU Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; ALU instructions require two cycles to execute, and use the ALU +;; pipeline in each of the three stages. The results are available +;; after the execute stage stage has finished. +;; +;; If the destination register is the PC, the pipelines are stalled +;; for several cycles. That case is not modeled here. + +;; ALU operations +(define_insn_reservation "626te_alu_op" 1 + (and (eq_attr "tune" "fa626,fa626te") + (eq_attr "type" "alu")) + "fa626te_core") + +(define_insn_reservation "626te_alu_shift_op" 2 + (and (eq_attr "tune" "fa626,fa626te") + (eq_attr "type" "alu_shift,alu_shift_reg")) + "fa626te_core") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Multiplication Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_insn_reservation "626te_mult1" 2 + (and (eq_attr "tune" "fa626,fa626te") + (eq_attr "insn" "smulwy,smlawy,smulxy,smlaxy")) + "fa626te_core") + +(define_insn_reservation "626te_mult2" 2 + (and (eq_attr "tune" "fa626,fa626te") + (eq_attr "insn" "mul,mla")) + "fa626te_core") + +(define_insn_reservation "626te_mult3" 3 + (and (eq_attr "tune" "fa626,fa626te") + (eq_attr "insn" "muls,mlas,smull,smlal,umull,umlal,smlalxy,smlawx")) + "fa626te_core*2") + +(define_insn_reservation "626te_mult4" 4 + (and (eq_attr "tune" "fa626,fa626te") + (eq_attr "insn" "smulls,smlals,umulls,umlals")) + "fa626te_core*3") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Load/Store Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; The models for load/store instructions do not accurately describe +;; the difference between operations with a base register writeback +;; (such as "ldm!"). These models assume that all memory references +;; hit in dcache. + +(define_insn_reservation "626te_load1_op" 3 + (and (eq_attr "tune" "fa626,fa626te") + (eq_attr "type" "load1,load_byte")) + "fa626te_core") + +(define_insn_reservation "626te_load2_op" 4 + (and (eq_attr "tune" "fa626,fa626te") + (eq_attr "type" "load2,load3")) + "fa626te_core*2") + +(define_insn_reservation "626te_load3_op" 5 + (and (eq_attr "tune" "fa626,fa626te") + (eq_attr "type" "load4")) + "fa626te_core*3") + +(define_insn_reservation "626te_store1_op" 0 + (and (eq_attr "tune" "fa626,fa626te") + (eq_attr "type" "store1")) + "fa626te_core") + +(define_insn_reservation "626te_store2_op" 1 + (and (eq_attr "tune" "fa626,fa626te") + (eq_attr "type" "store2,store3")) + "fa626te_core*2") + +(define_insn_reservation "626te_store3_op" 2 + (and (eq_attr "tune" "fa626,fa626te") + (eq_attr "type" "store4")) + "fa626te_core*3") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Branch and Call Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Branch instructions are difficult to model accurately. The ARM +;; core can predict most branches. If the branch is predicted +;; correctly, and predicted early enough, the branch can be completely +;; eliminated from the instruction stream. Some branches can +;; therefore appear to require zero cycle to execute. We assume that +;; all branches are predicted correctly, and that the latency is +;; therefore the minimum value. + +(define_insn_reservation "626te_branch_op" 0 + (and (eq_attr "tune" "fa626,fa626te") + (eq_attr "type" "branch")) + "fa626te_core") + +;; The latency for a call is actually the latency when the result is available. +;; i.e. R0 ready for int return value. +(define_insn_reservation "626te_call_op" 1 + (and (eq_attr "tune" "fa626,fa626te") + (eq_attr "type" "call")) + "fa626te_core") + diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa726te.md gcc-4.6-svn-20101116/gcc/config/arm/fa726te.md --- tmp/gcc-4.6-svn-20101116/gcc/config/arm/fa726te.md 1970-01-01 08:00:00.000000000 +0800 +++ gcc-4.6-svn-20101116/gcc/config/arm/fa726te.md 2010-11-23 14:37:04.268859000 +0800 @@ -0,0 +1,221 @@ +;; Faraday FA726TE Pipeline Description +;; Copyright (C) 2010 Free Software Foundation, Inc. +;; Written by I-Jui Sung, based on ARM926EJ-S Pipeline Description +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it under +;; the terms of the GNU General Public License as published by the Free +;; Software Foundation; either version 3, or (at your option) any later +;; version. +;; +;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY +;; WARRANTY; without even the implied warranty of MERCHANTABILITY or +;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +;; for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . */ + +;; These descriptions are based on the information contained in the +;; FA726TE Core Design Note, Copyright (c) 2010 Faraday Technology Corp. +;; + +;; This automaton provides a pipeline description for the Faraday +;; FA726TE core. +;; +;; The model given here assumes that the condition for all conditional +;; instructions is "true", i.e., that all of the instructions are +;; actually executed. + +(define_automaton "fa726te") +(automata_option "ndfa") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Pipelines +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; The ALU pipeline has fetch, decode, execute, memory, and +;; write stages. We only need to model the execute, memory and write +;; stages. + +;; E1 E2 E3 E4 E5 WB +;;______________________________________________________ +;; +;; <-------------- LD/ST -----------> +;; shifter + LU <-- AU --> +;; <-- AU --> shifter + LU CPSR (Pipe 0) +;;______________________________________________________ +;; +;; <---------- MUL ---------> +;; shifter + LU <-- AU --> +;; <-- AU --> shifter + LU CPSR (Pipe 1) + + +(define_cpu_unit "fa726te_alu0_pipe,fa726te_alu1_pipe" "fa726te") +(define_cpu_unit "fa726te_mac_pipe" "fa726te") +(define_cpu_unit "fa726te_lsu_pipe_e,fa726te_lsu_pipe_w" "fa726te") +;; pretend we have 2 LSUs (the second is ONLY for LDR), which can possibly +;; improve code quality +(define_query_cpu_unit "fa726te_lsu1_pipe_e,fa726te_lsu1_pipe_w" "fa726te") +(define_cpu_unit "fa726te_is0,fa726te_is1" "fa726te") + +(define_reservation "fa726te_issue" "(fa726te_is0|fa726te_is1)") +;; reservation which blocks IS +(define_reservation "fa726te_blockage" "(fa726te_is0+fa726te_is1)") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; ALU Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; ALU instructions require three cycles to execute, and use the ALU +;; pipeline in each of the three stages. The results are available +;; after the execute stage stage has finished. +;; +;; If the destination register is the PC, the pipelines are stalled +;; for several cycles. That case is not modeled here. + +;; Move instructions. +(define_insn_reservation "726te_shift_op" 1 + (and (eq_attr "tune" "fa726te") + (eq_attr "insn" "mov,mvn")) + "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)") + +;; ALU operations with no shifted operand will finished in 1 cycle +;; Other ALU instructions 2 cycles +(define_insn_reservation "726te_alu_op" 1 + (and (eq_attr "tune" "fa726te") + (and (eq_attr "type" "alu") + (not (eq_attr "insn" "mov,mvn")))) + "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)") + +;; ALU operations with a shift-by-register operand +;; These really stall in the decoder, in order to read +;; the shift value in a second cycle. Pretend we take two cycles in +;; the execute stage. +;; If shift+LU, it takes 2 cycles. If shift+AU, it takes 3 cycles. +(define_insn_reservation "726te_alu_shift_op" 3 + (and (eq_attr "tune" "fa726te") + (and (eq_attr "type" "alu_shift") + (not (eq_attr "insn" "mov,mvn")))) + "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)") + +(define_insn_reservation "726te_alu_shift_reg_op" 3 + (and (eq_attr "tune" "fa726te") + (and (eq_attr "type" "alu_shift_reg") + (not (eq_attr "insn" "mov,mvn")))) + "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)") +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Multiplication Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Multiplication instructions loop in the execute stage until the +;; instruction has been passed through the multiplier array enough +;; times. Multiply operations occur in both the execute and memory +;; stages of the pipeline + +(define_insn_reservation "726te_mult_op" 3 + (and (eq_attr "tune" "fa726te") + (eq_attr "insn" "smlalxy,mul,mla,muls,mlas,umull,umlal,smull,smlal,\ + umulls,umlals,smulls,smlals,smlawx,smulxy,smlaxy")) + "fa726te_issue+fa726te_mac_pipe") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Load/Store Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; The models for load/store instructions do not accurately describe +;; the difference between operations with a base register writeback +;; (such as "ldm!"). These models assume that all memory references +;; hit in dcache. + +;; Loads with a shifted offset take 3 cycles, and are (a) probably the +;; most common and (b) the pessimistic assumption will lead to fewer stalls. + +;; Scalar loads are pipelined in FA726TE LSU pipe. +;; Here we model the resource conflict between Load@E3-stage & Store@W-stage +;; The 2nd LSU (lsu1) is to model the fact that if 2 loads are scheduled in the +;; same "bundle", the 2nd load will introudce another ISSUE stall but is still +;; ok to execute (and may be benefical sometimes) + +(define_insn_reservation "726te_load1_op" 3 + (and (eq_attr "tune" "fa726te") + (eq_attr "type" "load1,load_byte")) + "(fa726te_issue+fa726te_lsu_pipe_e+fa726te_lsu_pipe_w)\ + | (fa726te_issue+fa726te_lsu1_pipe_e+fa726te_lsu1_pipe_w,fa726te_blockage)") + +(define_insn_reservation "726te_store1_op" 1 + (and (eq_attr "tune" "fa726te") + (eq_attr "type" "store1")) + "fa726te_blockage*2") + +;; Load/Store Multiple blocks all pipelines in EX stages until WB +;; No other instructions can be issued together. +;; Since they essentially prevent all scheduling opportunities, we model them +;; together here. + +;; If LDM is breaking into multiple load instructions, later instruction in +;; pipe 1 is stalled +(define_insn_reservation "726te_ldm2_op" 4 + (and (eq_attr "tune" "fa726te") + (eq_attr "type" "load2,load3")) + "fa726te_blockage*4") + +(define_insn_reservation "726te_ldm3_op" 5 + (and (eq_attr "tune" "fa726te") + (eq_attr "type" "load4")) + "fa726te_blockage*5") + +(define_insn_reservation "726te_stm2_op" 2 + (and (eq_attr "tune" "fa726te") + (eq_attr "type" "store2,store3")) + "fa726te_blockage*3") + +(define_insn_reservation "726te_stm3_op" 3 + (and (eq_attr "tune" "fa726te") + (eq_attr "type" "store4")) + "fa726te_blockage*4") + +(define_bypass 1 "726te_load1_op,726te_ldm2_op,726te_ldm3_op" "726te_store1_op,\ + 726te_stm2_op,726te_stm3_op" "arm_no_early_store_addr_dep") +(define_bypass 0 "726te_shift_op,726te_alu_op,726te_alu_shift_op,\ + 726te_alu_shift_reg_op,726te_mult_op" "726te_store1_op" + "arm_no_early_store_addr_dep") +(define_bypass 0 "726te_shift_op,726te_alu_op" "726te_shift_op,726te_alu_op") +(define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op" + "726te_shift_op,726te_alu_op") +(define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op,726te_mult_op" + "726te_alu_shift_op" "arm_no_early_alu_shift_dep") +(define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op,726te_mult_op" + "726te_alu_shift_reg_op" "arm_no_early_alu_shift_value_dep") +(define_bypass 1 "726te_mult_op" "726te_shift_op,726te_alu_op") + +(define_bypass 4 "726te_load1_op" "726te_mult_op") +(define_bypass 5 "726te_ldm2_op" "726te_mult_op") +(define_bypass 6 "726te_ldm3_op" "726te_mult_op") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Branch and Call Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Branch instructions are difficult to model accurately. The ARM +;; core can predict most branches. If the branch is predicted +;; correctly, and predicted early enough, the branch can be completely +;; eliminated from the instruction stream. Some branches can +;; therefore appear to require zero cycle to execute. We assume that +;; all branches are predicted correctly, and that the latency is +;; therefore the minimum value. + +(define_insn_reservation "726te_branch_op" 0 + (and (eq_attr "tune" "fa726te") + (eq_attr "type" "branch")) + "fa726te_blockage") + +;; The latency for a call is actually the latency when the result is available. +;; i.e. R0 is ready for int return value. +(define_insn_reservation "726te_call_op" 1 + (and (eq_attr "tune" "fa726te") + (eq_attr "type" "call")) + "fa726te_blockage") + diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/fmp626.md gcc-4.6-svn-20101116/gcc/config/arm/fmp626.md --- tmp/gcc-4.6-svn-20101116/gcc/config/arm/fmp626.md 1970-01-01 08:00:00.000000000 +0800 +++ gcc-4.6-svn-20101116/gcc/config/arm/fmp626.md 2010-11-23 14:36:50.883213000 +0800 @@ -0,0 +1,183 @@ +;; Faraday FA626TE Pipeline Description +;; Copyright (C) 2010 Free Software Foundation, Inc. +;; Written by I-Jui Sung, based on ARM926EJ-S Pipeline Description +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it under +;; the terms of the GNU General Public License as published by the Free +;; Software Foundation; either version 3, or (at your option) any later +;; version. +;; +;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY +;; WARRANTY; without even the implied warranty of MERCHANTABILITY or +;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +;; for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . */ + +;; These descriptions are based on the information contained in the +;; FMP626 Core Design Note, Copyright (c) 2010 Faraday Technology Corp. +;; + +;; Pipeline architecture +;; S E M W(Q1) Q2 +;; ___________________________________________ +;; shifter alu +;; mul1 mul2 mul3 +;; ld/st1 ld/st2 ld/st3 ld/st4 ld/st5 + +;; This automaton provides a pipeline description for the Faraday +;; FMP626 core. +;; +;; The model given here assumes that the condition for all conditional +;; instructions is "true", i.e., that all of the instructions are +;; actually executed. + +(define_automaton "fmp626") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Pipelines +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; There is a single pipeline +;; +;; The ALU pipeline has fetch, decode, execute, memory, and +;; write stages. We only need to model the execute, memory and write +;; stages. + +(define_cpu_unit "fmp626_core" "fmp626") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; ALU Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; ALU instructions require two cycles to execute, and use the ALU +;; pipeline in each of the three stages. The results are available +;; after the execute stage stage has finished. +;; +;; If the destination register is the PC, the pipelines are stalled +;; for several cycles. That case is not modeled here. + +;; ALU operations +(define_insn_reservation "mp626_alu_op" 1 + (and (eq_attr "tune" "fmp626") + (eq_attr "type" "alu")) + "fmp626_core") + +(define_insn_reservation "mp626_alu_shift_op" 2 + (and (eq_attr "tune" "fmp626") + (eq_attr "type" "alu_shift,alu_shift_reg")) + "fmp626_core") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Multiplication Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_insn_reservation "mp626_mult1" 2 + (and (eq_attr "tune" "fmp626") + (eq_attr "insn" "smulwy,smlawy,smulxy,smlaxy")) + "fmp626_core") + +(define_insn_reservation "mp626_mult2" 2 + (and (eq_attr "tune" "fmp626") + (eq_attr "insn" "mul,mla")) + "fmp626_core") + +(define_insn_reservation "mp626_mult3" 3 + (and (eq_attr "tune" "fmp626") + (eq_attr "insn" "muls,mlas,smull,smlal,umull,umlal,smlalxy,smlawx")) + "fmp626_core*2") + +(define_insn_reservation "mp626_mult4" 4 + (and (eq_attr "tune" "fmp626") + (eq_attr "insn" "smulls,smlals,umulls,umlals")) + "fmp626_core*3") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Load/Store Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; The models for load/store instructions do not accurately describe +;; the difference between operations with a base register writeback +;; (such as "ldm!"). These models assume that all memory references +;; hit in dcache. + +(define_insn_reservation "mp626_load1_op" 5 + (and (eq_attr "tune" "fmp626") + (eq_attr "type" "load1,load_byte")) + "fmp626_core") + +(define_insn_reservation "mp626_load2_op" 6 + (and (eq_attr "tune" "fmp626") + (eq_attr "type" "load2,load3")) + "fmp626_core*2") + +(define_insn_reservation "mp626_load3_op" 7 + (and (eq_attr "tune" "fmp626") + (eq_attr "type" "load4")) + "fmp626_core*3") + +(define_insn_reservation "mp626_store1_op" 0 + (and (eq_attr "tune" "fmp626") + (eq_attr "type" "store1")) + "fmp626_core") + +(define_insn_reservation "mp626_store2_op" 1 + (and (eq_attr "tune" "fmp626") + (eq_attr "type" "store2,store3")) + "fmp626_core*2") + +(define_insn_reservation "mp626_store3_op" 2 + (and (eq_attr "tune" "fmp626") + (eq_attr "type" "store4")) + "fmp626_core*3") + +(define_bypass 1 "mp626_load1_op,mp626_load2_op,mp626_load3_op" + "mp626_store1_op,mp626_store2_op,mp626_store3_op" + "arm_no_early_store_addr_dep") +(define_bypass 1 "mp626_alu_op,mp626_alu_shift_op,mp626_mult1,mp626_mult2,\ + mp626_mult3,mp626_mult4" "mp626_store1_op" + "arm_no_early_store_addr_dep") +(define_bypass 1 "mp626_alu_shift_op" "mp626_alu_op") +(define_bypass 1 "mp626_alu_shift_op" "mp626_alu_shift_op" + "arm_no_early_alu_shift_dep") +(define_bypass 1 "mp626_mult1,mp626_mult2" "mp626_alu_shift_op" + "arm_no_early_alu_shift_dep") +(define_bypass 2 "mp626_mult3" "mp626_alu_shift_op" + "arm_no_early_alu_shift_dep") +(define_bypass 3 "mp626_mult4" "mp626_alu_shift_op" + "arm_no_early_alu_shift_dep") +(define_bypass 1 "mp626_mult1,mp626_mult2" "mp626_alu_op") +(define_bypass 2 "mp626_mult3" "mp626_alu_op") +(define_bypass 3 "mp626_mult4" "mp626_alu_op") +(define_bypass 4 "mp626_load1_op" "mp626_alu_op") +(define_bypass 5 "mp626_load2_op" "mp626_alu_op") +(define_bypass 6 "mp626_load3_op" "mp626_alu_op") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Branch and Call Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Branch instructions are difficult to model accurately. The ARM +;; core can predict most branches. If the branch is predicted +;; correctly, and predicted early enough, the branch can be completely +;; eliminated from the instruction stream. Some branches can +;; therefore appear to require zero cycle to execute. We assume that +;; all branches are predicted correctly, and that the latency is +;; therefore the minimum value. + +(define_insn_reservation "mp626_branch_op" 0 + (and (eq_attr "tune" "fmp626") + (eq_attr "type" "branch")) + "fmp626_core") + +;; The latency for a call is actually the latency when the result is available. +;; i.e. R0 ready for int return value. +(define_insn_reservation "mp626_call_op" 1 + (and (eq_attr "tune" "fmp626") + (eq_attr "type" "call")) + "fmp626_core") + diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/t-arm gcc-4.6-svn-20101116/gcc/config/arm/t-arm --- tmp/gcc-4.6-svn-20101116/gcc/config/arm/t-arm 2010-11-23 13:43:47.213582000 +0800 +++ gcc-4.6-svn-20101116/gcc/config/arm/t-arm 2010-11-23 13:57:43.738329000 +0800 @@ -24,6 +24,11 @@ $(srcdir)/config/arm/arm1020e.md \ $(srcdir)/config/arm/arm1026ejs.md \ $(srcdir)/config/arm/arm1136jfs.md \ + $(srcdir)/config/arm/fa526.md \ + $(srcdir)/config/arm/fa606te.md \ + $(srcdir)/config/arm/fa626te.md \ + $(srcdir)/config/arm/fmp626.md \ + $(srcdir)/config/arm/fa726te.md \ $(srcdir)/config/arm/arm926ejs.md \ $(srcdir)/config/arm/cirrus.md \ $(srcdir)/config/arm/fpa.md \ diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/t-arm-elf gcc-4.6-svn-20101116/gcc/config/arm/t-arm-elf --- tmp/gcc-4.6-svn-20101116/gcc/config/arm/t-arm-elf 2010-11-23 13:43:47.221580000 +0800 +++ gcc-4.6-svn-20101116/gcc/config/arm/t-arm-elf 2010-11-23 13:57:43.744348000 +0800 @@ -36,6 +36,10 @@ MULTILIB_EXCEPTIONS = MULTILIB_MATCHES = +#MULTILIB_OPTIONS += mcpu=fa526/mcpu=fa626/mcpu=fa606te/mcpu=fa626te/mcpu=fmp626/mcpu=fa726te/mcpu=arm926ej-s +#MULTILIB_DIRNAMES += fa526 fa626 fa606te fa626te fmp626 fa726te arm926ej-s +#MULTILIB_EXCEPTIONS += *mthumb*/*mcpu=fa526 *mthumb*/*mcpu=fa626 + #MULTILIB_OPTIONS += march=armv7 #MULTILIB_DIRNAMES += thumb2 #MULTILIB_EXCEPTIONS += march=armv7* marm/*march=armv7* @@ -52,6 +56,8 @@ MULTILIB_OPTIONS += mfloat-abi=hard MULTILIB_DIRNAMES += fpu MULTILIB_EXCEPTIONS += *mthumb/*mfloat-abi=hard* +MULTILIB_EXCEPTIONS += *mcpu=fa526/*mfloat-abi=hard* +MULTILIB_EXCEPTIONS += *mcpu=fa626/*mfloat-abi=hard* # MULTILIB_OPTIONS += mcpu=ep9312 # MULTILIB_DIRNAMES += ep9312 diff -uNr tmp/gcc-4.6-svn-20101116/gcc/config/arm/t-linux-eabi gcc-4.6-svn-20101116/gcc/config/arm/t-linux-eabi --- tmp/gcc-4.6-svn-20101116/gcc/config/arm/t-linux-eabi 2010-11-23 13:43:47.231588000 +0800 +++ gcc-4.6-svn-20101116/gcc/config/arm/t-linux-eabi 2010-11-23 13:57:43.750343000 +0800 @@ -24,6 +24,10 @@ MULTILIB_OPTIONS = MULTILIB_DIRNAMES = +#MULTILIB_OPTIONS += mcpu=fa606te/mcpu=fa626te/mcpu=fmp626/mcpu=fa726te +#MULTILIB_DIRNAMES += fa606te fa626te fmp626 fa726te arm926ej-s +#MULTILIB_EXCEPTIONS += *mthumb/*mcpu=fa606te *mthumb/*mcpu=fa626te *mthumb/*mcpu=fmp626 *mthumb/*mcpu=fa726te* + # Use a version of div0 which raises SIGFPE, and a special __clear_cache. LIB1ASMFUNCS := $(filter-out _dvmd_tls,$(LIB1ASMFUNCS)) _dvmd_lnx _clear_cache diff -uNr tmp/gcc-4.6-svn-20101116/gcc/doc/invoke.texi gcc-4.6-svn-20101116/gcc/doc/invoke.texi --- tmp/gcc-4.6-svn-20101116/gcc/doc/invoke.texi 2010-11-23 13:44:15.846799000 +0800 +++ gcc-4.6-svn-20101116/gcc/doc/invoke.texi 2010-11-23 14:33:07.490613000 +0800 @@ -10115,7 +10115,9 @@ @samp{cortex-r4}, @samp{cortex-r4f}, @samp{cortex-m4}, @samp{cortex-m3}, @samp{cortex-m1}, @samp{cortex-m0}, -@samp{xscale}, @samp{iwmmxt}, @samp{iwmmxt2}, @samp{ep9312}. +@samp{xscale}, @samp{iwmmxt}, @samp{iwmmxt2}, @samp{ep9312}, +@samp{fa526}, @samp{fa626}, +@samp{fa606te}, @samp{fa626te}, @samp{fmp626}, @samp{fa726te}. @item -mtune=@var{name} @opindex mtune