]>
Commit | Line | Data |
---|---|---|
d8099dd8 | 1 | ;; ARM Cortex-A5 pipeline description |
8d9254fc | 2 | ;; Copyright (C) 2010-2020 Free Software Foundation, Inc. |
d8099dd8 JB |
3 | ;; Contributed by CodeSourcery. |
4 | ;; | |
5 | ;; This file is part of GCC. | |
6 | ;; | |
7 | ;; GCC is free software; you can redistribute it and/or modify it | |
8 | ;; under the terms of the GNU General Public License as published by | |
9 | ;; the Free Software Foundation; either version 3, or (at your option) | |
10 | ;; any later version. | |
11 | ;; | |
12 | ;; GCC is distributed in the hope that it will be useful, but | |
13 | ;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | ;; General Public License for more details. | |
16 | ;; | |
17 | ;; You should have received a copy of the GNU General Public License | |
18 | ;; along with GCC; see the file COPYING3. If not see | |
19 | ;; <http://www.gnu.org/licenses/>. | |
20 | ||
21 | (define_automaton "cortex_a5") | |
22 | ||
23 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
24 | ;; Functional units. | |
25 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
26 | ||
27 | ;; The integer (ALU) pipeline. There are five DPU pipeline | |
28 | ;; stages. However the decode/issue stages operate the same for all | |
29 | ;; instructions, so do not model them. We only need to model the | |
30 | ;; first execute stage because instructions always advance one stage | |
31 | ;; per cycle in order. Only branch instructions may dual-issue, so a | |
32 | ;; single unit covers all of the LS, ALU, MAC and FPU pipelines. | |
33 | ||
34 | (define_cpu_unit "cortex_a5_ex1" "cortex_a5") | |
35 | ||
36 | ;; The branch pipeline. Branches can dual-issue with other instructions | |
37 | ;; (except when those instructions take multiple cycles to issue). | |
38 | ||
39 | (define_cpu_unit "cortex_a5_branch" "cortex_a5") | |
40 | ||
41 | ;; Pseudo-unit for blocking the multiply pipeline when a double-precision | |
42 | ;; multiply is in progress. | |
43 | ||
44 | (define_cpu_unit "cortex_a5_fpmul_pipe" "cortex_a5") | |
45 | ||
46 | ;; The floating-point add pipeline (ex1/f1 stage), used to model the usage | |
47 | ;; of the add pipeline by fmac instructions, etc. | |
48 | ||
49 | (define_cpu_unit "cortex_a5_fpadd_pipe" "cortex_a5") | |
50 | ||
51 | ;; Floating-point div/sqrt (long latency, out-of-order completion). | |
52 | ||
53 | (define_cpu_unit "cortex_a5_fp_div_sqrt" "cortex_a5") | |
54 | ||
55 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
56 | ;; ALU instructions. | |
57 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
58 | ||
59 | (define_insn_reservation "cortex_a5_alu" 2 | |
60 | (and (eq_attr "tune" "cortexa5") | |
6e4150e1 | 61 | (eq_attr "type" "alu_imm,alus_imm,logic_imm,logics_imm,\ |
1d61feeb | 62 | alu_sreg,alus_sreg,logic_reg,logics_reg,\ |
6e4150e1 | 63 | adc_imm,adcs_imm,adc_reg,adcs_reg,\ |
1d61feeb | 64 | adr,bfm,clz,rbit,rev,alu_dsp_reg,\ |
6e4150e1 | 65 | shift_imm,shift_reg,\ |
594726e4 | 66 | mov_imm,mov_reg,mvn_imm,mvn_reg,\ |
f62281dc | 67 | mrs,multiple")) |
d8099dd8 JB |
68 | "cortex_a5_ex1") |
69 | ||
70 | (define_insn_reservation "cortex_a5_alu_shift" 2 | |
71 | (and (eq_attr "tune" "cortexa5") | |
6e4150e1 | 72 | (eq_attr "type" "extend,\ |
ae27ce51 | 73 | alu_shift_imm_lsl_1to4,alu_shift_imm_other,alus_shift_imm,\ |
6e4150e1 JG |
74 | logic_shift_imm,logics_shift_imm,\ |
75 | alu_shift_reg,alus_shift_reg,\ | |
76 | logic_shift_reg,logics_shift_reg,\ | |
859abddd SN |
77 | mov_shift,mov_shift_reg,\ |
78 | mvn_shift,mvn_shift_reg")) | |
d8099dd8 JB |
79 | "cortex_a5_ex1") |
80 | ||
81 | ;; Forwarding path for unshifted operands. | |
82 | ||
83 | (define_bypass 1 "cortex_a5_alu,cortex_a5_alu_shift" | |
84 | "cortex_a5_alu") | |
85 | ||
86 | (define_bypass 1 "cortex_a5_alu,cortex_a5_alu_shift" | |
87 | "cortex_a5_alu_shift" | |
88 | "arm_no_early_alu_shift_dep") | |
89 | ||
90 | ;; The multiplier pipeline can forward results from wr stage only so | |
91 | ;; there's no need to specify bypasses). | |
92 | ||
93 | (define_insn_reservation "cortex_a5_mul" 2 | |
94 | (and (eq_attr "tune" "cortexa5") | |
09485a08 | 95 | (ior (eq_attr "mul32" "yes") |
f51c724c | 96 | (eq_attr "widen_mul64" "yes"))) |
d8099dd8 JB |
97 | "cortex_a5_ex1") |
98 | ||
99 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
100 | ;; Load/store instructions. | |
101 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
102 | ||
103 | ;; Address-generation happens in the issue stage, which is one stage behind | |
104 | ;; the ex1 stage (the first stage we care about for scheduling purposes). The | |
105 | ;; dc1 stage is parallel with ex1, dc2 with ex2 and rot with wr. | |
106 | ||
107 | (define_insn_reservation "cortex_a5_load1" 2 | |
108 | (and (eq_attr "tune" "cortexa5") | |
89b2133e | 109 | (eq_attr "type" "load_byte,load_4")) |
d8099dd8 JB |
110 | "cortex_a5_ex1") |
111 | ||
112 | (define_insn_reservation "cortex_a5_store1" 0 | |
113 | (and (eq_attr "tune" "cortexa5") | |
89b2133e | 114 | (eq_attr "type" "store_4")) |
d8099dd8 JB |
115 | "cortex_a5_ex1") |
116 | ||
117 | (define_insn_reservation "cortex_a5_load2" 3 | |
118 | (and (eq_attr "tune" "cortexa5") | |
89b2133e | 119 | (eq_attr "type" "load_8")) |
d8099dd8 JB |
120 | "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1") |
121 | ||
122 | (define_insn_reservation "cortex_a5_store2" 0 | |
123 | (and (eq_attr "tune" "cortexa5") | |
89b2133e | 124 | (eq_attr "type" "store_8")) |
d8099dd8 JB |
125 | "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1") |
126 | ||
127 | (define_insn_reservation "cortex_a5_load3" 4 | |
128 | (and (eq_attr "tune" "cortexa5") | |
89b2133e | 129 | (eq_attr "type" "load_12")) |
d8099dd8 JB |
130 | "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1+cortex_a5_branch,\ |
131 | cortex_a5_ex1") | |
132 | ||
133 | (define_insn_reservation "cortex_a5_store3" 0 | |
134 | (and (eq_attr "tune" "cortexa5") | |
89b2133e | 135 | (eq_attr "type" "store_12")) |
d8099dd8 JB |
136 | "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1+cortex_a5_branch,\ |
137 | cortex_a5_ex1") | |
138 | ||
139 | (define_insn_reservation "cortex_a5_load4" 5 | |
140 | (and (eq_attr "tune" "cortexa5") | |
89b2133e | 141 | (eq_attr "type" "load_12")) |
d8099dd8 JB |
142 | "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1+cortex_a5_branch,\ |
143 | cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1") | |
144 | ||
145 | (define_insn_reservation "cortex_a5_store4" 0 | |
146 | (and (eq_attr "tune" "cortexa5") | |
89b2133e | 147 | (eq_attr "type" "store_12")) |
d8099dd8 JB |
148 | "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1+cortex_a5_branch,\ |
149 | cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1") | |
150 | ||
151 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
152 | ;; Branches. | |
153 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
154 | ||
155 | ;; Direct branches are the only instructions we can dual-issue (also IT and | |
156 | ;; nop, but those aren't very interesting for scheduling). (The latency here | |
157 | ;; is meant to represent when the branch actually takes place, but may not be | |
158 | ;; entirely correct.) | |
159 | ||
160 | (define_insn_reservation "cortex_a5_branch" 3 | |
161 | (and (eq_attr "tune" "cortexa5") | |
162 | (eq_attr "type" "branch,call")) | |
163 | "cortex_a5_branch") | |
164 | ||
165 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
166 | ;; Floating-point arithmetic. | |
167 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
168 | ||
169 | (define_insn_reservation "cortex_a5_fpalu" 4 | |
170 | (and (eq_attr "tune" "cortexa5") | |
292b89b3 | 171 | (eq_attr "type" "ffariths, fadds, ffarithd, faddd, fmov, fmuls,\ |
7b49c9e1 | 172 | f_cvt,f_cvtf2i,f_cvti2f,\ |
d8099dd8 JB |
173 | fcmps, fcmpd")) |
174 | "cortex_a5_ex1+cortex_a5_fpadd_pipe") | |
175 | ||
176 | ;; For fconsts and fconstd, 8-bit immediate data is passed directly from | |
177 | ;; f1 to f3 (which I think reduces the latency by one cycle). | |
178 | ||
179 | (define_insn_reservation "cortex_a5_fconst" 3 | |
180 | (and (eq_attr "tune" "cortexa5") | |
181 | (eq_attr "type" "fconsts,fconstd")) | |
182 | "cortex_a5_ex1+cortex_a5_fpadd_pipe") | |
183 | ||
184 | ;; We should try not to attempt to issue a single-precision multiplication in | |
185 | ;; the middle of a double-precision multiplication operation (the usage of | |
186 | ;; cortex_a5_fpmul_pipe). | |
187 | ||
188 | (define_insn_reservation "cortex_a5_fpmuls" 4 | |
189 | (and (eq_attr "tune" "cortexa5") | |
190 | (eq_attr "type" "fmuls")) | |
191 | "cortex_a5_ex1+cortex_a5_fpmul_pipe") | |
192 | ||
193 | ;; For single-precision multiply-accumulate, the add (accumulate) is issued | |
194 | ;; whilst the multiply is in F4. The multiply result can then be forwarded | |
195 | ;; from F5 to F1. The issue unit is only used once (when we first start | |
196 | ;; processing the instruction), but the usage of the FP add pipeline could | |
197 | ;; block other instructions attempting to use it simultaneously. We try to | |
198 | ;; avoid that using cortex_a5_fpadd_pipe. | |
199 | ||
200 | (define_insn_reservation "cortex_a5_fpmacs" 8 | |
201 | (and (eq_attr "tune" "cortexa5") | |
29637783 | 202 | (eq_attr "type" "fmacs,ffmas")) |
d8099dd8 JB |
203 | "cortex_a5_ex1+cortex_a5_fpmul_pipe, nothing*3, cortex_a5_fpadd_pipe") |
204 | ||
205 | ;; Non-multiply instructions can issue in the middle two instructions of a | |
206 | ;; double-precision multiply. Note that it isn't entirely clear when a branch | |
207 | ;; can dual-issue when a multi-cycle multiplication is in progress; we ignore | |
208 | ;; that for now though. | |
209 | ||
210 | (define_insn_reservation "cortex_a5_fpmuld" 7 | |
211 | (and (eq_attr "tune" "cortexa5") | |
212 | (eq_attr "type" "fmuld")) | |
213 | "cortex_a5_ex1+cortex_a5_fpmul_pipe, cortex_a5_fpmul_pipe*2,\ | |
214 | cortex_a5_ex1+cortex_a5_fpmul_pipe") | |
215 | ||
216 | (define_insn_reservation "cortex_a5_fpmacd" 11 | |
217 | (and (eq_attr "tune" "cortexa5") | |
29637783 | 218 | (eq_attr "type" "fmacd,ffmad")) |
d8099dd8 JB |
219 | "cortex_a5_ex1+cortex_a5_fpmul_pipe, cortex_a5_fpmul_pipe*2,\ |
220 | cortex_a5_ex1+cortex_a5_fpmul_pipe, nothing*3, cortex_a5_fpadd_pipe") | |
221 | ||
222 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
223 | ;; Floating-point divide/square root instructions. | |
224 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
225 | ||
226 | ;; ??? Not sure if the 14 cycles taken for single-precision divide to complete | |
227 | ;; includes the time taken for the special instruction used to collect the | |
228 | ;; result to travel down the multiply pipeline, or not. Assuming so. (If | |
229 | ;; that's wrong, the latency should be increased by a few cycles.) | |
230 | ||
231 | ;; fsqrt takes one cycle less, but that is not modelled, nor is the use of the | |
232 | ;; multiply pipeline to collect the divide/square-root result. | |
233 | ||
234 | (define_insn_reservation "cortex_a5_fdivs" 14 | |
235 | (and (eq_attr "tune" "cortexa5") | |
b86923f0 | 236 | (eq_attr "type" "fdivs, fsqrts")) |
d8099dd8 JB |
237 | "cortex_a5_ex1, cortex_a5_fp_div_sqrt * 13") |
238 | ||
239 | ;; ??? Similarly for fdivd. | |
240 | ||
241 | (define_insn_reservation "cortex_a5_fdivd" 29 | |
242 | (and (eq_attr "tune" "cortexa5") | |
b86923f0 | 243 | (eq_attr "type" "fdivd, fsqrtd")) |
d8099dd8 JB |
244 | "cortex_a5_ex1, cortex_a5_fp_div_sqrt * 28") |
245 | ||
246 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
247 | ;; VFP to/from core transfers. | |
248 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
249 | ||
250 | ;; FP loads take data from wr/rot/f3. | |
251 | ||
252 | ;; Core-to-VFP transfers use the multiply pipeline. | |
253 | ||
254 | (define_insn_reservation "cortex_a5_r2f" 4 | |
255 | (and (eq_attr "tune" "cortexa5") | |
003bb7f3 | 256 | (eq_attr "type" "f_mcr,f_mcrr")) |
d8099dd8 JB |
257 | "cortex_a5_ex1") |
258 | ||
259 | (define_insn_reservation "cortex_a5_f2r" 2 | |
260 | (and (eq_attr "tune" "cortexa5") | |
003bb7f3 | 261 | (eq_attr "type" "f_mrc,f_mrrc")) |
d8099dd8 JB |
262 | "cortex_a5_ex1") |
263 | ||
264 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
265 | ;; VFP flag transfer. | |
266 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
267 | ||
268 | ;; ??? The flag forwarding from fmstat to the ex2 stage of the second | |
269 | ;; instruction is not modeled at present. | |
270 | ||
271 | (define_insn_reservation "cortex_a5_f_flags" 4 | |
272 | (and (eq_attr "tune" "cortexa5") | |
273 | (eq_attr "type" "f_flag")) | |
274 | "cortex_a5_ex1") | |
275 | ||
276 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
277 | ;; VFP load/store. | |
278 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
279 | ||
280 | (define_insn_reservation "cortex_a5_f_loads" 4 | |
281 | (and (eq_attr "tune" "cortexa5") | |
282 | (eq_attr "type" "f_loads")) | |
283 | "cortex_a5_ex1") | |
284 | ||
285 | (define_insn_reservation "cortex_a5_f_loadd" 5 | |
286 | (and (eq_attr "tune" "cortexa5") | |
837b01f6 | 287 | (eq_attr "type" "f_loadd")) |
d8099dd8 JB |
288 | "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1") |
289 | ||
290 | (define_insn_reservation "cortex_a5_f_stores" 0 | |
291 | (and (eq_attr "tune" "cortexa5") | |
292 | (eq_attr "type" "f_stores")) | |
293 | "cortex_a5_ex1") | |
294 | ||
295 | (define_insn_reservation "cortex_a5_f_stored" 0 | |
296 | (and (eq_attr "tune" "cortexa5") | |
837b01f6 | 297 | (eq_attr "type" "f_stored")) |
d8099dd8 JB |
298 | "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1") |
299 | ||
300 | ;; Load-to-use for floating-point values has a penalty of one cycle, | |
301 | ;; i.e. a latency of two. | |
302 | ||
303 | (define_bypass 2 "cortex_a5_f_loads" | |
304 | "cortex_a5_fpalu, cortex_a5_fpmacs, cortex_a5_fpmuld,\ | |
305 | cortex_a5_fpmacd, cortex_a5_fdivs, cortex_a5_fdivd,\ | |
306 | cortex_a5_f2r") | |
307 | ||
308 | (define_bypass 3 "cortex_a5_f_loadd" | |
309 | "cortex_a5_fpalu, cortex_a5_fpmacs, cortex_a5_fpmuld,\ | |
310 | cortex_a5_fpmacd, cortex_a5_fdivs, cortex_a5_fdivd,\ | |
311 | cortex_a5_f2r") |