]> gcc.gnu.org Git - gcc.git/blob - gcc/config/i386/i386.c
Introduce RTL function reader
[gcc.git] / gcc / config / i386 / i386.c
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2017 Free Software Foundation, Inc.
3
4 This file is part of GCC.
5
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
10
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
19
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "backend.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "memmodel.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "cfgloop.h"
30 #include "df.h"
31 #include "tm_p.h"
32 #include "stringpool.h"
33 #include "expmed.h"
34 #include "optabs.h"
35 #include "regs.h"
36 #include "emit-rtl.h"
37 #include "recog.h"
38 #include "cgraph.h"
39 #include "diagnostic.h"
40 #include "cfgbuild.h"
41 #include "alias.h"
42 #include "fold-const.h"
43 #include "attribs.h"
44 #include "calls.h"
45 #include "stor-layout.h"
46 #include "varasm.h"
47 #include "output.h"
48 #include "insn-attr.h"
49 #include "flags.h"
50 #include "except.h"
51 #include "explow.h"
52 #include "expr.h"
53 #include "cfgrtl.h"
54 #include "common/common-target.h"
55 #include "langhooks.h"
56 #include "reload.h"
57 #include "gimplify.h"
58 #include "dwarf2.h"
59 #include "tm-constrs.h"
60 #include "params.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "tree-chkp.h"
75 #include "rtl-chkp.h"
76 #include "dbgcnt.h"
77 #include "case-cfn-macros.h"
78 #include "regrename.h"
79 #include "dojump.h"
80 #include "fold-const-call.h"
81 #include "tree-vrp.h"
82 #include "tree-ssanames.h"
83 #include "selftest.h"
84 #include "selftest-rtl.h"
85 #include "print-rtl.h"
86
87 /* This file should be included last. */
88 #include "target-def.h"
89
90 static rtx legitimize_dllimport_symbol (rtx, bool);
91 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
92 static rtx legitimize_pe_coff_symbol (rtx, bool);
93 static void ix86_print_operand_address_as (FILE *, rtx, addr_space_t, bool);
94
95 #ifndef CHECK_STACK_LIMIT
96 #define CHECK_STACK_LIMIT (-1)
97 #endif
98
99 /* Return index of given mode in mult and division cost tables. */
100 #define MODE_INDEX(mode) \
101 ((mode) == QImode ? 0 \
102 : (mode) == HImode ? 1 \
103 : (mode) == SImode ? 2 \
104 : (mode) == DImode ? 3 \
105 : 4)
106
107 /* Processor costs (relative to an add) */
108 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
109 #define COSTS_N_BYTES(N) ((N) * 2)
110
111 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
112
113 static stringop_algs ix86_size_memcpy[2] = {
114 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
115 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
116 static stringop_algs ix86_size_memset[2] = {
117 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
118 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
119
120 const
121 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
122 COSTS_N_BYTES (2), /* cost of an add instruction */
123 COSTS_N_BYTES (3), /* cost of a lea instruction */
124 COSTS_N_BYTES (2), /* variable shift costs */
125 COSTS_N_BYTES (3), /* constant shift costs */
126 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
127 COSTS_N_BYTES (3), /* HI */
128 COSTS_N_BYTES (3), /* SI */
129 COSTS_N_BYTES (3), /* DI */
130 COSTS_N_BYTES (5)}, /* other */
131 0, /* cost of multiply per each bit set */
132 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
133 COSTS_N_BYTES (3), /* HI */
134 COSTS_N_BYTES (3), /* SI */
135 COSTS_N_BYTES (3), /* DI */
136 COSTS_N_BYTES (5)}, /* other */
137 COSTS_N_BYTES (3), /* cost of movsx */
138 COSTS_N_BYTES (3), /* cost of movzx */
139 0, /* "large" insn */
140 2, /* MOVE_RATIO */
141 2, /* cost for loading QImode using movzbl */
142 {2, 2, 2}, /* cost of loading integer registers
143 in QImode, HImode and SImode.
144 Relative to reg-reg move (2). */
145 {2, 2, 2}, /* cost of storing integer registers */
146 2, /* cost of reg,reg fld/fst */
147 {2, 2, 2}, /* cost of loading fp registers
148 in SFmode, DFmode and XFmode */
149 {2, 2, 2}, /* cost of storing fp registers
150 in SFmode, DFmode and XFmode */
151 3, /* cost of moving MMX register */
152 {3, 3}, /* cost of loading MMX registers
153 in SImode and DImode */
154 {3, 3}, /* cost of storing MMX registers
155 in SImode and DImode */
156 3, /* cost of moving SSE register */
157 {3, 3, 3}, /* cost of loading SSE registers
158 in SImode, DImode and TImode */
159 {3, 3, 3}, /* cost of storing SSE registers
160 in SImode, DImode and TImode */
161 3, /* MMX or SSE register to integer */
162 0, /* size of l1 cache */
163 0, /* size of l2 cache */
164 0, /* size of prefetch block */
165 0, /* number of parallel prefetches */
166 2, /* Branch cost */
167 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
168 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
169 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
170 COSTS_N_BYTES (2), /* cost of FABS instruction. */
171 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
172 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
173 ix86_size_memcpy,
174 ix86_size_memset,
175 1, /* scalar_stmt_cost. */
176 1, /* scalar load_cost. */
177 1, /* scalar_store_cost. */
178 1, /* vec_stmt_cost. */
179 1, /* vec_to_scalar_cost. */
180 1, /* scalar_to_vec_cost. */
181 1, /* vec_align_load_cost. */
182 1, /* vec_unalign_load_cost. */
183 1, /* vec_store_cost. */
184 1, /* cond_taken_branch_cost. */
185 1, /* cond_not_taken_branch_cost. */
186 };
187
188 /* Processor costs (relative to an add) */
189 static stringop_algs i386_memcpy[2] = {
190 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
191 DUMMY_STRINGOP_ALGS};
192 static stringop_algs i386_memset[2] = {
193 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
194 DUMMY_STRINGOP_ALGS};
195
196 static const
197 struct processor_costs i386_cost = { /* 386 specific costs */
198 COSTS_N_INSNS (1), /* cost of an add instruction */
199 COSTS_N_INSNS (1), /* cost of a lea instruction */
200 COSTS_N_INSNS (3), /* variable shift costs */
201 COSTS_N_INSNS (2), /* constant shift costs */
202 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
203 COSTS_N_INSNS (6), /* HI */
204 COSTS_N_INSNS (6), /* SI */
205 COSTS_N_INSNS (6), /* DI */
206 COSTS_N_INSNS (6)}, /* other */
207 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
208 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
209 COSTS_N_INSNS (23), /* HI */
210 COSTS_N_INSNS (23), /* SI */
211 COSTS_N_INSNS (23), /* DI */
212 COSTS_N_INSNS (23)}, /* other */
213 COSTS_N_INSNS (3), /* cost of movsx */
214 COSTS_N_INSNS (2), /* cost of movzx */
215 15, /* "large" insn */
216 3, /* MOVE_RATIO */
217 4, /* cost for loading QImode using movzbl */
218 {2, 4, 2}, /* cost of loading integer registers
219 in QImode, HImode and SImode.
220 Relative to reg-reg move (2). */
221 {2, 4, 2}, /* cost of storing integer registers */
222 2, /* cost of reg,reg fld/fst */
223 {8, 8, 8}, /* cost of loading fp registers
224 in SFmode, DFmode and XFmode */
225 {8, 8, 8}, /* cost of storing fp registers
226 in SFmode, DFmode and XFmode */
227 2, /* cost of moving MMX register */
228 {4, 8}, /* cost of loading MMX registers
229 in SImode and DImode */
230 {4, 8}, /* cost of storing MMX registers
231 in SImode and DImode */
232 2, /* cost of moving SSE register */
233 {4, 8, 16}, /* cost of loading SSE registers
234 in SImode, DImode and TImode */
235 {4, 8, 16}, /* cost of storing SSE registers
236 in SImode, DImode and TImode */
237 3, /* MMX or SSE register to integer */
238 0, /* size of l1 cache */
239 0, /* size of l2 cache */
240 0, /* size of prefetch block */
241 0, /* number of parallel prefetches */
242 1, /* Branch cost */
243 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
244 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
245 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
246 COSTS_N_INSNS (22), /* cost of FABS instruction. */
247 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
248 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
249 i386_memcpy,
250 i386_memset,
251 1, /* scalar_stmt_cost. */
252 1, /* scalar load_cost. */
253 1, /* scalar_store_cost. */
254 1, /* vec_stmt_cost. */
255 1, /* vec_to_scalar_cost. */
256 1, /* scalar_to_vec_cost. */
257 1, /* vec_align_load_cost. */
258 2, /* vec_unalign_load_cost. */
259 1, /* vec_store_cost. */
260 3, /* cond_taken_branch_cost. */
261 1, /* cond_not_taken_branch_cost. */
262 };
263
264 static stringop_algs i486_memcpy[2] = {
265 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
266 DUMMY_STRINGOP_ALGS};
267 static stringop_algs i486_memset[2] = {
268 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
269 DUMMY_STRINGOP_ALGS};
270
271 static const
272 struct processor_costs i486_cost = { /* 486 specific costs */
273 COSTS_N_INSNS (1), /* cost of an add instruction */
274 COSTS_N_INSNS (1), /* cost of a lea instruction */
275 COSTS_N_INSNS (3), /* variable shift costs */
276 COSTS_N_INSNS (2), /* constant shift costs */
277 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
278 COSTS_N_INSNS (12), /* HI */
279 COSTS_N_INSNS (12), /* SI */
280 COSTS_N_INSNS (12), /* DI */
281 COSTS_N_INSNS (12)}, /* other */
282 1, /* cost of multiply per each bit set */
283 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
284 COSTS_N_INSNS (40), /* HI */
285 COSTS_N_INSNS (40), /* SI */
286 COSTS_N_INSNS (40), /* DI */
287 COSTS_N_INSNS (40)}, /* other */
288 COSTS_N_INSNS (3), /* cost of movsx */
289 COSTS_N_INSNS (2), /* cost of movzx */
290 15, /* "large" insn */
291 3, /* MOVE_RATIO */
292 4, /* cost for loading QImode using movzbl */
293 {2, 4, 2}, /* cost of loading integer registers
294 in QImode, HImode and SImode.
295 Relative to reg-reg move (2). */
296 {2, 4, 2}, /* cost of storing integer registers */
297 2, /* cost of reg,reg fld/fst */
298 {8, 8, 8}, /* cost of loading fp registers
299 in SFmode, DFmode and XFmode */
300 {8, 8, 8}, /* cost of storing fp registers
301 in SFmode, DFmode and XFmode */
302 2, /* cost of moving MMX register */
303 {4, 8}, /* cost of loading MMX registers
304 in SImode and DImode */
305 {4, 8}, /* cost of storing MMX registers
306 in SImode and DImode */
307 2, /* cost of moving SSE register */
308 {4, 8, 16}, /* cost of loading SSE registers
309 in SImode, DImode and TImode */
310 {4, 8, 16}, /* cost of storing SSE registers
311 in SImode, DImode and TImode */
312 3, /* MMX or SSE register to integer */
313 4, /* size of l1 cache. 486 has 8kB cache
314 shared for code and data, so 4kB is
315 not really precise. */
316 4, /* size of l2 cache */
317 0, /* size of prefetch block */
318 0, /* number of parallel prefetches */
319 1, /* Branch cost */
320 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
321 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
322 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
323 COSTS_N_INSNS (3), /* cost of FABS instruction. */
324 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
325 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
326 i486_memcpy,
327 i486_memset,
328 1, /* scalar_stmt_cost. */
329 1, /* scalar load_cost. */
330 1, /* scalar_store_cost. */
331 1, /* vec_stmt_cost. */
332 1, /* vec_to_scalar_cost. */
333 1, /* scalar_to_vec_cost. */
334 1, /* vec_align_load_cost. */
335 2, /* vec_unalign_load_cost. */
336 1, /* vec_store_cost. */
337 3, /* cond_taken_branch_cost. */
338 1, /* cond_not_taken_branch_cost. */
339 };
340
341 static stringop_algs pentium_memcpy[2] = {
342 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
343 DUMMY_STRINGOP_ALGS};
344 static stringop_algs pentium_memset[2] = {
345 {libcall, {{-1, rep_prefix_4_byte, false}}},
346 DUMMY_STRINGOP_ALGS};
347
348 static const
349 struct processor_costs pentium_cost = {
350 COSTS_N_INSNS (1), /* cost of an add instruction */
351 COSTS_N_INSNS (1), /* cost of a lea instruction */
352 COSTS_N_INSNS (4), /* variable shift costs */
353 COSTS_N_INSNS (1), /* constant shift costs */
354 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
355 COSTS_N_INSNS (11), /* HI */
356 COSTS_N_INSNS (11), /* SI */
357 COSTS_N_INSNS (11), /* DI */
358 COSTS_N_INSNS (11)}, /* other */
359 0, /* cost of multiply per each bit set */
360 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
361 COSTS_N_INSNS (25), /* HI */
362 COSTS_N_INSNS (25), /* SI */
363 COSTS_N_INSNS (25), /* DI */
364 COSTS_N_INSNS (25)}, /* other */
365 COSTS_N_INSNS (3), /* cost of movsx */
366 COSTS_N_INSNS (2), /* cost of movzx */
367 8, /* "large" insn */
368 6, /* MOVE_RATIO */
369 6, /* cost for loading QImode using movzbl */
370 {2, 4, 2}, /* cost of loading integer registers
371 in QImode, HImode and SImode.
372 Relative to reg-reg move (2). */
373 {2, 4, 2}, /* cost of storing integer registers */
374 2, /* cost of reg,reg fld/fst */
375 {2, 2, 6}, /* cost of loading fp registers
376 in SFmode, DFmode and XFmode */
377 {4, 4, 6}, /* cost of storing fp registers
378 in SFmode, DFmode and XFmode */
379 8, /* cost of moving MMX register */
380 {8, 8}, /* cost of loading MMX registers
381 in SImode and DImode */
382 {8, 8}, /* cost of storing MMX registers
383 in SImode and DImode */
384 2, /* cost of moving SSE register */
385 {4, 8, 16}, /* cost of loading SSE registers
386 in SImode, DImode and TImode */
387 {4, 8, 16}, /* cost of storing SSE registers
388 in SImode, DImode and TImode */
389 3, /* MMX or SSE register to integer */
390 8, /* size of l1 cache. */
391 8, /* size of l2 cache */
392 0, /* size of prefetch block */
393 0, /* number of parallel prefetches */
394 2, /* Branch cost */
395 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
396 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
397 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
398 COSTS_N_INSNS (1), /* cost of FABS instruction. */
399 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
400 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
401 pentium_memcpy,
402 pentium_memset,
403 1, /* scalar_stmt_cost. */
404 1, /* scalar load_cost. */
405 1, /* scalar_store_cost. */
406 1, /* vec_stmt_cost. */
407 1, /* vec_to_scalar_cost. */
408 1, /* scalar_to_vec_cost. */
409 1, /* vec_align_load_cost. */
410 2, /* vec_unalign_load_cost. */
411 1, /* vec_store_cost. */
412 3, /* cond_taken_branch_cost. */
413 1, /* cond_not_taken_branch_cost. */
414 };
415
416 static const
417 struct processor_costs lakemont_cost = {
418 COSTS_N_INSNS (1), /* cost of an add instruction */
419 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
420 COSTS_N_INSNS (1), /* variable shift costs */
421 COSTS_N_INSNS (1), /* constant shift costs */
422 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
423 COSTS_N_INSNS (11), /* HI */
424 COSTS_N_INSNS (11), /* SI */
425 COSTS_N_INSNS (11), /* DI */
426 COSTS_N_INSNS (11)}, /* other */
427 0, /* cost of multiply per each bit set */
428 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
429 COSTS_N_INSNS (25), /* HI */
430 COSTS_N_INSNS (25), /* SI */
431 COSTS_N_INSNS (25), /* DI */
432 COSTS_N_INSNS (25)}, /* other */
433 COSTS_N_INSNS (3), /* cost of movsx */
434 COSTS_N_INSNS (2), /* cost of movzx */
435 8, /* "large" insn */
436 17, /* MOVE_RATIO */
437 6, /* cost for loading QImode using movzbl */
438 {2, 4, 2}, /* cost of loading integer registers
439 in QImode, HImode and SImode.
440 Relative to reg-reg move (2). */
441 {2, 4, 2}, /* cost of storing integer registers */
442 2, /* cost of reg,reg fld/fst */
443 {2, 2, 6}, /* cost of loading fp registers
444 in SFmode, DFmode and XFmode */
445 {4, 4, 6}, /* cost of storing fp registers
446 in SFmode, DFmode and XFmode */
447 8, /* cost of moving MMX register */
448 {8, 8}, /* cost of loading MMX registers
449 in SImode and DImode */
450 {8, 8}, /* cost of storing MMX registers
451 in SImode and DImode */
452 2, /* cost of moving SSE register */
453 {4, 8, 16}, /* cost of loading SSE registers
454 in SImode, DImode and TImode */
455 {4, 8, 16}, /* cost of storing SSE registers
456 in SImode, DImode and TImode */
457 3, /* MMX or SSE register to integer */
458 8, /* size of l1 cache. */
459 8, /* size of l2 cache */
460 0, /* size of prefetch block */
461 0, /* number of parallel prefetches */
462 2, /* Branch cost */
463 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
464 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
465 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
466 COSTS_N_INSNS (1), /* cost of FABS instruction. */
467 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
468 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
469 pentium_memcpy,
470 pentium_memset,
471 1, /* scalar_stmt_cost. */
472 1, /* scalar load_cost. */
473 1, /* scalar_store_cost. */
474 1, /* vec_stmt_cost. */
475 1, /* vec_to_scalar_cost. */
476 1, /* scalar_to_vec_cost. */
477 1, /* vec_align_load_cost. */
478 2, /* vec_unalign_load_cost. */
479 1, /* vec_store_cost. */
480 3, /* cond_taken_branch_cost. */
481 1, /* cond_not_taken_branch_cost. */
482 };
483
484 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
485 (we ensure the alignment). For small blocks inline loop is still a
486 noticeable win, for bigger blocks either rep movsl or rep movsb is
487 way to go. Rep movsb has apparently more expensive startup time in CPU,
488 but after 4K the difference is down in the noise. */
489 static stringop_algs pentiumpro_memcpy[2] = {
490 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
491 {8192, rep_prefix_4_byte, false},
492 {-1, rep_prefix_1_byte, false}}},
493 DUMMY_STRINGOP_ALGS};
494 static stringop_algs pentiumpro_memset[2] = {
495 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
496 {8192, rep_prefix_4_byte, false},
497 {-1, libcall, false}}},
498 DUMMY_STRINGOP_ALGS};
499 static const
500 struct processor_costs pentiumpro_cost = {
501 COSTS_N_INSNS (1), /* cost of an add instruction */
502 COSTS_N_INSNS (1), /* cost of a lea instruction */
503 COSTS_N_INSNS (1), /* variable shift costs */
504 COSTS_N_INSNS (1), /* constant shift costs */
505 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
506 COSTS_N_INSNS (4), /* HI */
507 COSTS_N_INSNS (4), /* SI */
508 COSTS_N_INSNS (4), /* DI */
509 COSTS_N_INSNS (4)}, /* other */
510 0, /* cost of multiply per each bit set */
511 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
512 COSTS_N_INSNS (17), /* HI */
513 COSTS_N_INSNS (17), /* SI */
514 COSTS_N_INSNS (17), /* DI */
515 COSTS_N_INSNS (17)}, /* other */
516 COSTS_N_INSNS (1), /* cost of movsx */
517 COSTS_N_INSNS (1), /* cost of movzx */
518 8, /* "large" insn */
519 6, /* MOVE_RATIO */
520 2, /* cost for loading QImode using movzbl */
521 {4, 4, 4}, /* cost of loading integer registers
522 in QImode, HImode and SImode.
523 Relative to reg-reg move (2). */
524 {2, 2, 2}, /* cost of storing integer registers */
525 2, /* cost of reg,reg fld/fst */
526 {2, 2, 6}, /* cost of loading fp registers
527 in SFmode, DFmode and XFmode */
528 {4, 4, 6}, /* cost of storing fp registers
529 in SFmode, DFmode and XFmode */
530 2, /* cost of moving MMX register */
531 {2, 2}, /* cost of loading MMX registers
532 in SImode and DImode */
533 {2, 2}, /* cost of storing MMX registers
534 in SImode and DImode */
535 2, /* cost of moving SSE register */
536 {2, 2, 8}, /* cost of loading SSE registers
537 in SImode, DImode and TImode */
538 {2, 2, 8}, /* cost of storing SSE registers
539 in SImode, DImode and TImode */
540 3, /* MMX or SSE register to integer */
541 8, /* size of l1 cache. */
542 256, /* size of l2 cache */
543 32, /* size of prefetch block */
544 6, /* number of parallel prefetches */
545 2, /* Branch cost */
546 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
547 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
548 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
549 COSTS_N_INSNS (2), /* cost of FABS instruction. */
550 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
551 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
552 pentiumpro_memcpy,
553 pentiumpro_memset,
554 1, /* scalar_stmt_cost. */
555 1, /* scalar load_cost. */
556 1, /* scalar_store_cost. */
557 1, /* vec_stmt_cost. */
558 1, /* vec_to_scalar_cost. */
559 1, /* scalar_to_vec_cost. */
560 1, /* vec_align_load_cost. */
561 2, /* vec_unalign_load_cost. */
562 1, /* vec_store_cost. */
563 3, /* cond_taken_branch_cost. */
564 1, /* cond_not_taken_branch_cost. */
565 };
566
567 static stringop_algs geode_memcpy[2] = {
568 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
569 DUMMY_STRINGOP_ALGS};
570 static stringop_algs geode_memset[2] = {
571 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
572 DUMMY_STRINGOP_ALGS};
573 static const
574 struct processor_costs geode_cost = {
575 COSTS_N_INSNS (1), /* cost of an add instruction */
576 COSTS_N_INSNS (1), /* cost of a lea instruction */
577 COSTS_N_INSNS (2), /* variable shift costs */
578 COSTS_N_INSNS (1), /* constant shift costs */
579 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
580 COSTS_N_INSNS (4), /* HI */
581 COSTS_N_INSNS (7), /* SI */
582 COSTS_N_INSNS (7), /* DI */
583 COSTS_N_INSNS (7)}, /* other */
584 0, /* cost of multiply per each bit set */
585 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
586 COSTS_N_INSNS (23), /* HI */
587 COSTS_N_INSNS (39), /* SI */
588 COSTS_N_INSNS (39), /* DI */
589 COSTS_N_INSNS (39)}, /* other */
590 COSTS_N_INSNS (1), /* cost of movsx */
591 COSTS_N_INSNS (1), /* cost of movzx */
592 8, /* "large" insn */
593 4, /* MOVE_RATIO */
594 1, /* cost for loading QImode using movzbl */
595 {1, 1, 1}, /* cost of loading integer registers
596 in QImode, HImode and SImode.
597 Relative to reg-reg move (2). */
598 {1, 1, 1}, /* cost of storing integer registers */
599 1, /* cost of reg,reg fld/fst */
600 {1, 1, 1}, /* cost of loading fp registers
601 in SFmode, DFmode and XFmode */
602 {4, 6, 6}, /* cost of storing fp registers
603 in SFmode, DFmode and XFmode */
604
605 2, /* cost of moving MMX register */
606 {2, 2}, /* cost of loading MMX registers
607 in SImode and DImode */
608 {2, 2}, /* cost of storing MMX registers
609 in SImode and DImode */
610 2, /* cost of moving SSE register */
611 {2, 2, 8}, /* cost of loading SSE registers
612 in SImode, DImode and TImode */
613 {2, 2, 8}, /* cost of storing SSE registers
614 in SImode, DImode and TImode */
615 3, /* MMX or SSE register to integer */
616 64, /* size of l1 cache. */
617 128, /* size of l2 cache. */
618 32, /* size of prefetch block */
619 1, /* number of parallel prefetches */
620 1, /* Branch cost */
621 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
622 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
623 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
624 COSTS_N_INSNS (1), /* cost of FABS instruction. */
625 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
626 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
627 geode_memcpy,
628 geode_memset,
629 1, /* scalar_stmt_cost. */
630 1, /* scalar load_cost. */
631 1, /* scalar_store_cost. */
632 1, /* vec_stmt_cost. */
633 1, /* vec_to_scalar_cost. */
634 1, /* scalar_to_vec_cost. */
635 1, /* vec_align_load_cost. */
636 2, /* vec_unalign_load_cost. */
637 1, /* vec_store_cost. */
638 3, /* cond_taken_branch_cost. */
639 1, /* cond_not_taken_branch_cost. */
640 };
641
642 static stringop_algs k6_memcpy[2] = {
643 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
644 DUMMY_STRINGOP_ALGS};
645 static stringop_algs k6_memset[2] = {
646 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
647 DUMMY_STRINGOP_ALGS};
648 static const
649 struct processor_costs k6_cost = {
650 COSTS_N_INSNS (1), /* cost of an add instruction */
651 COSTS_N_INSNS (2), /* cost of a lea instruction */
652 COSTS_N_INSNS (1), /* variable shift costs */
653 COSTS_N_INSNS (1), /* constant shift costs */
654 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
655 COSTS_N_INSNS (3), /* HI */
656 COSTS_N_INSNS (3), /* SI */
657 COSTS_N_INSNS (3), /* DI */
658 COSTS_N_INSNS (3)}, /* other */
659 0, /* cost of multiply per each bit set */
660 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
661 COSTS_N_INSNS (18), /* HI */
662 COSTS_N_INSNS (18), /* SI */
663 COSTS_N_INSNS (18), /* DI */
664 COSTS_N_INSNS (18)}, /* other */
665 COSTS_N_INSNS (2), /* cost of movsx */
666 COSTS_N_INSNS (2), /* cost of movzx */
667 8, /* "large" insn */
668 4, /* MOVE_RATIO */
669 3, /* cost for loading QImode using movzbl */
670 {4, 5, 4}, /* cost of loading integer registers
671 in QImode, HImode and SImode.
672 Relative to reg-reg move (2). */
673 {2, 3, 2}, /* cost of storing integer registers */
674 4, /* cost of reg,reg fld/fst */
675 {6, 6, 6}, /* cost of loading fp registers
676 in SFmode, DFmode and XFmode */
677 {4, 4, 4}, /* cost of storing fp registers
678 in SFmode, DFmode and XFmode */
679 2, /* cost of moving MMX register */
680 {2, 2}, /* cost of loading MMX registers
681 in SImode and DImode */
682 {2, 2}, /* cost of storing MMX registers
683 in SImode and DImode */
684 2, /* cost of moving SSE register */
685 {2, 2, 8}, /* cost of loading SSE registers
686 in SImode, DImode and TImode */
687 {2, 2, 8}, /* cost of storing SSE registers
688 in SImode, DImode and TImode */
689 6, /* MMX or SSE register to integer */
690 32, /* size of l1 cache. */
691 32, /* size of l2 cache. Some models
692 have integrated l2 cache, but
693 optimizing for k6 is not important
694 enough to worry about that. */
695 32, /* size of prefetch block */
696 1, /* number of parallel prefetches */
697 1, /* Branch cost */
698 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
699 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
700 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
701 COSTS_N_INSNS (2), /* cost of FABS instruction. */
702 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
703 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
704 k6_memcpy,
705 k6_memset,
706 1, /* scalar_stmt_cost. */
707 1, /* scalar load_cost. */
708 1, /* scalar_store_cost. */
709 1, /* vec_stmt_cost. */
710 1, /* vec_to_scalar_cost. */
711 1, /* scalar_to_vec_cost. */
712 1, /* vec_align_load_cost. */
713 2, /* vec_unalign_load_cost. */
714 1, /* vec_store_cost. */
715 3, /* cond_taken_branch_cost. */
716 1, /* cond_not_taken_branch_cost. */
717 };
718
719 /* For some reason, Athlon deals better with REP prefix (relative to loops)
720 compared to K8. Alignment becomes important after 8 bytes for memcpy and
721 128 bytes for memset. */
722 static stringop_algs athlon_memcpy[2] = {
723 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
724 DUMMY_STRINGOP_ALGS};
725 static stringop_algs athlon_memset[2] = {
726 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
727 DUMMY_STRINGOP_ALGS};
728 static const
729 struct processor_costs athlon_cost = {
730 COSTS_N_INSNS (1), /* cost of an add instruction */
731 COSTS_N_INSNS (2), /* cost of a lea instruction */
732 COSTS_N_INSNS (1), /* variable shift costs */
733 COSTS_N_INSNS (1), /* constant shift costs */
734 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
735 COSTS_N_INSNS (5), /* HI */
736 COSTS_N_INSNS (5), /* SI */
737 COSTS_N_INSNS (5), /* DI */
738 COSTS_N_INSNS (5)}, /* other */
739 0, /* cost of multiply per each bit set */
740 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
741 COSTS_N_INSNS (26), /* HI */
742 COSTS_N_INSNS (42), /* SI */
743 COSTS_N_INSNS (74), /* DI */
744 COSTS_N_INSNS (74)}, /* other */
745 COSTS_N_INSNS (1), /* cost of movsx */
746 COSTS_N_INSNS (1), /* cost of movzx */
747 8, /* "large" insn */
748 9, /* MOVE_RATIO */
749 4, /* cost for loading QImode using movzbl */
750 {3, 4, 3}, /* cost of loading integer registers
751 in QImode, HImode and SImode.
752 Relative to reg-reg move (2). */
753 {3, 4, 3}, /* cost of storing integer registers */
754 4, /* cost of reg,reg fld/fst */
755 {4, 4, 12}, /* cost of loading fp registers
756 in SFmode, DFmode and XFmode */
757 {6, 6, 8}, /* cost of storing fp registers
758 in SFmode, DFmode and XFmode */
759 2, /* cost of moving MMX register */
760 {4, 4}, /* cost of loading MMX registers
761 in SImode and DImode */
762 {4, 4}, /* cost of storing MMX registers
763 in SImode and DImode */
764 2, /* cost of moving SSE register */
765 {4, 4, 6}, /* cost of loading SSE registers
766 in SImode, DImode and TImode */
767 {4, 4, 5}, /* cost of storing SSE registers
768 in SImode, DImode and TImode */
769 5, /* MMX or SSE register to integer */
770 64, /* size of l1 cache. */
771 256, /* size of l2 cache. */
772 64, /* size of prefetch block */
773 6, /* number of parallel prefetches */
774 5, /* Branch cost */
775 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
776 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
777 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
778 COSTS_N_INSNS (2), /* cost of FABS instruction. */
779 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
780 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
781 athlon_memcpy,
782 athlon_memset,
783 1, /* scalar_stmt_cost. */
784 1, /* scalar load_cost. */
785 1, /* scalar_store_cost. */
786 1, /* vec_stmt_cost. */
787 1, /* vec_to_scalar_cost. */
788 1, /* scalar_to_vec_cost. */
789 1, /* vec_align_load_cost. */
790 2, /* vec_unalign_load_cost. */
791 1, /* vec_store_cost. */
792 3, /* cond_taken_branch_cost. */
793 1, /* cond_not_taken_branch_cost. */
794 };
795
796 /* K8 has optimized REP instruction for medium sized blocks, but for very
797 small blocks it is better to use loop. For large blocks, libcall can
798 do nontemporary accesses and beat inline considerably. */
799 static stringop_algs k8_memcpy[2] = {
800 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
801 {-1, rep_prefix_4_byte, false}}},
802 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
803 {-1, libcall, false}}}};
804 static stringop_algs k8_memset[2] = {
805 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
806 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
807 {libcall, {{48, unrolled_loop, false},
808 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
809 static const
810 struct processor_costs k8_cost = {
811 COSTS_N_INSNS (1), /* cost of an add instruction */
812 COSTS_N_INSNS (2), /* cost of a lea instruction */
813 COSTS_N_INSNS (1), /* variable shift costs */
814 COSTS_N_INSNS (1), /* constant shift costs */
815 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
816 COSTS_N_INSNS (4), /* HI */
817 COSTS_N_INSNS (3), /* SI */
818 COSTS_N_INSNS (4), /* DI */
819 COSTS_N_INSNS (5)}, /* other */
820 0, /* cost of multiply per each bit set */
821 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
822 COSTS_N_INSNS (26), /* HI */
823 COSTS_N_INSNS (42), /* SI */
824 COSTS_N_INSNS (74), /* DI */
825 COSTS_N_INSNS (74)}, /* other */
826 COSTS_N_INSNS (1), /* cost of movsx */
827 COSTS_N_INSNS (1), /* cost of movzx */
828 8, /* "large" insn */
829 9, /* MOVE_RATIO */
830 4, /* cost for loading QImode using movzbl */
831 {3, 4, 3}, /* cost of loading integer registers
832 in QImode, HImode and SImode.
833 Relative to reg-reg move (2). */
834 {3, 4, 3}, /* cost of storing integer registers */
835 4, /* cost of reg,reg fld/fst */
836 {4, 4, 12}, /* cost of loading fp registers
837 in SFmode, DFmode and XFmode */
838 {6, 6, 8}, /* cost of storing fp registers
839 in SFmode, DFmode and XFmode */
840 2, /* cost of moving MMX register */
841 {3, 3}, /* cost of loading MMX registers
842 in SImode and DImode */
843 {4, 4}, /* cost of storing MMX registers
844 in SImode and DImode */
845 2, /* cost of moving SSE register */
846 {4, 3, 6}, /* cost of loading SSE registers
847 in SImode, DImode and TImode */
848 {4, 4, 5}, /* cost of storing SSE registers
849 in SImode, DImode and TImode */
850 5, /* MMX or SSE register to integer */
851 64, /* size of l1 cache. */
852 512, /* size of l2 cache. */
853 64, /* size of prefetch block */
854 /* New AMD processors never drop prefetches; if they cannot be performed
855 immediately, they are queued. We set number of simultaneous prefetches
856 to a large constant to reflect this (it probably is not a good idea not
857 to limit number of prefetches at all, as their execution also takes some
858 time). */
859 100, /* number of parallel prefetches */
860 3, /* Branch cost */
861 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
862 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
863 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
864 COSTS_N_INSNS (2), /* cost of FABS instruction. */
865 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
866 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
867
868 k8_memcpy,
869 k8_memset,
870 4, /* scalar_stmt_cost. */
871 2, /* scalar load_cost. */
872 2, /* scalar_store_cost. */
873 5, /* vec_stmt_cost. */
874 0, /* vec_to_scalar_cost. */
875 2, /* scalar_to_vec_cost. */
876 2, /* vec_align_load_cost. */
877 3, /* vec_unalign_load_cost. */
878 3, /* vec_store_cost. */
879 3, /* cond_taken_branch_cost. */
880 2, /* cond_not_taken_branch_cost. */
881 };
882
883 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
884 very small blocks it is better to use loop. For large blocks, libcall can
885 do nontemporary accesses and beat inline considerably. */
886 static stringop_algs amdfam10_memcpy[2] = {
887 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
888 {-1, rep_prefix_4_byte, false}}},
889 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
890 {-1, libcall, false}}}};
891 static stringop_algs amdfam10_memset[2] = {
892 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
893 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
894 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
895 {-1, libcall, false}}}};
896 struct processor_costs amdfam10_cost = {
897 COSTS_N_INSNS (1), /* cost of an add instruction */
898 COSTS_N_INSNS (2), /* cost of a lea instruction */
899 COSTS_N_INSNS (1), /* variable shift costs */
900 COSTS_N_INSNS (1), /* constant shift costs */
901 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
902 COSTS_N_INSNS (4), /* HI */
903 COSTS_N_INSNS (3), /* SI */
904 COSTS_N_INSNS (4), /* DI */
905 COSTS_N_INSNS (5)}, /* other */
906 0, /* cost of multiply per each bit set */
907 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
908 COSTS_N_INSNS (35), /* HI */
909 COSTS_N_INSNS (51), /* SI */
910 COSTS_N_INSNS (83), /* DI */
911 COSTS_N_INSNS (83)}, /* other */
912 COSTS_N_INSNS (1), /* cost of movsx */
913 COSTS_N_INSNS (1), /* cost of movzx */
914 8, /* "large" insn */
915 9, /* MOVE_RATIO */
916 4, /* cost for loading QImode using movzbl */
917 {3, 4, 3}, /* cost of loading integer registers
918 in QImode, HImode and SImode.
919 Relative to reg-reg move (2). */
920 {3, 4, 3}, /* cost of storing integer registers */
921 4, /* cost of reg,reg fld/fst */
922 {4, 4, 12}, /* cost of loading fp registers
923 in SFmode, DFmode and XFmode */
924 {6, 6, 8}, /* cost of storing fp registers
925 in SFmode, DFmode and XFmode */
926 2, /* cost of moving MMX register */
927 {3, 3}, /* cost of loading MMX registers
928 in SImode and DImode */
929 {4, 4}, /* cost of storing MMX registers
930 in SImode and DImode */
931 2, /* cost of moving SSE register */
932 {4, 4, 3}, /* cost of loading SSE registers
933 in SImode, DImode and TImode */
934 {4, 4, 5}, /* cost of storing SSE registers
935 in SImode, DImode and TImode */
936 3, /* MMX or SSE register to integer */
937 /* On K8:
938 MOVD reg64, xmmreg Double FSTORE 4
939 MOVD reg32, xmmreg Double FSTORE 4
940 On AMDFAM10:
941 MOVD reg64, xmmreg Double FADD 3
942 1/1 1/1
943 MOVD reg32, xmmreg Double FADD 3
944 1/1 1/1 */
945 64, /* size of l1 cache. */
946 512, /* size of l2 cache. */
947 64, /* size of prefetch block */
948 /* New AMD processors never drop prefetches; if they cannot be performed
949 immediately, they are queued. We set number of simultaneous prefetches
950 to a large constant to reflect this (it probably is not a good idea not
951 to limit number of prefetches at all, as their execution also takes some
952 time). */
953 100, /* number of parallel prefetches */
954 2, /* Branch cost */
955 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
956 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
957 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
958 COSTS_N_INSNS (2), /* cost of FABS instruction. */
959 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
960 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
961
962 amdfam10_memcpy,
963 amdfam10_memset,
964 4, /* scalar_stmt_cost. */
965 2, /* scalar load_cost. */
966 2, /* scalar_store_cost. */
967 6, /* vec_stmt_cost. */
968 0, /* vec_to_scalar_cost. */
969 2, /* scalar_to_vec_cost. */
970 2, /* vec_align_load_cost. */
971 2, /* vec_unalign_load_cost. */
972 2, /* vec_store_cost. */
973 2, /* cond_taken_branch_cost. */
974 1, /* cond_not_taken_branch_cost. */
975 };
976
977 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
978 very small blocks it is better to use loop. For large blocks, libcall
979 can do nontemporary accesses and beat inline considerably. */
980 static stringop_algs bdver1_memcpy[2] = {
981 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
982 {-1, rep_prefix_4_byte, false}}},
983 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
984 {-1, libcall, false}}}};
985 static stringop_algs bdver1_memset[2] = {
986 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
987 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
988 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
989 {-1, libcall, false}}}};
990
991 const struct processor_costs bdver1_cost = {
992 COSTS_N_INSNS (1), /* cost of an add instruction */
993 COSTS_N_INSNS (1), /* cost of a lea instruction */
994 COSTS_N_INSNS (1), /* variable shift costs */
995 COSTS_N_INSNS (1), /* constant shift costs */
996 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
997 COSTS_N_INSNS (4), /* HI */
998 COSTS_N_INSNS (4), /* SI */
999 COSTS_N_INSNS (6), /* DI */
1000 COSTS_N_INSNS (6)}, /* other */
1001 0, /* cost of multiply per each bit set */
1002 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1003 COSTS_N_INSNS (35), /* HI */
1004 COSTS_N_INSNS (51), /* SI */
1005 COSTS_N_INSNS (83), /* DI */
1006 COSTS_N_INSNS (83)}, /* other */
1007 COSTS_N_INSNS (1), /* cost of movsx */
1008 COSTS_N_INSNS (1), /* cost of movzx */
1009 8, /* "large" insn */
1010 9, /* MOVE_RATIO */
1011 4, /* cost for loading QImode using movzbl */
1012 {5, 5, 4}, /* cost of loading integer registers
1013 in QImode, HImode and SImode.
1014 Relative to reg-reg move (2). */
1015 {4, 4, 4}, /* cost of storing integer registers */
1016 2, /* cost of reg,reg fld/fst */
1017 {5, 5, 12}, /* cost of loading fp registers
1018 in SFmode, DFmode and XFmode */
1019 {4, 4, 8}, /* cost of storing fp registers
1020 in SFmode, DFmode and XFmode */
1021 2, /* cost of moving MMX register */
1022 {4, 4}, /* cost of loading MMX registers
1023 in SImode and DImode */
1024 {4, 4}, /* cost of storing MMX registers
1025 in SImode and DImode */
1026 2, /* cost of moving SSE register */
1027 {4, 4, 4}, /* cost of loading SSE registers
1028 in SImode, DImode and TImode */
1029 {4, 4, 4}, /* cost of storing SSE registers
1030 in SImode, DImode and TImode */
1031 2, /* MMX or SSE register to integer */
1032 /* On K8:
1033 MOVD reg64, xmmreg Double FSTORE 4
1034 MOVD reg32, xmmreg Double FSTORE 4
1035 On AMDFAM10:
1036 MOVD reg64, xmmreg Double FADD 3
1037 1/1 1/1
1038 MOVD reg32, xmmreg Double FADD 3
1039 1/1 1/1 */
1040 16, /* size of l1 cache. */
1041 2048, /* size of l2 cache. */
1042 64, /* size of prefetch block */
1043 /* New AMD processors never drop prefetches; if they cannot be performed
1044 immediately, they are queued. We set number of simultaneous prefetches
1045 to a large constant to reflect this (it probably is not a good idea not
1046 to limit number of prefetches at all, as their execution also takes some
1047 time). */
1048 100, /* number of parallel prefetches */
1049 2, /* Branch cost */
1050 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1051 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1052 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1053 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1054 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1055 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1056
1057 bdver1_memcpy,
1058 bdver1_memset,
1059 6, /* scalar_stmt_cost. */
1060 4, /* scalar load_cost. */
1061 4, /* scalar_store_cost. */
1062 6, /* vec_stmt_cost. */
1063 0, /* vec_to_scalar_cost. */
1064 2, /* scalar_to_vec_cost. */
1065 4, /* vec_align_load_cost. */
1066 4, /* vec_unalign_load_cost. */
1067 4, /* vec_store_cost. */
1068 4, /* cond_taken_branch_cost. */
1069 2, /* cond_not_taken_branch_cost. */
1070 };
1071
1072 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1073 very small blocks it is better to use loop. For large blocks, libcall
1074 can do nontemporary accesses and beat inline considerably. */
1075
1076 static stringop_algs bdver2_memcpy[2] = {
1077 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1078 {-1, rep_prefix_4_byte, false}}},
1079 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1080 {-1, libcall, false}}}};
1081 static stringop_algs bdver2_memset[2] = {
1082 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1083 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1084 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1085 {-1, libcall, false}}}};
1086
1087 const struct processor_costs bdver2_cost = {
1088 COSTS_N_INSNS (1), /* cost of an add instruction */
1089 COSTS_N_INSNS (1), /* cost of a lea instruction */
1090 COSTS_N_INSNS (1), /* variable shift costs */
1091 COSTS_N_INSNS (1), /* constant shift costs */
1092 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1093 COSTS_N_INSNS (4), /* HI */
1094 COSTS_N_INSNS (4), /* SI */
1095 COSTS_N_INSNS (6), /* DI */
1096 COSTS_N_INSNS (6)}, /* other */
1097 0, /* cost of multiply per each bit set */
1098 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1099 COSTS_N_INSNS (35), /* HI */
1100 COSTS_N_INSNS (51), /* SI */
1101 COSTS_N_INSNS (83), /* DI */
1102 COSTS_N_INSNS (83)}, /* other */
1103 COSTS_N_INSNS (1), /* cost of movsx */
1104 COSTS_N_INSNS (1), /* cost of movzx */
1105 8, /* "large" insn */
1106 9, /* MOVE_RATIO */
1107 4, /* cost for loading QImode using movzbl */
1108 {5, 5, 4}, /* cost of loading integer registers
1109 in QImode, HImode and SImode.
1110 Relative to reg-reg move (2). */
1111 {4, 4, 4}, /* cost of storing integer registers */
1112 2, /* cost of reg,reg fld/fst */
1113 {5, 5, 12}, /* cost of loading fp registers
1114 in SFmode, DFmode and XFmode */
1115 {4, 4, 8}, /* cost of storing fp registers
1116 in SFmode, DFmode and XFmode */
1117 2, /* cost of moving MMX register */
1118 {4, 4}, /* cost of loading MMX registers
1119 in SImode and DImode */
1120 {4, 4}, /* cost of storing MMX registers
1121 in SImode and DImode */
1122 2, /* cost of moving SSE register */
1123 {4, 4, 4}, /* cost of loading SSE registers
1124 in SImode, DImode and TImode */
1125 {4, 4, 4}, /* cost of storing SSE registers
1126 in SImode, DImode and TImode */
1127 2, /* MMX or SSE register to integer */
1128 /* On K8:
1129 MOVD reg64, xmmreg Double FSTORE 4
1130 MOVD reg32, xmmreg Double FSTORE 4
1131 On AMDFAM10:
1132 MOVD reg64, xmmreg Double FADD 3
1133 1/1 1/1
1134 MOVD reg32, xmmreg Double FADD 3
1135 1/1 1/1 */
1136 16, /* size of l1 cache. */
1137 2048, /* size of l2 cache. */
1138 64, /* size of prefetch block */
1139 /* New AMD processors never drop prefetches; if they cannot be performed
1140 immediately, they are queued. We set number of simultaneous prefetches
1141 to a large constant to reflect this (it probably is not a good idea not
1142 to limit number of prefetches at all, as their execution also takes some
1143 time). */
1144 100, /* number of parallel prefetches */
1145 2, /* Branch cost */
1146 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1147 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1148 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1149 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1150 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1151 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1152
1153 bdver2_memcpy,
1154 bdver2_memset,
1155 6, /* scalar_stmt_cost. */
1156 4, /* scalar load_cost. */
1157 4, /* scalar_store_cost. */
1158 6, /* vec_stmt_cost. */
1159 0, /* vec_to_scalar_cost. */
1160 2, /* scalar_to_vec_cost. */
1161 4, /* vec_align_load_cost. */
1162 4, /* vec_unalign_load_cost. */
1163 4, /* vec_store_cost. */
1164 4, /* cond_taken_branch_cost. */
1165 2, /* cond_not_taken_branch_cost. */
1166 };
1167
1168
1169 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1170 very small blocks it is better to use loop. For large blocks, libcall
1171 can do nontemporary accesses and beat inline considerably. */
1172 static stringop_algs bdver3_memcpy[2] = {
1173 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1174 {-1, rep_prefix_4_byte, false}}},
1175 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1176 {-1, libcall, false}}}};
1177 static stringop_algs bdver3_memset[2] = {
1178 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1179 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1180 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1181 {-1, libcall, false}}}};
1182 struct processor_costs bdver3_cost = {
1183 COSTS_N_INSNS (1), /* cost of an add instruction */
1184 COSTS_N_INSNS (1), /* cost of a lea instruction */
1185 COSTS_N_INSNS (1), /* variable shift costs */
1186 COSTS_N_INSNS (1), /* constant shift costs */
1187 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1188 COSTS_N_INSNS (4), /* HI */
1189 COSTS_N_INSNS (4), /* SI */
1190 COSTS_N_INSNS (6), /* DI */
1191 COSTS_N_INSNS (6)}, /* other */
1192 0, /* cost of multiply per each bit set */
1193 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1194 COSTS_N_INSNS (35), /* HI */
1195 COSTS_N_INSNS (51), /* SI */
1196 COSTS_N_INSNS (83), /* DI */
1197 COSTS_N_INSNS (83)}, /* other */
1198 COSTS_N_INSNS (1), /* cost of movsx */
1199 COSTS_N_INSNS (1), /* cost of movzx */
1200 8, /* "large" insn */
1201 9, /* MOVE_RATIO */
1202 4, /* cost for loading QImode using movzbl */
1203 {5, 5, 4}, /* cost of loading integer registers
1204 in QImode, HImode and SImode.
1205 Relative to reg-reg move (2). */
1206 {4, 4, 4}, /* cost of storing integer registers */
1207 2, /* cost of reg,reg fld/fst */
1208 {5, 5, 12}, /* cost of loading fp registers
1209 in SFmode, DFmode and XFmode */
1210 {4, 4, 8}, /* cost of storing fp registers
1211 in SFmode, DFmode and XFmode */
1212 2, /* cost of moving MMX register */
1213 {4, 4}, /* cost of loading MMX registers
1214 in SImode and DImode */
1215 {4, 4}, /* cost of storing MMX registers
1216 in SImode and DImode */
1217 2, /* cost of moving SSE register */
1218 {4, 4, 4}, /* cost of loading SSE registers
1219 in SImode, DImode and TImode */
1220 {4, 4, 4}, /* cost of storing SSE registers
1221 in SImode, DImode and TImode */
1222 2, /* MMX or SSE register to integer */
1223 16, /* size of l1 cache. */
1224 2048, /* size of l2 cache. */
1225 64, /* size of prefetch block */
1226 /* New AMD processors never drop prefetches; if they cannot be performed
1227 immediately, they are queued. We set number of simultaneous prefetches
1228 to a large constant to reflect this (it probably is not a good idea not
1229 to limit number of prefetches at all, as their execution also takes some
1230 time). */
1231 100, /* number of parallel prefetches */
1232 2, /* Branch cost */
1233 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1234 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1235 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1236 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1237 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1238 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1239
1240 bdver3_memcpy,
1241 bdver3_memset,
1242 6, /* scalar_stmt_cost. */
1243 4, /* scalar load_cost. */
1244 4, /* scalar_store_cost. */
1245 6, /* vec_stmt_cost. */
1246 0, /* vec_to_scalar_cost. */
1247 2, /* scalar_to_vec_cost. */
1248 4, /* vec_align_load_cost. */
1249 4, /* vec_unalign_load_cost. */
1250 4, /* vec_store_cost. */
1251 4, /* cond_taken_branch_cost. */
1252 2, /* cond_not_taken_branch_cost. */
1253 };
1254
1255 /* BDVER4 has optimized REP instruction for medium sized blocks, but for
1256 very small blocks it is better to use loop. For large blocks, libcall
1257 can do nontemporary accesses and beat inline considerably. */
1258 static stringop_algs bdver4_memcpy[2] = {
1259 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1260 {-1, rep_prefix_4_byte, false}}},
1261 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1262 {-1, libcall, false}}}};
1263 static stringop_algs bdver4_memset[2] = {
1264 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1265 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1266 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1267 {-1, libcall, false}}}};
1268 struct processor_costs bdver4_cost = {
1269 COSTS_N_INSNS (1), /* cost of an add instruction */
1270 COSTS_N_INSNS (1), /* cost of a lea instruction */
1271 COSTS_N_INSNS (1), /* variable shift costs */
1272 COSTS_N_INSNS (1), /* constant shift costs */
1273 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1274 COSTS_N_INSNS (4), /* HI */
1275 COSTS_N_INSNS (4), /* SI */
1276 COSTS_N_INSNS (6), /* DI */
1277 COSTS_N_INSNS (6)}, /* other */
1278 0, /* cost of multiply per each bit set */
1279 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1280 COSTS_N_INSNS (35), /* HI */
1281 COSTS_N_INSNS (51), /* SI */
1282 COSTS_N_INSNS (83), /* DI */
1283 COSTS_N_INSNS (83)}, /* other */
1284 COSTS_N_INSNS (1), /* cost of movsx */
1285 COSTS_N_INSNS (1), /* cost of movzx */
1286 8, /* "large" insn */
1287 9, /* MOVE_RATIO */
1288 4, /* cost for loading QImode using movzbl */
1289 {5, 5, 4}, /* cost of loading integer registers
1290 in QImode, HImode and SImode.
1291 Relative to reg-reg move (2). */
1292 {4, 4, 4}, /* cost of storing integer registers */
1293 2, /* cost of reg,reg fld/fst */
1294 {5, 5, 12}, /* cost of loading fp registers
1295 in SFmode, DFmode and XFmode */
1296 {4, 4, 8}, /* cost of storing fp registers
1297 in SFmode, DFmode and XFmode */
1298 2, /* cost of moving MMX register */
1299 {4, 4}, /* cost of loading MMX registers
1300 in SImode and DImode */
1301 {4, 4}, /* cost of storing MMX registers
1302 in SImode and DImode */
1303 2, /* cost of moving SSE register */
1304 {4, 4, 4}, /* cost of loading SSE registers
1305 in SImode, DImode and TImode */
1306 {4, 4, 4}, /* cost of storing SSE registers
1307 in SImode, DImode and TImode */
1308 2, /* MMX or SSE register to integer */
1309 16, /* size of l1 cache. */
1310 2048, /* size of l2 cache. */
1311 64, /* size of prefetch block */
1312 /* New AMD processors never drop prefetches; if they cannot be performed
1313 immediately, they are queued. We set number of simultaneous prefetches
1314 to a large constant to reflect this (it probably is not a good idea not
1315 to limit number of prefetches at all, as their execution also takes some
1316 time). */
1317 100, /* number of parallel prefetches */
1318 2, /* Branch cost */
1319 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1320 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1321 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1322 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1323 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1324 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1325
1326 bdver4_memcpy,
1327 bdver4_memset,
1328 6, /* scalar_stmt_cost. */
1329 4, /* scalar load_cost. */
1330 4, /* scalar_store_cost. */
1331 6, /* vec_stmt_cost. */
1332 0, /* vec_to_scalar_cost. */
1333 2, /* scalar_to_vec_cost. */
1334 4, /* vec_align_load_cost. */
1335 4, /* vec_unalign_load_cost. */
1336 4, /* vec_store_cost. */
1337 4, /* cond_taken_branch_cost. */
1338 2, /* cond_not_taken_branch_cost. */
1339 };
1340
1341
1342 /* ZNVER1 has optimized REP instruction for medium sized blocks, but for
1343 very small blocks it is better to use loop. For large blocks, libcall
1344 can do nontemporary accesses and beat inline considerably. */
1345 static stringop_algs znver1_memcpy[2] = {
1346 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1347 {-1, rep_prefix_4_byte, false}}},
1348 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1349 {-1, libcall, false}}}};
1350 static stringop_algs znver1_memset[2] = {
1351 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1352 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1353 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1354 {-1, libcall, false}}}};
1355 struct processor_costs znver1_cost = {
1356 COSTS_N_INSNS (1), /* cost of an add instruction. */
1357 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1358 COSTS_N_INSNS (1), /* variable shift costs. */
1359 COSTS_N_INSNS (1), /* constant shift costs. */
1360 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1361 COSTS_N_INSNS (3), /* HI. */
1362 COSTS_N_INSNS (3), /* SI. */
1363 COSTS_N_INSNS (4), /* DI. */
1364 COSTS_N_INSNS (4)}, /* other. */
1365 0, /* cost of multiply per each bit
1366 set. */
1367 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI. */
1368 COSTS_N_INSNS (35), /* HI. */
1369 COSTS_N_INSNS (51), /* SI. */
1370 COSTS_N_INSNS (83), /* DI. */
1371 COSTS_N_INSNS (83)}, /* other. */
1372 COSTS_N_INSNS (1), /* cost of movsx. */
1373 COSTS_N_INSNS (1), /* cost of movzx. */
1374 8, /* "large" insn. */
1375 9, /* MOVE_RATIO. */
1376 4, /* cost for loading QImode using
1377 movzbl. */
1378 {5, 5, 4}, /* cost of loading integer registers
1379 in QImode, HImode and SImode.
1380 Relative to reg-reg move (2). */
1381 {4, 4, 4}, /* cost of storing integer
1382 registers. */
1383 2, /* cost of reg,reg fld/fst. */
1384 {5, 5, 12}, /* cost of loading fp registers
1385 in SFmode, DFmode and XFmode. */
1386 {4, 4, 8}, /* cost of storing fp registers
1387 in SFmode, DFmode and XFmode. */
1388 2, /* cost of moving MMX register. */
1389 {4, 4}, /* cost of loading MMX registers
1390 in SImode and DImode. */
1391 {4, 4}, /* cost of storing MMX registers
1392 in SImode and DImode. */
1393 2, /* cost of moving SSE register. */
1394 {4, 4, 4}, /* cost of loading SSE registers
1395 in SImode, DImode and TImode. */
1396 {4, 4, 4}, /* cost of storing SSE registers
1397 in SImode, DImode and TImode. */
1398 2, /* MMX or SSE register to integer. */
1399 32, /* size of l1 cache. */
1400 512, /* size of l2 cache. */
1401 64, /* size of prefetch block. */
1402 /* New AMD processors never drop prefetches; if they cannot be performed
1403 immediately, they are queued. We set number of simultaneous prefetches
1404 to a large constant to reflect this (it probably is not a good idea not
1405 to limit number of prefetches at all, as their execution also takes some
1406 time). */
1407 100, /* number of parallel prefetches. */
1408 2, /* Branch cost. */
1409 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1410 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1411 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1412 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1413 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1414 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1415
1416 znver1_memcpy,
1417 znver1_memset,
1418 6, /* scalar_stmt_cost. */
1419 4, /* scalar load_cost. */
1420 4, /* scalar_store_cost. */
1421 6, /* vec_stmt_cost. */
1422 0, /* vec_to_scalar_cost. */
1423 2, /* scalar_to_vec_cost. */
1424 4, /* vec_align_load_cost. */
1425 4, /* vec_unalign_load_cost. */
1426 4, /* vec_store_cost. */
1427 4, /* cond_taken_branch_cost. */
1428 2, /* cond_not_taken_branch_cost. */
1429 };
1430
1431 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1432 very small blocks it is better to use loop. For large blocks, libcall can
1433 do nontemporary accesses and beat inline considerably. */
1434 static stringop_algs btver1_memcpy[2] = {
1435 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1436 {-1, rep_prefix_4_byte, false}}},
1437 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1438 {-1, libcall, false}}}};
1439 static stringop_algs btver1_memset[2] = {
1440 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1441 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1442 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1443 {-1, libcall, false}}}};
1444 const struct processor_costs btver1_cost = {
1445 COSTS_N_INSNS (1), /* cost of an add instruction */
1446 COSTS_N_INSNS (2), /* cost of a lea instruction */
1447 COSTS_N_INSNS (1), /* variable shift costs */
1448 COSTS_N_INSNS (1), /* constant shift costs */
1449 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1450 COSTS_N_INSNS (4), /* HI */
1451 COSTS_N_INSNS (3), /* SI */
1452 COSTS_N_INSNS (4), /* DI */
1453 COSTS_N_INSNS (5)}, /* other */
1454 0, /* cost of multiply per each bit set */
1455 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1456 COSTS_N_INSNS (35), /* HI */
1457 COSTS_N_INSNS (51), /* SI */
1458 COSTS_N_INSNS (83), /* DI */
1459 COSTS_N_INSNS (83)}, /* other */
1460 COSTS_N_INSNS (1), /* cost of movsx */
1461 COSTS_N_INSNS (1), /* cost of movzx */
1462 8, /* "large" insn */
1463 9, /* MOVE_RATIO */
1464 4, /* cost for loading QImode using movzbl */
1465 {3, 4, 3}, /* cost of loading integer registers
1466 in QImode, HImode and SImode.
1467 Relative to reg-reg move (2). */
1468 {3, 4, 3}, /* cost of storing integer registers */
1469 4, /* cost of reg,reg fld/fst */
1470 {4, 4, 12}, /* cost of loading fp registers
1471 in SFmode, DFmode and XFmode */
1472 {6, 6, 8}, /* cost of storing fp registers
1473 in SFmode, DFmode and XFmode */
1474 2, /* cost of moving MMX register */
1475 {3, 3}, /* cost of loading MMX registers
1476 in SImode and DImode */
1477 {4, 4}, /* cost of storing MMX registers
1478 in SImode and DImode */
1479 2, /* cost of moving SSE register */
1480 {4, 4, 3}, /* cost of loading SSE registers
1481 in SImode, DImode and TImode */
1482 {4, 4, 5}, /* cost of storing SSE registers
1483 in SImode, DImode and TImode */
1484 3, /* MMX or SSE register to integer */
1485 /* On K8:
1486 MOVD reg64, xmmreg Double FSTORE 4
1487 MOVD reg32, xmmreg Double FSTORE 4
1488 On AMDFAM10:
1489 MOVD reg64, xmmreg Double FADD 3
1490 1/1 1/1
1491 MOVD reg32, xmmreg Double FADD 3
1492 1/1 1/1 */
1493 32, /* size of l1 cache. */
1494 512, /* size of l2 cache. */
1495 64, /* size of prefetch block */
1496 100, /* number of parallel prefetches */
1497 2, /* Branch cost */
1498 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1499 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1500 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1501 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1502 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1503 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1504
1505 btver1_memcpy,
1506 btver1_memset,
1507 4, /* scalar_stmt_cost. */
1508 2, /* scalar load_cost. */
1509 2, /* scalar_store_cost. */
1510 6, /* vec_stmt_cost. */
1511 0, /* vec_to_scalar_cost. */
1512 2, /* scalar_to_vec_cost. */
1513 2, /* vec_align_load_cost. */
1514 2, /* vec_unalign_load_cost. */
1515 2, /* vec_store_cost. */
1516 2, /* cond_taken_branch_cost. */
1517 1, /* cond_not_taken_branch_cost. */
1518 };
1519
1520 static stringop_algs btver2_memcpy[2] = {
1521 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1522 {-1, rep_prefix_4_byte, false}}},
1523 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1524 {-1, libcall, false}}}};
1525 static stringop_algs btver2_memset[2] = {
1526 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1527 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1528 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1529 {-1, libcall, false}}}};
1530 const struct processor_costs btver2_cost = {
1531 COSTS_N_INSNS (1), /* cost of an add instruction */
1532 COSTS_N_INSNS (2), /* cost of a lea instruction */
1533 COSTS_N_INSNS (1), /* variable shift costs */
1534 COSTS_N_INSNS (1), /* constant shift costs */
1535 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1536 COSTS_N_INSNS (4), /* HI */
1537 COSTS_N_INSNS (3), /* SI */
1538 COSTS_N_INSNS (4), /* DI */
1539 COSTS_N_INSNS (5)}, /* other */
1540 0, /* cost of multiply per each bit set */
1541 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1542 COSTS_N_INSNS (35), /* HI */
1543 COSTS_N_INSNS (51), /* SI */
1544 COSTS_N_INSNS (83), /* DI */
1545 COSTS_N_INSNS (83)}, /* other */
1546 COSTS_N_INSNS (1), /* cost of movsx */
1547 COSTS_N_INSNS (1), /* cost of movzx */
1548 8, /* "large" insn */
1549 9, /* MOVE_RATIO */
1550 4, /* cost for loading QImode using movzbl */
1551 {3, 4, 3}, /* cost of loading integer registers
1552 in QImode, HImode and SImode.
1553 Relative to reg-reg move (2). */
1554 {3, 4, 3}, /* cost of storing integer registers */
1555 4, /* cost of reg,reg fld/fst */
1556 {4, 4, 12}, /* cost of loading fp registers
1557 in SFmode, DFmode and XFmode */
1558 {6, 6, 8}, /* cost of storing fp registers
1559 in SFmode, DFmode and XFmode */
1560 2, /* cost of moving MMX register */
1561 {3, 3}, /* cost of loading MMX registers
1562 in SImode and DImode */
1563 {4, 4}, /* cost of storing MMX registers
1564 in SImode and DImode */
1565 2, /* cost of moving SSE register */
1566 {4, 4, 3}, /* cost of loading SSE registers
1567 in SImode, DImode and TImode */
1568 {4, 4, 5}, /* cost of storing SSE registers
1569 in SImode, DImode and TImode */
1570 3, /* MMX or SSE register to integer */
1571 /* On K8:
1572 MOVD reg64, xmmreg Double FSTORE 4
1573 MOVD reg32, xmmreg Double FSTORE 4
1574 On AMDFAM10:
1575 MOVD reg64, xmmreg Double FADD 3
1576 1/1 1/1
1577 MOVD reg32, xmmreg Double FADD 3
1578 1/1 1/1 */
1579 32, /* size of l1 cache. */
1580 2048, /* size of l2 cache. */
1581 64, /* size of prefetch block */
1582 100, /* number of parallel prefetches */
1583 2, /* Branch cost */
1584 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1585 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1586 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1587 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1588 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1589 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1590 btver2_memcpy,
1591 btver2_memset,
1592 4, /* scalar_stmt_cost. */
1593 2, /* scalar load_cost. */
1594 2, /* scalar_store_cost. */
1595 6, /* vec_stmt_cost. */
1596 0, /* vec_to_scalar_cost. */
1597 2, /* scalar_to_vec_cost. */
1598 2, /* vec_align_load_cost. */
1599 2, /* vec_unalign_load_cost. */
1600 2, /* vec_store_cost. */
1601 2, /* cond_taken_branch_cost. */
1602 1, /* cond_not_taken_branch_cost. */
1603 };
1604
1605 static stringop_algs pentium4_memcpy[2] = {
1606 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1607 DUMMY_STRINGOP_ALGS};
1608 static stringop_algs pentium4_memset[2] = {
1609 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1610 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1611 DUMMY_STRINGOP_ALGS};
1612
1613 static const
1614 struct processor_costs pentium4_cost = {
1615 COSTS_N_INSNS (1), /* cost of an add instruction */
1616 COSTS_N_INSNS (3), /* cost of a lea instruction */
1617 COSTS_N_INSNS (4), /* variable shift costs */
1618 COSTS_N_INSNS (4), /* constant shift costs */
1619 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1620 COSTS_N_INSNS (15), /* HI */
1621 COSTS_N_INSNS (15), /* SI */
1622 COSTS_N_INSNS (15), /* DI */
1623 COSTS_N_INSNS (15)}, /* other */
1624 0, /* cost of multiply per each bit set */
1625 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1626 COSTS_N_INSNS (56), /* HI */
1627 COSTS_N_INSNS (56), /* SI */
1628 COSTS_N_INSNS (56), /* DI */
1629 COSTS_N_INSNS (56)}, /* other */
1630 COSTS_N_INSNS (1), /* cost of movsx */
1631 COSTS_N_INSNS (1), /* cost of movzx */
1632 16, /* "large" insn */
1633 6, /* MOVE_RATIO */
1634 2, /* cost for loading QImode using movzbl */
1635 {4, 5, 4}, /* cost of loading integer registers
1636 in QImode, HImode and SImode.
1637 Relative to reg-reg move (2). */
1638 {2, 3, 2}, /* cost of storing integer registers */
1639 2, /* cost of reg,reg fld/fst */
1640 {2, 2, 6}, /* cost of loading fp registers
1641 in SFmode, DFmode and XFmode */
1642 {4, 4, 6}, /* cost of storing fp registers
1643 in SFmode, DFmode and XFmode */
1644 2, /* cost of moving MMX register */
1645 {2, 2}, /* cost of loading MMX registers
1646 in SImode and DImode */
1647 {2, 2}, /* cost of storing MMX registers
1648 in SImode and DImode */
1649 12, /* cost of moving SSE register */
1650 {12, 12, 12}, /* cost of loading SSE registers
1651 in SImode, DImode and TImode */
1652 {2, 2, 8}, /* cost of storing SSE registers
1653 in SImode, DImode and TImode */
1654 10, /* MMX or SSE register to integer */
1655 8, /* size of l1 cache. */
1656 256, /* size of l2 cache. */
1657 64, /* size of prefetch block */
1658 6, /* number of parallel prefetches */
1659 2, /* Branch cost */
1660 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1661 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1662 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1663 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1664 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1665 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1666 pentium4_memcpy,
1667 pentium4_memset,
1668 1, /* scalar_stmt_cost. */
1669 1, /* scalar load_cost. */
1670 1, /* scalar_store_cost. */
1671 1, /* vec_stmt_cost. */
1672 1, /* vec_to_scalar_cost. */
1673 1, /* scalar_to_vec_cost. */
1674 1, /* vec_align_load_cost. */
1675 2, /* vec_unalign_load_cost. */
1676 1, /* vec_store_cost. */
1677 3, /* cond_taken_branch_cost. */
1678 1, /* cond_not_taken_branch_cost. */
1679 };
1680
1681 static stringop_algs nocona_memcpy[2] = {
1682 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1683 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1684 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1685
1686 static stringop_algs nocona_memset[2] = {
1687 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1688 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1689 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1690 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1691
1692 static const
1693 struct processor_costs nocona_cost = {
1694 COSTS_N_INSNS (1), /* cost of an add instruction */
1695 COSTS_N_INSNS (1), /* cost of a lea instruction */
1696 COSTS_N_INSNS (1), /* variable shift costs */
1697 COSTS_N_INSNS (1), /* constant shift costs */
1698 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1699 COSTS_N_INSNS (10), /* HI */
1700 COSTS_N_INSNS (10), /* SI */
1701 COSTS_N_INSNS (10), /* DI */
1702 COSTS_N_INSNS (10)}, /* other */
1703 0, /* cost of multiply per each bit set */
1704 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1705 COSTS_N_INSNS (66), /* HI */
1706 COSTS_N_INSNS (66), /* SI */
1707 COSTS_N_INSNS (66), /* DI */
1708 COSTS_N_INSNS (66)}, /* other */
1709 COSTS_N_INSNS (1), /* cost of movsx */
1710 COSTS_N_INSNS (1), /* cost of movzx */
1711 16, /* "large" insn */
1712 17, /* MOVE_RATIO */
1713 4, /* cost for loading QImode using movzbl */
1714 {4, 4, 4}, /* cost of loading integer registers
1715 in QImode, HImode and SImode.
1716 Relative to reg-reg move (2). */
1717 {4, 4, 4}, /* cost of storing integer registers */
1718 3, /* cost of reg,reg fld/fst */
1719 {12, 12, 12}, /* cost of loading fp registers
1720 in SFmode, DFmode and XFmode */
1721 {4, 4, 4}, /* cost of storing fp registers
1722 in SFmode, DFmode and XFmode */
1723 6, /* cost of moving MMX register */
1724 {12, 12}, /* cost of loading MMX registers
1725 in SImode and DImode */
1726 {12, 12}, /* cost of storing MMX registers
1727 in SImode and DImode */
1728 6, /* cost of moving SSE register */
1729 {12, 12, 12}, /* cost of loading SSE registers
1730 in SImode, DImode and TImode */
1731 {12, 12, 12}, /* cost of storing SSE registers
1732 in SImode, DImode and TImode */
1733 8, /* MMX or SSE register to integer */
1734 8, /* size of l1 cache. */
1735 1024, /* size of l2 cache. */
1736 64, /* size of prefetch block */
1737 8, /* number of parallel prefetches */
1738 1, /* Branch cost */
1739 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1740 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1741 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1742 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1743 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1744 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1745 nocona_memcpy,
1746 nocona_memset,
1747 1, /* scalar_stmt_cost. */
1748 1, /* scalar load_cost. */
1749 1, /* scalar_store_cost. */
1750 1, /* vec_stmt_cost. */
1751 1, /* vec_to_scalar_cost. */
1752 1, /* scalar_to_vec_cost. */
1753 1, /* vec_align_load_cost. */
1754 2, /* vec_unalign_load_cost. */
1755 1, /* vec_store_cost. */
1756 3, /* cond_taken_branch_cost. */
1757 1, /* cond_not_taken_branch_cost. */
1758 };
1759
1760 static stringop_algs atom_memcpy[2] = {
1761 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1762 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1763 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1764 static stringop_algs atom_memset[2] = {
1765 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1766 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1767 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1768 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1769 static const
1770 struct processor_costs atom_cost = {
1771 COSTS_N_INSNS (1), /* cost of an add instruction */
1772 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1773 COSTS_N_INSNS (1), /* variable shift costs */
1774 COSTS_N_INSNS (1), /* constant shift costs */
1775 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1776 COSTS_N_INSNS (4), /* HI */
1777 COSTS_N_INSNS (3), /* SI */
1778 COSTS_N_INSNS (4), /* DI */
1779 COSTS_N_INSNS (2)}, /* other */
1780 0, /* cost of multiply per each bit set */
1781 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1782 COSTS_N_INSNS (26), /* HI */
1783 COSTS_N_INSNS (42), /* SI */
1784 COSTS_N_INSNS (74), /* DI */
1785 COSTS_N_INSNS (74)}, /* other */
1786 COSTS_N_INSNS (1), /* cost of movsx */
1787 COSTS_N_INSNS (1), /* cost of movzx */
1788 8, /* "large" insn */
1789 17, /* MOVE_RATIO */
1790 4, /* cost for loading QImode using movzbl */
1791 {4, 4, 4}, /* cost of loading integer registers
1792 in QImode, HImode and SImode.
1793 Relative to reg-reg move (2). */
1794 {4, 4, 4}, /* cost of storing integer registers */
1795 4, /* cost of reg,reg fld/fst */
1796 {12, 12, 12}, /* cost of loading fp registers
1797 in SFmode, DFmode and XFmode */
1798 {6, 6, 8}, /* cost of storing fp registers
1799 in SFmode, DFmode and XFmode */
1800 2, /* cost of moving MMX register */
1801 {8, 8}, /* cost of loading MMX registers
1802 in SImode and DImode */
1803 {8, 8}, /* cost of storing MMX registers
1804 in SImode and DImode */
1805 2, /* cost of moving SSE register */
1806 {8, 8, 8}, /* cost of loading SSE registers
1807 in SImode, DImode and TImode */
1808 {8, 8, 8}, /* cost of storing SSE registers
1809 in SImode, DImode and TImode */
1810 5, /* MMX or SSE register to integer */
1811 32, /* size of l1 cache. */
1812 256, /* size of l2 cache. */
1813 64, /* size of prefetch block */
1814 6, /* number of parallel prefetches */
1815 3, /* Branch cost */
1816 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1817 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1818 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1819 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1820 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1821 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1822 atom_memcpy,
1823 atom_memset,
1824 1, /* scalar_stmt_cost. */
1825 1, /* scalar load_cost. */
1826 1, /* scalar_store_cost. */
1827 1, /* vec_stmt_cost. */
1828 1, /* vec_to_scalar_cost. */
1829 1, /* scalar_to_vec_cost. */
1830 1, /* vec_align_load_cost. */
1831 2, /* vec_unalign_load_cost. */
1832 1, /* vec_store_cost. */
1833 3, /* cond_taken_branch_cost. */
1834 1, /* cond_not_taken_branch_cost. */
1835 };
1836
1837 static stringop_algs slm_memcpy[2] = {
1838 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1839 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1840 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1841 static stringop_algs slm_memset[2] = {
1842 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1843 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1844 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1845 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1846 static const
1847 struct processor_costs slm_cost = {
1848 COSTS_N_INSNS (1), /* cost of an add instruction */
1849 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1850 COSTS_N_INSNS (1), /* variable shift costs */
1851 COSTS_N_INSNS (1), /* constant shift costs */
1852 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1853 COSTS_N_INSNS (3), /* HI */
1854 COSTS_N_INSNS (3), /* SI */
1855 COSTS_N_INSNS (4), /* DI */
1856 COSTS_N_INSNS (2)}, /* other */
1857 0, /* cost of multiply per each bit set */
1858 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1859 COSTS_N_INSNS (26), /* HI */
1860 COSTS_N_INSNS (42), /* SI */
1861 COSTS_N_INSNS (74), /* DI */
1862 COSTS_N_INSNS (74)}, /* other */
1863 COSTS_N_INSNS (1), /* cost of movsx */
1864 COSTS_N_INSNS (1), /* cost of movzx */
1865 8, /* "large" insn */
1866 17, /* MOVE_RATIO */
1867 4, /* cost for loading QImode using movzbl */
1868 {4, 4, 4}, /* cost of loading integer registers
1869 in QImode, HImode and SImode.
1870 Relative to reg-reg move (2). */
1871 {4, 4, 4}, /* cost of storing integer registers */
1872 4, /* cost of reg,reg fld/fst */
1873 {12, 12, 12}, /* cost of loading fp registers
1874 in SFmode, DFmode and XFmode */
1875 {6, 6, 8}, /* cost of storing fp registers
1876 in SFmode, DFmode and XFmode */
1877 2, /* cost of moving MMX register */
1878 {8, 8}, /* cost of loading MMX registers
1879 in SImode and DImode */
1880 {8, 8}, /* cost of storing MMX registers
1881 in SImode and DImode */
1882 2, /* cost of moving SSE register */
1883 {8, 8, 8}, /* cost of loading SSE registers
1884 in SImode, DImode and TImode */
1885 {8, 8, 8}, /* cost of storing SSE registers
1886 in SImode, DImode and TImode */
1887 5, /* MMX or SSE register to integer */
1888 32, /* size of l1 cache. */
1889 256, /* size of l2 cache. */
1890 64, /* size of prefetch block */
1891 6, /* number of parallel prefetches */
1892 3, /* Branch cost */
1893 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1894 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1895 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1896 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1897 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1898 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1899 slm_memcpy,
1900 slm_memset,
1901 1, /* scalar_stmt_cost. */
1902 1, /* scalar load_cost. */
1903 1, /* scalar_store_cost. */
1904 1, /* vec_stmt_cost. */
1905 4, /* vec_to_scalar_cost. */
1906 1, /* scalar_to_vec_cost. */
1907 1, /* vec_align_load_cost. */
1908 2, /* vec_unalign_load_cost. */
1909 1, /* vec_store_cost. */
1910 3, /* cond_taken_branch_cost. */
1911 1, /* cond_not_taken_branch_cost. */
1912 };
1913
1914 static stringop_algs intel_memcpy[2] = {
1915 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1916 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1917 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1918 static stringop_algs intel_memset[2] = {
1919 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1920 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1921 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1922 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1923 static const
1924 struct processor_costs intel_cost = {
1925 COSTS_N_INSNS (1), /* cost of an add instruction */
1926 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1927 COSTS_N_INSNS (1), /* variable shift costs */
1928 COSTS_N_INSNS (1), /* constant shift costs */
1929 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1930 COSTS_N_INSNS (3), /* HI */
1931 COSTS_N_INSNS (3), /* SI */
1932 COSTS_N_INSNS (4), /* DI */
1933 COSTS_N_INSNS (2)}, /* other */
1934 0, /* cost of multiply per each bit set */
1935 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1936 COSTS_N_INSNS (26), /* HI */
1937 COSTS_N_INSNS (42), /* SI */
1938 COSTS_N_INSNS (74), /* DI */
1939 COSTS_N_INSNS (74)}, /* other */
1940 COSTS_N_INSNS (1), /* cost of movsx */
1941 COSTS_N_INSNS (1), /* cost of movzx */
1942 8, /* "large" insn */
1943 17, /* MOVE_RATIO */
1944 4, /* cost for loading QImode using movzbl */
1945 {4, 4, 4}, /* cost of loading integer registers
1946 in QImode, HImode and SImode.
1947 Relative to reg-reg move (2). */
1948 {4, 4, 4}, /* cost of storing integer registers */
1949 4, /* cost of reg,reg fld/fst */
1950 {12, 12, 12}, /* cost of loading fp registers
1951 in SFmode, DFmode and XFmode */
1952 {6, 6, 8}, /* cost of storing fp registers
1953 in SFmode, DFmode and XFmode */
1954 2, /* cost of moving MMX register */
1955 {8, 8}, /* cost of loading MMX registers
1956 in SImode and DImode */
1957 {8, 8}, /* cost of storing MMX registers
1958 in SImode and DImode */
1959 2, /* cost of moving SSE register */
1960 {8, 8, 8}, /* cost of loading SSE registers
1961 in SImode, DImode and TImode */
1962 {8, 8, 8}, /* cost of storing SSE registers
1963 in SImode, DImode and TImode */
1964 5, /* MMX or SSE register to integer */
1965 32, /* size of l1 cache. */
1966 256, /* size of l2 cache. */
1967 64, /* size of prefetch block */
1968 6, /* number of parallel prefetches */
1969 3, /* Branch cost */
1970 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1971 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1972 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1973 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1974 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1975 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1976 intel_memcpy,
1977 intel_memset,
1978 1, /* scalar_stmt_cost. */
1979 1, /* scalar load_cost. */
1980 1, /* scalar_store_cost. */
1981 1, /* vec_stmt_cost. */
1982 4, /* vec_to_scalar_cost. */
1983 1, /* scalar_to_vec_cost. */
1984 1, /* vec_align_load_cost. */
1985 2, /* vec_unalign_load_cost. */
1986 1, /* vec_store_cost. */
1987 3, /* cond_taken_branch_cost. */
1988 1, /* cond_not_taken_branch_cost. */
1989 };
1990
1991 /* Generic should produce code tuned for Core-i7 (and newer chips)
1992 and btver1 (and newer chips). */
1993
1994 static stringop_algs generic_memcpy[2] = {
1995 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1996 {-1, libcall, false}}},
1997 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1998 {-1, libcall, false}}}};
1999 static stringop_algs generic_memset[2] = {
2000 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2001 {-1, libcall, false}}},
2002 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2003 {-1, libcall, false}}}};
2004 static const
2005 struct processor_costs generic_cost = {
2006 COSTS_N_INSNS (1), /* cost of an add instruction */
2007 /* On all chips taken into consideration lea is 2 cycles and more. With
2008 this cost however our current implementation of synth_mult results in
2009 use of unnecessary temporary registers causing regression on several
2010 SPECfp benchmarks. */
2011 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2012 COSTS_N_INSNS (1), /* variable shift costs */
2013 COSTS_N_INSNS (1), /* constant shift costs */
2014 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2015 COSTS_N_INSNS (4), /* HI */
2016 COSTS_N_INSNS (3), /* SI */
2017 COSTS_N_INSNS (4), /* DI */
2018 COSTS_N_INSNS (2)}, /* other */
2019 0, /* cost of multiply per each bit set */
2020 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2021 COSTS_N_INSNS (26), /* HI */
2022 COSTS_N_INSNS (42), /* SI */
2023 COSTS_N_INSNS (74), /* DI */
2024 COSTS_N_INSNS (74)}, /* other */
2025 COSTS_N_INSNS (1), /* cost of movsx */
2026 COSTS_N_INSNS (1), /* cost of movzx */
2027 8, /* "large" insn */
2028 17, /* MOVE_RATIO */
2029 4, /* cost for loading QImode using movzbl */
2030 {4, 4, 4}, /* cost of loading integer registers
2031 in QImode, HImode and SImode.
2032 Relative to reg-reg move (2). */
2033 {4, 4, 4}, /* cost of storing integer registers */
2034 4, /* cost of reg,reg fld/fst */
2035 {12, 12, 12}, /* cost of loading fp registers
2036 in SFmode, DFmode and XFmode */
2037 {6, 6, 8}, /* cost of storing fp registers
2038 in SFmode, DFmode and XFmode */
2039 2, /* cost of moving MMX register */
2040 {8, 8}, /* cost of loading MMX registers
2041 in SImode and DImode */
2042 {8, 8}, /* cost of storing MMX registers
2043 in SImode and DImode */
2044 2, /* cost of moving SSE register */
2045 {8, 8, 8}, /* cost of loading SSE registers
2046 in SImode, DImode and TImode */
2047 {8, 8, 8}, /* cost of storing SSE registers
2048 in SImode, DImode and TImode */
2049 5, /* MMX or SSE register to integer */
2050 32, /* size of l1 cache. */
2051 512, /* size of l2 cache. */
2052 64, /* size of prefetch block */
2053 6, /* number of parallel prefetches */
2054 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2055 value is increased to perhaps more appropriate value of 5. */
2056 3, /* Branch cost */
2057 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2058 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2059 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2060 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2061 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2062 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2063 generic_memcpy,
2064 generic_memset,
2065 1, /* scalar_stmt_cost. */
2066 1, /* scalar load_cost. */
2067 1, /* scalar_store_cost. */
2068 1, /* vec_stmt_cost. */
2069 1, /* vec_to_scalar_cost. */
2070 1, /* scalar_to_vec_cost. */
2071 1, /* vec_align_load_cost. */
2072 2, /* vec_unalign_load_cost. */
2073 1, /* vec_store_cost. */
2074 3, /* cond_taken_branch_cost. */
2075 1, /* cond_not_taken_branch_cost. */
2076 };
2077
2078 /* core_cost should produce code tuned for Core familly of CPUs. */
2079 static stringop_algs core_memcpy[2] = {
2080 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2081 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2082 {-1, libcall, false}}}};
2083 static stringop_algs core_memset[2] = {
2084 {libcall, {{6, loop_1_byte, true},
2085 {24, loop, true},
2086 {8192, rep_prefix_4_byte, true},
2087 {-1, libcall, false}}},
2088 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2089 {-1, libcall, false}}}};
2090
2091 static const
2092 struct processor_costs core_cost = {
2093 COSTS_N_INSNS (1), /* cost of an add instruction */
2094 /* On all chips taken into consideration lea is 2 cycles and more. With
2095 this cost however our current implementation of synth_mult results in
2096 use of unnecessary temporary registers causing regression on several
2097 SPECfp benchmarks. */
2098 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2099 COSTS_N_INSNS (1), /* variable shift costs */
2100 COSTS_N_INSNS (1), /* constant shift costs */
2101 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2102 COSTS_N_INSNS (4), /* HI */
2103 COSTS_N_INSNS (3), /* SI */
2104 COSTS_N_INSNS (4), /* DI */
2105 COSTS_N_INSNS (2)}, /* other */
2106 0, /* cost of multiply per each bit set */
2107 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2108 COSTS_N_INSNS (26), /* HI */
2109 COSTS_N_INSNS (42), /* SI */
2110 COSTS_N_INSNS (74), /* DI */
2111 COSTS_N_INSNS (74)}, /* other */
2112 COSTS_N_INSNS (1), /* cost of movsx */
2113 COSTS_N_INSNS (1), /* cost of movzx */
2114 8, /* "large" insn */
2115 17, /* MOVE_RATIO */
2116 4, /* cost for loading QImode using movzbl */
2117 {4, 4, 4}, /* cost of loading integer registers
2118 in QImode, HImode and SImode.
2119 Relative to reg-reg move (2). */
2120 {4, 4, 4}, /* cost of storing integer registers */
2121 4, /* cost of reg,reg fld/fst */
2122 {12, 12, 12}, /* cost of loading fp registers
2123 in SFmode, DFmode and XFmode */
2124 {6, 6, 8}, /* cost of storing fp registers
2125 in SFmode, DFmode and XFmode */
2126 2, /* cost of moving MMX register */
2127 {8, 8}, /* cost of loading MMX registers
2128 in SImode and DImode */
2129 {8, 8}, /* cost of storing MMX registers
2130 in SImode and DImode */
2131 2, /* cost of moving SSE register */
2132 {8, 8, 8}, /* cost of loading SSE registers
2133 in SImode, DImode and TImode */
2134 {8, 8, 8}, /* cost of storing SSE registers
2135 in SImode, DImode and TImode */
2136 5, /* MMX or SSE register to integer */
2137 64, /* size of l1 cache. */
2138 512, /* size of l2 cache. */
2139 64, /* size of prefetch block */
2140 6, /* number of parallel prefetches */
2141 /* FIXME perhaps more appropriate value is 5. */
2142 3, /* Branch cost */
2143 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2144 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2145 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2146 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2147 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2148 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2149 core_memcpy,
2150 core_memset,
2151 1, /* scalar_stmt_cost. */
2152 1, /* scalar load_cost. */
2153 1, /* scalar_store_cost. */
2154 1, /* vec_stmt_cost. */
2155 1, /* vec_to_scalar_cost. */
2156 1, /* scalar_to_vec_cost. */
2157 1, /* vec_align_load_cost. */
2158 2, /* vec_unalign_load_cost. */
2159 1, /* vec_store_cost. */
2160 3, /* cond_taken_branch_cost. */
2161 1, /* cond_not_taken_branch_cost. */
2162 };
2163
2164
2165 /* Set by -mtune. */
2166 const struct processor_costs *ix86_tune_cost = &pentium_cost;
2167
2168 /* Set by -mtune or -Os. */
2169 const struct processor_costs *ix86_cost = &pentium_cost;
2170
2171 /* Processor feature/optimization bitmasks. */
2172 #define m_386 (1U<<PROCESSOR_I386)
2173 #define m_486 (1U<<PROCESSOR_I486)
2174 #define m_PENT (1U<<PROCESSOR_PENTIUM)
2175 #define m_LAKEMONT (1U<<PROCESSOR_LAKEMONT)
2176 #define m_PPRO (1U<<PROCESSOR_PENTIUMPRO)
2177 #define m_PENT4 (1U<<PROCESSOR_PENTIUM4)
2178 #define m_NOCONA (1U<<PROCESSOR_NOCONA)
2179 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
2180 #define m_CORE2 (1U<<PROCESSOR_CORE2)
2181 #define m_NEHALEM (1U<<PROCESSOR_NEHALEM)
2182 #define m_SANDYBRIDGE (1U<<PROCESSOR_SANDYBRIDGE)
2183 #define m_HASWELL (1U<<PROCESSOR_HASWELL)
2184 #define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
2185 #define m_BONNELL (1U<<PROCESSOR_BONNELL)
2186 #define m_SILVERMONT (1U<<PROCESSOR_SILVERMONT)
2187 #define m_KNL (1U<<PROCESSOR_KNL)
2188 #define m_SKYLAKE_AVX512 (1U<<PROCESSOR_SKYLAKE_AVX512)
2189 #define m_INTEL (1U<<PROCESSOR_INTEL)
2190
2191 #define m_GEODE (1U<<PROCESSOR_GEODE)
2192 #define m_K6 (1U<<PROCESSOR_K6)
2193 #define m_K6_GEODE (m_K6 | m_GEODE)
2194 #define m_K8 (1U<<PROCESSOR_K8)
2195 #define m_ATHLON (1U<<PROCESSOR_ATHLON)
2196 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
2197 #define m_AMDFAM10 (1U<<PROCESSOR_AMDFAM10)
2198 #define m_BDVER1 (1U<<PROCESSOR_BDVER1)
2199 #define m_BDVER2 (1U<<PROCESSOR_BDVER2)
2200 #define m_BDVER3 (1U<<PROCESSOR_BDVER3)
2201 #define m_BDVER4 (1U<<PROCESSOR_BDVER4)
2202 #define m_ZNVER1 (1U<<PROCESSOR_ZNVER1)
2203 #define m_BTVER1 (1U<<PROCESSOR_BTVER1)
2204 #define m_BTVER2 (1U<<PROCESSOR_BTVER2)
2205 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
2206 #define m_BTVER (m_BTVER1 | m_BTVER2)
2207 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
2208 | m_ZNVER1)
2209
2210 #define m_GENERIC (1U<<PROCESSOR_GENERIC)
2211
2212 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
2213 #undef DEF_TUNE
2214 #define DEF_TUNE(tune, name, selector) name,
2215 #include "x86-tune.def"
2216 #undef DEF_TUNE
2217 };
2218
2219 /* Feature tests against the various tunings. */
2220 unsigned char ix86_tune_features[X86_TUNE_LAST];
2221
2222 /* Feature tests against the various tunings used to create ix86_tune_features
2223 based on the processor mask. */
2224 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
2225 #undef DEF_TUNE
2226 #define DEF_TUNE(tune, name, selector) selector,
2227 #include "x86-tune.def"
2228 #undef DEF_TUNE
2229 };
2230
2231 /* Feature tests against the various architecture variations. */
2232 unsigned char ix86_arch_features[X86_ARCH_LAST];
2233
2234 /* Feature tests against the various architecture variations, used to create
2235 ix86_arch_features based on the processor mask. */
2236 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2237 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
2238 ~(m_386 | m_486 | m_PENT | m_LAKEMONT | m_K6),
2239
2240 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2241 ~m_386,
2242
2243 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2244 ~(m_386 | m_486),
2245
2246 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2247 ~m_386,
2248
2249 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2250 ~m_386,
2251 };
2252
2253 /* In case the average insn count for single function invocation is
2254 lower than this constant, emit fast (but longer) prologue and
2255 epilogue code. */
2256 #define FAST_PROLOGUE_INSN_COUNT 20
2257
2258 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2259 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2260 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2261 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2262
2263 /* Array of the smallest class containing reg number REGNO, indexed by
2264 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2265
2266 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2267 {
2268 /* ax, dx, cx, bx */
2269 AREG, DREG, CREG, BREG,
2270 /* si, di, bp, sp */
2271 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2272 /* FP registers */
2273 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2274 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2275 /* arg pointer */
2276 NON_Q_REGS,
2277 /* flags, fpsr, fpcr, frame */
2278 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2279 /* SSE registers */
2280 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2281 SSE_REGS, SSE_REGS,
2282 /* MMX registers */
2283 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2284 MMX_REGS, MMX_REGS,
2285 /* REX registers */
2286 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2287 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2288 /* SSE REX registers */
2289 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2290 SSE_REGS, SSE_REGS,
2291 /* AVX-512 SSE registers */
2292 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2293 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2294 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2295 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2296 /* Mask registers. */
2297 MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2298 MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2299 /* MPX bound registers */
2300 BND_REGS, BND_REGS, BND_REGS, BND_REGS,
2301 };
2302
2303 /* The "default" register map used in 32bit mode. */
2304
2305 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2306 {
2307 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2308 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2309 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2310 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2311 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2312 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2313 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2314 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2315 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2316 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2317 101, 102, 103, 104, /* bound registers */
2318 };
2319
2320 /* The "default" register map used in 64bit mode. */
2321
2322 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2323 {
2324 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2325 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2326 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2327 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2328 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2329 8,9,10,11,12,13,14,15, /* extended integer registers */
2330 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2331 67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
2332 75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
2333 118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
2334 126, 127, 128, 129, /* bound registers */
2335 };
2336
2337 /* Define the register numbers to be used in Dwarf debugging information.
2338 The SVR4 reference port C compiler uses the following register numbers
2339 in its Dwarf output code:
2340 0 for %eax (gcc regno = 0)
2341 1 for %ecx (gcc regno = 2)
2342 2 for %edx (gcc regno = 1)
2343 3 for %ebx (gcc regno = 3)
2344 4 for %esp (gcc regno = 7)
2345 5 for %ebp (gcc regno = 6)
2346 6 for %esi (gcc regno = 4)
2347 7 for %edi (gcc regno = 5)
2348 The following three DWARF register numbers are never generated by
2349 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2350 believes these numbers have these meanings.
2351 8 for %eip (no gcc equivalent)
2352 9 for %eflags (gcc regno = 17)
2353 10 for %trapno (no gcc equivalent)
2354 It is not at all clear how we should number the FP stack registers
2355 for the x86 architecture. If the version of SDB on x86/svr4 were
2356 a bit less brain dead with respect to floating-point then we would
2357 have a precedent to follow with respect to DWARF register numbers
2358 for x86 FP registers, but the SDB on x86/svr4 is so completely
2359 broken with respect to FP registers that it is hardly worth thinking
2360 of it as something to strive for compatibility with.
2361 The version of x86/svr4 SDB I have at the moment does (partially)
2362 seem to believe that DWARF register number 11 is associated with
2363 the x86 register %st(0), but that's about all. Higher DWARF
2364 register numbers don't seem to be associated with anything in
2365 particular, and even for DWARF regno 11, SDB only seems to under-
2366 stand that it should say that a variable lives in %st(0) (when
2367 asked via an `=' command) if we said it was in DWARF regno 11,
2368 but SDB still prints garbage when asked for the value of the
2369 variable in question (via a `/' command).
2370 (Also note that the labels SDB prints for various FP stack regs
2371 when doing an `x' command are all wrong.)
2372 Note that these problems generally don't affect the native SVR4
2373 C compiler because it doesn't allow the use of -O with -g and
2374 because when it is *not* optimizing, it allocates a memory
2375 location for each floating-point variable, and the memory
2376 location is what gets described in the DWARF AT_location
2377 attribute for the variable in question.
2378 Regardless of the severe mental illness of the x86/svr4 SDB, we
2379 do something sensible here and we use the following DWARF
2380 register numbers. Note that these are all stack-top-relative
2381 numbers.
2382 11 for %st(0) (gcc regno = 8)
2383 12 for %st(1) (gcc regno = 9)
2384 13 for %st(2) (gcc regno = 10)
2385 14 for %st(3) (gcc regno = 11)
2386 15 for %st(4) (gcc regno = 12)
2387 16 for %st(5) (gcc regno = 13)
2388 17 for %st(6) (gcc regno = 14)
2389 18 for %st(7) (gcc regno = 15)
2390 */
2391 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2392 {
2393 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2394 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2395 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2396 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2397 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2398 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2399 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2400 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2401 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2402 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2403 101, 102, 103, 104, /* bound registers */
2404 };
2405
2406 /* Define parameter passing and return registers. */
2407
2408 static int const x86_64_int_parameter_registers[6] =
2409 {
2410 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2411 };
2412
2413 static int const x86_64_ms_abi_int_parameter_registers[4] =
2414 {
2415 CX_REG, DX_REG, R8_REG, R9_REG
2416 };
2417
2418 static int const x86_64_int_return_registers[4] =
2419 {
2420 AX_REG, DX_REG, DI_REG, SI_REG
2421 };
2422
2423 /* Additional registers that are clobbered by SYSV calls. */
2424
2425 int const x86_64_ms_sysv_extra_clobbered_registers[12] =
2426 {
2427 SI_REG, DI_REG,
2428 XMM6_REG, XMM7_REG,
2429 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
2430 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
2431 };
2432
2433 /* Define the structure for the machine field in struct function. */
2434
2435 struct GTY(()) stack_local_entry {
2436 unsigned short mode;
2437 unsigned short n;
2438 rtx rtl;
2439 struct stack_local_entry *next;
2440 };
2441
2442 /* Structure describing stack frame layout.
2443 Stack grows downward:
2444
2445 [arguments]
2446 <- ARG_POINTER
2447 saved pc
2448
2449 saved static chain if ix86_static_chain_on_stack
2450
2451 saved frame pointer if frame_pointer_needed
2452 <- HARD_FRAME_POINTER
2453 [saved regs]
2454 <- regs_save_offset
2455 [padding0]
2456
2457 [saved SSE regs]
2458 <- sse_regs_save_offset
2459 [padding1] |
2460 | <- FRAME_POINTER
2461 [va_arg registers] |
2462 |
2463 [frame] |
2464 |
2465 [padding2] | = to_allocate
2466 <- STACK_POINTER
2467 */
2468 struct ix86_frame
2469 {
2470 int nsseregs;
2471 int nregs;
2472 int va_arg_size;
2473 int red_zone_size;
2474 int outgoing_arguments_size;
2475
2476 /* The offsets relative to ARG_POINTER. */
2477 HOST_WIDE_INT frame_pointer_offset;
2478 HOST_WIDE_INT hard_frame_pointer_offset;
2479 HOST_WIDE_INT stack_pointer_offset;
2480 HOST_WIDE_INT hfp_save_offset;
2481 HOST_WIDE_INT reg_save_offset;
2482 HOST_WIDE_INT sse_reg_save_offset;
2483
2484 /* When save_regs_using_mov is set, emit prologue using
2485 move instead of push instructions. */
2486 bool save_regs_using_mov;
2487 };
2488
2489 /* Which cpu are we scheduling for. */
2490 enum attr_cpu ix86_schedule;
2491
2492 /* Which cpu are we optimizing for. */
2493 enum processor_type ix86_tune;
2494
2495 /* Which instruction set architecture to use. */
2496 enum processor_type ix86_arch;
2497
2498 /* True if processor has SSE prefetch instruction. */
2499 unsigned char x86_prefetch_sse;
2500
2501 /* -mstackrealign option */
2502 static const char ix86_force_align_arg_pointer_string[]
2503 = "force_align_arg_pointer";
2504
2505 static rtx (*ix86_gen_leave) (void);
2506 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2507 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2508 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2509 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2510 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2511 static rtx (*ix86_gen_monitorx) (rtx, rtx, rtx);
2512 static rtx (*ix86_gen_clzero) (rtx);
2513 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2514 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2515 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2516 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2517 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2518 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2519
2520 /* Preferred alignment for stack boundary in bits. */
2521 unsigned int ix86_preferred_stack_boundary;
2522
2523 /* Alignment for incoming stack boundary in bits specified at
2524 command line. */
2525 static unsigned int ix86_user_incoming_stack_boundary;
2526
2527 /* Default alignment for incoming stack boundary in bits. */
2528 static unsigned int ix86_default_incoming_stack_boundary;
2529
2530 /* Alignment for incoming stack boundary in bits. */
2531 unsigned int ix86_incoming_stack_boundary;
2532
2533 /* Calling abi specific va_list type nodes. */
2534 static GTY(()) tree sysv_va_list_type_node;
2535 static GTY(()) tree ms_va_list_type_node;
2536
2537 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2538 char internal_label_prefix[16];
2539 int internal_label_prefix_len;
2540
2541 /* Fence to use after loop using movnt. */
2542 tree x86_mfence;
2543
2544 /* Register class used for passing given 64bit part of the argument.
2545 These represent classes as documented by the PS ABI, with the exception
2546 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2547 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2548
2549 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2550 whenever possible (upper half does contain padding). */
2551 enum x86_64_reg_class
2552 {
2553 X86_64_NO_CLASS,
2554 X86_64_INTEGER_CLASS,
2555 X86_64_INTEGERSI_CLASS,
2556 X86_64_SSE_CLASS,
2557 X86_64_SSESF_CLASS,
2558 X86_64_SSEDF_CLASS,
2559 X86_64_SSEUP_CLASS,
2560 X86_64_X87_CLASS,
2561 X86_64_X87UP_CLASS,
2562 X86_64_COMPLEX_X87_CLASS,
2563 X86_64_MEMORY_CLASS
2564 };
2565
2566 #define MAX_CLASSES 8
2567
2568 /* Table of constants used by fldpi, fldln2, etc.... */
2569 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2570 static bool ext_80387_constants_init = 0;
2571
2572 \f
2573 static struct machine_function * ix86_init_machine_status (void);
2574 static rtx ix86_function_value (const_tree, const_tree, bool);
2575 static bool ix86_function_value_regno_p (const unsigned int);
2576 static unsigned int ix86_function_arg_boundary (machine_mode,
2577 const_tree);
2578 static rtx ix86_static_chain (const_tree, bool);
2579 static int ix86_function_regparm (const_tree, const_tree);
2580 static void ix86_compute_frame_layout (struct ix86_frame *);
2581 static bool ix86_expand_vector_init_one_nonzero (bool, machine_mode,
2582 rtx, rtx, int);
2583 static void ix86_add_new_builtins (HOST_WIDE_INT, HOST_WIDE_INT);
2584 static tree ix86_canonical_va_list_type (tree);
2585 static void predict_jump (int);
2586 static unsigned int split_stack_prologue_scratch_regno (void);
2587 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2588
2589 enum ix86_function_specific_strings
2590 {
2591 IX86_FUNCTION_SPECIFIC_ARCH,
2592 IX86_FUNCTION_SPECIFIC_TUNE,
2593 IX86_FUNCTION_SPECIFIC_MAX
2594 };
2595
2596 static char *ix86_target_string (HOST_WIDE_INT, HOST_WIDE_INT, int, int,
2597 const char *, const char *, enum fpmath_unit,
2598 bool);
2599 static void ix86_function_specific_save (struct cl_target_option *,
2600 struct gcc_options *opts);
2601 static void ix86_function_specific_restore (struct gcc_options *opts,
2602 struct cl_target_option *);
2603 static void ix86_function_specific_post_stream_in (struct cl_target_option *);
2604 static void ix86_function_specific_print (FILE *, int,
2605 struct cl_target_option *);
2606 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2607 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2608 struct gcc_options *,
2609 struct gcc_options *,
2610 struct gcc_options *);
2611 static bool ix86_can_inline_p (tree, tree);
2612 static void ix86_set_current_function (tree);
2613 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2614
2615 static enum calling_abi ix86_function_abi (const_tree);
2616
2617 \f
2618 #ifndef SUBTARGET32_DEFAULT_CPU
2619 #define SUBTARGET32_DEFAULT_CPU "i386"
2620 #endif
2621
2622 /* Whether -mtune= or -march= were specified */
2623 static int ix86_tune_defaulted;
2624 static int ix86_arch_specified;
2625
2626 /* Vectorization library interface and handlers. */
2627 static tree (*ix86_veclib_handler) (combined_fn, tree, tree);
2628
2629 static tree ix86_veclibabi_svml (combined_fn, tree, tree);
2630 static tree ix86_veclibabi_acml (combined_fn, tree, tree);
2631
2632 /* Processor target table, indexed by processor number */
2633 struct ptt
2634 {
2635 const char *const name; /* processor name */
2636 const struct processor_costs *cost; /* Processor costs */
2637 const int align_loop; /* Default alignments. */
2638 const int align_loop_max_skip;
2639 const int align_jump;
2640 const int align_jump_max_skip;
2641 const int align_func;
2642 };
2643
2644 /* This table must be in sync with enum processor_type in i386.h. */
2645 static const struct ptt processor_target_table[PROCESSOR_max] =
2646 {
2647 {"generic", &generic_cost, 16, 10, 16, 10, 16},
2648 {"i386", &i386_cost, 4, 3, 4, 3, 4},
2649 {"i486", &i486_cost, 16, 15, 16, 15, 16},
2650 {"pentium", &pentium_cost, 16, 7, 16, 7, 16},
2651 {"lakemont", &lakemont_cost, 16, 7, 16, 7, 16},
2652 {"pentiumpro", &pentiumpro_cost, 16, 15, 16, 10, 16},
2653 {"pentium4", &pentium4_cost, 0, 0, 0, 0, 0},
2654 {"nocona", &nocona_cost, 0, 0, 0, 0, 0},
2655 {"core2", &core_cost, 16, 10, 16, 10, 16},
2656 {"nehalem", &core_cost, 16, 10, 16, 10, 16},
2657 {"sandybridge", &core_cost, 16, 10, 16, 10, 16},
2658 {"haswell", &core_cost, 16, 10, 16, 10, 16},
2659 {"bonnell", &atom_cost, 16, 15, 16, 7, 16},
2660 {"silvermont", &slm_cost, 16, 15, 16, 7, 16},
2661 {"knl", &slm_cost, 16, 15, 16, 7, 16},
2662 {"skylake-avx512", &core_cost, 16, 10, 16, 10, 16},
2663 {"intel", &intel_cost, 16, 15, 16, 7, 16},
2664 {"geode", &geode_cost, 0, 0, 0, 0, 0},
2665 {"k6", &k6_cost, 32, 7, 32, 7, 32},
2666 {"athlon", &athlon_cost, 16, 7, 16, 7, 16},
2667 {"k8", &k8_cost, 16, 7, 16, 7, 16},
2668 {"amdfam10", &amdfam10_cost, 32, 24, 32, 7, 32},
2669 {"bdver1", &bdver1_cost, 16, 10, 16, 7, 11},
2670 {"bdver2", &bdver2_cost, 16, 10, 16, 7, 11},
2671 {"bdver3", &bdver3_cost, 16, 10, 16, 7, 11},
2672 {"bdver4", &bdver4_cost, 16, 10, 16, 7, 11},
2673 {"btver1", &btver1_cost, 16, 10, 16, 7, 11},
2674 {"btver2", &btver2_cost, 16, 10, 16, 7, 11},
2675 {"znver1", &znver1_cost, 16, 10, 16, 7, 11}
2676 };
2677 \f
2678 static unsigned int
2679 rest_of_handle_insert_vzeroupper (void)
2680 {
2681 int i;
2682
2683 /* vzeroupper instructions are inserted immediately after reload to
2684 account for possible spills from 256bit registers. The pass
2685 reuses mode switching infrastructure by re-running mode insertion
2686 pass, so disable entities that have already been processed. */
2687 for (i = 0; i < MAX_386_ENTITIES; i++)
2688 ix86_optimize_mode_switching[i] = 0;
2689
2690 ix86_optimize_mode_switching[AVX_U128] = 1;
2691
2692 /* Call optimize_mode_switching. */
2693 g->get_passes ()->execute_pass_mode_switching ();
2694 return 0;
2695 }
2696
2697 /* Return 1 if INSN uses or defines a hard register.
2698 Hard register uses in a memory address are ignored.
2699 Clobbers and flags definitions are ignored. */
2700
2701 static bool
2702 has_non_address_hard_reg (rtx_insn *insn)
2703 {
2704 df_ref ref;
2705 FOR_EACH_INSN_DEF (ref, insn)
2706 if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
2707 && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
2708 && DF_REF_REGNO (ref) != FLAGS_REG)
2709 return true;
2710
2711 FOR_EACH_INSN_USE (ref, insn)
2712 if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
2713 return true;
2714
2715 return false;
2716 }
2717
2718 /* Check if comparison INSN may be transformed
2719 into vector comparison. Currently we transform
2720 zero checks only which look like:
2721
2722 (set (reg:CCZ 17 flags)
2723 (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
2724 (subreg:SI (reg:DI x) 0))
2725 (const_int 0 [0]))) */
2726
2727 static bool
2728 convertible_comparison_p (rtx_insn *insn)
2729 {
2730 if (!TARGET_SSE4_1)
2731 return false;
2732
2733 rtx def_set = single_set (insn);
2734
2735 gcc_assert (def_set);
2736
2737 rtx src = SET_SRC (def_set);
2738 rtx dst = SET_DEST (def_set);
2739
2740 gcc_assert (GET_CODE (src) == COMPARE);
2741
2742 if (GET_CODE (dst) != REG
2743 || REGNO (dst) != FLAGS_REG
2744 || GET_MODE (dst) != CCZmode)
2745 return false;
2746
2747 rtx op1 = XEXP (src, 0);
2748 rtx op2 = XEXP (src, 1);
2749
2750 if (op2 != CONST0_RTX (GET_MODE (op2)))
2751 return false;
2752
2753 if (GET_CODE (op1) != IOR)
2754 return false;
2755
2756 op2 = XEXP (op1, 1);
2757 op1 = XEXP (op1, 0);
2758
2759 if (!SUBREG_P (op1)
2760 || !SUBREG_P (op2)
2761 || GET_MODE (op1) != SImode
2762 || GET_MODE (op2) != SImode
2763 || ((SUBREG_BYTE (op1) != 0
2764 || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
2765 && (SUBREG_BYTE (op2) != 0
2766 || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
2767 return false;
2768
2769 op1 = SUBREG_REG (op1);
2770 op2 = SUBREG_REG (op2);
2771
2772 if (op1 != op2
2773 || !REG_P (op1)
2774 || GET_MODE (op1) != DImode)
2775 return false;
2776
2777 return true;
2778 }
2779
2780 /* The DImode version of scalar_to_vector_candidate_p. */
2781
2782 static bool
2783 dimode_scalar_to_vector_candidate_p (rtx_insn *insn)
2784 {
2785 rtx def_set = single_set (insn);
2786
2787 if (!def_set)
2788 return false;
2789
2790 if (has_non_address_hard_reg (insn))
2791 return false;
2792
2793 rtx src = SET_SRC (def_set);
2794 rtx dst = SET_DEST (def_set);
2795
2796 if (GET_CODE (src) == COMPARE)
2797 return convertible_comparison_p (insn);
2798
2799 /* We are interested in DImode promotion only. */
2800 if ((GET_MODE (src) != DImode
2801 && !CONST_INT_P (src))
2802 || GET_MODE (dst) != DImode)
2803 return false;
2804
2805 if (!REG_P (dst) && !MEM_P (dst))
2806 return false;
2807
2808 switch (GET_CODE (src))
2809 {
2810 case ASHIFT:
2811 case LSHIFTRT:
2812 /* FIXME: consider also variable shifts. */
2813 if (!CONST_INT_P (XEXP (src, 1))
2814 || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, 63))
2815 return false;
2816 break;
2817
2818 case PLUS:
2819 case MINUS:
2820 case IOR:
2821 case XOR:
2822 case AND:
2823 if (!REG_P (XEXP (src, 1))
2824 && !MEM_P (XEXP (src, 1))
2825 && !CONST_INT_P (XEXP (src, 1)))
2826 return false;
2827 break;
2828
2829 case NEG:
2830 case NOT:
2831 break;
2832
2833 case REG:
2834 return true;
2835
2836 case MEM:
2837 case CONST_INT:
2838 return REG_P (dst);
2839
2840 default:
2841 return false;
2842 }
2843
2844 if (!REG_P (XEXP (src, 0))
2845 && !MEM_P (XEXP (src, 0))
2846 && !CONST_INT_P (XEXP (src, 0))
2847 /* Check for andnot case. */
2848 && (GET_CODE (src) != AND
2849 || GET_CODE (XEXP (src, 0)) != NOT
2850 || !REG_P (XEXP (XEXP (src, 0), 0))))
2851 return false;
2852
2853 if ((GET_MODE (XEXP (src, 0)) != DImode
2854 && !CONST_INT_P (XEXP (src, 0)))
2855 || (GET_CODE (src) != NEG
2856 && GET_CODE (src) != NOT
2857 && GET_MODE (XEXP (src, 1)) != DImode
2858 && !CONST_INT_P (XEXP (src, 1))))
2859 return false;
2860
2861 return true;
2862 }
2863
2864 /* The TImode version of scalar_to_vector_candidate_p. */
2865
2866 static bool
2867 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
2868 {
2869 rtx def_set = single_set (insn);
2870
2871 if (!def_set)
2872 return false;
2873
2874 if (has_non_address_hard_reg (insn))
2875 return false;
2876
2877 rtx src = SET_SRC (def_set);
2878 rtx dst = SET_DEST (def_set);
2879
2880 /* Only TImode load and store are allowed. */
2881 if (GET_MODE (dst) != TImode)
2882 return false;
2883
2884 if (MEM_P (dst))
2885 {
2886 /* Check for store. Memory must be aligned or unaligned store
2887 is optimal. Only support store from register, standard SSE
2888 constant or CONST_WIDE_INT generated from piecewise store.
2889
2890 ??? Verify performance impact before enabling CONST_INT for
2891 __int128 store. */
2892 if (misaligned_operand (dst, TImode)
2893 && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
2894 return false;
2895
2896 switch (GET_CODE (src))
2897 {
2898 default:
2899 return false;
2900
2901 case REG:
2902 case CONST_WIDE_INT:
2903 return true;
2904
2905 case CONST_INT:
2906 return standard_sse_constant_p (src, TImode);
2907 }
2908 }
2909 else if (MEM_P (src))
2910 {
2911 /* Check for load. Memory must be aligned or unaligned load is
2912 optimal. */
2913 return (REG_P (dst)
2914 && (!misaligned_operand (src, TImode)
2915 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
2916 }
2917
2918 return false;
2919 }
2920
2921 /* Return 1 if INSN may be converted into vector
2922 instruction. */
2923
2924 static bool
2925 scalar_to_vector_candidate_p (rtx_insn *insn)
2926 {
2927 if (TARGET_64BIT)
2928 return timode_scalar_to_vector_candidate_p (insn);
2929 else
2930 return dimode_scalar_to_vector_candidate_p (insn);
2931 }
2932
2933 /* The DImode version of remove_non_convertible_regs. */
2934
2935 static void
2936 dimode_remove_non_convertible_regs (bitmap candidates)
2937 {
2938 bitmap_iterator bi;
2939 unsigned id;
2940 bitmap regs = BITMAP_ALLOC (NULL);
2941
2942 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
2943 {
2944 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
2945 rtx reg = SET_DEST (def_set);
2946
2947 if (!REG_P (reg)
2948 || bitmap_bit_p (regs, REGNO (reg))
2949 || HARD_REGISTER_P (reg))
2950 continue;
2951
2952 for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
2953 def;
2954 def = DF_REF_NEXT_REG (def))
2955 {
2956 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
2957 {
2958 if (dump_file)
2959 fprintf (dump_file,
2960 "r%d has non convertible definition in insn %d\n",
2961 REGNO (reg), DF_REF_INSN_UID (def));
2962
2963 bitmap_set_bit (regs, REGNO (reg));
2964 break;
2965 }
2966 }
2967 }
2968
2969 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
2970 {
2971 for (df_ref def = DF_REG_DEF_CHAIN (id);
2972 def;
2973 def = DF_REF_NEXT_REG (def))
2974 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
2975 {
2976 if (dump_file)
2977 fprintf (dump_file, "Removing insn %d from candidates list\n",
2978 DF_REF_INSN_UID (def));
2979
2980 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
2981 }
2982 }
2983
2984 BITMAP_FREE (regs);
2985 }
2986
2987 /* For a register REGNO, scan instructions for its defs and uses.
2988 Put REGNO in REGS if a def or use isn't in CANDIDATES. */
2989
2990 static void
2991 timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
2992 unsigned int regno)
2993 {
2994 for (df_ref def = DF_REG_DEF_CHAIN (regno);
2995 def;
2996 def = DF_REF_NEXT_REG (def))
2997 {
2998 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
2999 {
3000 if (dump_file)
3001 fprintf (dump_file,
3002 "r%d has non convertible def in insn %d\n",
3003 regno, DF_REF_INSN_UID (def));
3004
3005 bitmap_set_bit (regs, regno);
3006 break;
3007 }
3008 }
3009
3010 for (df_ref ref = DF_REG_USE_CHAIN (regno);
3011 ref;
3012 ref = DF_REF_NEXT_REG (ref))
3013 {
3014 /* Debug instructions are skipped. */
3015 if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
3016 && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
3017 {
3018 if (dump_file)
3019 fprintf (dump_file,
3020 "r%d has non convertible use in insn %d\n",
3021 regno, DF_REF_INSN_UID (ref));
3022
3023 bitmap_set_bit (regs, regno);
3024 break;
3025 }
3026 }
3027 }
3028
3029 /* The TImode version of remove_non_convertible_regs. */
3030
3031 static void
3032 timode_remove_non_convertible_regs (bitmap candidates)
3033 {
3034 bitmap_iterator bi;
3035 unsigned id;
3036 bitmap regs = BITMAP_ALLOC (NULL);
3037
3038 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
3039 {
3040 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
3041 rtx dest = SET_DEST (def_set);
3042 rtx src = SET_SRC (def_set);
3043
3044 if ((!REG_P (dest)
3045 || bitmap_bit_p (regs, REGNO (dest))
3046 || HARD_REGISTER_P (dest))
3047 && (!REG_P (src)
3048 || bitmap_bit_p (regs, REGNO (src))
3049 || HARD_REGISTER_P (src)))
3050 continue;
3051
3052 if (REG_P (dest))
3053 timode_check_non_convertible_regs (candidates, regs,
3054 REGNO (dest));
3055
3056 if (REG_P (src))
3057 timode_check_non_convertible_regs (candidates, regs,
3058 REGNO (src));
3059 }
3060
3061 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
3062 {
3063 for (df_ref def = DF_REG_DEF_CHAIN (id);
3064 def;
3065 def = DF_REF_NEXT_REG (def))
3066 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3067 {
3068 if (dump_file)
3069 fprintf (dump_file, "Removing insn %d from candidates list\n",
3070 DF_REF_INSN_UID (def));
3071
3072 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
3073 }
3074
3075 for (df_ref ref = DF_REG_USE_CHAIN (id);
3076 ref;
3077 ref = DF_REF_NEXT_REG (ref))
3078 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
3079 {
3080 if (dump_file)
3081 fprintf (dump_file, "Removing insn %d from candidates list\n",
3082 DF_REF_INSN_UID (ref));
3083
3084 bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
3085 }
3086 }
3087
3088 BITMAP_FREE (regs);
3089 }
3090
3091 /* For a given bitmap of insn UIDs scans all instruction and
3092 remove insn from CANDIDATES in case it has both convertible
3093 and not convertible definitions.
3094
3095 All insns in a bitmap are conversion candidates according to
3096 scalar_to_vector_candidate_p. Currently it implies all insns
3097 are single_set. */
3098
3099 static void
3100 remove_non_convertible_regs (bitmap candidates)
3101 {
3102 if (TARGET_64BIT)
3103 timode_remove_non_convertible_regs (candidates);
3104 else
3105 dimode_remove_non_convertible_regs (candidates);
3106 }
3107
3108 class scalar_chain
3109 {
3110 public:
3111 scalar_chain ();
3112 virtual ~scalar_chain ();
3113
3114 static unsigned max_id;
3115
3116 /* ID of a chain. */
3117 unsigned int chain_id;
3118 /* A queue of instructions to be included into a chain. */
3119 bitmap queue;
3120 /* Instructions included into a chain. */
3121 bitmap insns;
3122 /* All registers defined by a chain. */
3123 bitmap defs;
3124 /* Registers used in both vector and sclar modes. */
3125 bitmap defs_conv;
3126
3127 void build (bitmap candidates, unsigned insn_uid);
3128 virtual int compute_convert_gain () = 0;
3129 int convert ();
3130
3131 protected:
3132 void add_to_queue (unsigned insn_uid);
3133 void emit_conversion_insns (rtx insns, rtx_insn *pos);
3134
3135 private:
3136 void add_insn (bitmap candidates, unsigned insn_uid);
3137 void analyze_register_chain (bitmap candidates, df_ref ref);
3138 virtual void mark_dual_mode_def (df_ref def) = 0;
3139 virtual void convert_insn (rtx_insn *insn) = 0;
3140 virtual void convert_registers () = 0;
3141 };
3142
3143 class dimode_scalar_chain : public scalar_chain
3144 {
3145 public:
3146 int compute_convert_gain ();
3147 private:
3148 void mark_dual_mode_def (df_ref def);
3149 rtx replace_with_subreg (rtx x, rtx reg, rtx subreg);
3150 void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg);
3151 void convert_insn (rtx_insn *insn);
3152 void convert_op (rtx *op, rtx_insn *insn);
3153 void convert_reg (unsigned regno);
3154 void make_vector_copies (unsigned regno);
3155 void convert_registers ();
3156 int vector_const_cost (rtx exp);
3157 };
3158
3159 class timode_scalar_chain : public scalar_chain
3160 {
3161 public:
3162 /* Convert from TImode to V1TImode is always faster. */
3163 int compute_convert_gain () { return 1; }
3164
3165 private:
3166 void mark_dual_mode_def (df_ref def);
3167 void fix_debug_reg_uses (rtx reg);
3168 void convert_insn (rtx_insn *insn);
3169 /* We don't convert registers to difference size. */
3170 void convert_registers () {}
3171 };
3172
3173 unsigned scalar_chain::max_id = 0;
3174
3175 /* Initialize new chain. */
3176
3177 scalar_chain::scalar_chain ()
3178 {
3179 chain_id = ++max_id;
3180
3181 if (dump_file)
3182 fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
3183
3184 bitmap_obstack_initialize (NULL);
3185 insns = BITMAP_ALLOC (NULL);
3186 defs = BITMAP_ALLOC (NULL);
3187 defs_conv = BITMAP_ALLOC (NULL);
3188 queue = NULL;
3189 }
3190
3191 /* Free chain's data. */
3192
3193 scalar_chain::~scalar_chain ()
3194 {
3195 BITMAP_FREE (insns);
3196 BITMAP_FREE (defs);
3197 BITMAP_FREE (defs_conv);
3198 bitmap_obstack_release (NULL);
3199 }
3200
3201 /* Add instruction into chains' queue. */
3202
3203 void
3204 scalar_chain::add_to_queue (unsigned insn_uid)
3205 {
3206 if (bitmap_bit_p (insns, insn_uid)
3207 || bitmap_bit_p (queue, insn_uid))
3208 return;
3209
3210 if (dump_file)
3211 fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
3212 insn_uid, chain_id);
3213 bitmap_set_bit (queue, insn_uid);
3214 }
3215
3216 /* For DImode conversion, mark register defined by DEF as requiring
3217 conversion. */
3218
3219 void
3220 dimode_scalar_chain::mark_dual_mode_def (df_ref def)
3221 {
3222 gcc_assert (DF_REF_REG_DEF_P (def));
3223
3224 if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
3225 return;
3226
3227 if (dump_file)
3228 fprintf (dump_file,
3229 " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
3230 DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
3231
3232 bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
3233 }
3234
3235 /* For TImode conversion, it is unused. */
3236
3237 void
3238 timode_scalar_chain::mark_dual_mode_def (df_ref)
3239 {
3240 gcc_unreachable ();
3241 }
3242
3243 /* Check REF's chain to add new insns into a queue
3244 and find registers requiring conversion. */
3245
3246 void
3247 scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
3248 {
3249 df_link *chain;
3250
3251 gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
3252 || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
3253 add_to_queue (DF_REF_INSN_UID (ref));
3254
3255 for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
3256 {
3257 unsigned uid = DF_REF_INSN_UID (chain->ref);
3258
3259 if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
3260 continue;
3261
3262 if (!DF_REF_REG_MEM_P (chain->ref))
3263 {
3264 if (bitmap_bit_p (insns, uid))
3265 continue;
3266
3267 if (bitmap_bit_p (candidates, uid))
3268 {
3269 add_to_queue (uid);
3270 continue;
3271 }
3272 }
3273
3274 if (DF_REF_REG_DEF_P (chain->ref))
3275 {
3276 if (dump_file)
3277 fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
3278 DF_REF_REGNO (chain->ref), uid);
3279 mark_dual_mode_def (chain->ref);
3280 }
3281 else
3282 {
3283 if (dump_file)
3284 fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
3285 DF_REF_REGNO (chain->ref), uid);
3286 mark_dual_mode_def (ref);
3287 }
3288 }
3289 }
3290
3291 /* Add instruction into a chain. */
3292
3293 void
3294 scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
3295 {
3296 if (bitmap_bit_p (insns, insn_uid))
3297 return;
3298
3299 if (dump_file)
3300 fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
3301
3302 bitmap_set_bit (insns, insn_uid);
3303
3304 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
3305 rtx def_set = single_set (insn);
3306 if (def_set && REG_P (SET_DEST (def_set))
3307 && !HARD_REGISTER_P (SET_DEST (def_set)))
3308 bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
3309
3310 df_ref ref;
3311 df_ref def;
3312 for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
3313 if (!HARD_REGISTER_P (DF_REF_REG (ref)))
3314 for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
3315 def;
3316 def = DF_REF_NEXT_REG (def))
3317 analyze_register_chain (candidates, def);
3318 for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
3319 if (!DF_REF_REG_MEM_P (ref))
3320 analyze_register_chain (candidates, ref);
3321 }
3322
3323 /* Build new chain starting from insn INSN_UID recursively
3324 adding all dependent uses and definitions. */
3325
3326 void
3327 scalar_chain::build (bitmap candidates, unsigned insn_uid)
3328 {
3329 queue = BITMAP_ALLOC (NULL);
3330 bitmap_set_bit (queue, insn_uid);
3331
3332 if (dump_file)
3333 fprintf (dump_file, "Building chain #%d...\n", chain_id);
3334
3335 while (!bitmap_empty_p (queue))
3336 {
3337 insn_uid = bitmap_first_set_bit (queue);
3338 bitmap_clear_bit (queue, insn_uid);
3339 bitmap_clear_bit (candidates, insn_uid);
3340 add_insn (candidates, insn_uid);
3341 }
3342
3343 if (dump_file)
3344 {
3345 fprintf (dump_file, "Collected chain #%d...\n", chain_id);
3346 fprintf (dump_file, " insns: ");
3347 dump_bitmap (dump_file, insns);
3348 if (!bitmap_empty_p (defs_conv))
3349 {
3350 bitmap_iterator bi;
3351 unsigned id;
3352 const char *comma = "";
3353 fprintf (dump_file, " defs to convert: ");
3354 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
3355 {
3356 fprintf (dump_file, "%sr%d", comma, id);
3357 comma = ", ";
3358 }
3359 fprintf (dump_file, "\n");
3360 }
3361 }
3362
3363 BITMAP_FREE (queue);
3364 }
3365
3366 /* Return a cost of building a vector costant
3367 instead of using a scalar one. */
3368
3369 int
3370 dimode_scalar_chain::vector_const_cost (rtx exp)
3371 {
3372 gcc_assert (CONST_INT_P (exp));
3373
3374 if (standard_sse_constant_p (exp, V2DImode))
3375 return COSTS_N_INSNS (1);
3376 return ix86_cost->sse_load[1];
3377 }
3378
3379 /* Compute a gain for chain conversion. */
3380
3381 int
3382 dimode_scalar_chain::compute_convert_gain ()
3383 {
3384 bitmap_iterator bi;
3385 unsigned insn_uid;
3386 int gain = 0;
3387 int cost = 0;
3388
3389 if (dump_file)
3390 fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
3391
3392 EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
3393 {
3394 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
3395 rtx def_set = single_set (insn);
3396 rtx src = SET_SRC (def_set);
3397 rtx dst = SET_DEST (def_set);
3398
3399 if (REG_P (src) && REG_P (dst))
3400 gain += COSTS_N_INSNS (2) - ix86_cost->sse_move;
3401 else if (REG_P (src) && MEM_P (dst))
3402 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
3403 else if (MEM_P (src) && REG_P (dst))
3404 gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
3405 else if (GET_CODE (src) == ASHIFT
3406 || GET_CODE (src) == LSHIFTRT)
3407 {
3408 gain += ix86_cost->add;
3409 if (CONST_INT_P (XEXP (src, 0)))
3410 gain -= vector_const_cost (XEXP (src, 0));
3411 if (CONST_INT_P (XEXP (src, 1))
3412 && INTVAL (XEXP (src, 1)) >= 32)
3413 gain -= COSTS_N_INSNS (1);
3414 }
3415 else if (GET_CODE (src) == PLUS
3416 || GET_CODE (src) == MINUS
3417 || GET_CODE (src) == IOR
3418 || GET_CODE (src) == XOR
3419 || GET_CODE (src) == AND)
3420 {
3421 gain += ix86_cost->add;
3422 /* Additional gain for andnot for targets without BMI. */
3423 if (GET_CODE (XEXP (src, 0)) == NOT
3424 && !TARGET_BMI)
3425 gain += 2 * ix86_cost->add;
3426
3427 if (CONST_INT_P (XEXP (src, 0)))
3428 gain -= vector_const_cost (XEXP (src, 0));
3429 if (CONST_INT_P (XEXP (src, 1)))
3430 gain -= vector_const_cost (XEXP (src, 1));
3431 }
3432 else if (GET_CODE (src) == NEG
3433 || GET_CODE (src) == NOT)
3434 gain += ix86_cost->add - COSTS_N_INSNS (1);
3435 else if (GET_CODE (src) == COMPARE)
3436 {
3437 /* Assume comparison cost is the same. */
3438 }
3439 else if (CONST_INT_P (src))
3440 {
3441 if (REG_P (dst))
3442 gain += COSTS_N_INSNS (2);
3443 else if (MEM_P (dst))
3444 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
3445 gain -= vector_const_cost (src);
3446 }
3447 else
3448 gcc_unreachable ();
3449 }
3450
3451 if (dump_file)
3452 fprintf (dump_file, " Instruction conversion gain: %d\n", gain);
3453
3454 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
3455 cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
3456
3457 if (dump_file)
3458 fprintf (dump_file, " Registers conversion cost: %d\n", cost);
3459
3460 gain -= cost;
3461
3462 if (dump_file)
3463 fprintf (dump_file, " Total gain: %d\n", gain);
3464
3465 return gain;
3466 }
3467
3468 /* Replace REG in X with a V2DI subreg of NEW_REG. */
3469
3470 rtx
3471 dimode_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg)
3472 {
3473 if (x == reg)
3474 return gen_rtx_SUBREG (V2DImode, new_reg, 0);
3475
3476 const char *fmt = GET_RTX_FORMAT (GET_CODE (x));
3477 int i, j;
3478 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
3479 {
3480 if (fmt[i] == 'e')
3481 XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg);
3482 else if (fmt[i] == 'E')
3483 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
3484 XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j),
3485 reg, new_reg);
3486 }
3487
3488 return x;
3489 }
3490
3491 /* Replace REG in INSN with a V2DI subreg of NEW_REG. */
3492
3493 void
3494 dimode_scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn,
3495 rtx reg, rtx new_reg)
3496 {
3497 replace_with_subreg (single_set (insn), reg, new_reg);
3498 }
3499
3500 /* Insert generated conversion instruction sequence INSNS
3501 after instruction AFTER. New BB may be required in case
3502 instruction has EH region attached. */
3503
3504 void
3505 scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
3506 {
3507 if (!control_flow_insn_p (after))
3508 {
3509 emit_insn_after (insns, after);
3510 return;
3511 }
3512
3513 basic_block bb = BLOCK_FOR_INSN (after);
3514 edge e = find_fallthru_edge (bb->succs);
3515 gcc_assert (e);
3516
3517 basic_block new_bb = split_edge (e);
3518 emit_insn_after (insns, BB_HEAD (new_bb));
3519 }
3520
3521 /* Make vector copies for all register REGNO definitions
3522 and replace its uses in a chain. */
3523
3524 void
3525 dimode_scalar_chain::make_vector_copies (unsigned regno)
3526 {
3527 rtx reg = regno_reg_rtx[regno];
3528 rtx vreg = gen_reg_rtx (DImode);
3529 df_ref ref;
3530
3531 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3532 if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3533 {
3534 rtx_insn *insn = DF_REF_INSN (ref);
3535
3536 start_sequence ();
3537 if (TARGET_SSE4_1)
3538 {
3539 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
3540 CONST0_RTX (V4SImode),
3541 gen_rtx_SUBREG (SImode, reg, 0)));
3542 emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
3543 gen_rtx_SUBREG (V4SImode, vreg, 0),
3544 gen_rtx_SUBREG (SImode, reg, 4),
3545 GEN_INT (2)));
3546 }
3547 else if (TARGET_INTER_UNIT_MOVES_TO_VEC)
3548 {
3549 rtx tmp = gen_reg_rtx (DImode);
3550 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
3551 CONST0_RTX (V4SImode),
3552 gen_rtx_SUBREG (SImode, reg, 0)));
3553 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
3554 CONST0_RTX (V4SImode),
3555 gen_rtx_SUBREG (SImode, reg, 4)));
3556 emit_insn (gen_vec_interleave_lowv4si
3557 (gen_rtx_SUBREG (V4SImode, vreg, 0),
3558 gen_rtx_SUBREG (V4SImode, vreg, 0),
3559 gen_rtx_SUBREG (V4SImode, tmp, 0)));
3560 }
3561 else
3562 {
3563 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
3564 emit_move_insn (adjust_address (tmp, SImode, 0),
3565 gen_rtx_SUBREG (SImode, reg, 0));
3566 emit_move_insn (adjust_address (tmp, SImode, 4),
3567 gen_rtx_SUBREG (SImode, reg, 4));
3568 emit_move_insn (vreg, tmp);
3569 }
3570 rtx_insn *seq = get_insns ();
3571 end_sequence ();
3572 emit_conversion_insns (seq, insn);
3573
3574 if (dump_file)
3575 fprintf (dump_file,
3576 " Copied r%d to a vector register r%d for insn %d\n",
3577 regno, REGNO (vreg), DF_REF_INSN_UID (ref));
3578 }
3579
3580 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3581 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3582 {
3583 replace_with_subreg_in_insn (DF_REF_INSN (ref), reg, vreg);
3584
3585 if (dump_file)
3586 fprintf (dump_file, " Replaced r%d with r%d in insn %d\n",
3587 regno, REGNO (vreg), DF_REF_INSN_UID (ref));
3588 }
3589 }
3590
3591 /* Convert all definitions of register REGNO
3592 and fix its uses. Scalar copies may be created
3593 in case register is used in not convertible insn. */
3594
3595 void
3596 dimode_scalar_chain::convert_reg (unsigned regno)
3597 {
3598 bool scalar_copy = bitmap_bit_p (defs_conv, regno);
3599 rtx reg = regno_reg_rtx[regno];
3600 rtx scopy = NULL_RTX;
3601 df_ref ref;
3602 bitmap conv;
3603
3604 conv = BITMAP_ALLOC (NULL);
3605 bitmap_copy (conv, insns);
3606
3607 if (scalar_copy)
3608 scopy = gen_reg_rtx (DImode);
3609
3610 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3611 {
3612 rtx_insn *insn = DF_REF_INSN (ref);
3613 rtx def_set = single_set (insn);
3614 rtx src = SET_SRC (def_set);
3615 rtx reg = DF_REF_REG (ref);
3616
3617 if (!MEM_P (src))
3618 {
3619 replace_with_subreg_in_insn (insn, reg, reg);
3620 bitmap_clear_bit (conv, INSN_UID (insn));
3621 }
3622
3623 if (scalar_copy)
3624 {
3625 rtx vcopy = gen_reg_rtx (V2DImode);
3626
3627 start_sequence ();
3628 if (TARGET_INTER_UNIT_MOVES_FROM_VEC)
3629 {
3630 emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0));
3631 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
3632 gen_rtx_SUBREG (SImode, vcopy, 0));
3633 emit_move_insn (vcopy,
3634 gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
3635 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
3636 gen_rtx_SUBREG (SImode, vcopy, 0));
3637 }
3638 else
3639 {
3640 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
3641 emit_move_insn (tmp, reg);
3642 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
3643 adjust_address (tmp, SImode, 0));
3644 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
3645 adjust_address (tmp, SImode, 4));
3646 }
3647 rtx_insn *seq = get_insns ();
3648 end_sequence ();
3649 emit_conversion_insns (seq, insn);
3650
3651 if (dump_file)
3652 fprintf (dump_file,
3653 " Copied r%d to a scalar register r%d for insn %d\n",
3654 regno, REGNO (scopy), INSN_UID (insn));
3655 }
3656 }
3657
3658 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3659 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3660 {
3661 if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
3662 {
3663 rtx def_set = single_set (DF_REF_INSN (ref));
3664 if (!MEM_P (SET_DEST (def_set))
3665 || !REG_P (SET_SRC (def_set)))
3666 replace_with_subreg_in_insn (DF_REF_INSN (ref), reg, reg);
3667 bitmap_clear_bit (conv, DF_REF_INSN_UID (ref));
3668 }
3669 }
3670 /* Skip debug insns and uninitialized uses. */
3671 else if (DF_REF_CHAIN (ref)
3672 && NONDEBUG_INSN_P (DF_REF_INSN (ref)))
3673 {
3674 gcc_assert (scopy);
3675 replace_rtx (DF_REF_INSN (ref), reg, scopy);
3676 df_insn_rescan (DF_REF_INSN (ref));
3677 }
3678
3679 BITMAP_FREE (conv);
3680 }
3681
3682 /* Convert operand OP in INSN. We should handle
3683 memory operands and uninitialized registers.
3684 All other register uses are converted during
3685 registers conversion. */
3686
3687 void
3688 dimode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
3689 {
3690 *op = copy_rtx_if_shared (*op);
3691
3692 if (GET_CODE (*op) == NOT)
3693 {
3694 convert_op (&XEXP (*op, 0), insn);
3695 PUT_MODE (*op, V2DImode);
3696 }
3697 else if (MEM_P (*op))
3698 {
3699 rtx tmp = gen_reg_rtx (DImode);
3700
3701 emit_insn_before (gen_move_insn (tmp, *op), insn);
3702 *op = gen_rtx_SUBREG (V2DImode, tmp, 0);
3703
3704 if (dump_file)
3705 fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
3706 INSN_UID (insn), REGNO (tmp));
3707 }
3708 else if (REG_P (*op))
3709 {
3710 /* We may have not converted register usage in case
3711 this register has no definition. Otherwise it
3712 should be converted in convert_reg. */
3713 df_ref ref;
3714 FOR_EACH_INSN_USE (ref, insn)
3715 if (DF_REF_REGNO (ref) == REGNO (*op))
3716 {
3717 gcc_assert (!DF_REF_CHAIN (ref));
3718 break;
3719 }
3720 *op = gen_rtx_SUBREG (V2DImode, *op, 0);
3721 }
3722 else if (CONST_INT_P (*op))
3723 {
3724 rtx vec_cst;
3725 rtx tmp = gen_rtx_SUBREG (V2DImode, gen_reg_rtx (DImode), 0);
3726
3727 /* Prefer all ones vector in case of -1. */
3728 if (constm1_operand (*op, GET_MODE (*op)))
3729 vec_cst = CONSTM1_RTX (V2DImode);
3730 else
3731 vec_cst = gen_rtx_CONST_VECTOR (V2DImode,
3732 gen_rtvec (2, *op, const0_rtx));
3733
3734 if (!standard_sse_constant_p (vec_cst, V2DImode))
3735 {
3736 start_sequence ();
3737 vec_cst = validize_mem (force_const_mem (V2DImode, vec_cst));
3738 rtx_insn *seq = get_insns ();
3739 end_sequence ();
3740 emit_insn_before (seq, insn);
3741 }
3742
3743 emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
3744 *op = tmp;
3745 }
3746 else
3747 {
3748 gcc_assert (SUBREG_P (*op));
3749 gcc_assert (GET_MODE (*op) == V2DImode);
3750 }
3751 }
3752
3753 /* Convert INSN to vector mode. */
3754
3755 void
3756 dimode_scalar_chain::convert_insn (rtx_insn *insn)
3757 {
3758 rtx def_set = single_set (insn);
3759 rtx src = SET_SRC (def_set);
3760 rtx dst = SET_DEST (def_set);
3761 rtx subreg;
3762
3763 if (MEM_P (dst) && !REG_P (src))
3764 {
3765 /* There are no scalar integer instructions and therefore
3766 temporary register usage is required. */
3767 rtx tmp = gen_reg_rtx (DImode);
3768 emit_conversion_insns (gen_move_insn (dst, tmp), insn);
3769 dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
3770 }
3771
3772 switch (GET_CODE (src))
3773 {
3774 case ASHIFT:
3775 case LSHIFTRT:
3776 convert_op (&XEXP (src, 0), insn);
3777 PUT_MODE (src, V2DImode);
3778 break;
3779
3780 case PLUS:
3781 case MINUS:
3782 case IOR:
3783 case XOR:
3784 case AND:
3785 convert_op (&XEXP (src, 0), insn);
3786 convert_op (&XEXP (src, 1), insn);
3787 PUT_MODE (src, V2DImode);
3788 break;
3789
3790 case NEG:
3791 src = XEXP (src, 0);
3792 convert_op (&src, insn);
3793 subreg = gen_reg_rtx (V2DImode);
3794 emit_insn_before (gen_move_insn (subreg, CONST0_RTX (V2DImode)), insn);
3795 src = gen_rtx_MINUS (V2DImode, subreg, src);
3796 break;
3797
3798 case NOT:
3799 src = XEXP (src, 0);
3800 convert_op (&src, insn);
3801 subreg = gen_reg_rtx (V2DImode);
3802 emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (V2DImode)), insn);
3803 src = gen_rtx_XOR (V2DImode, src, subreg);
3804 break;
3805
3806 case MEM:
3807 if (!REG_P (dst))
3808 convert_op (&src, insn);
3809 break;
3810
3811 case REG:
3812 if (!MEM_P (dst))
3813 convert_op (&src, insn);
3814 break;
3815
3816 case SUBREG:
3817 gcc_assert (GET_MODE (src) == V2DImode);
3818 break;
3819
3820 case COMPARE:
3821 src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
3822
3823 gcc_assert ((REG_P (src) && GET_MODE (src) == DImode)
3824 || (SUBREG_P (src) && GET_MODE (src) == V2DImode));
3825
3826 if (REG_P (src))
3827 subreg = gen_rtx_SUBREG (V2DImode, src, 0);
3828 else
3829 subreg = copy_rtx_if_shared (src);
3830 emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
3831 copy_rtx_if_shared (subreg),
3832 copy_rtx_if_shared (subreg)),
3833 insn);
3834 dst = gen_rtx_REG (CCmode, FLAGS_REG);
3835 src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src),
3836 copy_rtx_if_shared (src)),
3837 UNSPEC_PTEST);
3838 break;
3839
3840 case CONST_INT:
3841 convert_op (&src, insn);
3842 break;
3843
3844 default:
3845 gcc_unreachable ();
3846 }
3847
3848 SET_SRC (def_set) = src;
3849 SET_DEST (def_set) = dst;
3850
3851 /* Drop possible dead definitions. */
3852 PATTERN (insn) = def_set;
3853
3854 INSN_CODE (insn) = -1;
3855 recog_memoized (insn);
3856 df_insn_rescan (insn);
3857 }
3858
3859 /* Fix uses of converted REG in debug insns. */
3860
3861 void
3862 timode_scalar_chain::fix_debug_reg_uses (rtx reg)
3863 {
3864 if (!flag_var_tracking)
3865 return;
3866
3867 df_ref ref, next;
3868 for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
3869 {
3870 rtx_insn *insn = DF_REF_INSN (ref);
3871 /* Make sure the next ref is for a different instruction,
3872 so that we're not affected by the rescan. */
3873 next = DF_REF_NEXT_REG (ref);
3874 while (next && DF_REF_INSN (next) == insn)
3875 next = DF_REF_NEXT_REG (next);
3876
3877 if (DEBUG_INSN_P (insn))
3878 {
3879 /* It may be a debug insn with a TImode variable in
3880 register. */
3881 bool changed = false;
3882 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
3883 {
3884 rtx *loc = DF_REF_LOC (ref);
3885 if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
3886 {
3887 *loc = gen_rtx_SUBREG (TImode, *loc, 0);
3888 changed = true;
3889 }
3890 }
3891 if (changed)
3892 df_insn_rescan (insn);
3893 }
3894 }
3895 }
3896
3897 /* Convert INSN from TImode to V1T1mode. */
3898
3899 void
3900 timode_scalar_chain::convert_insn (rtx_insn *insn)
3901 {
3902 rtx def_set = single_set (insn);
3903 rtx src = SET_SRC (def_set);
3904 rtx dst = SET_DEST (def_set);
3905
3906 switch (GET_CODE (dst))
3907 {
3908 case REG:
3909 {
3910 rtx tmp = find_reg_equal_equiv_note (insn);
3911 if (tmp)
3912 PUT_MODE (XEXP (tmp, 0), V1TImode);
3913 PUT_MODE (dst, V1TImode);
3914 fix_debug_reg_uses (dst);
3915 }
3916 break;
3917 case MEM:
3918 PUT_MODE (dst, V1TImode);
3919 break;
3920
3921 default:
3922 gcc_unreachable ();
3923 }
3924
3925 switch (GET_CODE (src))
3926 {
3927 case REG:
3928 PUT_MODE (src, V1TImode);
3929 /* Call fix_debug_reg_uses only if SRC is never defined. */
3930 if (!DF_REG_DEF_CHAIN (REGNO (src)))
3931 fix_debug_reg_uses (src);
3932 break;
3933
3934 case MEM:
3935 PUT_MODE (src, V1TImode);
3936 break;
3937
3938 case CONST_WIDE_INT:
3939 if (NONDEBUG_INSN_P (insn))
3940 {
3941 /* Since there are no instructions to store 128-bit constant,
3942 temporary register usage is required. */
3943 rtx tmp = gen_reg_rtx (V1TImode);
3944 src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
3945 src = validize_mem (force_const_mem (V1TImode, src));
3946 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
3947 dst = tmp;
3948 }
3949 break;
3950
3951 case CONST_INT:
3952 switch (standard_sse_constant_p (src, TImode))
3953 {
3954 case 1:
3955 src = CONST0_RTX (GET_MODE (dst));
3956 break;
3957 case 2:
3958 src = CONSTM1_RTX (GET_MODE (dst));
3959 break;
3960 default:
3961 gcc_unreachable ();
3962 }
3963 if (NONDEBUG_INSN_P (insn))
3964 {
3965 rtx tmp = gen_reg_rtx (V1TImode);
3966 /* Since there are no instructions to store standard SSE
3967 constant, temporary register usage is required. */
3968 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
3969 dst = tmp;
3970 }
3971 break;
3972
3973 default:
3974 gcc_unreachable ();
3975 }
3976
3977 SET_SRC (def_set) = src;
3978 SET_DEST (def_set) = dst;
3979
3980 /* Drop possible dead definitions. */
3981 PATTERN (insn) = def_set;
3982
3983 INSN_CODE (insn) = -1;
3984 recog_memoized (insn);
3985 df_insn_rescan (insn);
3986 }
3987
3988 void
3989 dimode_scalar_chain::convert_registers ()
3990 {
3991 bitmap_iterator bi;
3992 unsigned id;
3993
3994 EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
3995 convert_reg (id);
3996
3997 EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
3998 make_vector_copies (id);
3999 }
4000
4001 /* Convert whole chain creating required register
4002 conversions and copies. */
4003
4004 int
4005 scalar_chain::convert ()
4006 {
4007 bitmap_iterator bi;
4008 unsigned id;
4009 int converted_insns = 0;
4010
4011 if (!dbg_cnt (stv_conversion))
4012 return 0;
4013
4014 if (dump_file)
4015 fprintf (dump_file, "Converting chain #%d...\n", chain_id);
4016
4017 convert_registers ();
4018
4019 EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
4020 {
4021 convert_insn (DF_INSN_UID_GET (id)->insn);
4022 converted_insns++;
4023 }
4024
4025 return converted_insns;
4026 }
4027
4028 /* Main STV pass function. Find and convert scalar
4029 instructions into vector mode when profitable. */
4030
4031 static unsigned int
4032 convert_scalars_to_vector ()
4033 {
4034 basic_block bb;
4035 bitmap candidates;
4036 int converted_insns = 0;
4037
4038 bitmap_obstack_initialize (NULL);
4039 candidates = BITMAP_ALLOC (NULL);
4040
4041 calculate_dominance_info (CDI_DOMINATORS);
4042 df_set_flags (DF_DEFER_INSN_RESCAN);
4043 df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
4044 df_md_add_problem ();
4045 df_analyze ();
4046
4047 /* Find all instructions we want to convert into vector mode. */
4048 if (dump_file)
4049 fprintf (dump_file, "Searching for mode conversion candidates...\n");
4050
4051 FOR_EACH_BB_FN (bb, cfun)
4052 {
4053 rtx_insn *insn;
4054 FOR_BB_INSNS (bb, insn)
4055 if (scalar_to_vector_candidate_p (insn))
4056 {
4057 if (dump_file)
4058 fprintf (dump_file, " insn %d is marked as a candidate\n",
4059 INSN_UID (insn));
4060
4061 bitmap_set_bit (candidates, INSN_UID (insn));
4062 }
4063 }
4064
4065 remove_non_convertible_regs (candidates);
4066
4067 if (bitmap_empty_p (candidates))
4068 if (dump_file)
4069 fprintf (dump_file, "There are no candidates for optimization.\n");
4070
4071 while (!bitmap_empty_p (candidates))
4072 {
4073 unsigned uid = bitmap_first_set_bit (candidates);
4074 scalar_chain *chain;
4075
4076 if (TARGET_64BIT)
4077 chain = new timode_scalar_chain;
4078 else
4079 chain = new dimode_scalar_chain;
4080
4081 /* Find instructions chain we want to convert to vector mode.
4082 Check all uses and definitions to estimate all required
4083 conversions. */
4084 chain->build (candidates, uid);
4085
4086 if (chain->compute_convert_gain () > 0)
4087 converted_insns += chain->convert ();
4088 else
4089 if (dump_file)
4090 fprintf (dump_file, "Chain #%d conversion is not profitable\n",
4091 chain->chain_id);
4092
4093 delete chain;
4094 }
4095
4096 if (dump_file)
4097 fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
4098
4099 BITMAP_FREE (candidates);
4100 bitmap_obstack_release (NULL);
4101 df_process_deferred_rescans ();
4102
4103 /* Conversion means we may have 128bit register spills/fills
4104 which require aligned stack. */
4105 if (converted_insns)
4106 {
4107 if (crtl->stack_alignment_needed < 128)
4108 crtl->stack_alignment_needed = 128;
4109 if (crtl->stack_alignment_estimated < 128)
4110 crtl->stack_alignment_estimated = 128;
4111 /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */
4112 if (TARGET_64BIT)
4113 for (tree parm = DECL_ARGUMENTS (current_function_decl);
4114 parm; parm = DECL_CHAIN (parm))
4115 {
4116 if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
4117 continue;
4118 if (DECL_RTL_SET_P (parm)
4119 && GET_MODE (DECL_RTL (parm)) == V1TImode)
4120 {
4121 rtx r = DECL_RTL (parm);
4122 if (REG_P (r))
4123 SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
4124 }
4125 if (DECL_INCOMING_RTL (parm)
4126 && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
4127 {
4128 rtx r = DECL_INCOMING_RTL (parm);
4129 if (REG_P (r))
4130 DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
4131 }
4132 }
4133 }
4134
4135 return 0;
4136 }
4137
4138 namespace {
4139
4140 const pass_data pass_data_insert_vzeroupper =
4141 {
4142 RTL_PASS, /* type */
4143 "vzeroupper", /* name */
4144 OPTGROUP_NONE, /* optinfo_flags */
4145 TV_MACH_DEP, /* tv_id */
4146 0, /* properties_required */
4147 0, /* properties_provided */
4148 0, /* properties_destroyed */
4149 0, /* todo_flags_start */
4150 TODO_df_finish, /* todo_flags_finish */
4151 };
4152
4153 class pass_insert_vzeroupper : public rtl_opt_pass
4154 {
4155 public:
4156 pass_insert_vzeroupper(gcc::context *ctxt)
4157 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
4158 {}
4159
4160 /* opt_pass methods: */
4161 virtual bool gate (function *)
4162 {
4163 return TARGET_AVX && !TARGET_AVX512F
4164 && TARGET_VZEROUPPER && flag_expensive_optimizations
4165 && !optimize_size;
4166 }
4167
4168 virtual unsigned int execute (function *)
4169 {
4170 return rest_of_handle_insert_vzeroupper ();
4171 }
4172
4173 }; // class pass_insert_vzeroupper
4174
4175 const pass_data pass_data_stv =
4176 {
4177 RTL_PASS, /* type */
4178 "stv", /* name */
4179 OPTGROUP_NONE, /* optinfo_flags */
4180 TV_MACH_DEP, /* tv_id */
4181 0, /* properties_required */
4182 0, /* properties_provided */
4183 0, /* properties_destroyed */
4184 0, /* todo_flags_start */
4185 TODO_df_finish, /* todo_flags_finish */
4186 };
4187
4188 class pass_stv : public rtl_opt_pass
4189 {
4190 public:
4191 pass_stv (gcc::context *ctxt)
4192 : rtl_opt_pass (pass_data_stv, ctxt),
4193 timode_p (false)
4194 {}
4195
4196 /* opt_pass methods: */
4197 virtual bool gate (function *)
4198 {
4199 return (timode_p == !!TARGET_64BIT
4200 && TARGET_STV && TARGET_SSE2 && optimize > 1);
4201 }
4202
4203 virtual unsigned int execute (function *)
4204 {
4205 return convert_scalars_to_vector ();
4206 }
4207
4208 opt_pass *clone ()
4209 {
4210 return new pass_stv (m_ctxt);
4211 }
4212
4213 void set_pass_param (unsigned int n, bool param)
4214 {
4215 gcc_assert (n == 0);
4216 timode_p = param;
4217 }
4218
4219 private:
4220 bool timode_p;
4221 }; // class pass_stv
4222
4223 } // anon namespace
4224
4225 rtl_opt_pass *
4226 make_pass_insert_vzeroupper (gcc::context *ctxt)
4227 {
4228 return new pass_insert_vzeroupper (ctxt);
4229 }
4230
4231 rtl_opt_pass *
4232 make_pass_stv (gcc::context *ctxt)
4233 {
4234 return new pass_stv (ctxt);
4235 }
4236
4237 /* Return true if a red-zone is in use. */
4238
4239 bool
4240 ix86_using_red_zone (void)
4241 {
4242 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
4243 }
4244 \f
4245 /* Return a string that documents the current -m options. The caller is
4246 responsible for freeing the string. */
4247
4248 static char *
4249 ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2, int flags,
4250 int ix86_flags, const char *arch, const char *tune,
4251 enum fpmath_unit fpmath, bool add_nl_p)
4252 {
4253 struct ix86_target_opts
4254 {
4255 const char *option; /* option string */
4256 HOST_WIDE_INT mask; /* isa mask options */
4257 };
4258
4259 /* This table is ordered so that options like -msse4.2 that imply
4260 preceding options while match those first. */
4261 static struct ix86_target_opts isa_opts[] =
4262 {
4263 { "-mfma4", OPTION_MASK_ISA_FMA4 },
4264 { "-mfma", OPTION_MASK_ISA_FMA },
4265 { "-mxop", OPTION_MASK_ISA_XOP },
4266 { "-mlwp", OPTION_MASK_ISA_LWP },
4267 { "-mavx512f", OPTION_MASK_ISA_AVX512F },
4268 { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
4269 { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
4270 { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
4271 { "-mavx512dq", OPTION_MASK_ISA_AVX512DQ },
4272 { "-mavx512bw", OPTION_MASK_ISA_AVX512BW },
4273 { "-mavx512vl", OPTION_MASK_ISA_AVX512VL },
4274 { "-mavx512ifma", OPTION_MASK_ISA_AVX512IFMA },
4275 { "-mavx512vbmi", OPTION_MASK_ISA_AVX512VBMI },
4276 { "-msse4a", OPTION_MASK_ISA_SSE4A },
4277 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
4278 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
4279 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
4280 { "-msse3", OPTION_MASK_ISA_SSE3 },
4281 { "-msse2", OPTION_MASK_ISA_SSE2 },
4282 { "-msse", OPTION_MASK_ISA_SSE },
4283 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
4284 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
4285 { "-mmmx", OPTION_MASK_ISA_MMX },
4286 { "-mabm", OPTION_MASK_ISA_ABM },
4287 { "-mbmi", OPTION_MASK_ISA_BMI },
4288 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
4289 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
4290 { "-mhle", OPTION_MASK_ISA_HLE },
4291 { "-mfxsr", OPTION_MASK_ISA_FXSR },
4292 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
4293 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
4294 { "-madx", OPTION_MASK_ISA_ADX },
4295 { "-mtbm", OPTION_MASK_ISA_TBM },
4296 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
4297 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
4298 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
4299 { "-maes", OPTION_MASK_ISA_AES },
4300 { "-msha", OPTION_MASK_ISA_SHA },
4301 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
4302 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
4303 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
4304 { "-mf16c", OPTION_MASK_ISA_F16C },
4305 { "-mrtm", OPTION_MASK_ISA_RTM },
4306 { "-mxsave", OPTION_MASK_ISA_XSAVE },
4307 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
4308 { "-mprefetchwt1", OPTION_MASK_ISA_PREFETCHWT1 },
4309 { "-mclflushopt", OPTION_MASK_ISA_CLFLUSHOPT },
4310 { "-mxsavec", OPTION_MASK_ISA_XSAVEC },
4311 { "-mxsaves", OPTION_MASK_ISA_XSAVES },
4312 { "-mmpx", OPTION_MASK_ISA_MPX },
4313 { "-mclwb", OPTION_MASK_ISA_CLWB },
4314 { "-mmwaitx", OPTION_MASK_ISA_MWAITX },
4315 { "-mclzero", OPTION_MASK_ISA_CLZERO },
4316 { "-mpku", OPTION_MASK_ISA_PKU },
4317 };
4318 /* Additional structure for isa flags. */
4319 static struct ix86_target_opts isa_opts2[] =
4320 {
4321 { "-mavx5124vnniw", OPTION_MASK_ISA_AVX5124VNNIW },
4322 { "-mavx5124fmaps", OPTION_MASK_ISA_AVX5124FMAPS },
4323 };
4324 /* Flag options. */
4325 static struct ix86_target_opts flag_opts[] =
4326 {
4327 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
4328 { "-mlong-double-128", MASK_LONG_DOUBLE_128 },
4329 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
4330 { "-m80387", MASK_80387 },
4331 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
4332 { "-malign-double", MASK_ALIGN_DOUBLE },
4333 { "-mcld", MASK_CLD },
4334 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
4335 { "-mieee-fp", MASK_IEEE_FP },
4336 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
4337 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
4338 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
4339 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
4340 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
4341 { "-mno-push-args", MASK_NO_PUSH_ARGS },
4342 { "-mno-red-zone", MASK_NO_RED_ZONE },
4343 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
4344 { "-mrecip", MASK_RECIP },
4345 { "-mrtd", MASK_RTD },
4346 { "-msseregparm", MASK_SSEREGPARM },
4347 { "-mstack-arg-probe", MASK_STACK_PROBE },
4348 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
4349 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
4350 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
4351 { "-mvzeroupper", MASK_VZEROUPPER },
4352 { "-mstv", MASK_STV},
4353 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
4354 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
4355 { "-mprefer-avx128", MASK_PREFER_AVX128},
4356 };
4357
4358 /* Additional flag options. */
4359 static struct ix86_target_opts ix86_flag_opts[] =
4360 {
4361 { "-mgeneral-regs-only", OPTION_MASK_GENERAL_REGS_ONLY },
4362 };
4363
4364 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (isa_opts2)
4365 + ARRAY_SIZE (flag_opts) + ARRAY_SIZE (ix86_flag_opts) + 6][2];
4366
4367 char isa_other[40];
4368 char target_other[40];
4369 char ix86_target_other[40];
4370 unsigned num = 0;
4371 unsigned i, j;
4372 char *ret;
4373 char *ptr;
4374 size_t len;
4375 size_t line_len;
4376 size_t sep_len;
4377 const char *abi;
4378
4379 memset (opts, '\0', sizeof (opts));
4380
4381 /* Add -march= option. */
4382 if (arch)
4383 {
4384 opts[num][0] = "-march=";
4385 opts[num++][1] = arch;
4386 }
4387
4388 /* Add -mtune= option. */
4389 if (tune)
4390 {
4391 opts[num][0] = "-mtune=";
4392 opts[num++][1] = tune;
4393 }
4394
4395 /* Add -m32/-m64/-mx32. */
4396 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
4397 {
4398 if ((isa & OPTION_MASK_ABI_64) != 0)
4399 abi = "-m64";
4400 else
4401 abi = "-mx32";
4402 isa &= ~ (OPTION_MASK_ISA_64BIT
4403 | OPTION_MASK_ABI_64
4404 | OPTION_MASK_ABI_X32);
4405 }
4406 else
4407 abi = "-m32";
4408 opts[num++][0] = abi;
4409
4410 /* Pick out the options in isa options. */
4411 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
4412 {
4413 if ((isa & isa_opts[i].mask) != 0)
4414 {
4415 opts[num++][0] = isa_opts[i].option;
4416 isa &= ~ isa_opts[i].mask;
4417 }
4418 }
4419
4420 if (isa && add_nl_p)
4421 {
4422 opts[num++][0] = isa_other;
4423 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
4424 isa);
4425 }
4426
4427 /* Pick out the options in isa2 options. */
4428 for (i = 0; i < ARRAY_SIZE (isa_opts2); i++)
4429 {
4430 if ((isa2 & isa_opts2[i].mask) != 0)
4431 {
4432 opts[num++][0] = isa_opts2[i].option;
4433 isa &= ~ isa_opts2[i].mask;
4434 }
4435 }
4436
4437 /* Add flag options. */
4438 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
4439 {
4440 if ((flags & flag_opts[i].mask) != 0)
4441 {
4442 opts[num++][0] = flag_opts[i].option;
4443 flags &= ~ flag_opts[i].mask;
4444 }
4445 }
4446
4447 if (flags && add_nl_p)
4448 {
4449 opts[num++][0] = target_other;
4450 sprintf (target_other, "(other flags: %#x)", flags);
4451 }
4452
4453 /* Add additional flag options. */
4454 for (i = 0; i < ARRAY_SIZE (ix86_flag_opts); i++)
4455 {
4456 if ((ix86_flags & ix86_flag_opts[i].mask) != 0)
4457 {
4458 opts[num++][0] = ix86_flag_opts[i].option;
4459 ix86_flags &= ~ ix86_flag_opts[i].mask;
4460 }
4461 }
4462
4463 if (ix86_flags && add_nl_p)
4464 {
4465 opts[num++][0] = ix86_target_other;
4466 sprintf (ix86_target_other, "(other flags: %#x)", ix86_flags);
4467 }
4468
4469 /* Add -fpmath= option. */
4470 if (fpmath)
4471 {
4472 opts[num][0] = "-mfpmath=";
4473 switch ((int) fpmath)
4474 {
4475 case FPMATH_387:
4476 opts[num++][1] = "387";
4477 break;
4478
4479 case FPMATH_SSE:
4480 opts[num++][1] = "sse";
4481 break;
4482
4483 case FPMATH_387 | FPMATH_SSE:
4484 opts[num++][1] = "sse+387";
4485 break;
4486
4487 default:
4488 gcc_unreachable ();
4489 }
4490 }
4491
4492 /* Any options? */
4493 if (num == 0)
4494 return NULL;
4495
4496 gcc_assert (num < ARRAY_SIZE (opts));
4497
4498 /* Size the string. */
4499 len = 0;
4500 sep_len = (add_nl_p) ? 3 : 1;
4501 for (i = 0; i < num; i++)
4502 {
4503 len += sep_len;
4504 for (j = 0; j < 2; j++)
4505 if (opts[i][j])
4506 len += strlen (opts[i][j]);
4507 }
4508
4509 /* Build the string. */
4510 ret = ptr = (char *) xmalloc (len);
4511 line_len = 0;
4512
4513 for (i = 0; i < num; i++)
4514 {
4515 size_t len2[2];
4516
4517 for (j = 0; j < 2; j++)
4518 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
4519
4520 if (i != 0)
4521 {
4522 *ptr++ = ' ';
4523 line_len++;
4524
4525 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
4526 {
4527 *ptr++ = '\\';
4528 *ptr++ = '\n';
4529 line_len = 0;
4530 }
4531 }
4532
4533 for (j = 0; j < 2; j++)
4534 if (opts[i][j])
4535 {
4536 memcpy (ptr, opts[i][j], len2[j]);
4537 ptr += len2[j];
4538 line_len += len2[j];
4539 }
4540 }
4541
4542 *ptr = '\0';
4543 gcc_assert (ret + len >= ptr);
4544
4545 return ret;
4546 }
4547
4548 /* Return true, if profiling code should be emitted before
4549 prologue. Otherwise it returns false.
4550 Note: For x86 with "hotfix" it is sorried. */
4551 static bool
4552 ix86_profile_before_prologue (void)
4553 {
4554 return flag_fentry != 0;
4555 }
4556
4557 /* Function that is callable from the debugger to print the current
4558 options. */
4559 void ATTRIBUTE_UNUSED
4560 ix86_debug_options (void)
4561 {
4562 char *opts = ix86_target_string (ix86_isa_flags, ix86_isa_flags2,
4563 target_flags, ix86_target_flags,
4564 ix86_arch_string,ix86_tune_string,
4565 ix86_fpmath, true);
4566
4567 if (opts)
4568 {
4569 fprintf (stderr, "%s\n\n", opts);
4570 free (opts);
4571 }
4572 else
4573 fputs ("<no options>\n\n", stderr);
4574
4575 return;
4576 }
4577
4578 /* Return true if T is one of the bytes we should avoid with
4579 -fmitigate-rop. */
4580
4581 static bool
4582 ix86_rop_should_change_byte_p (int t)
4583 {
4584 return t == 0xc2 || t == 0xc3 || t == 0xca || t == 0xcb;
4585 }
4586
4587 static const char *stringop_alg_names[] = {
4588 #define DEF_ENUM
4589 #define DEF_ALG(alg, name) #name,
4590 #include "stringop.def"
4591 #undef DEF_ENUM
4592 #undef DEF_ALG
4593 };
4594
4595 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
4596 The string is of the following form (or comma separated list of it):
4597
4598 strategy_alg:max_size:[align|noalign]
4599
4600 where the full size range for the strategy is either [0, max_size] or
4601 [min_size, max_size], in which min_size is the max_size + 1 of the
4602 preceding range. The last size range must have max_size == -1.
4603
4604 Examples:
4605
4606 1.
4607 -mmemcpy-strategy=libcall:-1:noalign
4608
4609 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
4610
4611
4612 2.
4613 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
4614
4615 This is to tell the compiler to use the following strategy for memset
4616 1) when the expected size is between [1, 16], use rep_8byte strategy;
4617 2) when the size is between [17, 2048], use vector_loop;
4618 3) when the size is > 2048, use libcall. */
4619
4620 struct stringop_size_range
4621 {
4622 int max;
4623 stringop_alg alg;
4624 bool noalign;
4625 };
4626
4627 static void
4628 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
4629 {
4630 const struct stringop_algs *default_algs;
4631 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
4632 char *curr_range_str, *next_range_str;
4633 const char *opt = is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=";
4634 int i = 0, n = 0;
4635
4636 if (is_memset)
4637 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
4638 else
4639 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
4640
4641 curr_range_str = strategy_str;
4642
4643 do
4644 {
4645 int maxs;
4646 char alg_name[128];
4647 char align[16];
4648 next_range_str = strchr (curr_range_str, ',');
4649 if (next_range_str)
4650 *next_range_str++ = '\0';
4651
4652 if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s",
4653 alg_name, &maxs, align))
4654 {
4655 error ("wrong argument %qs to option %qs", curr_range_str, opt);
4656 return;
4657 }
4658
4659 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
4660 {
4661 error ("size ranges of option %qs should be increasing", opt);
4662 return;
4663 }
4664
4665 for (i = 0; i < last_alg; i++)
4666 if (!strcmp (alg_name, stringop_alg_names[i]))
4667 break;
4668
4669 if (i == last_alg)
4670 {
4671 error ("wrong strategy name %qs specified for option %qs",
4672 alg_name, opt);
4673
4674 auto_vec <const char *> candidates;
4675 for (i = 0; i < last_alg; i++)
4676 if ((stringop_alg) i != rep_prefix_8_byte || TARGET_64BIT)
4677 candidates.safe_push (stringop_alg_names[i]);
4678
4679 char *s;
4680 const char *hint
4681 = candidates_list_and_hint (alg_name, s, candidates);
4682 if (hint)
4683 inform (input_location,
4684 "valid arguments to %qs are: %s; did you mean %qs?",
4685 opt, s, hint);
4686 else
4687 inform (input_location, "valid arguments to %qs are: %s",
4688 opt, s);
4689 XDELETEVEC (s);
4690 return;
4691 }
4692
4693 if ((stringop_alg) i == rep_prefix_8_byte
4694 && !TARGET_64BIT)
4695 {
4696 /* rep; movq isn't available in 32-bit code. */
4697 error ("strategy name %qs specified for option %qs "
4698 "not supported for 32-bit code", alg_name, opt);
4699 return;
4700 }
4701
4702 input_ranges[n].max = maxs;
4703 input_ranges[n].alg = (stringop_alg) i;
4704 if (!strcmp (align, "align"))
4705 input_ranges[n].noalign = false;
4706 else if (!strcmp (align, "noalign"))
4707 input_ranges[n].noalign = true;
4708 else
4709 {
4710 error ("unknown alignment %qs specified for option %qs", align, opt);
4711 return;
4712 }
4713 n++;
4714 curr_range_str = next_range_str;
4715 }
4716 while (curr_range_str);
4717
4718 if (input_ranges[n - 1].max != -1)
4719 {
4720 error ("the max value for the last size range should be -1"
4721 " for option %qs", opt);
4722 return;
4723 }
4724
4725 if (n > MAX_STRINGOP_ALGS)
4726 {
4727 error ("too many size ranges specified in option %qs", opt);
4728 return;
4729 }
4730
4731 /* Now override the default algs array. */
4732 for (i = 0; i < n; i++)
4733 {
4734 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
4735 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
4736 = input_ranges[i].alg;
4737 *const_cast<int *>(&default_algs->size[i].noalign)
4738 = input_ranges[i].noalign;
4739 }
4740 }
4741
4742 \f
4743 /* parse -mtune-ctrl= option. When DUMP is true,
4744 print the features that are explicitly set. */
4745
4746 static void
4747 parse_mtune_ctrl_str (bool dump)
4748 {
4749 if (!ix86_tune_ctrl_string)
4750 return;
4751
4752 char *next_feature_string = NULL;
4753 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
4754 char *orig = curr_feature_string;
4755 int i;
4756 do
4757 {
4758 bool clear = false;
4759
4760 next_feature_string = strchr (curr_feature_string, ',');
4761 if (next_feature_string)
4762 *next_feature_string++ = '\0';
4763 if (*curr_feature_string == '^')
4764 {
4765 curr_feature_string++;
4766 clear = true;
4767 }
4768 for (i = 0; i < X86_TUNE_LAST; i++)
4769 {
4770 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
4771 {
4772 ix86_tune_features[i] = !clear;
4773 if (dump)
4774 fprintf (stderr, "Explicitly %s feature %s\n",
4775 clear ? "clear" : "set", ix86_tune_feature_names[i]);
4776 break;
4777 }
4778 }
4779 if (i == X86_TUNE_LAST)
4780 error ("Unknown parameter to option -mtune-ctrl: %s",
4781 clear ? curr_feature_string - 1 : curr_feature_string);
4782 curr_feature_string = next_feature_string;
4783 }
4784 while (curr_feature_string);
4785 free (orig);
4786 }
4787
4788 /* Helper function to set ix86_tune_features. IX86_TUNE is the
4789 processor type. */
4790
4791 static void
4792 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
4793 {
4794 unsigned int ix86_tune_mask = 1u << ix86_tune;
4795 int i;
4796
4797 for (i = 0; i < X86_TUNE_LAST; ++i)
4798 {
4799 if (ix86_tune_no_default)
4800 ix86_tune_features[i] = 0;
4801 else
4802 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4803 }
4804
4805 if (dump)
4806 {
4807 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
4808 for (i = 0; i < X86_TUNE_LAST; i++)
4809 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
4810 ix86_tune_features[i] ? "on" : "off");
4811 }
4812
4813 parse_mtune_ctrl_str (dump);
4814 }
4815
4816
4817 /* Default align_* from the processor table. */
4818
4819 static void
4820 ix86_default_align (struct gcc_options *opts)
4821 {
4822 if (opts->x_align_loops == 0)
4823 {
4824 opts->x_align_loops = processor_target_table[ix86_tune].align_loop;
4825 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
4826 }
4827 if (opts->x_align_jumps == 0)
4828 {
4829 opts->x_align_jumps = processor_target_table[ix86_tune].align_jump;
4830 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
4831 }
4832 if (opts->x_align_functions == 0)
4833 {
4834 opts->x_align_functions = processor_target_table[ix86_tune].align_func;
4835 }
4836 }
4837
4838 /* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE hook. */
4839
4840 static void
4841 ix86_override_options_after_change (void)
4842 {
4843 ix86_default_align (&global_options);
4844 }
4845
4846 /* Override various settings based on options. If MAIN_ARGS_P, the
4847 options are from the command line, otherwise they are from
4848 attributes. Return true if there's an error related to march
4849 option. */
4850
4851 static bool
4852 ix86_option_override_internal (bool main_args_p,
4853 struct gcc_options *opts,
4854 struct gcc_options *opts_set)
4855 {
4856 int i;
4857 unsigned int ix86_arch_mask;
4858 const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
4859
4860 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
4861 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
4862 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
4863 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
4864 #define PTA_AES (HOST_WIDE_INT_1 << 4)
4865 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
4866 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
4867 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
4868 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
4869 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
4870 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
4871 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
4872 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
4873 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
4874 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
4875 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
4876 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
4877 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
4878 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
4879 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
4880 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
4881 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
4882 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
4883 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
4884 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
4885 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
4886 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
4887 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
4888 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
4889 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
4890 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
4891 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
4892 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
4893 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
4894 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
4895 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
4896 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
4897 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
4898 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
4899 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
4900 #define PTA_AVX512F (HOST_WIDE_INT_1 << 40)
4901 #define PTA_AVX512ER (HOST_WIDE_INT_1 << 41)
4902 #define PTA_AVX512PF (HOST_WIDE_INT_1 << 42)
4903 #define PTA_AVX512CD (HOST_WIDE_INT_1 << 43)
4904 #define PTA_MPX (HOST_WIDE_INT_1 << 44)
4905 #define PTA_SHA (HOST_WIDE_INT_1 << 45)
4906 #define PTA_PREFETCHWT1 (HOST_WIDE_INT_1 << 46)
4907 #define PTA_CLFLUSHOPT (HOST_WIDE_INT_1 << 47)
4908 #define PTA_XSAVEC (HOST_WIDE_INT_1 << 48)
4909 #define PTA_XSAVES (HOST_WIDE_INT_1 << 49)
4910 #define PTA_AVX512DQ (HOST_WIDE_INT_1 << 50)
4911 #define PTA_AVX512BW (HOST_WIDE_INT_1 << 51)
4912 #define PTA_AVX512VL (HOST_WIDE_INT_1 << 52)
4913 #define PTA_AVX512IFMA (HOST_WIDE_INT_1 << 53)
4914 #define PTA_AVX512VBMI (HOST_WIDE_INT_1 << 54)
4915 #define PTA_CLWB (HOST_WIDE_INT_1 << 55)
4916 #define PTA_MWAITX (HOST_WIDE_INT_1 << 56)
4917 #define PTA_CLZERO (HOST_WIDE_INT_1 << 57)
4918 #define PTA_NO_80387 (HOST_WIDE_INT_1 << 58)
4919 #define PTA_PKU (HOST_WIDE_INT_1 << 59)
4920 #define PTA_AVX5124VNNIW (HOST_WIDE_INT_1 << 60)
4921 #define PTA_AVX5124FMAPS (HOST_WIDE_INT_1 << 61)
4922
4923 #define PTA_CORE2 \
4924 (PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 \
4925 | PTA_CX16 | PTA_FXSR)
4926 #define PTA_NEHALEM \
4927 (PTA_CORE2 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_POPCNT)
4928 #define PTA_WESTMERE \
4929 (PTA_NEHALEM | PTA_AES | PTA_PCLMUL)
4930 #define PTA_SANDYBRIDGE \
4931 (PTA_WESTMERE | PTA_AVX | PTA_XSAVE | PTA_XSAVEOPT)
4932 #define PTA_IVYBRIDGE \
4933 (PTA_SANDYBRIDGE | PTA_FSGSBASE | PTA_RDRND | PTA_F16C)
4934 #define PTA_HASWELL \
4935 (PTA_IVYBRIDGE | PTA_AVX2 | PTA_BMI | PTA_BMI2 | PTA_LZCNT \
4936 | PTA_FMA | PTA_MOVBE | PTA_HLE)
4937 #define PTA_BROADWELL \
4938 (PTA_HASWELL | PTA_ADX | PTA_PRFCHW | PTA_RDSEED)
4939 #define PTA_SKYLAKE \
4940 (PTA_BROADWELL | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES)
4941 #define PTA_SKYLAKE_AVX512 \
4942 (PTA_SKYLAKE | PTA_AVX512F | PTA_AVX512CD | PTA_AVX512VL \
4943 | PTA_AVX512BW | PTA_AVX512DQ | PTA_PKU)
4944 #define PTA_KNL \
4945 (PTA_BROADWELL | PTA_AVX512PF | PTA_AVX512ER | PTA_AVX512F | PTA_AVX512CD)
4946 #define PTA_BONNELL \
4947 (PTA_CORE2 | PTA_MOVBE)
4948 #define PTA_SILVERMONT \
4949 (PTA_WESTMERE | PTA_MOVBE)
4950
4951 /* if this reaches 64, need to widen struct pta flags below */
4952
4953 static struct pta
4954 {
4955 const char *const name; /* processor name or nickname. */
4956 const enum processor_type processor;
4957 const enum attr_cpu schedule;
4958 const unsigned HOST_WIDE_INT flags;
4959 }
4960 const processor_alias_table[] =
4961 {
4962 {"i386", PROCESSOR_I386, CPU_NONE, 0},
4963 {"i486", PROCESSOR_I486, CPU_NONE, 0},
4964 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
4965 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
4966 {"lakemont", PROCESSOR_LAKEMONT, CPU_PENTIUM, PTA_NO_80387},
4967 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
4968 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
4969 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
4970 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
4971 {"samuel-2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
4972 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
4973 PTA_MMX | PTA_SSE | PTA_FXSR},
4974 {"nehemiah", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
4975 PTA_MMX | PTA_SSE | PTA_FXSR},
4976 {"c7", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
4977 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
4978 {"esther", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
4979 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
4980 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
4981 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
4982 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
4983 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
4984 PTA_MMX | PTA_SSE | PTA_FXSR},
4985 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
4986 PTA_MMX | PTA_SSE | PTA_FXSR},
4987 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
4988 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
4989 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
4990 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
4991 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
4992 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
4993 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
4994 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
4995 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
4996 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4997 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
4998 {"core2", PROCESSOR_CORE2, CPU_CORE2, PTA_CORE2},
4999 {"nehalem", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
5000 {"corei7", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
5001 {"westmere", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_WESTMERE},
5002 {"sandybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5003 PTA_SANDYBRIDGE},
5004 {"corei7-avx", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5005 PTA_SANDYBRIDGE},
5006 {"ivybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5007 PTA_IVYBRIDGE},
5008 {"core-avx-i", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5009 PTA_IVYBRIDGE},
5010 {"haswell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
5011 {"core-avx2", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
5012 {"broadwell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_BROADWELL},
5013 {"skylake", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE},
5014 {"skylake-avx512", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE_AVX512},
5015 {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
5016 {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
5017 {"silvermont", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
5018 {"slm", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
5019 {"knl", PROCESSOR_KNL, CPU_SLM, PTA_KNL},
5020 {"intel", PROCESSOR_INTEL, CPU_SLM, PTA_NEHALEM},
5021 {"geode", PROCESSOR_GEODE, CPU_GEODE,
5022 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
5023 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
5024 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
5025 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
5026 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
5027 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
5028 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
5029 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
5030 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
5031 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
5032 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
5033 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
5034 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
5035 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
5036 {"x86-64", PROCESSOR_K8, CPU_K8,
5037 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5038 {"eden-x2", PROCESSOR_K8, CPU_K8,
5039 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5040 {"nano", PROCESSOR_K8, CPU_K8,
5041 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5042 | PTA_SSSE3 | PTA_FXSR},
5043 {"nano-1000", PROCESSOR_K8, CPU_K8,
5044 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5045 | PTA_SSSE3 | PTA_FXSR},
5046 {"nano-2000", PROCESSOR_K8, CPU_K8,
5047 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5048 | PTA_SSSE3 | PTA_FXSR},
5049 {"nano-3000", PROCESSOR_K8, CPU_K8,
5050 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5051 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5052 {"nano-x2", PROCESSOR_K8, CPU_K8,
5053 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5054 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5055 {"eden-x4", PROCESSOR_K8, CPU_K8,
5056 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5057 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5058 {"nano-x4", PROCESSOR_K8, CPU_K8,
5059 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5060 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5061 {"k8", PROCESSOR_K8, CPU_K8,
5062 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5063 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5064 {"k8-sse3", PROCESSOR_K8, CPU_K8,
5065 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5066 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
5067 {"opteron", PROCESSOR_K8, CPU_K8,
5068 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5069 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5070 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
5071 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5072 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
5073 {"athlon64", PROCESSOR_K8, CPU_K8,
5074 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5075 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5076 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
5077 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5078 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
5079 {"athlon-fx", PROCESSOR_K8, CPU_K8,
5080 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5081 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5082 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
5083 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
5084 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
5085 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
5086 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
5087 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
5088 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
5089 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5090 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5091 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
5092 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
5093 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
5094 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5095 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5096 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
5097 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
5098 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
5099 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
5100 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5101 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5102 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
5103 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
5104 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
5105 | PTA_XSAVEOPT | PTA_FSGSBASE},
5106 {"bdver4", PROCESSOR_BDVER4, CPU_BDVER4,
5107 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5108 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5109 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
5110 | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_BMI2
5111 | PTA_TBM | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR
5112 | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND
5113 | PTA_MOVBE | PTA_MWAITX},
5114 {"znver1", PROCESSOR_ZNVER1, CPU_ZNVER1,
5115 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5116 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5117 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
5118 | PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_PRFCHW
5119 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE
5120 | PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
5121 | PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES
5122 | PTA_SHA | PTA_LZCNT | PTA_POPCNT},
5123 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
5124 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5125 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
5126 | PTA_FXSR | PTA_XSAVE},
5127 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
5128 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5129 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
5130 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
5131 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
5132 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
5133
5134 {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
5135 PTA_64BIT
5136 | PTA_HLE /* flags are only used for -march switch. */ },
5137 };
5138
5139 /* -mrecip options. */
5140 static struct
5141 {
5142 const char *string; /* option name */
5143 unsigned int mask; /* mask bits to set */
5144 }
5145 const recip_options[] =
5146 {
5147 { "all", RECIP_MASK_ALL },
5148 { "none", RECIP_MASK_NONE },
5149 { "div", RECIP_MASK_DIV },
5150 { "sqrt", RECIP_MASK_SQRT },
5151 { "vec-div", RECIP_MASK_VEC_DIV },
5152 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
5153 };
5154
5155 int const pta_size = ARRAY_SIZE (processor_alias_table);
5156
5157 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
5158 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
5159 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
5160 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
5161 #ifdef TARGET_BI_ARCH
5162 else
5163 {
5164 #if TARGET_BI_ARCH == 1
5165 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
5166 is on and OPTION_MASK_ABI_X32 is off. We turn off
5167 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
5168 -mx32. */
5169 if (TARGET_X32_P (opts->x_ix86_isa_flags))
5170 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
5171 #else
5172 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
5173 on and OPTION_MASK_ABI_64 is off. We turn off
5174 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
5175 -m64 or OPTION_MASK_CODE16 is turned on by -m16. */
5176 if (TARGET_LP64_P (opts->x_ix86_isa_flags)
5177 || TARGET_16BIT_P (opts->x_ix86_isa_flags))
5178 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
5179 #endif
5180 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5181 && TARGET_IAMCU_P (opts->x_target_flags))
5182 sorry ("Intel MCU psABI isn%'t supported in %s mode",
5183 TARGET_X32_P (opts->x_ix86_isa_flags) ? "x32" : "64-bit");
5184 }
5185 #endif
5186
5187 if (TARGET_X32_P (opts->x_ix86_isa_flags))
5188 {
5189 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
5190 OPTION_MASK_ABI_64 for TARGET_X32. */
5191 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
5192 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
5193 }
5194 else if (TARGET_16BIT_P (opts->x_ix86_isa_flags))
5195 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT
5196 | OPTION_MASK_ABI_X32
5197 | OPTION_MASK_ABI_64);
5198 else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
5199 {
5200 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
5201 OPTION_MASK_ABI_X32 for TARGET_LP64. */
5202 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
5203 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
5204 }
5205
5206 #ifdef SUBTARGET_OVERRIDE_OPTIONS
5207 SUBTARGET_OVERRIDE_OPTIONS;
5208 #endif
5209
5210 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
5211 SUBSUBTARGET_OVERRIDE_OPTIONS;
5212 #endif
5213
5214 /* -fPIC is the default for x86_64. */
5215 if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
5216 opts->x_flag_pic = 2;
5217
5218 /* Need to check -mtune=generic first. */
5219 if (opts->x_ix86_tune_string)
5220 {
5221 /* As special support for cross compilers we read -mtune=native
5222 as -mtune=generic. With native compilers we won't see the
5223 -mtune=native, as it was changed by the driver. */
5224 if (!strcmp (opts->x_ix86_tune_string, "native"))
5225 {
5226 opts->x_ix86_tune_string = "generic";
5227 }
5228 else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
5229 warning (OPT_Wdeprecated,
5230 main_args_p
5231 ? "%<-mtune=x86-64%> is deprecated; use %<-mtune=k8%> "
5232 "or %<-mtune=generic%> instead as appropriate"
5233 : "%<target(\"tune=x86-64\")%> is deprecated; use "
5234 "%<target(\"tune=k8\")%> or %<target(\"tune=generic\")%> "
5235 "instead as appropriate");
5236 }
5237 else
5238 {
5239 if (opts->x_ix86_arch_string)
5240 opts->x_ix86_tune_string = opts->x_ix86_arch_string;
5241 if (!opts->x_ix86_tune_string)
5242 {
5243 opts->x_ix86_tune_string
5244 = processor_target_table[TARGET_CPU_DEFAULT].name;
5245 ix86_tune_defaulted = 1;
5246 }
5247
5248 /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
5249 or defaulted. We need to use a sensible tune option. */
5250 if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
5251 {
5252 opts->x_ix86_tune_string = "generic";
5253 }
5254 }
5255
5256 if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
5257 && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
5258 {
5259 /* rep; movq isn't available in 32-bit code. */
5260 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
5261 opts->x_ix86_stringop_alg = no_stringop;
5262 }
5263
5264 if (!opts->x_ix86_arch_string)
5265 opts->x_ix86_arch_string
5266 = TARGET_64BIT_P (opts->x_ix86_isa_flags)
5267 ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
5268 else
5269 ix86_arch_specified = 1;
5270
5271 if (opts_set->x_ix86_pmode)
5272 {
5273 if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
5274 && opts->x_ix86_pmode == PMODE_SI)
5275 || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
5276 && opts->x_ix86_pmode == PMODE_DI))
5277 error ("address mode %qs not supported in the %s bit mode",
5278 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
5279 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
5280 }
5281 else
5282 opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
5283 ? PMODE_DI : PMODE_SI;
5284
5285 if (!opts_set->x_ix86_abi)
5286 opts->x_ix86_abi = DEFAULT_ABI;
5287
5288 /* For targets using ms ABI enable ms-extensions, if not
5289 explicit turned off. For non-ms ABI we turn off this
5290 option. */
5291 if (!opts_set->x_flag_ms_extensions)
5292 opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
5293
5294 if (opts_set->x_ix86_cmodel)
5295 {
5296 switch (opts->x_ix86_cmodel)
5297 {
5298 case CM_SMALL:
5299 case CM_SMALL_PIC:
5300 if (opts->x_flag_pic)
5301 opts->x_ix86_cmodel = CM_SMALL_PIC;
5302 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5303 error ("code model %qs not supported in the %s bit mode",
5304 "small", "32");
5305 break;
5306
5307 case CM_MEDIUM:
5308 case CM_MEDIUM_PIC:
5309 if (opts->x_flag_pic)
5310 opts->x_ix86_cmodel = CM_MEDIUM_PIC;
5311 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5312 error ("code model %qs not supported in the %s bit mode",
5313 "medium", "32");
5314 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
5315 error ("code model %qs not supported in x32 mode",
5316 "medium");
5317 break;
5318
5319 case CM_LARGE:
5320 case CM_LARGE_PIC:
5321 if (opts->x_flag_pic)
5322 opts->x_ix86_cmodel = CM_LARGE_PIC;
5323 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5324 error ("code model %qs not supported in the %s bit mode",
5325 "large", "32");
5326 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
5327 error ("code model %qs not supported in x32 mode",
5328 "large");
5329 break;
5330
5331 case CM_32:
5332 if (opts->x_flag_pic)
5333 error ("code model %s does not support PIC mode", "32");
5334 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5335 error ("code model %qs not supported in the %s bit mode",
5336 "32", "64");
5337 break;
5338
5339 case CM_KERNEL:
5340 if (opts->x_flag_pic)
5341 {
5342 error ("code model %s does not support PIC mode", "kernel");
5343 opts->x_ix86_cmodel = CM_32;
5344 }
5345 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5346 error ("code model %qs not supported in the %s bit mode",
5347 "kernel", "32");
5348 break;
5349
5350 default:
5351 gcc_unreachable ();
5352 }
5353 }
5354 else
5355 {
5356 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
5357 use of rip-relative addressing. This eliminates fixups that
5358 would otherwise be needed if this object is to be placed in a
5359 DLL, and is essentially just as efficient as direct addressing. */
5360 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5361 && (TARGET_RDOS || TARGET_PECOFF))
5362 opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
5363 else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5364 opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
5365 else
5366 opts->x_ix86_cmodel = CM_32;
5367 }
5368 if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
5369 {
5370 error ("-masm=intel not supported in this configuration");
5371 opts->x_ix86_asm_dialect = ASM_ATT;
5372 }
5373 if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
5374 != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
5375 sorry ("%i-bit mode not compiled in",
5376 (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
5377
5378 for (i = 0; i < pta_size; i++)
5379 if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
5380 {
5381 if (!strcmp (opts->x_ix86_arch_string, "generic"))
5382 {
5383 error (main_args_p
5384 ? "%<generic%> CPU can be used only for %<-mtune=%> switch"
5385 : "%<generic%> CPU can be used only for "
5386 "%<target(\"tune=\")%> attribute");
5387 return false;
5388 }
5389 else if (!strcmp (opts->x_ix86_arch_string, "intel"))
5390 {
5391 error (main_args_p
5392 ? "%<intel%> CPU can be used only for %<-mtune=%> switch"
5393 : "%<intel%> CPU can be used only for "
5394 "%<target(\"tune=\")%> attribute");
5395 return false;
5396 }
5397
5398 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5399 && !(processor_alias_table[i].flags & PTA_64BIT))
5400 {
5401 error ("CPU you selected does not support x86-64 "
5402 "instruction set");
5403 return false;
5404 }
5405
5406 ix86_schedule = processor_alias_table[i].schedule;
5407 ix86_arch = processor_alias_table[i].processor;
5408 /* Default cpu tuning to the architecture. */
5409 ix86_tune = ix86_arch;
5410
5411 if (processor_alias_table[i].flags & PTA_MMX
5412 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
5413 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
5414 if (processor_alias_table[i].flags & PTA_3DNOW
5415 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
5416 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
5417 if (processor_alias_table[i].flags & PTA_3DNOW_A
5418 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
5419 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
5420 if (processor_alias_table[i].flags & PTA_SSE
5421 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
5422 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
5423 if (processor_alias_table[i].flags & PTA_SSE2
5424 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
5425 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
5426 if (processor_alias_table[i].flags & PTA_SSE3
5427 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
5428 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
5429 if (processor_alias_table[i].flags & PTA_SSSE3
5430 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
5431 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
5432 if (processor_alias_table[i].flags & PTA_SSE4_1
5433 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
5434 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
5435 if (processor_alias_table[i].flags & PTA_SSE4_2
5436 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
5437 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
5438 if (processor_alias_table[i].flags & PTA_AVX
5439 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
5440 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
5441 if (processor_alias_table[i].flags & PTA_AVX2
5442 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
5443 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
5444 if (processor_alias_table[i].flags & PTA_FMA
5445 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
5446 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
5447 if (processor_alias_table[i].flags & PTA_SSE4A
5448 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
5449 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
5450 if (processor_alias_table[i].flags & PTA_FMA4
5451 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
5452 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
5453 if (processor_alias_table[i].flags & PTA_XOP
5454 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
5455 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
5456 if (processor_alias_table[i].flags & PTA_LWP
5457 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
5458 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
5459 if (processor_alias_table[i].flags & PTA_ABM
5460 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
5461 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
5462 if (processor_alias_table[i].flags & PTA_BMI
5463 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
5464 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
5465 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
5466 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
5467 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
5468 if (processor_alias_table[i].flags & PTA_TBM
5469 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
5470 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
5471 if (processor_alias_table[i].flags & PTA_BMI2
5472 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
5473 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
5474 if (processor_alias_table[i].flags & PTA_CX16
5475 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
5476 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CX16;
5477 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
5478 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
5479 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
5480 if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
5481 && (processor_alias_table[i].flags & PTA_NO_SAHF))
5482 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
5483 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
5484 if (processor_alias_table[i].flags & PTA_MOVBE
5485 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
5486 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
5487 if (processor_alias_table[i].flags & PTA_AES
5488 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
5489 ix86_isa_flags |= OPTION_MASK_ISA_AES;
5490 if (processor_alias_table[i].flags & PTA_SHA
5491 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA))
5492 ix86_isa_flags |= OPTION_MASK_ISA_SHA;
5493 if (processor_alias_table[i].flags & PTA_PCLMUL
5494 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
5495 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
5496 if (processor_alias_table[i].flags & PTA_FSGSBASE
5497 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
5498 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
5499 if (processor_alias_table[i].flags & PTA_RDRND
5500 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
5501 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
5502 if (processor_alias_table[i].flags & PTA_F16C
5503 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
5504 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
5505 if (processor_alias_table[i].flags & PTA_RTM
5506 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
5507 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
5508 if (processor_alias_table[i].flags & PTA_HLE
5509 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
5510 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_HLE;
5511 if (processor_alias_table[i].flags & PTA_PRFCHW
5512 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
5513 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
5514 if (processor_alias_table[i].flags & PTA_RDSEED
5515 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
5516 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
5517 if (processor_alias_table[i].flags & PTA_ADX
5518 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
5519 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
5520 if (processor_alias_table[i].flags & PTA_FXSR
5521 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
5522 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
5523 if (processor_alias_table[i].flags & PTA_XSAVE
5524 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
5525 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
5526 if (processor_alias_table[i].flags & PTA_XSAVEOPT
5527 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
5528 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
5529 if (processor_alias_table[i].flags & PTA_AVX512F
5530 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
5531 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
5532 if (processor_alias_table[i].flags & PTA_AVX512ER
5533 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
5534 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
5535 if (processor_alias_table[i].flags & PTA_AVX512PF
5536 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
5537 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
5538 if (processor_alias_table[i].flags & PTA_AVX512CD
5539 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
5540 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
5541 if (processor_alias_table[i].flags & PTA_PREFETCHWT1
5542 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1))
5543 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1;
5544 if (processor_alias_table[i].flags & PTA_CLWB
5545 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB))
5546 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB;
5547 if (processor_alias_table[i].flags & PTA_CLFLUSHOPT
5548 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT))
5549 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT;
5550 if (processor_alias_table[i].flags & PTA_CLZERO
5551 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLZERO))
5552 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLZERO;
5553 if (processor_alias_table[i].flags & PTA_XSAVEC
5554 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC))
5555 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC;
5556 if (processor_alias_table[i].flags & PTA_XSAVES
5557 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES))
5558 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES;
5559 if (processor_alias_table[i].flags & PTA_AVX512DQ
5560 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ))
5561 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ;
5562 if (processor_alias_table[i].flags & PTA_AVX512BW
5563 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW))
5564 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW;
5565 if (processor_alias_table[i].flags & PTA_AVX512VL
5566 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL))
5567 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL;
5568 if (processor_alias_table[i].flags & PTA_MPX
5569 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MPX))
5570 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MPX;
5571 if (processor_alias_table[i].flags & PTA_AVX512VBMI
5572 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI))
5573 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI;
5574 if (processor_alias_table[i].flags & PTA_AVX512IFMA
5575 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA))
5576 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA;
5577
5578 if (processor_alias_table[i].flags & PTA_AVX5124VNNIW
5579 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX5124VNNIW))
5580 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124VNNIW;
5581 if (processor_alias_table[i].flags & PTA_AVX5124FMAPS
5582 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX5124FMAPS))
5583 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124FMAPS;
5584
5585 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
5586 x86_prefetch_sse = true;
5587 if (processor_alias_table[i].flags & PTA_MWAITX
5588 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MWAITX))
5589 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MWAITX;
5590 if (processor_alias_table[i].flags & PTA_PKU
5591 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU))
5592 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU;
5593
5594 /* Don't enable x87 instructions if only
5595 general registers are allowed. */
5596 if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
5597 && !(opts_set->x_target_flags & MASK_80387))
5598 {
5599 if (processor_alias_table[i].flags & PTA_NO_80387)
5600 opts->x_target_flags &= ~MASK_80387;
5601 else
5602 opts->x_target_flags |= MASK_80387;
5603 }
5604 break;
5605 }
5606
5607 if (TARGET_X32 && (opts->x_ix86_isa_flags & OPTION_MASK_ISA_MPX))
5608 error ("Intel MPX does not support x32");
5609
5610 if (TARGET_X32 && (ix86_isa_flags & OPTION_MASK_ISA_MPX))
5611 error ("Intel MPX does not support x32");
5612
5613 if (i == pta_size)
5614 {
5615 error (main_args_p
5616 ? "bad value (%qs) for %<-march=%> switch"
5617 : "bad value (%qs) for %<target(\"arch=\")%> attribute",
5618 opts->x_ix86_arch_string);
5619
5620 auto_vec <const char *> candidates;
5621 for (i = 0; i < pta_size; i++)
5622 if (strcmp (processor_alias_table[i].name, "generic")
5623 && strcmp (processor_alias_table[i].name, "intel")
5624 && (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
5625 || (processor_alias_table[i].flags & PTA_64BIT)))
5626 candidates.safe_push (processor_alias_table[i].name);
5627
5628 char *s;
5629 const char *hint
5630 = candidates_list_and_hint (opts->x_ix86_arch_string, s, candidates);
5631 if (hint)
5632 inform (input_location,
5633 main_args_p
5634 ? "valid arguments to %<-march=%> switch are: "
5635 "%s; did you mean %qs?"
5636 : "valid arguments to %<target(\"arch=\")%> attribute are: "
5637 "%s; did you mean %qs?", s, hint);
5638 else
5639 inform (input_location,
5640 main_args_p
5641 ? "valid arguments to %<-march=%> switch are: %s"
5642 : "valid arguments to %<target(\"arch=\")%> attribute are: %s",
5643 s);
5644 XDELETEVEC (s);
5645 }
5646
5647 ix86_arch_mask = 1u << ix86_arch;
5648 for (i = 0; i < X86_ARCH_LAST; ++i)
5649 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
5650
5651 for (i = 0; i < pta_size; i++)
5652 if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
5653 {
5654 ix86_schedule = processor_alias_table[i].schedule;
5655 ix86_tune = processor_alias_table[i].processor;
5656 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5657 {
5658 if (!(processor_alias_table[i].flags & PTA_64BIT))
5659 {
5660 if (ix86_tune_defaulted)
5661 {
5662 opts->x_ix86_tune_string = "x86-64";
5663 for (i = 0; i < pta_size; i++)
5664 if (! strcmp (opts->x_ix86_tune_string,
5665 processor_alias_table[i].name))
5666 break;
5667 ix86_schedule = processor_alias_table[i].schedule;
5668 ix86_tune = processor_alias_table[i].processor;
5669 }
5670 else
5671 error ("CPU you selected does not support x86-64 "
5672 "instruction set");
5673 }
5674 }
5675 /* Intel CPUs have always interpreted SSE prefetch instructions as
5676 NOPs; so, we can enable SSE prefetch instructions even when
5677 -mtune (rather than -march) points us to a processor that has them.
5678 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
5679 higher processors. */
5680 if (TARGET_CMOV
5681 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
5682 x86_prefetch_sse = true;
5683 break;
5684 }
5685
5686 if (ix86_tune_specified && i == pta_size)
5687 {
5688 error (main_args_p
5689 ? "bad value (%qs) for %<-mtune=%> switch"
5690 : "bad value (%qs) for %<target(\"tune=\")%> attribute",
5691 opts->x_ix86_tune_string);
5692
5693 auto_vec <const char *> candidates;
5694 for (i = 0; i < pta_size; i++)
5695 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
5696 || (processor_alias_table[i].flags & PTA_64BIT))
5697 candidates.safe_push (processor_alias_table[i].name);
5698
5699 char *s;
5700 const char *hint
5701 = candidates_list_and_hint (opts->x_ix86_tune_string, s, candidates);
5702 if (hint)
5703 inform (input_location,
5704 main_args_p
5705 ? "valid arguments to %<-mtune=%> switch are: "
5706 "%s; did you mean %qs?"
5707 : "valid arguments to %<target(\"tune=\")%> attribute are: "
5708 "%s; did you mean %qs?", s, hint);
5709 else
5710 inform (input_location,
5711 main_args_p
5712 ? "valid arguments to %<-mtune=%> switch are: %s"
5713 : "valid arguments to %<target(\"tune=\")%> attribute are: %s",
5714 s);
5715 XDELETEVEC (s);
5716 }
5717
5718 set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
5719
5720 #ifndef USE_IX86_FRAME_POINTER
5721 #define USE_IX86_FRAME_POINTER 0
5722 #endif
5723
5724 #ifndef USE_X86_64_FRAME_POINTER
5725 #define USE_X86_64_FRAME_POINTER 0
5726 #endif
5727
5728 /* Set the default values for switches whose default depends on TARGET_64BIT
5729 in case they weren't overwritten by command line options. */
5730 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5731 {
5732 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
5733 opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
5734 if (opts->x_flag_asynchronous_unwind_tables
5735 && !opts_set->x_flag_unwind_tables
5736 && TARGET_64BIT_MS_ABI)
5737 opts->x_flag_unwind_tables = 1;
5738 if (opts->x_flag_asynchronous_unwind_tables == 2)
5739 opts->x_flag_unwind_tables
5740 = opts->x_flag_asynchronous_unwind_tables = 1;
5741 if (opts->x_flag_pcc_struct_return == 2)
5742 opts->x_flag_pcc_struct_return = 0;
5743 }
5744 else
5745 {
5746 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
5747 opts->x_flag_omit_frame_pointer
5748 = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
5749 if (opts->x_flag_asynchronous_unwind_tables == 2)
5750 opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
5751 if (opts->x_flag_pcc_struct_return == 2)
5752 {
5753 /* Intel MCU psABI specifies that -freg-struct-return should
5754 be on. Instead of setting DEFAULT_PCC_STRUCT_RETURN to 1,
5755 we check -miamcu so that -freg-struct-return is always
5756 turned on if -miamcu is used. */
5757 if (TARGET_IAMCU_P (opts->x_target_flags))
5758 opts->x_flag_pcc_struct_return = 0;
5759 else
5760 opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
5761 }
5762 }
5763
5764 ix86_tune_cost = processor_target_table[ix86_tune].cost;
5765 /* TODO: ix86_cost should be chosen at instruction or function granuality
5766 so for cold code we use size_cost even in !optimize_size compilation. */
5767 if (opts->x_optimize_size)
5768 ix86_cost = &ix86_size_cost;
5769 else
5770 ix86_cost = ix86_tune_cost;
5771
5772 /* Arrange to set up i386_stack_locals for all functions. */
5773 init_machine_status = ix86_init_machine_status;
5774
5775 /* Validate -mregparm= value. */
5776 if (opts_set->x_ix86_regparm)
5777 {
5778 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5779 warning (0, "-mregparm is ignored in 64-bit mode");
5780 else if (TARGET_IAMCU_P (opts->x_target_flags))
5781 warning (0, "-mregparm is ignored for Intel MCU psABI");
5782 if (opts->x_ix86_regparm > REGPARM_MAX)
5783 {
5784 error ("-mregparm=%d is not between 0 and %d",
5785 opts->x_ix86_regparm, REGPARM_MAX);
5786 opts->x_ix86_regparm = 0;
5787 }
5788 }
5789 if (TARGET_IAMCU_P (opts->x_target_flags)
5790 || TARGET_64BIT_P (opts->x_ix86_isa_flags))
5791 opts->x_ix86_regparm = REGPARM_MAX;
5792
5793 /* Default align_* from the processor table. */
5794 ix86_default_align (opts);
5795
5796 /* Provide default for -mbranch-cost= value. */
5797 if (!opts_set->x_ix86_branch_cost)
5798 opts->x_ix86_branch_cost = ix86_tune_cost->branch_cost;
5799
5800 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5801 {
5802 opts->x_target_flags
5803 |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
5804
5805 /* Enable by default the SSE and MMX builtins. Do allow the user to
5806 explicitly disable any of these. In particular, disabling SSE and
5807 MMX for kernel code is extremely useful. */
5808 if (!ix86_arch_specified)
5809 opts->x_ix86_isa_flags
5810 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
5811 | TARGET_SUBTARGET64_ISA_DEFAULT)
5812 & ~opts->x_ix86_isa_flags_explicit);
5813
5814 if (TARGET_RTD_P (opts->x_target_flags))
5815 warning (0,
5816 main_args_p ? "%<-mrtd%> is ignored in 64bit mode"
5817 : "%<target(\"rtd\")%> is ignored in 64bit mode");
5818 }
5819 else
5820 {
5821 opts->x_target_flags
5822 |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
5823
5824 if (!ix86_arch_specified)
5825 opts->x_ix86_isa_flags
5826 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
5827
5828 /* i386 ABI does not specify red zone. It still makes sense to use it
5829 when programmer takes care to stack from being destroyed. */
5830 if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
5831 opts->x_target_flags |= MASK_NO_RED_ZONE;
5832 }
5833
5834 /* Keep nonleaf frame pointers. */
5835 if (opts->x_flag_omit_frame_pointer)
5836 opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
5837 else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
5838 opts->x_flag_omit_frame_pointer = 1;
5839
5840 /* If we're doing fast math, we don't care about comparison order
5841 wrt NaNs. This lets us use a shorter comparison sequence. */
5842 if (opts->x_flag_finite_math_only)
5843 opts->x_target_flags &= ~MASK_IEEE_FP;
5844
5845 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
5846 since the insns won't need emulation. */
5847 if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
5848 opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
5849
5850 /* Likewise, if the target doesn't have a 387, or we've specified
5851 software floating point, don't use 387 inline intrinsics. */
5852 if (!TARGET_80387_P (opts->x_target_flags))
5853 opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
5854
5855 /* Turn on MMX builtins for -msse. */
5856 if (TARGET_SSE_P (opts->x_ix86_isa_flags))
5857 opts->x_ix86_isa_flags
5858 |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
5859
5860 /* Enable SSE prefetch. */
5861 if (TARGET_SSE_P (opts->x_ix86_isa_flags)
5862 || (TARGET_PRFCHW_P (opts->x_ix86_isa_flags)
5863 && !TARGET_3DNOW_P (opts->x_ix86_isa_flags))
5864 || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags))
5865 x86_prefetch_sse = true;
5866
5867 /* Enable popcnt instruction for -msse4.2 or -mabm. */
5868 if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
5869 || TARGET_ABM_P (opts->x_ix86_isa_flags))
5870 opts->x_ix86_isa_flags
5871 |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
5872
5873 /* Enable lzcnt instruction for -mabm. */
5874 if (TARGET_ABM_P(opts->x_ix86_isa_flags))
5875 opts->x_ix86_isa_flags
5876 |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
5877
5878 /* Validate -mpreferred-stack-boundary= value or default it to
5879 PREFERRED_STACK_BOUNDARY_DEFAULT. */
5880 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
5881 if (opts_set->x_ix86_preferred_stack_boundary_arg)
5882 {
5883 int min = (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5884 ? (TARGET_SSE_P (opts->x_ix86_isa_flags) ? 4 : 3) : 2);
5885 int max = (TARGET_SEH ? 4 : 12);
5886
5887 if (opts->x_ix86_preferred_stack_boundary_arg < min
5888 || opts->x_ix86_preferred_stack_boundary_arg > max)
5889 {
5890 if (min == max)
5891 error ("-mpreferred-stack-boundary is not supported "
5892 "for this target");
5893 else
5894 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
5895 opts->x_ix86_preferred_stack_boundary_arg, min, max);
5896 }
5897 else
5898 ix86_preferred_stack_boundary
5899 = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
5900 }
5901
5902 /* Set the default value for -mstackrealign. */
5903 if (opts->x_ix86_force_align_arg_pointer == -1)
5904 opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
5905
5906 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
5907
5908 /* Validate -mincoming-stack-boundary= value or default it to
5909 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
5910 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
5911 if (opts_set->x_ix86_incoming_stack_boundary_arg)
5912 {
5913 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2;
5914
5915 if (opts->x_ix86_incoming_stack_boundary_arg < min
5916 || opts->x_ix86_incoming_stack_boundary_arg > 12)
5917 error ("-mincoming-stack-boundary=%d is not between %d and 12",
5918 opts->x_ix86_incoming_stack_boundary_arg, min);
5919 else
5920 {
5921 ix86_user_incoming_stack_boundary
5922 = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
5923 ix86_incoming_stack_boundary
5924 = ix86_user_incoming_stack_boundary;
5925 }
5926 }
5927
5928 #ifndef NO_PROFILE_COUNTERS
5929 if (flag_nop_mcount)
5930 error ("-mnop-mcount is not compatible with this target");
5931 #endif
5932 if (flag_nop_mcount && flag_pic)
5933 error ("-mnop-mcount is not implemented for -fPIC");
5934
5935 /* Accept -msseregparm only if at least SSE support is enabled. */
5936 if (TARGET_SSEREGPARM_P (opts->x_target_flags)
5937 && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
5938 error (main_args_p
5939 ? "%<-msseregparm%> used without SSE enabled"
5940 : "%<target(\"sseregparm\")%> used without SSE enabled");
5941
5942 if (opts_set->x_ix86_fpmath)
5943 {
5944 if (opts->x_ix86_fpmath & FPMATH_SSE)
5945 {
5946 if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
5947 {
5948 if (TARGET_80387_P (opts->x_target_flags))
5949 {
5950 warning (0, "SSE instruction set disabled, using 387 arithmetics");
5951 opts->x_ix86_fpmath = FPMATH_387;
5952 }
5953 }
5954 else if ((opts->x_ix86_fpmath & FPMATH_387)
5955 && !TARGET_80387_P (opts->x_target_flags))
5956 {
5957 warning (0, "387 instruction set disabled, using SSE arithmetics");
5958 opts->x_ix86_fpmath = FPMATH_SSE;
5959 }
5960 }
5961 }
5962 /* For all chips supporting SSE2, -mfpmath=sse performs better than
5963 fpmath=387. The second is however default at many targets since the
5964 extra 80bit precision of temporaries is considered to be part of ABI.
5965 Overwrite the default at least for -ffast-math.
5966 TODO: -mfpmath=both seems to produce same performing code with bit
5967 smaller binaries. It is however not clear if register allocation is
5968 ready for this setting.
5969 Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
5970 codegen. We may switch to 387 with -ffast-math for size optimized
5971 functions. */
5972 else if (fast_math_flags_set_p (&global_options)
5973 && TARGET_SSE2_P (opts->x_ix86_isa_flags))
5974 opts->x_ix86_fpmath = FPMATH_SSE;
5975 else
5976 opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
5977
5978 /* Use external vectorized library in vectorizing intrinsics. */
5979 if (opts_set->x_ix86_veclibabi_type)
5980 switch (opts->x_ix86_veclibabi_type)
5981 {
5982 case ix86_veclibabi_type_svml:
5983 ix86_veclib_handler = ix86_veclibabi_svml;
5984 break;
5985
5986 case ix86_veclibabi_type_acml:
5987 ix86_veclib_handler = ix86_veclibabi_acml;
5988 break;
5989
5990 default:
5991 gcc_unreachable ();
5992 }
5993
5994 if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
5995 && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
5996 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
5997
5998 /* If stack probes are required, the space used for large function
5999 arguments on the stack must also be probed, so enable
6000 -maccumulate-outgoing-args so this happens in the prologue. */
6001 if (TARGET_STACK_PROBE_P (opts->x_target_flags)
6002 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
6003 {
6004 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
6005 warning (0,
6006 main_args_p
6007 ? "stack probing requires %<-maccumulate-outgoing-args%> "
6008 "for correctness"
6009 : "stack probing requires "
6010 "%<target(\"accumulate-outgoing-args\")%> for correctness");
6011 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
6012 }
6013
6014 /* Stack realignment without -maccumulate-outgoing-args requires %ebp,
6015 so enable -maccumulate-outgoing-args when %ebp is fixed. */
6016 if (fixed_regs[BP_REG]
6017 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
6018 {
6019 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
6020 warning (0,
6021 main_args_p
6022 ? "fixed ebp register requires %<-maccumulate-outgoing-args%>"
6023 : "fixed ebp register requires "
6024 "%<target(\"accumulate-outgoing-args\")%>");
6025 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
6026 }
6027
6028 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
6029 {
6030 char *p;
6031 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
6032 p = strchr (internal_label_prefix, 'X');
6033 internal_label_prefix_len = p - internal_label_prefix;
6034 *p = '\0';
6035 }
6036
6037 /* When scheduling description is not available, disable scheduler pass
6038 so it won't slow down the compilation and make x87 code slower. */
6039 if (!TARGET_SCHEDULE)
6040 opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
6041
6042 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
6043 ix86_tune_cost->simultaneous_prefetches,
6044 opts->x_param_values,
6045 opts_set->x_param_values);
6046 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
6047 ix86_tune_cost->prefetch_block,
6048 opts->x_param_values,
6049 opts_set->x_param_values);
6050 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
6051 ix86_tune_cost->l1_cache_size,
6052 opts->x_param_values,
6053 opts_set->x_param_values);
6054 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
6055 ix86_tune_cost->l2_cache_size,
6056 opts->x_param_values,
6057 opts_set->x_param_values);
6058
6059 /* Restrict number of if-converted SET insns to 1. */
6060 if (TARGET_ONE_IF_CONV_INSN)
6061 maybe_set_param_value (PARAM_MAX_RTL_IF_CONVERSION_INSNS,
6062 1,
6063 opts->x_param_values,
6064 opts_set->x_param_values);
6065
6066 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
6067 if (opts->x_flag_prefetch_loop_arrays < 0
6068 && HAVE_prefetch
6069 && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
6070 && !opts->x_optimize_size
6071 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
6072 opts->x_flag_prefetch_loop_arrays = 1;
6073
6074 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
6075 can be opts->x_optimized to ap = __builtin_next_arg (0). */
6076 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
6077 targetm.expand_builtin_va_start = NULL;
6078
6079 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6080 {
6081 ix86_gen_leave = gen_leave_rex64;
6082 if (Pmode == DImode)
6083 {
6084 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
6085 ix86_gen_tls_local_dynamic_base_64
6086 = gen_tls_local_dynamic_base_64_di;
6087 }
6088 else
6089 {
6090 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
6091 ix86_gen_tls_local_dynamic_base_64
6092 = gen_tls_local_dynamic_base_64_si;
6093 }
6094 }
6095 else
6096 ix86_gen_leave = gen_leave;
6097
6098 if (Pmode == DImode)
6099 {
6100 ix86_gen_add3 = gen_adddi3;
6101 ix86_gen_sub3 = gen_subdi3;
6102 ix86_gen_sub3_carry = gen_subdi3_carry;
6103 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
6104 ix86_gen_andsp = gen_anddi3;
6105 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
6106 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
6107 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
6108 ix86_gen_monitor = gen_sse3_monitor_di;
6109 ix86_gen_monitorx = gen_monitorx_di;
6110 ix86_gen_clzero = gen_clzero_di;
6111 }
6112 else
6113 {
6114 ix86_gen_add3 = gen_addsi3;
6115 ix86_gen_sub3 = gen_subsi3;
6116 ix86_gen_sub3_carry = gen_subsi3_carry;
6117 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
6118 ix86_gen_andsp = gen_andsi3;
6119 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
6120 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
6121 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
6122 ix86_gen_monitor = gen_sse3_monitor_si;
6123 ix86_gen_monitorx = gen_monitorx_si;
6124 ix86_gen_clzero = gen_clzero_si;
6125 }
6126
6127 #ifdef USE_IX86_CLD
6128 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
6129 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
6130 opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
6131 #endif
6132
6133 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic)
6134 {
6135 if (opts->x_flag_fentry > 0)
6136 sorry ("-mfentry isn%'t supported for 32-bit in combination "
6137 "with -fpic");
6138 opts->x_flag_fentry = 0;
6139 }
6140 else if (TARGET_SEH)
6141 {
6142 if (opts->x_flag_fentry == 0)
6143 sorry ("-mno-fentry isn%'t compatible with SEH");
6144 opts->x_flag_fentry = 1;
6145 }
6146 else if (opts->x_flag_fentry < 0)
6147 {
6148 #if defined(PROFILE_BEFORE_PROLOGUE)
6149 opts->x_flag_fentry = 1;
6150 #else
6151 opts->x_flag_fentry = 0;
6152 #endif
6153 }
6154
6155 if (!(opts_set->x_target_flags & MASK_VZEROUPPER))
6156 opts->x_target_flags |= MASK_VZEROUPPER;
6157 if (!(opts_set->x_target_flags & MASK_STV))
6158 opts->x_target_flags |= MASK_STV;
6159 /* Disable STV if -mpreferred-stack-boundary={2,3} or
6160 -mincoming-stack-boundary={2,3} or -mstackrealign - the needed
6161 stack realignment will be extra cost the pass doesn't take into
6162 account and the pass can't realign the stack. */
6163 if (ix86_preferred_stack_boundary < 128
6164 || ix86_incoming_stack_boundary < 128
6165 || opts->x_ix86_force_align_arg_pointer)
6166 opts->x_target_flags &= ~MASK_STV;
6167 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
6168 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
6169 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
6170 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
6171 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
6172 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
6173 /* Enable 128-bit AVX instruction generation
6174 for the auto-vectorizer. */
6175 if (TARGET_AVX128_OPTIMAL
6176 && !(opts_set->x_target_flags & MASK_PREFER_AVX128))
6177 opts->x_target_flags |= MASK_PREFER_AVX128;
6178
6179 if (opts->x_ix86_recip_name)
6180 {
6181 char *p = ASTRDUP (opts->x_ix86_recip_name);
6182 char *q;
6183 unsigned int mask, i;
6184 bool invert;
6185
6186 while ((q = strtok (p, ",")) != NULL)
6187 {
6188 p = NULL;
6189 if (*q == '!')
6190 {
6191 invert = true;
6192 q++;
6193 }
6194 else
6195 invert = false;
6196
6197 if (!strcmp (q, "default"))
6198 mask = RECIP_MASK_ALL;
6199 else
6200 {
6201 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
6202 if (!strcmp (q, recip_options[i].string))
6203 {
6204 mask = recip_options[i].mask;
6205 break;
6206 }
6207
6208 if (i == ARRAY_SIZE (recip_options))
6209 {
6210 error ("unknown option for -mrecip=%s", q);
6211 invert = false;
6212 mask = RECIP_MASK_NONE;
6213 }
6214 }
6215
6216 opts->x_recip_mask_explicit |= mask;
6217 if (invert)
6218 opts->x_recip_mask &= ~mask;
6219 else
6220 opts->x_recip_mask |= mask;
6221 }
6222 }
6223
6224 if (TARGET_RECIP_P (opts->x_target_flags))
6225 opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
6226 else if (opts_set->x_target_flags & MASK_RECIP)
6227 opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
6228
6229 /* Default long double to 64-bit for 32-bit Bionic and to __float128
6230 for 64-bit Bionic. Also default long double to 64-bit for Intel
6231 MCU psABI. */
6232 if ((TARGET_HAS_BIONIC || TARGET_IAMCU)
6233 && !(opts_set->x_target_flags
6234 & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128)))
6235 opts->x_target_flags |= (TARGET_64BIT
6236 ? MASK_LONG_DOUBLE_128
6237 : MASK_LONG_DOUBLE_64);
6238
6239 /* Only one of them can be active. */
6240 gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0
6241 || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0);
6242
6243 /* Save the initial options in case the user does function specific
6244 options. */
6245 if (main_args_p)
6246 target_option_default_node = target_option_current_node
6247 = build_target_option_node (opts);
6248
6249 /* Handle stack protector */
6250 if (!opts_set->x_ix86_stack_protector_guard)
6251 opts->x_ix86_stack_protector_guard
6252 = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
6253
6254 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
6255 if (opts->x_ix86_tune_memcpy_strategy)
6256 {
6257 char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
6258 ix86_parse_stringop_strategy_string (str, false);
6259 free (str);
6260 }
6261
6262 if (opts->x_ix86_tune_memset_strategy)
6263 {
6264 char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
6265 ix86_parse_stringop_strategy_string (str, true);
6266 free (str);
6267 }
6268
6269 return true;
6270 }
6271
6272 /* Implement the TARGET_OPTION_OVERRIDE hook. */
6273
6274 static void
6275 ix86_option_override (void)
6276 {
6277 ix86_option_override_internal (true, &global_options, &global_options_set);
6278 }
6279
6280 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
6281 static char *
6282 ix86_offload_options (void)
6283 {
6284 if (TARGET_LP64)
6285 return xstrdup ("-foffload-abi=lp64");
6286 return xstrdup ("-foffload-abi=ilp32");
6287 }
6288
6289 /* Update register usage after having seen the compiler flags. */
6290
6291 static void
6292 ix86_conditional_register_usage (void)
6293 {
6294 int i, c_mask;
6295
6296 /* If there are no caller-saved registers, preserve all registers.
6297 except fixed_regs and registers used for function return value
6298 since aggregate_value_p checks call_used_regs[regno] on return
6299 value. */
6300 if (cfun && cfun->machine->no_caller_saved_registers)
6301 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6302 if (!fixed_regs[i] && !ix86_function_value_regno_p (i))
6303 call_used_regs[i] = 0;
6304
6305 /* For 32-bit targets, squash the REX registers. */
6306 if (! TARGET_64BIT)
6307 {
6308 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
6309 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6310 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
6311 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6312 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
6313 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6314 }
6315
6316 /* See the definition of CALL_USED_REGISTERS in i386.h. */
6317 c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI);
6318
6319 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
6320
6321 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6322 {
6323 /* Set/reset conditionally defined registers from
6324 CALL_USED_REGISTERS initializer. */
6325 if (call_used_regs[i] > 1)
6326 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
6327
6328 /* Calculate registers of CLOBBERED_REGS register set
6329 as call used registers from GENERAL_REGS register set. */
6330 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
6331 && call_used_regs[i])
6332 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
6333 }
6334
6335 /* If MMX is disabled, squash the registers. */
6336 if (! TARGET_MMX)
6337 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6338 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
6339 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6340
6341 /* If SSE is disabled, squash the registers. */
6342 if (! TARGET_SSE)
6343 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6344 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
6345 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6346
6347 /* If the FPU is disabled, squash the registers. */
6348 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
6349 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6350 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
6351 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6352
6353 /* If AVX512F is disabled, squash the registers. */
6354 if (! TARGET_AVX512F)
6355 {
6356 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
6357 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6358
6359 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
6360 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6361 }
6362
6363 /* If MPX is disabled, squash the registers. */
6364 if (! TARGET_MPX)
6365 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
6366 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6367 }
6368
6369 \f
6370 /* Save the current options */
6371
6372 static void
6373 ix86_function_specific_save (struct cl_target_option *ptr,
6374 struct gcc_options *opts)
6375 {
6376 ptr->arch = ix86_arch;
6377 ptr->schedule = ix86_schedule;
6378 ptr->prefetch_sse = x86_prefetch_sse;
6379 ptr->tune = ix86_tune;
6380 ptr->branch_cost = ix86_branch_cost;
6381 ptr->tune_defaulted = ix86_tune_defaulted;
6382 ptr->arch_specified = ix86_arch_specified;
6383 ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
6384 ptr->x_ix86_isa_flags2_explicit = opts->x_ix86_isa_flags2_explicit;
6385 ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
6386 ptr->x_ix86_arch_string = opts->x_ix86_arch_string;
6387 ptr->x_ix86_tune_string = opts->x_ix86_tune_string;
6388 ptr->x_ix86_cmodel = opts->x_ix86_cmodel;
6389 ptr->x_ix86_abi = opts->x_ix86_abi;
6390 ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect;
6391 ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost;
6392 ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes;
6393 ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer;
6394 ptr->x_ix86_force_drap = opts->x_ix86_force_drap;
6395 ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg;
6396 ptr->x_ix86_pmode = opts->x_ix86_pmode;
6397 ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg;
6398 ptr->x_ix86_recip_name = opts->x_ix86_recip_name;
6399 ptr->x_ix86_regparm = opts->x_ix86_regparm;
6400 ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold;
6401 ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx;
6402 ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard;
6403 ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg;
6404 ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect;
6405 ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string;
6406 ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy;
6407 ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy;
6408 ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default;
6409 ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type;
6410
6411 /* The fields are char but the variables are not; make sure the
6412 values fit in the fields. */
6413 gcc_assert (ptr->arch == ix86_arch);
6414 gcc_assert (ptr->schedule == ix86_schedule);
6415 gcc_assert (ptr->tune == ix86_tune);
6416 gcc_assert (ptr->branch_cost == ix86_branch_cost);
6417 }
6418
6419 /* Restore the current options */
6420
6421 static void
6422 ix86_function_specific_restore (struct gcc_options *opts,
6423 struct cl_target_option *ptr)
6424 {
6425 enum processor_type old_tune = ix86_tune;
6426 enum processor_type old_arch = ix86_arch;
6427 unsigned int ix86_arch_mask;
6428 int i;
6429
6430 /* We don't change -fPIC. */
6431 opts->x_flag_pic = flag_pic;
6432
6433 ix86_arch = (enum processor_type) ptr->arch;
6434 ix86_schedule = (enum attr_cpu) ptr->schedule;
6435 ix86_tune = (enum processor_type) ptr->tune;
6436 x86_prefetch_sse = ptr->prefetch_sse;
6437 opts->x_ix86_branch_cost = ptr->branch_cost;
6438 ix86_tune_defaulted = ptr->tune_defaulted;
6439 ix86_arch_specified = ptr->arch_specified;
6440 opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
6441 opts->x_ix86_isa_flags2_explicit = ptr->x_ix86_isa_flags2_explicit;
6442 opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
6443 opts->x_ix86_arch_string = ptr->x_ix86_arch_string;
6444 opts->x_ix86_tune_string = ptr->x_ix86_tune_string;
6445 opts->x_ix86_cmodel = ptr->x_ix86_cmodel;
6446 opts->x_ix86_abi = ptr->x_ix86_abi;
6447 opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect;
6448 opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost;
6449 opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes;
6450 opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer;
6451 opts->x_ix86_force_drap = ptr->x_ix86_force_drap;
6452 opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg;
6453 opts->x_ix86_pmode = ptr->x_ix86_pmode;
6454 opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg;
6455 opts->x_ix86_recip_name = ptr->x_ix86_recip_name;
6456 opts->x_ix86_regparm = ptr->x_ix86_regparm;
6457 opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold;
6458 opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx;
6459 opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard;
6460 opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg;
6461 opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect;
6462 opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string;
6463 opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy;
6464 opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy;
6465 opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default;
6466 opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type;
6467 ix86_tune_cost = processor_target_table[ix86_tune].cost;
6468 /* TODO: ix86_cost should be chosen at instruction or function granuality
6469 so for cold code we use size_cost even in !optimize_size compilation. */
6470 if (opts->x_optimize_size)
6471 ix86_cost = &ix86_size_cost;
6472 else
6473 ix86_cost = ix86_tune_cost;
6474
6475 /* Recreate the arch feature tests if the arch changed */
6476 if (old_arch != ix86_arch)
6477 {
6478 ix86_arch_mask = 1u << ix86_arch;
6479 for (i = 0; i < X86_ARCH_LAST; ++i)
6480 ix86_arch_features[i]
6481 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
6482 }
6483
6484 /* Recreate the tune optimization tests */
6485 if (old_tune != ix86_tune)
6486 set_ix86_tune_features (ix86_tune, false);
6487 }
6488
6489 /* Adjust target options after streaming them in. This is mainly about
6490 reconciling them with global options. */
6491
6492 static void
6493 ix86_function_specific_post_stream_in (struct cl_target_option *ptr)
6494 {
6495 /* flag_pic is a global option, but ix86_cmodel is target saved option
6496 partly computed from flag_pic. If flag_pic is on, adjust x_ix86_cmodel
6497 for PIC, or error out. */
6498 if (flag_pic)
6499 switch (ptr->x_ix86_cmodel)
6500 {
6501 case CM_SMALL:
6502 ptr->x_ix86_cmodel = CM_SMALL_PIC;
6503 break;
6504
6505 case CM_MEDIUM:
6506 ptr->x_ix86_cmodel = CM_MEDIUM_PIC;
6507 break;
6508
6509 case CM_LARGE:
6510 ptr->x_ix86_cmodel = CM_LARGE_PIC;
6511 break;
6512
6513 case CM_KERNEL:
6514 error ("code model %s does not support PIC mode", "kernel");
6515 break;
6516
6517 default:
6518 break;
6519 }
6520 else
6521 switch (ptr->x_ix86_cmodel)
6522 {
6523 case CM_SMALL_PIC:
6524 ptr->x_ix86_cmodel = CM_SMALL;
6525 break;
6526
6527 case CM_MEDIUM_PIC:
6528 ptr->x_ix86_cmodel = CM_MEDIUM;
6529 break;
6530
6531 case CM_LARGE_PIC:
6532 ptr->x_ix86_cmodel = CM_LARGE;
6533 break;
6534
6535 default:
6536 break;
6537 }
6538 }
6539
6540 /* Print the current options */
6541
6542 static void
6543 ix86_function_specific_print (FILE *file, int indent,
6544 struct cl_target_option *ptr)
6545 {
6546 char *target_string
6547 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_ix86_isa_flags2,
6548 ptr->x_target_flags, ptr->x_ix86_target_flags,
6549 NULL, NULL, ptr->x_ix86_fpmath, false);
6550
6551 gcc_assert (ptr->arch < PROCESSOR_max);
6552 fprintf (file, "%*sarch = %d (%s)\n",
6553 indent, "",
6554 ptr->arch, processor_target_table[ptr->arch].name);
6555
6556 gcc_assert (ptr->tune < PROCESSOR_max);
6557 fprintf (file, "%*stune = %d (%s)\n",
6558 indent, "",
6559 ptr->tune, processor_target_table[ptr->tune].name);
6560
6561 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
6562
6563 if (target_string)
6564 {
6565 fprintf (file, "%*s%s\n", indent, "", target_string);
6566 free (target_string);
6567 }
6568 }
6569
6570 \f
6571 /* Inner function to process the attribute((target(...))), take an argument and
6572 set the current options from the argument. If we have a list, recursively go
6573 over the list. */
6574
6575 static bool
6576 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
6577 struct gcc_options *opts,
6578 struct gcc_options *opts_set,
6579 struct gcc_options *enum_opts_set)
6580 {
6581 char *next_optstr;
6582 bool ret = true;
6583
6584 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
6585 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
6586 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
6587 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
6588 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
6589
6590 enum ix86_opt_type
6591 {
6592 ix86_opt_unknown,
6593 ix86_opt_yes,
6594 ix86_opt_no,
6595 ix86_opt_str,
6596 ix86_opt_enum,
6597 ix86_opt_isa
6598 };
6599
6600 static const struct
6601 {
6602 const char *string;
6603 size_t len;
6604 enum ix86_opt_type type;
6605 int opt;
6606 int mask;
6607 } attrs[] = {
6608 /* isa options */
6609 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
6610 IX86_ATTR_ISA ("abm", OPT_mabm),
6611 IX86_ATTR_ISA ("bmi", OPT_mbmi),
6612 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
6613 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
6614 IX86_ATTR_ISA ("tbm", OPT_mtbm),
6615 IX86_ATTR_ISA ("aes", OPT_maes),
6616 IX86_ATTR_ISA ("sha", OPT_msha),
6617 IX86_ATTR_ISA ("avx", OPT_mavx),
6618 IX86_ATTR_ISA ("avx2", OPT_mavx2),
6619 IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
6620 IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
6621 IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
6622 IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
6623 IX86_ATTR_ISA ("avx512dq", OPT_mavx512dq),
6624 IX86_ATTR_ISA ("avx512bw", OPT_mavx512bw),
6625 IX86_ATTR_ISA ("avx512vl", OPT_mavx512vl),
6626 IX86_ATTR_ISA ("avx5124fmaps", OPT_mavx5124fmaps),
6627 IX86_ATTR_ISA ("avx5124vnniw", OPT_mavx5124vnniw),
6628 IX86_ATTR_ISA ("mmx", OPT_mmmx),
6629 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
6630 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
6631 IX86_ATTR_ISA ("movbe", OPT_mmovbe),
6632 IX86_ATTR_ISA ("crc32", OPT_mcrc32),
6633 IX86_ATTR_ISA ("sse", OPT_msse),
6634 IX86_ATTR_ISA ("sse2", OPT_msse2),
6635 IX86_ATTR_ISA ("sse3", OPT_msse3),
6636 IX86_ATTR_ISA ("sse4", OPT_msse4),
6637 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
6638 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
6639 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
6640 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
6641 IX86_ATTR_ISA ("fma4", OPT_mfma4),
6642 IX86_ATTR_ISA ("fma", OPT_mfma),
6643 IX86_ATTR_ISA ("xop", OPT_mxop),
6644 IX86_ATTR_ISA ("lwp", OPT_mlwp),
6645 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
6646 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
6647 IX86_ATTR_ISA ("f16c", OPT_mf16c),
6648 IX86_ATTR_ISA ("rtm", OPT_mrtm),
6649 IX86_ATTR_ISA ("hle", OPT_mhle),
6650 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
6651 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
6652 IX86_ATTR_ISA ("adx", OPT_madx),
6653 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
6654 IX86_ATTR_ISA ("xsave", OPT_mxsave),
6655 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
6656 IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1),
6657 IX86_ATTR_ISA ("clflushopt", OPT_mclflushopt),
6658 IX86_ATTR_ISA ("xsavec", OPT_mxsavec),
6659 IX86_ATTR_ISA ("xsaves", OPT_mxsaves),
6660 IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi),
6661 IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma),
6662 IX86_ATTR_ISA ("clwb", OPT_mclwb),
6663 IX86_ATTR_ISA ("mwaitx", OPT_mmwaitx),
6664 IX86_ATTR_ISA ("clzero", OPT_mclzero),
6665 IX86_ATTR_ISA ("pku", OPT_mpku),
6666
6667 /* enum options */
6668 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
6669
6670 /* string options */
6671 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
6672 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
6673
6674 /* flag options */
6675 IX86_ATTR_YES ("cld",
6676 OPT_mcld,
6677 MASK_CLD),
6678
6679 IX86_ATTR_NO ("fancy-math-387",
6680 OPT_mfancy_math_387,
6681 MASK_NO_FANCY_MATH_387),
6682
6683 IX86_ATTR_YES ("ieee-fp",
6684 OPT_mieee_fp,
6685 MASK_IEEE_FP),
6686
6687 IX86_ATTR_YES ("inline-all-stringops",
6688 OPT_minline_all_stringops,
6689 MASK_INLINE_ALL_STRINGOPS),
6690
6691 IX86_ATTR_YES ("inline-stringops-dynamically",
6692 OPT_minline_stringops_dynamically,
6693 MASK_INLINE_STRINGOPS_DYNAMICALLY),
6694
6695 IX86_ATTR_NO ("align-stringops",
6696 OPT_mno_align_stringops,
6697 MASK_NO_ALIGN_STRINGOPS),
6698
6699 IX86_ATTR_YES ("recip",
6700 OPT_mrecip,
6701 MASK_RECIP),
6702
6703 };
6704
6705 /* If this is a list, recurse to get the options. */
6706 if (TREE_CODE (args) == TREE_LIST)
6707 {
6708 bool ret = true;
6709
6710 for (; args; args = TREE_CHAIN (args))
6711 if (TREE_VALUE (args)
6712 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
6713 p_strings, opts, opts_set,
6714 enum_opts_set))
6715 ret = false;
6716
6717 return ret;
6718 }
6719
6720 else if (TREE_CODE (args) != STRING_CST)
6721 {
6722 error ("attribute %<target%> argument not a string");
6723 return false;
6724 }
6725
6726 /* Handle multiple arguments separated by commas. */
6727 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
6728
6729 while (next_optstr && *next_optstr != '\0')
6730 {
6731 char *p = next_optstr;
6732 char *orig_p = p;
6733 char *comma = strchr (next_optstr, ',');
6734 const char *opt_string;
6735 size_t len, opt_len;
6736 int opt;
6737 bool opt_set_p;
6738 char ch;
6739 unsigned i;
6740 enum ix86_opt_type type = ix86_opt_unknown;
6741 int mask = 0;
6742
6743 if (comma)
6744 {
6745 *comma = '\0';
6746 len = comma - next_optstr;
6747 next_optstr = comma + 1;
6748 }
6749 else
6750 {
6751 len = strlen (p);
6752 next_optstr = NULL;
6753 }
6754
6755 /* Recognize no-xxx. */
6756 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
6757 {
6758 opt_set_p = false;
6759 p += 3;
6760 len -= 3;
6761 }
6762 else
6763 opt_set_p = true;
6764
6765 /* Find the option. */
6766 ch = *p;
6767 opt = N_OPTS;
6768 for (i = 0; i < ARRAY_SIZE (attrs); i++)
6769 {
6770 type = attrs[i].type;
6771 opt_len = attrs[i].len;
6772 if (ch == attrs[i].string[0]
6773 && ((type != ix86_opt_str && type != ix86_opt_enum)
6774 ? len == opt_len
6775 : len > opt_len)
6776 && memcmp (p, attrs[i].string, opt_len) == 0)
6777 {
6778 opt = attrs[i].opt;
6779 mask = attrs[i].mask;
6780 opt_string = attrs[i].string;
6781 break;
6782 }
6783 }
6784
6785 /* Process the option. */
6786 if (opt == N_OPTS)
6787 {
6788 error ("attribute(target(\"%s\")) is unknown", orig_p);
6789 ret = false;
6790 }
6791
6792 else if (type == ix86_opt_isa)
6793 {
6794 struct cl_decoded_option decoded;
6795
6796 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
6797 ix86_handle_option (opts, opts_set,
6798 &decoded, input_location);
6799 }
6800
6801 else if (type == ix86_opt_yes || type == ix86_opt_no)
6802 {
6803 if (type == ix86_opt_no)
6804 opt_set_p = !opt_set_p;
6805
6806 if (opt_set_p)
6807 opts->x_target_flags |= mask;
6808 else
6809 opts->x_target_flags &= ~mask;
6810 }
6811
6812 else if (type == ix86_opt_str)
6813 {
6814 if (p_strings[opt])
6815 {
6816 error ("option(\"%s\") was already specified", opt_string);
6817 ret = false;
6818 }
6819 else
6820 p_strings[opt] = xstrdup (p + opt_len);
6821 }
6822
6823 else if (type == ix86_opt_enum)
6824 {
6825 bool arg_ok;
6826 int value;
6827
6828 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
6829 if (arg_ok)
6830 set_option (opts, enum_opts_set, opt, value,
6831 p + opt_len, DK_UNSPECIFIED, input_location,
6832 global_dc);
6833 else
6834 {
6835 error ("attribute(target(\"%s\")) is unknown", orig_p);
6836 ret = false;
6837 }
6838 }
6839
6840 else
6841 gcc_unreachable ();
6842 }
6843
6844 return ret;
6845 }
6846
6847 /* Release allocated strings. */
6848 static void
6849 release_options_strings (char **option_strings)
6850 {
6851 /* Free up memory allocated to hold the strings */
6852 for (unsigned i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
6853 free (option_strings[i]);
6854 }
6855
6856 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
6857
6858 tree
6859 ix86_valid_target_attribute_tree (tree args,
6860 struct gcc_options *opts,
6861 struct gcc_options *opts_set)
6862 {
6863 const char *orig_arch_string = opts->x_ix86_arch_string;
6864 const char *orig_tune_string = opts->x_ix86_tune_string;
6865 enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
6866 int orig_tune_defaulted = ix86_tune_defaulted;
6867 int orig_arch_specified = ix86_arch_specified;
6868 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
6869 tree t = NULL_TREE;
6870 struct cl_target_option *def
6871 = TREE_TARGET_OPTION (target_option_default_node);
6872 struct gcc_options enum_opts_set;
6873
6874 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
6875
6876 /* Process each of the options on the chain. */
6877 if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
6878 opts_set, &enum_opts_set))
6879 return error_mark_node;
6880
6881 /* If the changed options are different from the default, rerun
6882 ix86_option_override_internal, and then save the options away.
6883 The string options are attribute options, and will be undone
6884 when we copy the save structure. */
6885 if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
6886 || opts->x_ix86_isa_flags2 != def->x_ix86_isa_flags2
6887 || opts->x_target_flags != def->x_target_flags
6888 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
6889 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
6890 || enum_opts_set.x_ix86_fpmath)
6891 {
6892 /* If we are using the default tune= or arch=, undo the string assigned,
6893 and use the default. */
6894 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
6895 {
6896 opts->x_ix86_arch_string
6897 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]);
6898
6899 /* If arch= is set, clear all bits in x_ix86_isa_flags,
6900 except for ISA_64BIT, ABI_64, ABI_X32, and CODE16. */
6901 opts->x_ix86_isa_flags &= (OPTION_MASK_ISA_64BIT
6902 | OPTION_MASK_ABI_64
6903 | OPTION_MASK_ABI_X32
6904 | OPTION_MASK_CODE16);
6905 opts->x_ix86_isa_flags2 = 0;
6906 }
6907 else if (!orig_arch_specified)
6908 opts->x_ix86_arch_string = NULL;
6909
6910 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
6911 opts->x_ix86_tune_string
6912 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
6913 else if (orig_tune_defaulted)
6914 opts->x_ix86_tune_string = NULL;
6915
6916 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
6917 if (enum_opts_set.x_ix86_fpmath)
6918 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
6919 else if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
6920 && TARGET_SSE_P (opts->x_ix86_isa_flags))
6921 {
6922 if (TARGET_80387_P (opts->x_target_flags))
6923 opts->x_ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE
6924 | FPMATH_387);
6925 else
6926 opts->x_ix86_fpmath = (enum fpmath_unit) FPMATH_SSE;
6927 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
6928 }
6929
6930 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
6931 bool r = ix86_option_override_internal (false, opts, opts_set);
6932 if (!r)
6933 {
6934 release_options_strings (option_strings);
6935 return error_mark_node;
6936 }
6937
6938 /* Add any builtin functions with the new isa if any. */
6939 ix86_add_new_builtins (opts->x_ix86_isa_flags, opts->x_ix86_isa_flags2);
6940
6941 /* Save the current options unless we are validating options for
6942 #pragma. */
6943 t = build_target_option_node (opts);
6944
6945 opts->x_ix86_arch_string = orig_arch_string;
6946 opts->x_ix86_tune_string = orig_tune_string;
6947 opts_set->x_ix86_fpmath = orig_fpmath_set;
6948
6949 release_options_strings (option_strings);
6950 }
6951
6952 return t;
6953 }
6954
6955 /* Hook to validate attribute((target("string"))). */
6956
6957 static bool
6958 ix86_valid_target_attribute_p (tree fndecl,
6959 tree ARG_UNUSED (name),
6960 tree args,
6961 int ARG_UNUSED (flags))
6962 {
6963 struct gcc_options func_options;
6964 tree new_target, new_optimize;
6965 bool ret = true;
6966
6967 /* attribute((target("default"))) does nothing, beyond
6968 affecting multi-versioning. */
6969 if (TREE_VALUE (args)
6970 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
6971 && TREE_CHAIN (args) == NULL_TREE
6972 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
6973 return true;
6974
6975 tree old_optimize = build_optimization_node (&global_options);
6976
6977 /* Get the optimization options of the current function. */
6978 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
6979
6980 if (!func_optimize)
6981 func_optimize = old_optimize;
6982
6983 /* Init func_options. */
6984 memset (&func_options, 0, sizeof (func_options));
6985 init_options_struct (&func_options, NULL);
6986 lang_hooks.init_options_struct (&func_options);
6987
6988 cl_optimization_restore (&func_options,
6989 TREE_OPTIMIZATION (func_optimize));
6990
6991 /* Initialize func_options to the default before its target options can
6992 be set. */
6993 cl_target_option_restore (&func_options,
6994 TREE_TARGET_OPTION (target_option_default_node));
6995
6996 new_target = ix86_valid_target_attribute_tree (args, &func_options,
6997 &global_options_set);
6998
6999 new_optimize = build_optimization_node (&func_options);
7000
7001 if (new_target == error_mark_node)
7002 ret = false;
7003
7004 else if (fndecl && new_target)
7005 {
7006 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
7007
7008 if (old_optimize != new_optimize)
7009 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
7010 }
7011
7012 finalize_options_struct (&func_options);
7013
7014 return ret;
7015 }
7016
7017 \f
7018 /* Hook to determine if one function can safely inline another. */
7019
7020 static bool
7021 ix86_can_inline_p (tree caller, tree callee)
7022 {
7023 bool ret = false;
7024 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
7025 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
7026
7027 /* If callee has no option attributes, then it is ok to inline. */
7028 if (!callee_tree)
7029 ret = true;
7030
7031 /* If caller has no option attributes, but callee does then it is not ok to
7032 inline. */
7033 else if (!caller_tree)
7034 ret = false;
7035
7036 else
7037 {
7038 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
7039 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
7040
7041 /* Callee's isa options should be a subset of the caller's, i.e. a SSE4
7042 function can inline a SSE2 function but a SSE2 function can't inline
7043 a SSE4 function. */
7044 if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
7045 != callee_opts->x_ix86_isa_flags)
7046 || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2)
7047 != callee_opts->x_ix86_isa_flags2))
7048 ret = false;
7049
7050 /* See if we have the same non-isa options. */
7051 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
7052 ret = false;
7053
7054 /* See if arch, tune, etc. are the same. */
7055 else if (caller_opts->arch != callee_opts->arch)
7056 ret = false;
7057
7058 else if (caller_opts->tune != callee_opts->tune)
7059 ret = false;
7060
7061 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
7062 ret = false;
7063
7064 else if (caller_opts->branch_cost != callee_opts->branch_cost)
7065 ret = false;
7066
7067 else
7068 ret = true;
7069 }
7070
7071 return ret;
7072 }
7073
7074 \f
7075 /* Remember the last target of ix86_set_current_function. */
7076 static GTY(()) tree ix86_previous_fndecl;
7077
7078 /* Set targets globals to the default (or current #pragma GCC target
7079 if active). Invalidate ix86_previous_fndecl cache. */
7080
7081 void
7082 ix86_reset_previous_fndecl (void)
7083 {
7084 tree new_tree = target_option_current_node;
7085 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
7086 if (TREE_TARGET_GLOBALS (new_tree))
7087 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
7088 else if (new_tree == target_option_default_node)
7089 restore_target_globals (&default_target_globals);
7090 else
7091 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
7092 ix86_previous_fndecl = NULL_TREE;
7093 }
7094
7095 /* Set the func_type field from the function FNDECL. */
7096
7097 static void
7098 ix86_set_func_type (tree fndecl)
7099 {
7100 if (cfun->machine->func_type == TYPE_UNKNOWN)
7101 {
7102 if (lookup_attribute ("interrupt",
7103 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
7104 {
7105 int nargs = 0;
7106 for (tree arg = DECL_ARGUMENTS (fndecl);
7107 arg;
7108 arg = TREE_CHAIN (arg))
7109 nargs++;
7110 cfun->machine->no_caller_saved_registers = true;
7111 cfun->machine->func_type
7112 = nargs == 2 ? TYPE_EXCEPTION : TYPE_INTERRUPT;
7113
7114 ix86_optimize_mode_switching[X86_DIRFLAG] = 1;
7115
7116 /* Only dwarf2out.c can handle -WORD(AP) as a pointer argument. */
7117 if (write_symbols != NO_DEBUG && write_symbols != DWARF2_DEBUG)
7118 sorry ("Only DWARF debug format is supported for interrupt "
7119 "service routine.");
7120 }
7121 else
7122 {
7123 cfun->machine->func_type = TYPE_NORMAL;
7124 if (lookup_attribute ("no_caller_saved_registers",
7125 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
7126 cfun->machine->no_caller_saved_registers = true;
7127 }
7128 }
7129 }
7130
7131 /* Establish appropriate back-end context for processing the function
7132 FNDECL. The argument might be NULL to indicate processing at top
7133 level, outside of any function scope. */
7134 static void
7135 ix86_set_current_function (tree fndecl)
7136 {
7137 /* Only change the context if the function changes. This hook is called
7138 several times in the course of compiling a function, and we don't want to
7139 slow things down too much or call target_reinit when it isn't safe. */
7140 if (fndecl == ix86_previous_fndecl)
7141 {
7142 /* There may be 2 function bodies for the same function FNDECL,
7143 one is extern inline and one isn't. Call ix86_set_func_type
7144 to set the func_type field. */
7145 if (fndecl != NULL_TREE)
7146 ix86_set_func_type (fndecl);
7147 return;
7148 }
7149
7150 tree old_tree;
7151 if (ix86_previous_fndecl == NULL_TREE)
7152 old_tree = target_option_current_node;
7153 else if (DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl))
7154 old_tree = DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl);
7155 else
7156 old_tree = target_option_default_node;
7157
7158 if (fndecl == NULL_TREE)
7159 {
7160 if (old_tree != target_option_current_node)
7161 ix86_reset_previous_fndecl ();
7162 return;
7163 }
7164
7165 ix86_set_func_type (fndecl);
7166
7167 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
7168 if (new_tree == NULL_TREE)
7169 new_tree = target_option_default_node;
7170
7171 if (old_tree != new_tree)
7172 {
7173 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
7174 if (TREE_TARGET_GLOBALS (new_tree))
7175 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
7176 else if (new_tree == target_option_default_node)
7177 restore_target_globals (&default_target_globals);
7178 else
7179 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
7180 }
7181 ix86_previous_fndecl = fndecl;
7182
7183 static bool prev_no_caller_saved_registers;
7184
7185 /* 64-bit MS and SYSV ABI have different set of call used registers.
7186 Avoid expensive re-initialization of init_regs each time we switch
7187 function context. */
7188 if (TARGET_64BIT
7189 && (call_used_regs[SI_REG]
7190 == (cfun->machine->call_abi == MS_ABI)))
7191 reinit_regs ();
7192 /* Need to re-initialize init_regs if caller-saved registers are
7193 changed. */
7194 else if (prev_no_caller_saved_registers
7195 != cfun->machine->no_caller_saved_registers)
7196 reinit_regs ();
7197
7198 if (cfun->machine->func_type != TYPE_NORMAL
7199 || cfun->machine->no_caller_saved_registers)
7200 {
7201 /* Don't allow MPX, SSE, MMX nor x87 instructions since they
7202 may change processor state. */
7203 const char *isa;
7204 if (TARGET_MPX)
7205 isa = "MPX";
7206 else if (TARGET_SSE)
7207 isa = "SSE";
7208 else if (TARGET_MMX)
7209 isa = "MMX/3Dnow";
7210 else if (TARGET_80387)
7211 isa = "80387";
7212 else
7213 isa = NULL;
7214 if (isa != NULL)
7215 {
7216 if (cfun->machine->func_type != TYPE_NORMAL)
7217 sorry ("%s instructions aren't allowed in %s service routine",
7218 isa, (cfun->machine->func_type == TYPE_EXCEPTION
7219 ? "exception" : "interrupt"));
7220 else
7221 sorry ("%s instructions aren't allowed in function with "
7222 "no_caller_saved_registers attribute", isa);
7223 /* Don't issue the same error twice. */
7224 cfun->machine->func_type = TYPE_NORMAL;
7225 cfun->machine->no_caller_saved_registers = false;
7226 }
7227 }
7228
7229 prev_no_caller_saved_registers
7230 = cfun->machine->no_caller_saved_registers;
7231 }
7232
7233 \f
7234 /* Return true if this goes in large data/bss. */
7235
7236 static bool
7237 ix86_in_large_data_p (tree exp)
7238 {
7239 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
7240 return false;
7241
7242 if (exp == NULL_TREE)
7243 return false;
7244
7245 /* Functions are never large data. */
7246 if (TREE_CODE (exp) == FUNCTION_DECL)
7247 return false;
7248
7249 /* Automatic variables are never large data. */
7250 if (VAR_P (exp) && !is_global_var (exp))
7251 return false;
7252
7253 if (VAR_P (exp) && DECL_SECTION_NAME (exp))
7254 {
7255 const char *section = DECL_SECTION_NAME (exp);
7256 if (strcmp (section, ".ldata") == 0
7257 || strcmp (section, ".lbss") == 0)
7258 return true;
7259 return false;
7260 }
7261 else
7262 {
7263 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
7264
7265 /* If this is an incomplete type with size 0, then we can't put it
7266 in data because it might be too big when completed. Also,
7267 int_size_in_bytes returns -1 if size can vary or is larger than
7268 an integer in which case also it is safer to assume that it goes in
7269 large data. */
7270 if (size <= 0 || size > ix86_section_threshold)
7271 return true;
7272 }
7273
7274 return false;
7275 }
7276
7277 /* i386-specific section flag to mark large sections. */
7278 #define SECTION_LARGE SECTION_MACH_DEP
7279
7280 /* Switch to the appropriate section for output of DECL.
7281 DECL is either a `VAR_DECL' node or a constant of some sort.
7282 RELOC indicates whether forming the initial value of DECL requires
7283 link-time relocations. */
7284
7285 ATTRIBUTE_UNUSED static section *
7286 x86_64_elf_select_section (tree decl, int reloc,
7287 unsigned HOST_WIDE_INT align)
7288 {
7289 if (ix86_in_large_data_p (decl))
7290 {
7291 const char *sname = NULL;
7292 unsigned int flags = SECTION_WRITE | SECTION_LARGE;
7293 switch (categorize_decl_for_section (decl, reloc))
7294 {
7295 case SECCAT_DATA:
7296 sname = ".ldata";
7297 break;
7298 case SECCAT_DATA_REL:
7299 sname = ".ldata.rel";
7300 break;
7301 case SECCAT_DATA_REL_LOCAL:
7302 sname = ".ldata.rel.local";
7303 break;
7304 case SECCAT_DATA_REL_RO:
7305 sname = ".ldata.rel.ro";
7306 break;
7307 case SECCAT_DATA_REL_RO_LOCAL:
7308 sname = ".ldata.rel.ro.local";
7309 break;
7310 case SECCAT_BSS:
7311 sname = ".lbss";
7312 flags |= SECTION_BSS;
7313 break;
7314 case SECCAT_RODATA:
7315 case SECCAT_RODATA_MERGE_STR:
7316 case SECCAT_RODATA_MERGE_STR_INIT:
7317 case SECCAT_RODATA_MERGE_CONST:
7318 sname = ".lrodata";
7319 flags &= ~SECTION_WRITE;
7320 break;
7321 case SECCAT_SRODATA:
7322 case SECCAT_SDATA:
7323 case SECCAT_SBSS:
7324 gcc_unreachable ();
7325 case SECCAT_TEXT:
7326 case SECCAT_TDATA:
7327 case SECCAT_TBSS:
7328 /* We don't split these for medium model. Place them into
7329 default sections and hope for best. */
7330 break;
7331 }
7332 if (sname)
7333 {
7334 /* We might get called with string constants, but get_named_section
7335 doesn't like them as they are not DECLs. Also, we need to set
7336 flags in that case. */
7337 if (!DECL_P (decl))
7338 return get_section (sname, flags, NULL);
7339 return get_named_section (decl, sname, reloc);
7340 }
7341 }
7342 return default_elf_select_section (decl, reloc, align);
7343 }
7344
7345 /* Select a set of attributes for section NAME based on the properties
7346 of DECL and whether or not RELOC indicates that DECL's initializer
7347 might contain runtime relocations. */
7348
7349 static unsigned int ATTRIBUTE_UNUSED
7350 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
7351 {
7352 unsigned int flags = default_section_type_flags (decl, name, reloc);
7353
7354 if (ix86_in_large_data_p (decl))
7355 flags |= SECTION_LARGE;
7356
7357 if (decl == NULL_TREE
7358 && (strcmp (name, ".ldata.rel.ro") == 0
7359 || strcmp (name, ".ldata.rel.ro.local") == 0))
7360 flags |= SECTION_RELRO;
7361
7362 if (strcmp (name, ".lbss") == 0
7363 || strncmp (name, ".lbss.", 5) == 0
7364 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
7365 flags |= SECTION_BSS;
7366
7367 return flags;
7368 }
7369
7370 /* Build up a unique section name, expressed as a
7371 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
7372 RELOC indicates whether the initial value of EXP requires
7373 link-time relocations. */
7374
7375 static void ATTRIBUTE_UNUSED
7376 x86_64_elf_unique_section (tree decl, int reloc)
7377 {
7378 if (ix86_in_large_data_p (decl))
7379 {
7380 const char *prefix = NULL;
7381 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
7382 bool one_only = DECL_COMDAT_GROUP (decl) && !HAVE_COMDAT_GROUP;
7383
7384 switch (categorize_decl_for_section (decl, reloc))
7385 {
7386 case SECCAT_DATA:
7387 case SECCAT_DATA_REL:
7388 case SECCAT_DATA_REL_LOCAL:
7389 case SECCAT_DATA_REL_RO:
7390 case SECCAT_DATA_REL_RO_LOCAL:
7391 prefix = one_only ? ".ld" : ".ldata";
7392 break;
7393 case SECCAT_BSS:
7394 prefix = one_only ? ".lb" : ".lbss";
7395 break;
7396 case SECCAT_RODATA:
7397 case SECCAT_RODATA_MERGE_STR:
7398 case SECCAT_RODATA_MERGE_STR_INIT:
7399 case SECCAT_RODATA_MERGE_CONST:
7400 prefix = one_only ? ".lr" : ".lrodata";
7401 break;
7402 case SECCAT_SRODATA:
7403 case SECCAT_SDATA:
7404 case SECCAT_SBSS:
7405 gcc_unreachable ();
7406 case SECCAT_TEXT:
7407 case SECCAT_TDATA:
7408 case SECCAT_TBSS:
7409 /* We don't split these for medium model. Place them into
7410 default sections and hope for best. */
7411 break;
7412 }
7413 if (prefix)
7414 {
7415 const char *name, *linkonce;
7416 char *string;
7417
7418 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
7419 name = targetm.strip_name_encoding (name);
7420
7421 /* If we're using one_only, then there needs to be a .gnu.linkonce
7422 prefix to the section name. */
7423 linkonce = one_only ? ".gnu.linkonce" : "";
7424
7425 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
7426
7427 set_decl_section_name (decl, string);
7428 return;
7429 }
7430 }
7431 default_unique_section (decl, reloc);
7432 }
7433
7434 #ifdef COMMON_ASM_OP
7435
7436 #ifndef LARGECOMM_SECTION_ASM_OP
7437 #define LARGECOMM_SECTION_ASM_OP "\t.largecomm\t"
7438 #endif
7439
7440 /* This says how to output assembler code to declare an
7441 uninitialized external linkage data object.
7442
7443 For medium model x86-64 we need to use LARGECOMM_SECTION_ASM_OP opcode for
7444 large objects. */
7445 void
7446 x86_elf_aligned_decl_common (FILE *file, tree decl,
7447 const char *name, unsigned HOST_WIDE_INT size,
7448 int align)
7449 {
7450 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
7451 && size > (unsigned int)ix86_section_threshold)
7452 {
7453 switch_to_section (get_named_section (decl, ".lbss", 0));
7454 fputs (LARGECOMM_SECTION_ASM_OP, file);
7455 }
7456 else
7457 fputs (COMMON_ASM_OP, file);
7458 assemble_name (file, name);
7459 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
7460 size, align / BITS_PER_UNIT);
7461 }
7462 #endif
7463
7464 /* Utility function for targets to use in implementing
7465 ASM_OUTPUT_ALIGNED_BSS. */
7466
7467 void
7468 x86_output_aligned_bss (FILE *file, tree decl, const char *name,
7469 unsigned HOST_WIDE_INT size, int align)
7470 {
7471 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
7472 && size > (unsigned int)ix86_section_threshold)
7473 switch_to_section (get_named_section (decl, ".lbss", 0));
7474 else
7475 switch_to_section (bss_section);
7476 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
7477 #ifdef ASM_DECLARE_OBJECT_NAME
7478 last_assemble_variable_decl = decl;
7479 ASM_DECLARE_OBJECT_NAME (file, name, decl);
7480 #else
7481 /* Standard thing is just output label for the object. */
7482 ASM_OUTPUT_LABEL (file, name);
7483 #endif /* ASM_DECLARE_OBJECT_NAME */
7484 ASM_OUTPUT_SKIP (file, size ? size : 1);
7485 }
7486 \f
7487 /* Decide whether we must probe the stack before any space allocation
7488 on this target. It's essentially TARGET_STACK_PROBE except when
7489 -fstack-check causes the stack to be already probed differently. */
7490
7491 bool
7492 ix86_target_stack_probe (void)
7493 {
7494 /* Do not probe the stack twice if static stack checking is enabled. */
7495 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
7496 return false;
7497
7498 return TARGET_STACK_PROBE;
7499 }
7500 \f
7501 /* Decide whether we can make a sibling call to a function. DECL is the
7502 declaration of the function being targeted by the call and EXP is the
7503 CALL_EXPR representing the call. */
7504
7505 static bool
7506 ix86_function_ok_for_sibcall (tree decl, tree exp)
7507 {
7508 tree type, decl_or_type;
7509 rtx a, b;
7510 bool bind_global = decl && !targetm.binds_local_p (decl);
7511
7512 /* Sibling call isn't OK if there are no caller-saved registers
7513 since all registers must be preserved before return. */
7514 if (cfun->machine->no_caller_saved_registers)
7515 return false;
7516
7517 /* If we are generating position-independent code, we cannot sibcall
7518 optimize direct calls to global functions, as the PLT requires
7519 %ebx be live. (Darwin does not have a PLT.) */
7520 if (!TARGET_MACHO
7521 && !TARGET_64BIT
7522 && flag_pic
7523 && flag_plt
7524 && bind_global)
7525 return false;
7526
7527 /* If we need to align the outgoing stack, then sibcalling would
7528 unalign the stack, which may break the called function. */
7529 if (ix86_minimum_incoming_stack_boundary (true)
7530 < PREFERRED_STACK_BOUNDARY)
7531 return false;
7532
7533 if (decl)
7534 {
7535 decl_or_type = decl;
7536 type = TREE_TYPE (decl);
7537 }
7538 else
7539 {
7540 /* We're looking at the CALL_EXPR, we need the type of the function. */
7541 type = CALL_EXPR_FN (exp); /* pointer expression */
7542 type = TREE_TYPE (type); /* pointer type */
7543 type = TREE_TYPE (type); /* function type */
7544 decl_or_type = type;
7545 }
7546
7547 /* Check that the return value locations are the same. Like
7548 if we are returning floats on the 80387 register stack, we cannot
7549 make a sibcall from a function that doesn't return a float to a
7550 function that does or, conversely, from a function that does return
7551 a float to a function that doesn't; the necessary stack adjustment
7552 would not be executed. This is also the place we notice
7553 differences in the return value ABI. Note that it is ok for one
7554 of the functions to have void return type as long as the return
7555 value of the other is passed in a register. */
7556 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
7557 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
7558 cfun->decl, false);
7559 if (STACK_REG_P (a) || STACK_REG_P (b))
7560 {
7561 if (!rtx_equal_p (a, b))
7562 return false;
7563 }
7564 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
7565 ;
7566 else if (!rtx_equal_p (a, b))
7567 return false;
7568
7569 if (TARGET_64BIT)
7570 {
7571 /* The SYSV ABI has more call-clobbered registers;
7572 disallow sibcalls from MS to SYSV. */
7573 if (cfun->machine->call_abi == MS_ABI
7574 && ix86_function_type_abi (type) == SYSV_ABI)
7575 return false;
7576 }
7577 else
7578 {
7579 /* If this call is indirect, we'll need to be able to use a
7580 call-clobbered register for the address of the target function.
7581 Make sure that all such registers are not used for passing
7582 parameters. Note that DLLIMPORT functions and call to global
7583 function via GOT slot are indirect. */
7584 if (!decl
7585 || (bind_global && flag_pic && !flag_plt)
7586 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
7587 {
7588 /* Check if regparm >= 3 since arg_reg_available is set to
7589 false if regparm == 0. If regparm is 1 or 2, there is
7590 always a call-clobbered register available.
7591
7592 ??? The symbol indirect call doesn't need a call-clobbered
7593 register. But we don't know if this is a symbol indirect
7594 call or not here. */
7595 if (ix86_function_regparm (type, NULL) >= 3
7596 && !cfun->machine->arg_reg_available)
7597 return false;
7598 }
7599 }
7600
7601 /* Otherwise okay. That also includes certain types of indirect calls. */
7602 return true;
7603 }
7604
7605 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
7606 and "sseregparm" calling convention attributes;
7607 arguments as in struct attribute_spec.handler. */
7608
7609 static tree
7610 ix86_handle_cconv_attribute (tree *node, tree name,
7611 tree args,
7612 int,
7613 bool *no_add_attrs)
7614 {
7615 if (TREE_CODE (*node) != FUNCTION_TYPE
7616 && TREE_CODE (*node) != METHOD_TYPE
7617 && TREE_CODE (*node) != FIELD_DECL
7618 && TREE_CODE (*node) != TYPE_DECL)
7619 {
7620 warning (OPT_Wattributes, "%qE attribute only applies to functions",
7621 name);
7622 *no_add_attrs = true;
7623 return NULL_TREE;
7624 }
7625
7626 /* Can combine regparm with all attributes but fastcall, and thiscall. */
7627 if (is_attribute_p ("regparm", name))
7628 {
7629 tree cst;
7630
7631 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
7632 {
7633 error ("fastcall and regparm attributes are not compatible");
7634 }
7635
7636 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
7637 {
7638 error ("regparam and thiscall attributes are not compatible");
7639 }
7640
7641 cst = TREE_VALUE (args);
7642 if (TREE_CODE (cst) != INTEGER_CST)
7643 {
7644 warning (OPT_Wattributes,
7645 "%qE attribute requires an integer constant argument",
7646 name);
7647 *no_add_attrs = true;
7648 }
7649 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
7650 {
7651 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
7652 name, REGPARM_MAX);
7653 *no_add_attrs = true;
7654 }
7655
7656 return NULL_TREE;
7657 }
7658
7659 if (TARGET_64BIT)
7660 {
7661 /* Do not warn when emulating the MS ABI. */
7662 if ((TREE_CODE (*node) != FUNCTION_TYPE
7663 && TREE_CODE (*node) != METHOD_TYPE)
7664 || ix86_function_type_abi (*node) != MS_ABI)
7665 warning (OPT_Wattributes, "%qE attribute ignored",
7666 name);
7667 *no_add_attrs = true;
7668 return NULL_TREE;
7669 }
7670
7671 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
7672 if (is_attribute_p ("fastcall", name))
7673 {
7674 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
7675 {
7676 error ("fastcall and cdecl attributes are not compatible");
7677 }
7678 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
7679 {
7680 error ("fastcall and stdcall attributes are not compatible");
7681 }
7682 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
7683 {
7684 error ("fastcall and regparm attributes are not compatible");
7685 }
7686 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
7687 {
7688 error ("fastcall and thiscall attributes are not compatible");
7689 }
7690 }
7691
7692 /* Can combine stdcall with fastcall (redundant), regparm and
7693 sseregparm. */
7694 else if (is_attribute_p ("stdcall", name))
7695 {
7696 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
7697 {
7698 error ("stdcall and cdecl attributes are not compatible");
7699 }
7700 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
7701 {
7702 error ("stdcall and fastcall attributes are not compatible");
7703 }
7704 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
7705 {
7706 error ("stdcall and thiscall attributes are not compatible");
7707 }
7708 }
7709
7710 /* Can combine cdecl with regparm and sseregparm. */
7711 else if (is_attribute_p ("cdecl", name))
7712 {
7713 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
7714 {
7715 error ("stdcall and cdecl attributes are not compatible");
7716 }
7717 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
7718 {
7719 error ("fastcall and cdecl attributes are not compatible");
7720 }
7721 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
7722 {
7723 error ("cdecl and thiscall attributes are not compatible");
7724 }
7725 }
7726 else if (is_attribute_p ("thiscall", name))
7727 {
7728 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
7729 warning (OPT_Wattributes, "%qE attribute is used for non-class method",
7730 name);
7731 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
7732 {
7733 error ("stdcall and thiscall attributes are not compatible");
7734 }
7735 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
7736 {
7737 error ("fastcall and thiscall attributes are not compatible");
7738 }
7739 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
7740 {
7741 error ("cdecl and thiscall attributes are not compatible");
7742 }
7743 }
7744
7745 /* Can combine sseregparm with all attributes. */
7746
7747 return NULL_TREE;
7748 }
7749
7750 /* The transactional memory builtins are implicitly regparm or fastcall
7751 depending on the ABI. Override the generic do-nothing attribute that
7752 these builtins were declared with, and replace it with one of the two
7753 attributes that we expect elsewhere. */
7754
7755 static tree
7756 ix86_handle_tm_regparm_attribute (tree *node, tree, tree,
7757 int flags, bool *no_add_attrs)
7758 {
7759 tree alt;
7760
7761 /* In no case do we want to add the placeholder attribute. */
7762 *no_add_attrs = true;
7763
7764 /* The 64-bit ABI is unchanged for transactional memory. */
7765 if (TARGET_64BIT)
7766 return NULL_TREE;
7767
7768 /* ??? Is there a better way to validate 32-bit windows? We have
7769 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
7770 if (CHECK_STACK_LIMIT > 0)
7771 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
7772 else
7773 {
7774 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
7775 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
7776 }
7777 decl_attributes (node, alt, flags);
7778
7779 return NULL_TREE;
7780 }
7781
7782 /* This function determines from TYPE the calling-convention. */
7783
7784 unsigned int
7785 ix86_get_callcvt (const_tree type)
7786 {
7787 unsigned int ret = 0;
7788 bool is_stdarg;
7789 tree attrs;
7790
7791 if (TARGET_64BIT)
7792 return IX86_CALLCVT_CDECL;
7793
7794 attrs = TYPE_ATTRIBUTES (type);
7795 if (attrs != NULL_TREE)
7796 {
7797 if (lookup_attribute ("cdecl", attrs))
7798 ret |= IX86_CALLCVT_CDECL;
7799 else if (lookup_attribute ("stdcall", attrs))
7800 ret |= IX86_CALLCVT_STDCALL;
7801 else if (lookup_attribute ("fastcall", attrs))
7802 ret |= IX86_CALLCVT_FASTCALL;
7803 else if (lookup_attribute ("thiscall", attrs))
7804 ret |= IX86_CALLCVT_THISCALL;
7805
7806 /* Regparam isn't allowed for thiscall and fastcall. */
7807 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
7808 {
7809 if (lookup_attribute ("regparm", attrs))
7810 ret |= IX86_CALLCVT_REGPARM;
7811 if (lookup_attribute ("sseregparm", attrs))
7812 ret |= IX86_CALLCVT_SSEREGPARM;
7813 }
7814
7815 if (IX86_BASE_CALLCVT(ret) != 0)
7816 return ret;
7817 }
7818
7819 is_stdarg = stdarg_p (type);
7820 if (TARGET_RTD && !is_stdarg)
7821 return IX86_CALLCVT_STDCALL | ret;
7822
7823 if (ret != 0
7824 || is_stdarg
7825 || TREE_CODE (type) != METHOD_TYPE
7826 || ix86_function_type_abi (type) != MS_ABI)
7827 return IX86_CALLCVT_CDECL | ret;
7828
7829 return IX86_CALLCVT_THISCALL;
7830 }
7831
7832 /* Return 0 if the attributes for two types are incompatible, 1 if they
7833 are compatible, and 2 if they are nearly compatible (which causes a
7834 warning to be generated). */
7835
7836 static int
7837 ix86_comp_type_attributes (const_tree type1, const_tree type2)
7838 {
7839 unsigned int ccvt1, ccvt2;
7840
7841 if (TREE_CODE (type1) != FUNCTION_TYPE
7842 && TREE_CODE (type1) != METHOD_TYPE)
7843 return 1;
7844
7845 ccvt1 = ix86_get_callcvt (type1);
7846 ccvt2 = ix86_get_callcvt (type2);
7847 if (ccvt1 != ccvt2)
7848 return 0;
7849 if (ix86_function_regparm (type1, NULL)
7850 != ix86_function_regparm (type2, NULL))
7851 return 0;
7852
7853 return 1;
7854 }
7855 \f
7856 /* Return the regparm value for a function with the indicated TYPE and DECL.
7857 DECL may be NULL when calling function indirectly
7858 or considering a libcall. */
7859
7860 static int
7861 ix86_function_regparm (const_tree type, const_tree decl)
7862 {
7863 tree attr;
7864 int regparm;
7865 unsigned int ccvt;
7866
7867 if (TARGET_64BIT)
7868 return (ix86_function_type_abi (type) == SYSV_ABI
7869 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
7870 ccvt = ix86_get_callcvt (type);
7871 regparm = ix86_regparm;
7872
7873 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
7874 {
7875 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
7876 if (attr)
7877 {
7878 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
7879 return regparm;
7880 }
7881 }
7882 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
7883 return 2;
7884 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
7885 return 1;
7886
7887 /* Use register calling convention for local functions when possible. */
7888 if (decl
7889 && TREE_CODE (decl) == FUNCTION_DECL)
7890 {
7891 cgraph_node *target = cgraph_node::get (decl);
7892 if (target)
7893 target = target->function_symbol ();
7894
7895 /* Caller and callee must agree on the calling convention, so
7896 checking here just optimize means that with
7897 __attribute__((optimize (...))) caller could use regparm convention
7898 and callee not, or vice versa. Instead look at whether the callee
7899 is optimized or not. */
7900 if (target && opt_for_fn (target->decl, optimize)
7901 && !(profile_flag && !flag_fentry))
7902 {
7903 cgraph_local_info *i = &target->local;
7904 if (i && i->local && i->can_change_signature)
7905 {
7906 int local_regparm, globals = 0, regno;
7907
7908 /* Make sure no regparm register is taken by a
7909 fixed register variable. */
7910 for (local_regparm = 0; local_regparm < REGPARM_MAX;
7911 local_regparm++)
7912 if (fixed_regs[local_regparm])
7913 break;
7914
7915 /* We don't want to use regparm(3) for nested functions as
7916 these use a static chain pointer in the third argument. */
7917 if (local_regparm == 3 && DECL_STATIC_CHAIN (target->decl))
7918 local_regparm = 2;
7919
7920 /* Save a register for the split stack. */
7921 if (local_regparm == 3 && flag_split_stack)
7922 local_regparm = 2;
7923
7924 /* Each fixed register usage increases register pressure,
7925 so less registers should be used for argument passing.
7926 This functionality can be overriden by an explicit
7927 regparm value. */
7928 for (regno = AX_REG; regno <= DI_REG; regno++)
7929 if (fixed_regs[regno])
7930 globals++;
7931
7932 local_regparm
7933 = globals < local_regparm ? local_regparm - globals : 0;
7934
7935 if (local_regparm > regparm)
7936 regparm = local_regparm;
7937 }
7938 }
7939 }
7940
7941 return regparm;
7942 }
7943
7944 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
7945 DFmode (2) arguments in SSE registers for a function with the
7946 indicated TYPE and DECL. DECL may be NULL when calling function
7947 indirectly or considering a libcall. Return -1 if any FP parameter
7948 should be rejected by error. This is used in siutation we imply SSE
7949 calling convetion but the function is called from another function with
7950 SSE disabled. Otherwise return 0. */
7951
7952 static int
7953 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
7954 {
7955 gcc_assert (!TARGET_64BIT);
7956
7957 /* Use SSE registers to pass SFmode and DFmode arguments if requested
7958 by the sseregparm attribute. */
7959 if (TARGET_SSEREGPARM
7960 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
7961 {
7962 if (!TARGET_SSE)
7963 {
7964 if (warn)
7965 {
7966 if (decl)
7967 error ("calling %qD with attribute sseregparm without "
7968 "SSE/SSE2 enabled", decl);
7969 else
7970 error ("calling %qT with attribute sseregparm without "
7971 "SSE/SSE2 enabled", type);
7972 }
7973 return 0;
7974 }
7975
7976 return 2;
7977 }
7978
7979 if (!decl)
7980 return 0;
7981
7982 cgraph_node *target = cgraph_node::get (decl);
7983 if (target)
7984 target = target->function_symbol ();
7985
7986 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
7987 (and DFmode for SSE2) arguments in SSE registers. */
7988 if (target
7989 /* TARGET_SSE_MATH */
7990 && (target_opts_for_fn (target->decl)->x_ix86_fpmath & FPMATH_SSE)
7991 && opt_for_fn (target->decl, optimize)
7992 && !(profile_flag && !flag_fentry))
7993 {
7994 cgraph_local_info *i = &target->local;
7995 if (i && i->local && i->can_change_signature)
7996 {
7997 /* Refuse to produce wrong code when local function with SSE enabled
7998 is called from SSE disabled function.
7999 FIXME: We need a way to detect these cases cross-ltrans partition
8000 and avoid using SSE calling conventions on local functions called
8001 from function with SSE disabled. For now at least delay the
8002 warning until we know we are going to produce wrong code.
8003 See PR66047 */
8004 if (!TARGET_SSE && warn)
8005 return -1;
8006 return TARGET_SSE2_P (target_opts_for_fn (target->decl)
8007 ->x_ix86_isa_flags) ? 2 : 1;
8008 }
8009 }
8010
8011 return 0;
8012 }
8013
8014 /* Return true if EAX is live at the start of the function. Used by
8015 ix86_expand_prologue to determine if we need special help before
8016 calling allocate_stack_worker. */
8017
8018 static bool
8019 ix86_eax_live_at_start_p (void)
8020 {
8021 /* Cheat. Don't bother working forward from ix86_function_regparm
8022 to the function type to whether an actual argument is located in
8023 eax. Instead just look at cfg info, which is still close enough
8024 to correct at this point. This gives false positives for broken
8025 functions that might use uninitialized data that happens to be
8026 allocated in eax, but who cares? */
8027 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
8028 }
8029
8030 static bool
8031 ix86_keep_aggregate_return_pointer (tree fntype)
8032 {
8033 tree attr;
8034
8035 if (!TARGET_64BIT)
8036 {
8037 attr = lookup_attribute ("callee_pop_aggregate_return",
8038 TYPE_ATTRIBUTES (fntype));
8039 if (attr)
8040 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
8041
8042 /* For 32-bit MS-ABI the default is to keep aggregate
8043 return pointer. */
8044 if (ix86_function_type_abi (fntype) == MS_ABI)
8045 return true;
8046 }
8047 return KEEP_AGGREGATE_RETURN_POINTER != 0;
8048 }
8049
8050 /* Value is the number of bytes of arguments automatically
8051 popped when returning from a subroutine call.
8052 FUNDECL is the declaration node of the function (as a tree),
8053 FUNTYPE is the data type of the function (as a tree),
8054 or for a library call it is an identifier node for the subroutine name.
8055 SIZE is the number of bytes of arguments passed on the stack.
8056
8057 On the 80386, the RTD insn may be used to pop them if the number
8058 of args is fixed, but if the number is variable then the caller
8059 must pop them all. RTD can't be used for library calls now
8060 because the library is compiled with the Unix compiler.
8061 Use of RTD is a selectable option, since it is incompatible with
8062 standard Unix calling sequences. If the option is not selected,
8063 the caller must always pop the args.
8064
8065 The attribute stdcall is equivalent to RTD on a per module basis. */
8066
8067 static int
8068 ix86_return_pops_args (tree fundecl, tree funtype, int size)
8069 {
8070 unsigned int ccvt;
8071
8072 /* None of the 64-bit ABIs pop arguments. */
8073 if (TARGET_64BIT)
8074 return 0;
8075
8076 ccvt = ix86_get_callcvt (funtype);
8077
8078 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
8079 | IX86_CALLCVT_THISCALL)) != 0
8080 && ! stdarg_p (funtype))
8081 return size;
8082
8083 /* Lose any fake structure return argument if it is passed on the stack. */
8084 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
8085 && !ix86_keep_aggregate_return_pointer (funtype))
8086 {
8087 int nregs = ix86_function_regparm (funtype, fundecl);
8088 if (nregs == 0)
8089 return GET_MODE_SIZE (Pmode);
8090 }
8091
8092 return 0;
8093 }
8094
8095 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
8096
8097 static bool
8098 ix86_legitimate_combined_insn (rtx_insn *insn)
8099 {
8100 /* Check operand constraints in case hard registers were propagated
8101 into insn pattern. This check prevents combine pass from
8102 generating insn patterns with invalid hard register operands.
8103 These invalid insns can eventually confuse reload to error out
8104 with a spill failure. See also PRs 46829 and 46843. */
8105 if ((INSN_CODE (insn) = recog (PATTERN (insn), insn, 0)) >= 0)
8106 {
8107 int i;
8108
8109 extract_insn (insn);
8110 preprocess_constraints (insn);
8111
8112 int n_operands = recog_data.n_operands;
8113 int n_alternatives = recog_data.n_alternatives;
8114 for (i = 0; i < n_operands; i++)
8115 {
8116 rtx op = recog_data.operand[i];
8117 machine_mode mode = GET_MODE (op);
8118 const operand_alternative *op_alt;
8119 int offset = 0;
8120 bool win;
8121 int j;
8122
8123 /* A unary operator may be accepted by the predicate, but it
8124 is irrelevant for matching constraints. */
8125 if (UNARY_P (op))
8126 op = XEXP (op, 0);
8127
8128 if (SUBREG_P (op))
8129 {
8130 if (REG_P (SUBREG_REG (op))
8131 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
8132 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
8133 GET_MODE (SUBREG_REG (op)),
8134 SUBREG_BYTE (op),
8135 GET_MODE (op));
8136 op = SUBREG_REG (op);
8137 }
8138
8139 if (!(REG_P (op) && HARD_REGISTER_P (op)))
8140 continue;
8141
8142 op_alt = recog_op_alt;
8143
8144 /* Operand has no constraints, anything is OK. */
8145 win = !n_alternatives;
8146
8147 alternative_mask preferred = get_preferred_alternatives (insn);
8148 for (j = 0; j < n_alternatives; j++, op_alt += n_operands)
8149 {
8150 if (!TEST_BIT (preferred, j))
8151 continue;
8152 if (op_alt[i].anything_ok
8153 || (op_alt[i].matches != -1
8154 && operands_match_p
8155 (recog_data.operand[i],
8156 recog_data.operand[op_alt[i].matches]))
8157 || reg_fits_class_p (op, op_alt[i].cl, offset, mode))
8158 {
8159 win = true;
8160 break;
8161 }
8162 }
8163
8164 if (!win)
8165 return false;
8166 }
8167 }
8168
8169 return true;
8170 }
8171 \f
8172 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
8173
8174 static unsigned HOST_WIDE_INT
8175 ix86_asan_shadow_offset (void)
8176 {
8177 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
8178 : HOST_WIDE_INT_C (0x7fff8000))
8179 : (HOST_WIDE_INT_1 << 29);
8180 }
8181 \f
8182 /* Argument support functions. */
8183
8184 /* Return true when register may be used to pass function parameters. */
8185 bool
8186 ix86_function_arg_regno_p (int regno)
8187 {
8188 int i;
8189 enum calling_abi call_abi;
8190 const int *parm_regs;
8191
8192 if (TARGET_MPX && BND_REGNO_P (regno))
8193 return true;
8194
8195 if (!TARGET_64BIT)
8196 {
8197 if (TARGET_MACHO)
8198 return (regno < REGPARM_MAX
8199 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
8200 else
8201 return (regno < REGPARM_MAX
8202 || (TARGET_MMX && MMX_REGNO_P (regno)
8203 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
8204 || (TARGET_SSE && SSE_REGNO_P (regno)
8205 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
8206 }
8207
8208 if (TARGET_SSE && SSE_REGNO_P (regno)
8209 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
8210 return true;
8211
8212 /* TODO: The function should depend on current function ABI but
8213 builtins.c would need updating then. Therefore we use the
8214 default ABI. */
8215 call_abi = ix86_cfun_abi ();
8216
8217 /* RAX is used as hidden argument to va_arg functions. */
8218 if (call_abi == SYSV_ABI && regno == AX_REG)
8219 return true;
8220
8221 if (call_abi == MS_ABI)
8222 parm_regs = x86_64_ms_abi_int_parameter_registers;
8223 else
8224 parm_regs = x86_64_int_parameter_registers;
8225
8226 for (i = 0; i < (call_abi == MS_ABI
8227 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
8228 if (regno == parm_regs[i])
8229 return true;
8230 return false;
8231 }
8232
8233 /* Return if we do not know how to pass TYPE solely in registers. */
8234
8235 static bool
8236 ix86_must_pass_in_stack (machine_mode mode, const_tree type)
8237 {
8238 if (must_pass_in_stack_var_size_or_pad (mode, type))
8239 return true;
8240
8241 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
8242 The layout_type routine is crafty and tries to trick us into passing
8243 currently unsupported vector types on the stack by using TImode. */
8244 return (!TARGET_64BIT && mode == TImode
8245 && type && TREE_CODE (type) != VECTOR_TYPE);
8246 }
8247
8248 /* It returns the size, in bytes, of the area reserved for arguments passed
8249 in registers for the function represented by fndecl dependent to the used
8250 abi format. */
8251 int
8252 ix86_reg_parm_stack_space (const_tree fndecl)
8253 {
8254 enum calling_abi call_abi = SYSV_ABI;
8255 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
8256 call_abi = ix86_function_abi (fndecl);
8257 else
8258 call_abi = ix86_function_type_abi (fndecl);
8259 if (TARGET_64BIT && call_abi == MS_ABI)
8260 return 32;
8261 return 0;
8262 }
8263
8264 /* We add this as a workaround in order to use libc_has_function
8265 hook in i386.md. */
8266 bool
8267 ix86_libc_has_function (enum function_class fn_class)
8268 {
8269 return targetm.libc_has_function (fn_class);
8270 }
8271
8272 /* Returns value SYSV_ABI, MS_ABI dependent on fntype,
8273 specifying the call abi used. */
8274 enum calling_abi
8275 ix86_function_type_abi (const_tree fntype)
8276 {
8277 enum calling_abi abi = ix86_abi;
8278
8279 if (fntype == NULL_TREE || TYPE_ATTRIBUTES (fntype) == NULL_TREE)
8280 return abi;
8281
8282 if (abi == SYSV_ABI
8283 && lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
8284 {
8285 if (TARGET_X32)
8286 error ("X32 does not support ms_abi attribute");
8287
8288 abi = MS_ABI;
8289 }
8290 else if (abi == MS_ABI
8291 && lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
8292 abi = SYSV_ABI;
8293
8294 return abi;
8295 }
8296
8297 static enum calling_abi
8298 ix86_function_abi (const_tree fndecl)
8299 {
8300 return fndecl ? ix86_function_type_abi (TREE_TYPE (fndecl)) : ix86_abi;
8301 }
8302
8303 /* Returns value SYSV_ABI, MS_ABI dependent on cfun,
8304 specifying the call abi used. */
8305 enum calling_abi
8306 ix86_cfun_abi (void)
8307 {
8308 return cfun ? cfun->machine->call_abi : ix86_abi;
8309 }
8310
8311 static bool
8312 ix86_function_ms_hook_prologue (const_tree fn)
8313 {
8314 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
8315 {
8316 if (decl_function_context (fn) != NULL_TREE)
8317 error_at (DECL_SOURCE_LOCATION (fn),
8318 "ms_hook_prologue is not compatible with nested function");
8319 else
8320 return true;
8321 }
8322 return false;
8323 }
8324
8325 /* Write the extra assembler code needed to declare a function properly. */
8326
8327 void
8328 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
8329 tree decl)
8330 {
8331 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
8332
8333 if (is_ms_hook)
8334 {
8335 int i, filler_count = (TARGET_64BIT ? 32 : 16);
8336 unsigned int filler_cc = 0xcccccccc;
8337
8338 for (i = 0; i < filler_count; i += 4)
8339 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
8340 }
8341
8342 #ifdef SUBTARGET_ASM_UNWIND_INIT
8343 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
8344 #endif
8345
8346 ASM_OUTPUT_LABEL (asm_out_file, fname);
8347
8348 /* Output magic byte marker, if hot-patch attribute is set. */
8349 if (is_ms_hook)
8350 {
8351 if (TARGET_64BIT)
8352 {
8353 /* leaq [%rsp + 0], %rsp */
8354 asm_fprintf (asm_out_file, ASM_BYTE
8355 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
8356 }
8357 else
8358 {
8359 /* movl.s %edi, %edi
8360 push %ebp
8361 movl.s %esp, %ebp */
8362 asm_fprintf (asm_out_file, ASM_BYTE
8363 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
8364 }
8365 }
8366 }
8367
8368 /* regclass.c */
8369 extern void init_regs (void);
8370
8371 /* Implementation of call abi switching target hook. Specific to FNDECL
8372 the specific call register sets are set. See also
8373 ix86_conditional_register_usage for more details. */
8374 void
8375 ix86_call_abi_override (const_tree fndecl)
8376 {
8377 cfun->machine->call_abi = ix86_function_abi (fndecl);
8378 }
8379
8380 /* Return 1 if pseudo register should be created and used to hold
8381 GOT address for PIC code. */
8382 bool
8383 ix86_use_pseudo_pic_reg (void)
8384 {
8385 if ((TARGET_64BIT
8386 && (ix86_cmodel == CM_SMALL_PIC
8387 || TARGET_PECOFF))
8388 || !flag_pic)
8389 return false;
8390 return true;
8391 }
8392
8393 /* Initialize large model PIC register. */
8394
8395 static void
8396 ix86_init_large_pic_reg (unsigned int tmp_regno)
8397 {
8398 rtx_code_label *label;
8399 rtx tmp_reg;
8400
8401 gcc_assert (Pmode == DImode);
8402 label = gen_label_rtx ();
8403 emit_label (label);
8404 LABEL_PRESERVE_P (label) = 1;
8405 tmp_reg = gen_rtx_REG (Pmode, tmp_regno);
8406 gcc_assert (REGNO (pic_offset_table_rtx) != tmp_regno);
8407 emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
8408 label));
8409 emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
8410 emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
8411 pic_offset_table_rtx, tmp_reg));
8412 }
8413
8414 /* Create and initialize PIC register if required. */
8415 static void
8416 ix86_init_pic_reg (void)
8417 {
8418 edge entry_edge;
8419 rtx_insn *seq;
8420
8421 if (!ix86_use_pseudo_pic_reg ())
8422 return;
8423
8424 start_sequence ();
8425
8426 if (TARGET_64BIT)
8427 {
8428 if (ix86_cmodel == CM_LARGE_PIC)
8429 ix86_init_large_pic_reg (R11_REG);
8430 else
8431 emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
8432 }
8433 else
8434 {
8435 /* If there is future mcount call in the function it is more profitable
8436 to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM. */
8437 rtx reg = crtl->profile
8438 ? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
8439 : pic_offset_table_rtx;
8440 rtx_insn *insn = emit_insn (gen_set_got (reg));
8441 RTX_FRAME_RELATED_P (insn) = 1;
8442 if (crtl->profile)
8443 emit_move_insn (pic_offset_table_rtx, reg);
8444 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
8445 }
8446
8447 seq = get_insns ();
8448 end_sequence ();
8449
8450 entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
8451 insert_insn_on_edge (seq, entry_edge);
8452 commit_one_edge_insertion (entry_edge);
8453 }
8454
8455 /* Initialize a variable CUM of type CUMULATIVE_ARGS
8456 for a call to a function whose data type is FNTYPE.
8457 For a library call, FNTYPE is 0. */
8458
8459 void
8460 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
8461 tree fntype, /* tree ptr for function decl */
8462 rtx libname, /* SYMBOL_REF of library name or 0 */
8463 tree fndecl,
8464 int caller)
8465 {
8466 struct cgraph_local_info *i = NULL;
8467 struct cgraph_node *target = NULL;
8468
8469 memset (cum, 0, sizeof (*cum));
8470
8471 if (fndecl)
8472 {
8473 target = cgraph_node::get (fndecl);
8474 if (target)
8475 {
8476 target = target->function_symbol ();
8477 i = cgraph_node::local_info (target->decl);
8478 cum->call_abi = ix86_function_abi (target->decl);
8479 }
8480 else
8481 cum->call_abi = ix86_function_abi (fndecl);
8482 }
8483 else
8484 cum->call_abi = ix86_function_type_abi (fntype);
8485
8486 cum->caller = caller;
8487
8488 /* Set up the number of registers to use for passing arguments. */
8489 cum->nregs = ix86_regparm;
8490 if (TARGET_64BIT)
8491 {
8492 cum->nregs = (cum->call_abi == SYSV_ABI
8493 ? X86_64_REGPARM_MAX
8494 : X86_64_MS_REGPARM_MAX);
8495 }
8496 if (TARGET_SSE)
8497 {
8498 cum->sse_nregs = SSE_REGPARM_MAX;
8499 if (TARGET_64BIT)
8500 {
8501 cum->sse_nregs = (cum->call_abi == SYSV_ABI
8502 ? X86_64_SSE_REGPARM_MAX
8503 : X86_64_MS_SSE_REGPARM_MAX);
8504 }
8505 }
8506 if (TARGET_MMX)
8507 cum->mmx_nregs = MMX_REGPARM_MAX;
8508 cum->warn_avx512f = true;
8509 cum->warn_avx = true;
8510 cum->warn_sse = true;
8511 cum->warn_mmx = true;
8512
8513 /* Because type might mismatch in between caller and callee, we need to
8514 use actual type of function for local calls.
8515 FIXME: cgraph_analyze can be told to actually record if function uses
8516 va_start so for local functions maybe_vaarg can be made aggressive
8517 helping K&R code.
8518 FIXME: once typesytem is fixed, we won't need this code anymore. */
8519 if (i && i->local && i->can_change_signature)
8520 fntype = TREE_TYPE (target->decl);
8521 cum->stdarg = stdarg_p (fntype);
8522 cum->maybe_vaarg = (fntype
8523 ? (!prototype_p (fntype) || stdarg_p (fntype))
8524 : !libname);
8525
8526 cum->bnd_regno = FIRST_BND_REG;
8527 cum->bnds_in_bt = 0;
8528 cum->force_bnd_pass = 0;
8529 cum->decl = fndecl;
8530
8531 if (!TARGET_64BIT)
8532 {
8533 /* If there are variable arguments, then we won't pass anything
8534 in registers in 32-bit mode. */
8535 if (stdarg_p (fntype))
8536 {
8537 cum->nregs = 0;
8538 /* Since in 32-bit, variable arguments are always passed on
8539 stack, there is scratch register available for indirect
8540 sibcall. */
8541 cfun->machine->arg_reg_available = true;
8542 cum->sse_nregs = 0;
8543 cum->mmx_nregs = 0;
8544 cum->warn_avx512f = false;
8545 cum->warn_avx = false;
8546 cum->warn_sse = false;
8547 cum->warn_mmx = false;
8548 return;
8549 }
8550
8551 /* Use ecx and edx registers if function has fastcall attribute,
8552 else look for regparm information. */
8553 if (fntype)
8554 {
8555 unsigned int ccvt = ix86_get_callcvt (fntype);
8556 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
8557 {
8558 cum->nregs = 1;
8559 cum->fastcall = 1; /* Same first register as in fastcall. */
8560 }
8561 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
8562 {
8563 cum->nregs = 2;
8564 cum->fastcall = 1;
8565 }
8566 else
8567 cum->nregs = ix86_function_regparm (fntype, fndecl);
8568 }
8569
8570 /* Set up the number of SSE registers used for passing SFmode
8571 and DFmode arguments. Warn for mismatching ABI. */
8572 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
8573 }
8574
8575 cfun->machine->arg_reg_available = (cum->nregs > 0);
8576 }
8577
8578 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
8579 But in the case of vector types, it is some vector mode.
8580
8581 When we have only some of our vector isa extensions enabled, then there
8582 are some modes for which vector_mode_supported_p is false. For these
8583 modes, the generic vector support in gcc will choose some non-vector mode
8584 in order to implement the type. By computing the natural mode, we'll
8585 select the proper ABI location for the operand and not depend on whatever
8586 the middle-end decides to do with these vector types.
8587
8588 The midde-end can't deal with the vector types > 16 bytes. In this
8589 case, we return the original mode and warn ABI change if CUM isn't
8590 NULL.
8591
8592 If INT_RETURN is true, warn ABI change if the vector mode isn't
8593 available for function return value. */
8594
8595 static machine_mode
8596 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
8597 bool in_return)
8598 {
8599 machine_mode mode = TYPE_MODE (type);
8600
8601 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
8602 {
8603 HOST_WIDE_INT size = int_size_in_bytes (type);
8604 if ((size == 8 || size == 16 || size == 32 || size == 64)
8605 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
8606 && TYPE_VECTOR_SUBPARTS (type) > 1)
8607 {
8608 machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
8609
8610 /* There are no XFmode vector modes. */
8611 if (innermode == XFmode)
8612 return mode;
8613
8614 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
8615 mode = MIN_MODE_VECTOR_FLOAT;
8616 else
8617 mode = MIN_MODE_VECTOR_INT;
8618
8619 /* Get the mode which has this inner mode and number of units. */
8620 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
8621 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
8622 && GET_MODE_INNER (mode) == innermode)
8623 {
8624 if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
8625 {
8626 static bool warnedavx512f;
8627 static bool warnedavx512f_ret;
8628
8629 if (cum && cum->warn_avx512f && !warnedavx512f)
8630 {
8631 if (warning (OPT_Wpsabi, "AVX512F vector argument "
8632 "without AVX512F enabled changes the ABI"))
8633 warnedavx512f = true;
8634 }
8635 else if (in_return && !warnedavx512f_ret)
8636 {
8637 if (warning (OPT_Wpsabi, "AVX512F vector return "
8638 "without AVX512F enabled changes the ABI"))
8639 warnedavx512f_ret = true;
8640 }
8641
8642 return TYPE_MODE (type);
8643 }
8644 else if (size == 32 && !TARGET_AVX && !TARGET_IAMCU)
8645 {
8646 static bool warnedavx;
8647 static bool warnedavx_ret;
8648
8649 if (cum && cum->warn_avx && !warnedavx)
8650 {
8651 if (warning (OPT_Wpsabi, "AVX vector argument "
8652 "without AVX enabled changes the ABI"))
8653 warnedavx = true;
8654 }
8655 else if (in_return && !warnedavx_ret)
8656 {
8657 if (warning (OPT_Wpsabi, "AVX vector return "
8658 "without AVX enabled changes the ABI"))
8659 warnedavx_ret = true;
8660 }
8661
8662 return TYPE_MODE (type);
8663 }
8664 else if (((size == 8 && TARGET_64BIT) || size == 16)
8665 && !TARGET_SSE
8666 && !TARGET_IAMCU)
8667 {
8668 static bool warnedsse;
8669 static bool warnedsse_ret;
8670
8671 if (cum && cum->warn_sse && !warnedsse)
8672 {
8673 if (warning (OPT_Wpsabi, "SSE vector argument "
8674 "without SSE enabled changes the ABI"))
8675 warnedsse = true;
8676 }
8677 else if (!TARGET_64BIT && in_return && !warnedsse_ret)
8678 {
8679 if (warning (OPT_Wpsabi, "SSE vector return "
8680 "without SSE enabled changes the ABI"))
8681 warnedsse_ret = true;
8682 }
8683 }
8684 else if ((size == 8 && !TARGET_64BIT)
8685 && (!cfun
8686 || cfun->machine->func_type == TYPE_NORMAL)
8687 && !TARGET_MMX
8688 && !TARGET_IAMCU)
8689 {
8690 static bool warnedmmx;
8691 static bool warnedmmx_ret;
8692
8693 if (cum && cum->warn_mmx && !warnedmmx)
8694 {
8695 if (warning (OPT_Wpsabi, "MMX vector argument "
8696 "without MMX enabled changes the ABI"))
8697 warnedmmx = true;
8698 }
8699 else if (in_return && !warnedmmx_ret)
8700 {
8701 if (warning (OPT_Wpsabi, "MMX vector return "
8702 "without MMX enabled changes the ABI"))
8703 warnedmmx_ret = true;
8704 }
8705 }
8706 return mode;
8707 }
8708
8709 gcc_unreachable ();
8710 }
8711 }
8712
8713 return mode;
8714 }
8715
8716 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
8717 this may not agree with the mode that the type system has chosen for the
8718 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
8719 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
8720
8721 static rtx
8722 gen_reg_or_parallel (machine_mode mode, machine_mode orig_mode,
8723 unsigned int regno)
8724 {
8725 rtx tmp;
8726
8727 if (orig_mode != BLKmode)
8728 tmp = gen_rtx_REG (orig_mode, regno);
8729 else
8730 {
8731 tmp = gen_rtx_REG (mode, regno);
8732 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
8733 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
8734 }
8735
8736 return tmp;
8737 }
8738
8739 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
8740 of this code is to classify each 8bytes of incoming argument by the register
8741 class and assign registers accordingly. */
8742
8743 /* Return the union class of CLASS1 and CLASS2.
8744 See the x86-64 PS ABI for details. */
8745
8746 static enum x86_64_reg_class
8747 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
8748 {
8749 /* Rule #1: If both classes are equal, this is the resulting class. */
8750 if (class1 == class2)
8751 return class1;
8752
8753 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
8754 the other class. */
8755 if (class1 == X86_64_NO_CLASS)
8756 return class2;
8757 if (class2 == X86_64_NO_CLASS)
8758 return class1;
8759
8760 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
8761 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
8762 return X86_64_MEMORY_CLASS;
8763
8764 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
8765 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
8766 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
8767 return X86_64_INTEGERSI_CLASS;
8768 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
8769 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
8770 return X86_64_INTEGER_CLASS;
8771
8772 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
8773 MEMORY is used. */
8774 if (class1 == X86_64_X87_CLASS
8775 || class1 == X86_64_X87UP_CLASS
8776 || class1 == X86_64_COMPLEX_X87_CLASS
8777 || class2 == X86_64_X87_CLASS
8778 || class2 == X86_64_X87UP_CLASS
8779 || class2 == X86_64_COMPLEX_X87_CLASS)
8780 return X86_64_MEMORY_CLASS;
8781
8782 /* Rule #6: Otherwise class SSE is used. */
8783 return X86_64_SSE_CLASS;
8784 }
8785
8786 /* Classify the argument of type TYPE and mode MODE.
8787 CLASSES will be filled by the register class used to pass each word
8788 of the operand. The number of words is returned. In case the parameter
8789 should be passed in memory, 0 is returned. As a special case for zero
8790 sized containers, classes[0] will be NO_CLASS and 1 is returned.
8791
8792 BIT_OFFSET is used internally for handling records and specifies offset
8793 of the offset in bits modulo 512 to avoid overflow cases.
8794
8795 See the x86-64 PS ABI for details.
8796 */
8797
8798 static int
8799 classify_argument (machine_mode mode, const_tree type,
8800 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
8801 {
8802 HOST_WIDE_INT bytes =
8803 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
8804 int words = CEIL (bytes + (bit_offset % 64) / 8, UNITS_PER_WORD);
8805
8806 /* Variable sized entities are always passed/returned in memory. */
8807 if (bytes < 0)
8808 return 0;
8809
8810 if (mode != VOIDmode
8811 && targetm.calls.must_pass_in_stack (mode, type))
8812 return 0;
8813
8814 if (type && AGGREGATE_TYPE_P (type))
8815 {
8816 int i;
8817 tree field;
8818 enum x86_64_reg_class subclasses[MAX_CLASSES];
8819
8820 /* On x86-64 we pass structures larger than 64 bytes on the stack. */
8821 if (bytes > 64)
8822 return 0;
8823
8824 for (i = 0; i < words; i++)
8825 classes[i] = X86_64_NO_CLASS;
8826
8827 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
8828 signalize memory class, so handle it as special case. */
8829 if (!words)
8830 {
8831 classes[0] = X86_64_NO_CLASS;
8832 return 1;
8833 }
8834
8835 /* Classify each field of record and merge classes. */
8836 switch (TREE_CODE (type))
8837 {
8838 case RECORD_TYPE:
8839 /* And now merge the fields of structure. */
8840 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
8841 {
8842 if (TREE_CODE (field) == FIELD_DECL)
8843 {
8844 int num;
8845
8846 if (TREE_TYPE (field) == error_mark_node)
8847 continue;
8848
8849 /* Bitfields are always classified as integer. Handle them
8850 early, since later code would consider them to be
8851 misaligned integers. */
8852 if (DECL_BIT_FIELD (field))
8853 {
8854 for (i = (int_bit_position (field)
8855 + (bit_offset % 64)) / 8 / 8;
8856 i < ((int_bit_position (field) + (bit_offset % 64))
8857 + tree_to_shwi (DECL_SIZE (field))
8858 + 63) / 8 / 8; i++)
8859 classes[i] =
8860 merge_classes (X86_64_INTEGER_CLASS,
8861 classes[i]);
8862 }
8863 else
8864 {
8865 int pos;
8866
8867 type = TREE_TYPE (field);
8868
8869 /* Flexible array member is ignored. */
8870 if (TYPE_MODE (type) == BLKmode
8871 && TREE_CODE (type) == ARRAY_TYPE
8872 && TYPE_SIZE (type) == NULL_TREE
8873 && TYPE_DOMAIN (type) != NULL_TREE
8874 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
8875 == NULL_TREE))
8876 {
8877 static bool warned;
8878
8879 if (!warned && warn_psabi)
8880 {
8881 warned = true;
8882 inform (input_location,
8883 "the ABI of passing struct with"
8884 " a flexible array member has"
8885 " changed in GCC 4.4");
8886 }
8887 continue;
8888 }
8889 num = classify_argument (TYPE_MODE (type), type,
8890 subclasses,
8891 (int_bit_position (field)
8892 + bit_offset) % 512);
8893 if (!num)
8894 return 0;
8895 pos = (int_bit_position (field)
8896 + (bit_offset % 64)) / 8 / 8;
8897 for (i = 0; i < num && (i + pos) < words; i++)
8898 classes[i + pos] =
8899 merge_classes (subclasses[i], classes[i + pos]);
8900 }
8901 }
8902 }
8903 break;
8904
8905 case ARRAY_TYPE:
8906 /* Arrays are handled as small records. */
8907 {
8908 int num;
8909 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
8910 TREE_TYPE (type), subclasses, bit_offset);
8911 if (!num)
8912 return 0;
8913
8914 /* The partial classes are now full classes. */
8915 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
8916 subclasses[0] = X86_64_SSE_CLASS;
8917 if (subclasses[0] == X86_64_INTEGERSI_CLASS
8918 && !((bit_offset % 64) == 0 && bytes == 4))
8919 subclasses[0] = X86_64_INTEGER_CLASS;
8920
8921 for (i = 0; i < words; i++)
8922 classes[i] = subclasses[i % num];
8923
8924 break;
8925 }
8926 case UNION_TYPE:
8927 case QUAL_UNION_TYPE:
8928 /* Unions are similar to RECORD_TYPE but offset is always 0.
8929 */
8930 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
8931 {
8932 if (TREE_CODE (field) == FIELD_DECL)
8933 {
8934 int num;
8935
8936 if (TREE_TYPE (field) == error_mark_node)
8937 continue;
8938
8939 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
8940 TREE_TYPE (field), subclasses,
8941 bit_offset);
8942 if (!num)
8943 return 0;
8944 for (i = 0; i < num && i < words; i++)
8945 classes[i] = merge_classes (subclasses[i], classes[i]);
8946 }
8947 }
8948 break;
8949
8950 default:
8951 gcc_unreachable ();
8952 }
8953
8954 if (words > 2)
8955 {
8956 /* When size > 16 bytes, if the first one isn't
8957 X86_64_SSE_CLASS or any other ones aren't
8958 X86_64_SSEUP_CLASS, everything should be passed in
8959 memory. */
8960 if (classes[0] != X86_64_SSE_CLASS)
8961 return 0;
8962
8963 for (i = 1; i < words; i++)
8964 if (classes[i] != X86_64_SSEUP_CLASS)
8965 return 0;
8966 }
8967
8968 /* Final merger cleanup. */
8969 for (i = 0; i < words; i++)
8970 {
8971 /* If one class is MEMORY, everything should be passed in
8972 memory. */
8973 if (classes[i] == X86_64_MEMORY_CLASS)
8974 return 0;
8975
8976 /* The X86_64_SSEUP_CLASS should be always preceded by
8977 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
8978 if (classes[i] == X86_64_SSEUP_CLASS
8979 && classes[i - 1] != X86_64_SSE_CLASS
8980 && classes[i - 1] != X86_64_SSEUP_CLASS)
8981 {
8982 /* The first one should never be X86_64_SSEUP_CLASS. */
8983 gcc_assert (i != 0);
8984 classes[i] = X86_64_SSE_CLASS;
8985 }
8986
8987 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
8988 everything should be passed in memory. */
8989 if (classes[i] == X86_64_X87UP_CLASS
8990 && (classes[i - 1] != X86_64_X87_CLASS))
8991 {
8992 static bool warned;
8993
8994 /* The first one should never be X86_64_X87UP_CLASS. */
8995 gcc_assert (i != 0);
8996 if (!warned && warn_psabi)
8997 {
8998 warned = true;
8999 inform (input_location,
9000 "the ABI of passing union with long double"
9001 " has changed in GCC 4.4");
9002 }
9003 return 0;
9004 }
9005 }
9006 return words;
9007 }
9008
9009 /* Compute alignment needed. We align all types to natural boundaries with
9010 exception of XFmode that is aligned to 64bits. */
9011 if (mode != VOIDmode && mode != BLKmode)
9012 {
9013 int mode_alignment = GET_MODE_BITSIZE (mode);
9014
9015 if (mode == XFmode)
9016 mode_alignment = 128;
9017 else if (mode == XCmode)
9018 mode_alignment = 256;
9019 if (COMPLEX_MODE_P (mode))
9020 mode_alignment /= 2;
9021 /* Misaligned fields are always returned in memory. */
9022 if (bit_offset % mode_alignment)
9023 return 0;
9024 }
9025
9026 /* for V1xx modes, just use the base mode */
9027 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
9028 && GET_MODE_UNIT_SIZE (mode) == bytes)
9029 mode = GET_MODE_INNER (mode);
9030
9031 /* Classification of atomic types. */
9032 switch (mode)
9033 {
9034 case SDmode:
9035 case DDmode:
9036 classes[0] = X86_64_SSE_CLASS;
9037 return 1;
9038 case TDmode:
9039 classes[0] = X86_64_SSE_CLASS;
9040 classes[1] = X86_64_SSEUP_CLASS;
9041 return 2;
9042 case DImode:
9043 case SImode:
9044 case HImode:
9045 case QImode:
9046 case CSImode:
9047 case CHImode:
9048 case CQImode:
9049 {
9050 int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
9051
9052 /* Analyze last 128 bits only. */
9053 size = (size - 1) & 0x7f;
9054
9055 if (size < 32)
9056 {
9057 classes[0] = X86_64_INTEGERSI_CLASS;
9058 return 1;
9059 }
9060 else if (size < 64)
9061 {
9062 classes[0] = X86_64_INTEGER_CLASS;
9063 return 1;
9064 }
9065 else if (size < 64+32)
9066 {
9067 classes[0] = X86_64_INTEGER_CLASS;
9068 classes[1] = X86_64_INTEGERSI_CLASS;
9069 return 2;
9070 }
9071 else if (size < 64+64)
9072 {
9073 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
9074 return 2;
9075 }
9076 else
9077 gcc_unreachable ();
9078 }
9079 case CDImode:
9080 case TImode:
9081 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
9082 return 2;
9083 case COImode:
9084 case OImode:
9085 /* OImode shouldn't be used directly. */
9086 gcc_unreachable ();
9087 case CTImode:
9088 return 0;
9089 case SFmode:
9090 if (!(bit_offset % 64))
9091 classes[0] = X86_64_SSESF_CLASS;
9092 else
9093 classes[0] = X86_64_SSE_CLASS;
9094 return 1;
9095 case DFmode:
9096 classes[0] = X86_64_SSEDF_CLASS;
9097 return 1;
9098 case XFmode:
9099 classes[0] = X86_64_X87_CLASS;
9100 classes[1] = X86_64_X87UP_CLASS;
9101 return 2;
9102 case TFmode:
9103 classes[0] = X86_64_SSE_CLASS;
9104 classes[1] = X86_64_SSEUP_CLASS;
9105 return 2;
9106 case SCmode:
9107 classes[0] = X86_64_SSE_CLASS;
9108 if (!(bit_offset % 64))
9109 return 1;
9110 else
9111 {
9112 static bool warned;
9113
9114 if (!warned && warn_psabi)
9115 {
9116 warned = true;
9117 inform (input_location,
9118 "the ABI of passing structure with complex float"
9119 " member has changed in GCC 4.4");
9120 }
9121 classes[1] = X86_64_SSESF_CLASS;
9122 return 2;
9123 }
9124 case DCmode:
9125 classes[0] = X86_64_SSEDF_CLASS;
9126 classes[1] = X86_64_SSEDF_CLASS;
9127 return 2;
9128 case XCmode:
9129 classes[0] = X86_64_COMPLEX_X87_CLASS;
9130 return 1;
9131 case TCmode:
9132 /* This modes is larger than 16 bytes. */
9133 return 0;
9134 case V8SFmode:
9135 case V8SImode:
9136 case V32QImode:
9137 case V16HImode:
9138 case V4DFmode:
9139 case V4DImode:
9140 classes[0] = X86_64_SSE_CLASS;
9141 classes[1] = X86_64_SSEUP_CLASS;
9142 classes[2] = X86_64_SSEUP_CLASS;
9143 classes[3] = X86_64_SSEUP_CLASS;
9144 return 4;
9145 case V8DFmode:
9146 case V16SFmode:
9147 case V8DImode:
9148 case V16SImode:
9149 case V32HImode:
9150 case V64QImode:
9151 classes[0] = X86_64_SSE_CLASS;
9152 classes[1] = X86_64_SSEUP_CLASS;
9153 classes[2] = X86_64_SSEUP_CLASS;
9154 classes[3] = X86_64_SSEUP_CLASS;
9155 classes[4] = X86_64_SSEUP_CLASS;
9156 classes[5] = X86_64_SSEUP_CLASS;
9157 classes[6] = X86_64_SSEUP_CLASS;
9158 classes[7] = X86_64_SSEUP_CLASS;
9159 return 8;
9160 case V4SFmode:
9161 case V4SImode:
9162 case V16QImode:
9163 case V8HImode:
9164 case V2DFmode:
9165 case V2DImode:
9166 classes[0] = X86_64_SSE_CLASS;
9167 classes[1] = X86_64_SSEUP_CLASS;
9168 return 2;
9169 case V1TImode:
9170 case V1DImode:
9171 case V2SFmode:
9172 case V2SImode:
9173 case V4HImode:
9174 case V8QImode:
9175 classes[0] = X86_64_SSE_CLASS;
9176 return 1;
9177 case BLKmode:
9178 case VOIDmode:
9179 return 0;
9180 default:
9181 gcc_assert (VECTOR_MODE_P (mode));
9182
9183 if (bytes > 16)
9184 return 0;
9185
9186 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
9187
9188 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
9189 classes[0] = X86_64_INTEGERSI_CLASS;
9190 else
9191 classes[0] = X86_64_INTEGER_CLASS;
9192 classes[1] = X86_64_INTEGER_CLASS;
9193 return 1 + (bytes > 8);
9194 }
9195 }
9196
9197 /* Examine the argument and return set number of register required in each
9198 class. Return true iff parameter should be passed in memory. */
9199
9200 static bool
9201 examine_argument (machine_mode mode, const_tree type, int in_return,
9202 int *int_nregs, int *sse_nregs)
9203 {
9204 enum x86_64_reg_class regclass[MAX_CLASSES];
9205 int n = classify_argument (mode, type, regclass, 0);
9206
9207 *int_nregs = 0;
9208 *sse_nregs = 0;
9209
9210 if (!n)
9211 return true;
9212 for (n--; n >= 0; n--)
9213 switch (regclass[n])
9214 {
9215 case X86_64_INTEGER_CLASS:
9216 case X86_64_INTEGERSI_CLASS:
9217 (*int_nregs)++;
9218 break;
9219 case X86_64_SSE_CLASS:
9220 case X86_64_SSESF_CLASS:
9221 case X86_64_SSEDF_CLASS:
9222 (*sse_nregs)++;
9223 break;
9224 case X86_64_NO_CLASS:
9225 case X86_64_SSEUP_CLASS:
9226 break;
9227 case X86_64_X87_CLASS:
9228 case X86_64_X87UP_CLASS:
9229 case X86_64_COMPLEX_X87_CLASS:
9230 if (!in_return)
9231 return true;
9232 break;
9233 case X86_64_MEMORY_CLASS:
9234 gcc_unreachable ();
9235 }
9236
9237 return false;
9238 }
9239
9240 /* Construct container for the argument used by GCC interface. See
9241 FUNCTION_ARG for the detailed description. */
9242
9243 static rtx
9244 construct_container (machine_mode mode, machine_mode orig_mode,
9245 const_tree type, int in_return, int nintregs, int nsseregs,
9246 const int *intreg, int sse_regno)
9247 {
9248 /* The following variables hold the static issued_error state. */
9249 static bool issued_sse_arg_error;
9250 static bool issued_sse_ret_error;
9251 static bool issued_x87_ret_error;
9252
9253 machine_mode tmpmode;
9254 int bytes =
9255 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
9256 enum x86_64_reg_class regclass[MAX_CLASSES];
9257 int n;
9258 int i;
9259 int nexps = 0;
9260 int needed_sseregs, needed_intregs;
9261 rtx exp[MAX_CLASSES];
9262 rtx ret;
9263
9264 n = classify_argument (mode, type, regclass, 0);
9265 if (!n)
9266 return NULL;
9267 if (examine_argument (mode, type, in_return, &needed_intregs,
9268 &needed_sseregs))
9269 return NULL;
9270 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
9271 return NULL;
9272
9273 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
9274 some less clueful developer tries to use floating-point anyway. */
9275 if (needed_sseregs && !TARGET_SSE)
9276 {
9277 if (in_return)
9278 {
9279 if (!issued_sse_ret_error)
9280 {
9281 error ("SSE register return with SSE disabled");
9282 issued_sse_ret_error = true;
9283 }
9284 }
9285 else if (!issued_sse_arg_error)
9286 {
9287 error ("SSE register argument with SSE disabled");
9288 issued_sse_arg_error = true;
9289 }
9290 return NULL;
9291 }
9292
9293 /* Likewise, error if the ABI requires us to return values in the
9294 x87 registers and the user specified -mno-80387. */
9295 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
9296 for (i = 0; i < n; i++)
9297 if (regclass[i] == X86_64_X87_CLASS
9298 || regclass[i] == X86_64_X87UP_CLASS
9299 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
9300 {
9301 if (!issued_x87_ret_error)
9302 {
9303 error ("x87 register return with x87 disabled");
9304 issued_x87_ret_error = true;
9305 }
9306 return NULL;
9307 }
9308
9309 /* First construct simple cases. Avoid SCmode, since we want to use
9310 single register to pass this type. */
9311 if (n == 1 && mode != SCmode)
9312 switch (regclass[0])
9313 {
9314 case X86_64_INTEGER_CLASS:
9315 case X86_64_INTEGERSI_CLASS:
9316 return gen_rtx_REG (mode, intreg[0]);
9317 case X86_64_SSE_CLASS:
9318 case X86_64_SSESF_CLASS:
9319 case X86_64_SSEDF_CLASS:
9320 if (mode != BLKmode)
9321 return gen_reg_or_parallel (mode, orig_mode,
9322 SSE_REGNO (sse_regno));
9323 break;
9324 case X86_64_X87_CLASS:
9325 case X86_64_COMPLEX_X87_CLASS:
9326 return gen_rtx_REG (mode, FIRST_STACK_REG);
9327 case X86_64_NO_CLASS:
9328 /* Zero sized array, struct or class. */
9329 return NULL;
9330 default:
9331 gcc_unreachable ();
9332 }
9333 if (n == 2
9334 && regclass[0] == X86_64_SSE_CLASS
9335 && regclass[1] == X86_64_SSEUP_CLASS
9336 && mode != BLKmode)
9337 return gen_reg_or_parallel (mode, orig_mode,
9338 SSE_REGNO (sse_regno));
9339 if (n == 4
9340 && regclass[0] == X86_64_SSE_CLASS
9341 && regclass[1] == X86_64_SSEUP_CLASS
9342 && regclass[2] == X86_64_SSEUP_CLASS
9343 && regclass[3] == X86_64_SSEUP_CLASS
9344 && mode != BLKmode)
9345 return gen_reg_or_parallel (mode, orig_mode,
9346 SSE_REGNO (sse_regno));
9347 if (n == 8
9348 && regclass[0] == X86_64_SSE_CLASS
9349 && regclass[1] == X86_64_SSEUP_CLASS
9350 && regclass[2] == X86_64_SSEUP_CLASS
9351 && regclass[3] == X86_64_SSEUP_CLASS
9352 && regclass[4] == X86_64_SSEUP_CLASS
9353 && regclass[5] == X86_64_SSEUP_CLASS
9354 && regclass[6] == X86_64_SSEUP_CLASS
9355 && regclass[7] == X86_64_SSEUP_CLASS
9356 && mode != BLKmode)
9357 return gen_reg_or_parallel (mode, orig_mode,
9358 SSE_REGNO (sse_regno));
9359 if (n == 2
9360 && regclass[0] == X86_64_X87_CLASS
9361 && regclass[1] == X86_64_X87UP_CLASS)
9362 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
9363
9364 if (n == 2
9365 && regclass[0] == X86_64_INTEGER_CLASS
9366 && regclass[1] == X86_64_INTEGER_CLASS
9367 && (mode == CDImode || mode == TImode)
9368 && intreg[0] + 1 == intreg[1])
9369 return gen_rtx_REG (mode, intreg[0]);
9370
9371 /* Otherwise figure out the entries of the PARALLEL. */
9372 for (i = 0; i < n; i++)
9373 {
9374 int pos;
9375
9376 switch (regclass[i])
9377 {
9378 case X86_64_NO_CLASS:
9379 break;
9380 case X86_64_INTEGER_CLASS:
9381 case X86_64_INTEGERSI_CLASS:
9382 /* Merge TImodes on aligned occasions here too. */
9383 if (i * 8 + 8 > bytes)
9384 tmpmode
9385 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
9386 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
9387 tmpmode = SImode;
9388 else
9389 tmpmode = DImode;
9390 /* We've requested 24 bytes we
9391 don't have mode for. Use DImode. */
9392 if (tmpmode == BLKmode)
9393 tmpmode = DImode;
9394 exp [nexps++]
9395 = gen_rtx_EXPR_LIST (VOIDmode,
9396 gen_rtx_REG (tmpmode, *intreg),
9397 GEN_INT (i*8));
9398 intreg++;
9399 break;
9400 case X86_64_SSESF_CLASS:
9401 exp [nexps++]
9402 = gen_rtx_EXPR_LIST (VOIDmode,
9403 gen_rtx_REG (SFmode,
9404 SSE_REGNO (sse_regno)),
9405 GEN_INT (i*8));
9406 sse_regno++;
9407 break;
9408 case X86_64_SSEDF_CLASS:
9409 exp [nexps++]
9410 = gen_rtx_EXPR_LIST (VOIDmode,
9411 gen_rtx_REG (DFmode,
9412 SSE_REGNO (sse_regno)),
9413 GEN_INT (i*8));
9414 sse_regno++;
9415 break;
9416 case X86_64_SSE_CLASS:
9417 pos = i;
9418 switch (n)
9419 {
9420 case 1:
9421 tmpmode = DImode;
9422 break;
9423 case 2:
9424 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
9425 {
9426 tmpmode = TImode;
9427 i++;
9428 }
9429 else
9430 tmpmode = DImode;
9431 break;
9432 case 4:
9433 gcc_assert (i == 0
9434 && regclass[1] == X86_64_SSEUP_CLASS
9435 && regclass[2] == X86_64_SSEUP_CLASS
9436 && regclass[3] == X86_64_SSEUP_CLASS);
9437 tmpmode = OImode;
9438 i += 3;
9439 break;
9440 case 8:
9441 gcc_assert (i == 0
9442 && regclass[1] == X86_64_SSEUP_CLASS
9443 && regclass[2] == X86_64_SSEUP_CLASS
9444 && regclass[3] == X86_64_SSEUP_CLASS
9445 && regclass[4] == X86_64_SSEUP_CLASS
9446 && regclass[5] == X86_64_SSEUP_CLASS
9447 && regclass[6] == X86_64_SSEUP_CLASS
9448 && regclass[7] == X86_64_SSEUP_CLASS);
9449 tmpmode = XImode;
9450 i += 7;
9451 break;
9452 default:
9453 gcc_unreachable ();
9454 }
9455 exp [nexps++]
9456 = gen_rtx_EXPR_LIST (VOIDmode,
9457 gen_rtx_REG (tmpmode,
9458 SSE_REGNO (sse_regno)),
9459 GEN_INT (pos*8));
9460 sse_regno++;
9461 break;
9462 default:
9463 gcc_unreachable ();
9464 }
9465 }
9466
9467 /* Empty aligned struct, union or class. */
9468 if (nexps == 0)
9469 return NULL;
9470
9471 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
9472 for (i = 0; i < nexps; i++)
9473 XVECEXP (ret, 0, i) = exp [i];
9474 return ret;
9475 }
9476
9477 /* Update the data in CUM to advance over an argument of mode MODE
9478 and data type TYPE. (TYPE is null for libcalls where that information
9479 may not be available.)
9480
9481 Return a number of integer regsiters advanced over. */
9482
9483 static int
9484 function_arg_advance_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
9485 const_tree type, HOST_WIDE_INT bytes,
9486 HOST_WIDE_INT words)
9487 {
9488 int res = 0;
9489 bool error_p = NULL;
9490
9491 if (TARGET_IAMCU)
9492 {
9493 /* Intel MCU psABI passes scalars and aggregates no larger than 8
9494 bytes in registers. */
9495 if (!VECTOR_MODE_P (mode) && bytes <= 8)
9496 goto pass_in_reg;
9497 return res;
9498 }
9499
9500 switch (mode)
9501 {
9502 default:
9503 break;
9504
9505 case BLKmode:
9506 if (bytes < 0)
9507 break;
9508 /* FALLTHRU */
9509
9510 case DImode:
9511 case SImode:
9512 case HImode:
9513 case QImode:
9514 pass_in_reg:
9515 cum->words += words;
9516 cum->nregs -= words;
9517 cum->regno += words;
9518 if (cum->nregs >= 0)
9519 res = words;
9520 if (cum->nregs <= 0)
9521 {
9522 cum->nregs = 0;
9523 cfun->machine->arg_reg_available = false;
9524 cum->regno = 0;
9525 }
9526 break;
9527
9528 case OImode:
9529 /* OImode shouldn't be used directly. */
9530 gcc_unreachable ();
9531
9532 case DFmode:
9533 if (cum->float_in_sse == -1)
9534 error_p = 1;
9535 if (cum->float_in_sse < 2)
9536 break;
9537 /* FALLTHRU */
9538 case SFmode:
9539 if (cum->float_in_sse == -1)
9540 error_p = 1;
9541 if (cum->float_in_sse < 1)
9542 break;
9543 /* FALLTHRU */
9544
9545 case V8SFmode:
9546 case V8SImode:
9547 case V64QImode:
9548 case V32HImode:
9549 case V16SImode:
9550 case V8DImode:
9551 case V16SFmode:
9552 case V8DFmode:
9553 case V32QImode:
9554 case V16HImode:
9555 case V4DFmode:
9556 case V4DImode:
9557 case TImode:
9558 case V16QImode:
9559 case V8HImode:
9560 case V4SImode:
9561 case V2DImode:
9562 case V4SFmode:
9563 case V2DFmode:
9564 if (!type || !AGGREGATE_TYPE_P (type))
9565 {
9566 cum->sse_words += words;
9567 cum->sse_nregs -= 1;
9568 cum->sse_regno += 1;
9569 if (cum->sse_nregs <= 0)
9570 {
9571 cum->sse_nregs = 0;
9572 cum->sse_regno = 0;
9573 }
9574 }
9575 break;
9576
9577 case V8QImode:
9578 case V4HImode:
9579 case V2SImode:
9580 case V2SFmode:
9581 case V1TImode:
9582 case V1DImode:
9583 if (!type || !AGGREGATE_TYPE_P (type))
9584 {
9585 cum->mmx_words += words;
9586 cum->mmx_nregs -= 1;
9587 cum->mmx_regno += 1;
9588 if (cum->mmx_nregs <= 0)
9589 {
9590 cum->mmx_nregs = 0;
9591 cum->mmx_regno = 0;
9592 }
9593 }
9594 break;
9595 }
9596 if (error_p)
9597 {
9598 cum->float_in_sse = 0;
9599 error ("calling %qD with SSE calling convention without "
9600 "SSE/SSE2 enabled", cum->decl);
9601 sorry ("this is a GCC bug that can be worked around by adding "
9602 "attribute used to function called");
9603 }
9604
9605 return res;
9606 }
9607
9608 static int
9609 function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode,
9610 const_tree type, HOST_WIDE_INT words, bool named)
9611 {
9612 int int_nregs, sse_nregs;
9613
9614 /* Unnamed 512 and 256bit vector mode parameters are passed on stack. */
9615 if (!named && (VALID_AVX512F_REG_MODE (mode)
9616 || VALID_AVX256_REG_MODE (mode)))
9617 return 0;
9618
9619 if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
9620 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
9621 {
9622 cum->nregs -= int_nregs;
9623 cum->sse_nregs -= sse_nregs;
9624 cum->regno += int_nregs;
9625 cum->sse_regno += sse_nregs;
9626 return int_nregs;
9627 }
9628 else
9629 {
9630 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
9631 cum->words = ROUND_UP (cum->words, align);
9632 cum->words += words;
9633 return 0;
9634 }
9635 }
9636
9637 static int
9638 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
9639 HOST_WIDE_INT words)
9640 {
9641 /* Otherwise, this should be passed indirect. */
9642 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
9643
9644 cum->words += words;
9645 if (cum->nregs > 0)
9646 {
9647 cum->nregs -= 1;
9648 cum->regno += 1;
9649 return 1;
9650 }
9651 return 0;
9652 }
9653
9654 /* Update the data in CUM to advance over an argument of mode MODE and
9655 data type TYPE. (TYPE is null for libcalls where that information
9656 may not be available.) */
9657
9658 static void
9659 ix86_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
9660 const_tree type, bool named)
9661 {
9662 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9663 HOST_WIDE_INT bytes, words;
9664 int nregs;
9665
9666 /* The argument of interrupt handler is a special case and is
9667 handled in ix86_function_arg. */
9668 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
9669 return;
9670
9671 if (mode == BLKmode)
9672 bytes = int_size_in_bytes (type);
9673 else
9674 bytes = GET_MODE_SIZE (mode);
9675 words = CEIL (bytes, UNITS_PER_WORD);
9676
9677 if (type)
9678 mode = type_natural_mode (type, NULL, false);
9679
9680 if ((type && POINTER_BOUNDS_TYPE_P (type))
9681 || POINTER_BOUNDS_MODE_P (mode))
9682 {
9683 /* If we pass bounds in BT then just update remained bounds count. */
9684 if (cum->bnds_in_bt)
9685 {
9686 cum->bnds_in_bt--;
9687 return;
9688 }
9689
9690 /* Update remained number of bounds to force. */
9691 if (cum->force_bnd_pass)
9692 cum->force_bnd_pass--;
9693
9694 cum->bnd_regno++;
9695
9696 return;
9697 }
9698
9699 /* The first arg not going to Bounds Tables resets this counter. */
9700 cum->bnds_in_bt = 0;
9701 /* For unnamed args we always pass bounds to avoid bounds mess when
9702 passed and received types do not match. If bounds do not follow
9703 unnamed arg, still pretend required number of bounds were passed. */
9704 if (cum->force_bnd_pass)
9705 {
9706 cum->bnd_regno += cum->force_bnd_pass;
9707 cum->force_bnd_pass = 0;
9708 }
9709
9710 if (TARGET_64BIT)
9711 {
9712 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
9713
9714 if (call_abi == MS_ABI)
9715 nregs = function_arg_advance_ms_64 (cum, bytes, words);
9716 else
9717 nregs = function_arg_advance_64 (cum, mode, type, words, named);
9718 }
9719 else
9720 nregs = function_arg_advance_32 (cum, mode, type, bytes, words);
9721
9722 /* For stdarg we expect bounds to be passed for each value passed
9723 in register. */
9724 if (cum->stdarg)
9725 cum->force_bnd_pass = nregs;
9726 /* For pointers passed in memory we expect bounds passed in Bounds
9727 Table. */
9728 if (!nregs)
9729 cum->bnds_in_bt = chkp_type_bounds_count (type);
9730 }
9731
9732 /* Define where to put the arguments to a function.
9733 Value is zero to push the argument on the stack,
9734 or a hard register in which to store the argument.
9735
9736 MODE is the argument's machine mode.
9737 TYPE is the data type of the argument (as a tree).
9738 This is null for libcalls where that information may
9739 not be available.
9740 CUM is a variable of type CUMULATIVE_ARGS which gives info about
9741 the preceding args and about the function being called.
9742 NAMED is nonzero if this argument is a named parameter
9743 (otherwise it is an extra parameter matching an ellipsis). */
9744
9745 static rtx
9746 function_arg_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
9747 machine_mode orig_mode, const_tree type,
9748 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
9749 {
9750 bool error_p = false;
9751 /* Avoid the AL settings for the Unix64 ABI. */
9752 if (mode == VOIDmode)
9753 return constm1_rtx;
9754
9755 if (TARGET_IAMCU)
9756 {
9757 /* Intel MCU psABI passes scalars and aggregates no larger than 8
9758 bytes in registers. */
9759 if (!VECTOR_MODE_P (mode) && bytes <= 8)
9760 goto pass_in_reg;
9761 return NULL_RTX;
9762 }
9763
9764 switch (mode)
9765 {
9766 default:
9767 break;
9768
9769 case BLKmode:
9770 if (bytes < 0)
9771 break;
9772 /* FALLTHRU */
9773 case DImode:
9774 case SImode:
9775 case HImode:
9776 case QImode:
9777 pass_in_reg:
9778 if (words <= cum->nregs)
9779 {
9780 int regno = cum->regno;
9781
9782 /* Fastcall allocates the first two DWORD (SImode) or
9783 smaller arguments to ECX and EDX if it isn't an
9784 aggregate type . */
9785 if (cum->fastcall)
9786 {
9787 if (mode == BLKmode
9788 || mode == DImode
9789 || (type && AGGREGATE_TYPE_P (type)))
9790 break;
9791
9792 /* ECX not EAX is the first allocated register. */
9793 if (regno == AX_REG)
9794 regno = CX_REG;
9795 }
9796 return gen_rtx_REG (mode, regno);
9797 }
9798 break;
9799
9800 case DFmode:
9801 if (cum->float_in_sse == -1)
9802 error_p = 1;
9803 if (cum->float_in_sse < 2)
9804 break;
9805 /* FALLTHRU */
9806 case SFmode:
9807 if (cum->float_in_sse == -1)
9808 error_p = 1;
9809 if (cum->float_in_sse < 1)
9810 break;
9811 /* FALLTHRU */
9812 case TImode:
9813 /* In 32bit, we pass TImode in xmm registers. */
9814 case V16QImode:
9815 case V8HImode:
9816 case V4SImode:
9817 case V2DImode:
9818 case V4SFmode:
9819 case V2DFmode:
9820 if (!type || !AGGREGATE_TYPE_P (type))
9821 {
9822 if (cum->sse_nregs)
9823 return gen_reg_or_parallel (mode, orig_mode,
9824 cum->sse_regno + FIRST_SSE_REG);
9825 }
9826 break;
9827
9828 case OImode:
9829 case XImode:
9830 /* OImode and XImode shouldn't be used directly. */
9831 gcc_unreachable ();
9832
9833 case V64QImode:
9834 case V32HImode:
9835 case V16SImode:
9836 case V8DImode:
9837 case V16SFmode:
9838 case V8DFmode:
9839 case V8SFmode:
9840 case V8SImode:
9841 case V32QImode:
9842 case V16HImode:
9843 case V4DFmode:
9844 case V4DImode:
9845 if (!type || !AGGREGATE_TYPE_P (type))
9846 {
9847 if (cum->sse_nregs)
9848 return gen_reg_or_parallel (mode, orig_mode,
9849 cum->sse_regno + FIRST_SSE_REG);
9850 }
9851 break;
9852
9853 case V8QImode:
9854 case V4HImode:
9855 case V2SImode:
9856 case V2SFmode:
9857 case V1TImode:
9858 case V1DImode:
9859 if (!type || !AGGREGATE_TYPE_P (type))
9860 {
9861 if (cum->mmx_nregs)
9862 return gen_reg_or_parallel (mode, orig_mode,
9863 cum->mmx_regno + FIRST_MMX_REG);
9864 }
9865 break;
9866 }
9867 if (error_p)
9868 {
9869 cum->float_in_sse = 0;
9870 error ("calling %qD with SSE calling convention without "
9871 "SSE/SSE2 enabled", cum->decl);
9872 sorry ("this is a GCC bug that can be worked around by adding "
9873 "attribute used to function called");
9874 }
9875
9876 return NULL_RTX;
9877 }
9878
9879 static rtx
9880 function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
9881 machine_mode orig_mode, const_tree type, bool named)
9882 {
9883 /* Handle a hidden AL argument containing number of registers
9884 for varargs x86-64 functions. */
9885 if (mode == VOIDmode)
9886 return GEN_INT (cum->maybe_vaarg
9887 ? (cum->sse_nregs < 0
9888 ? X86_64_SSE_REGPARM_MAX
9889 : cum->sse_regno)
9890 : -1);
9891
9892 switch (mode)
9893 {
9894 default:
9895 break;
9896
9897 case V8SFmode:
9898 case V8SImode:
9899 case V32QImode:
9900 case V16HImode:
9901 case V4DFmode:
9902 case V4DImode:
9903 case V16SFmode:
9904 case V16SImode:
9905 case V64QImode:
9906 case V32HImode:
9907 case V8DFmode:
9908 case V8DImode:
9909 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
9910 if (!named)
9911 return NULL;
9912 break;
9913 }
9914
9915 return construct_container (mode, orig_mode, type, 0, cum->nregs,
9916 cum->sse_nregs,
9917 &x86_64_int_parameter_registers [cum->regno],
9918 cum->sse_regno);
9919 }
9920
9921 static rtx
9922 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
9923 machine_mode orig_mode, bool named,
9924 HOST_WIDE_INT bytes)
9925 {
9926 unsigned int regno;
9927
9928 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
9929 We use value of -2 to specify that current function call is MSABI. */
9930 if (mode == VOIDmode)
9931 return GEN_INT (-2);
9932
9933 /* If we've run out of registers, it goes on the stack. */
9934 if (cum->nregs == 0)
9935 return NULL_RTX;
9936
9937 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
9938
9939 /* Only floating point modes are passed in anything but integer regs. */
9940 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
9941 {
9942 if (named)
9943 regno = cum->regno + FIRST_SSE_REG;
9944 else
9945 {
9946 rtx t1, t2;
9947
9948 /* Unnamed floating parameters are passed in both the
9949 SSE and integer registers. */
9950 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
9951 t2 = gen_rtx_REG (mode, regno);
9952 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
9953 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
9954 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
9955 }
9956 }
9957 /* Handle aggregated types passed in register. */
9958 if (orig_mode == BLKmode)
9959 {
9960 if (bytes > 0 && bytes <= 8)
9961 mode = (bytes > 4 ? DImode : SImode);
9962 if (mode == BLKmode)
9963 mode = DImode;
9964 }
9965
9966 return gen_reg_or_parallel (mode, orig_mode, regno);
9967 }
9968
9969 /* Return where to put the arguments to a function.
9970 Return zero to push the argument on the stack, or a hard register in which to store the argument.
9971
9972 MODE is the argument's machine mode. TYPE is the data type of the
9973 argument. It is null for libcalls where that information may not be
9974 available. CUM gives information about the preceding args and about
9975 the function being called. NAMED is nonzero if this argument is a
9976 named parameter (otherwise it is an extra parameter matching an
9977 ellipsis). */
9978
9979 static rtx
9980 ix86_function_arg (cumulative_args_t cum_v, machine_mode omode,
9981 const_tree type, bool named)
9982 {
9983 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9984 machine_mode mode = omode;
9985 HOST_WIDE_INT bytes, words;
9986 rtx arg;
9987
9988 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
9989 {
9990 gcc_assert (type != NULL_TREE);
9991 if (POINTER_TYPE_P (type))
9992 {
9993 /* This is the pointer argument. */
9994 gcc_assert (TYPE_MODE (type) == Pmode);
9995 if (cfun->machine->func_type == TYPE_INTERRUPT)
9996 /* -WORD(AP) in the current frame in interrupt handler. */
9997 arg = plus_constant (Pmode, arg_pointer_rtx,
9998 -UNITS_PER_WORD);
9999 else
10000 /* (AP) in the current frame in exception handler. */
10001 arg = arg_pointer_rtx;
10002 }
10003 else
10004 {
10005 gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION
10006 && TREE_CODE (type) == INTEGER_TYPE
10007 && TYPE_MODE (type) == word_mode);
10008 /* The integer argument is the error code at -WORD(AP) in
10009 the current frame in exception handler. */
10010 arg = gen_rtx_MEM (word_mode,
10011 plus_constant (Pmode,
10012 arg_pointer_rtx,
10013 -UNITS_PER_WORD));
10014 }
10015 return arg;
10016 }
10017
10018 /* All pointer bounds arguments are handled separately here. */
10019 if ((type && POINTER_BOUNDS_TYPE_P (type))
10020 || POINTER_BOUNDS_MODE_P (mode))
10021 {
10022 /* Return NULL if bounds are forced to go in Bounds Table. */
10023 if (cum->bnds_in_bt)
10024 arg = NULL;
10025 /* Return the next available bound reg if any. */
10026 else if (cum->bnd_regno <= LAST_BND_REG)
10027 arg = gen_rtx_REG (BNDmode, cum->bnd_regno);
10028 /* Return the next special slot number otherwise. */
10029 else
10030 arg = GEN_INT (cum->bnd_regno - LAST_BND_REG - 1);
10031
10032 return arg;
10033 }
10034
10035 if (mode == BLKmode)
10036 bytes = int_size_in_bytes (type);
10037 else
10038 bytes = GET_MODE_SIZE (mode);
10039 words = CEIL (bytes, UNITS_PER_WORD);
10040
10041 /* To simplify the code below, represent vector types with a vector mode
10042 even if MMX/SSE are not active. */
10043 if (type && TREE_CODE (type) == VECTOR_TYPE)
10044 mode = type_natural_mode (type, cum, false);
10045
10046 if (TARGET_64BIT)
10047 {
10048 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
10049
10050 if (call_abi == MS_ABI)
10051 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
10052 else
10053 arg = function_arg_64 (cum, mode, omode, type, named);
10054 }
10055 else
10056 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
10057
10058 return arg;
10059 }
10060
10061 /* A C expression that indicates when an argument must be passed by
10062 reference. If nonzero for an argument, a copy of that argument is
10063 made in memory and a pointer to the argument is passed instead of
10064 the argument itself. The pointer is passed in whatever way is
10065 appropriate for passing a pointer to that type. */
10066
10067 static bool
10068 ix86_pass_by_reference (cumulative_args_t cum_v, machine_mode mode,
10069 const_tree type, bool)
10070 {
10071 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10072
10073 /* Bounds are never passed by reference. */
10074 if ((type && POINTER_BOUNDS_TYPE_P (type))
10075 || POINTER_BOUNDS_MODE_P (mode))
10076 return false;
10077
10078 if (TARGET_64BIT)
10079 {
10080 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
10081
10082 /* See Windows x64 Software Convention. */
10083 if (call_abi == MS_ABI)
10084 {
10085 HOST_WIDE_INT msize = GET_MODE_SIZE (mode);
10086
10087 if (type)
10088 {
10089 /* Arrays are passed by reference. */
10090 if (TREE_CODE (type) == ARRAY_TYPE)
10091 return true;
10092
10093 if (RECORD_OR_UNION_TYPE_P (type))
10094 {
10095 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
10096 are passed by reference. */
10097 msize = int_size_in_bytes (type);
10098 }
10099 }
10100
10101 /* __m128 is passed by reference. */
10102 return msize != 1 && msize != 2 && msize != 4 && msize != 8;
10103 }
10104 else if (type && int_size_in_bytes (type) == -1)
10105 return true;
10106 }
10107
10108 return false;
10109 }
10110
10111 /* Return true when TYPE should be 128bit aligned for 32bit argument
10112 passing ABI. XXX: This function is obsolete and is only used for
10113 checking psABI compatibility with previous versions of GCC. */
10114
10115 static bool
10116 ix86_compat_aligned_value_p (const_tree type)
10117 {
10118 machine_mode mode = TYPE_MODE (type);
10119 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
10120 || mode == TDmode
10121 || mode == TFmode
10122 || mode == TCmode)
10123 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
10124 return true;
10125 if (TYPE_ALIGN (type) < 128)
10126 return false;
10127
10128 if (AGGREGATE_TYPE_P (type))
10129 {
10130 /* Walk the aggregates recursively. */
10131 switch (TREE_CODE (type))
10132 {
10133 case RECORD_TYPE:
10134 case UNION_TYPE:
10135 case QUAL_UNION_TYPE:
10136 {
10137 tree field;
10138
10139 /* Walk all the structure fields. */
10140 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
10141 {
10142 if (TREE_CODE (field) == FIELD_DECL
10143 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
10144 return true;
10145 }
10146 break;
10147 }
10148
10149 case ARRAY_TYPE:
10150 /* Just for use if some languages passes arrays by value. */
10151 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
10152 return true;
10153 break;
10154
10155 default:
10156 gcc_unreachable ();
10157 }
10158 }
10159 return false;
10160 }
10161
10162 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
10163 XXX: This function is obsolete and is only used for checking psABI
10164 compatibility with previous versions of GCC. */
10165
10166 static unsigned int
10167 ix86_compat_function_arg_boundary (machine_mode mode,
10168 const_tree type, unsigned int align)
10169 {
10170 /* In 32bit, only _Decimal128 and __float128 are aligned to their
10171 natural boundaries. */
10172 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
10173 {
10174 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
10175 make an exception for SSE modes since these require 128bit
10176 alignment.
10177
10178 The handling here differs from field_alignment. ICC aligns MMX
10179 arguments to 4 byte boundaries, while structure fields are aligned
10180 to 8 byte boundaries. */
10181 if (!type)
10182 {
10183 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
10184 align = PARM_BOUNDARY;
10185 }
10186 else
10187 {
10188 if (!ix86_compat_aligned_value_p (type))
10189 align = PARM_BOUNDARY;
10190 }
10191 }
10192 if (align > BIGGEST_ALIGNMENT)
10193 align = BIGGEST_ALIGNMENT;
10194 return align;
10195 }
10196
10197 /* Return true when TYPE should be 128bit aligned for 32bit argument
10198 passing ABI. */
10199
10200 static bool
10201 ix86_contains_aligned_value_p (const_tree type)
10202 {
10203 machine_mode mode = TYPE_MODE (type);
10204
10205 if (mode == XFmode || mode == XCmode)
10206 return false;
10207
10208 if (TYPE_ALIGN (type) < 128)
10209 return false;
10210
10211 if (AGGREGATE_TYPE_P (type))
10212 {
10213 /* Walk the aggregates recursively. */
10214 switch (TREE_CODE (type))
10215 {
10216 case RECORD_TYPE:
10217 case UNION_TYPE:
10218 case QUAL_UNION_TYPE:
10219 {
10220 tree field;
10221
10222 /* Walk all the structure fields. */
10223 for (field = TYPE_FIELDS (type);
10224 field;
10225 field = DECL_CHAIN (field))
10226 {
10227 if (TREE_CODE (field) == FIELD_DECL
10228 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
10229 return true;
10230 }
10231 break;
10232 }
10233
10234 case ARRAY_TYPE:
10235 /* Just for use if some languages passes arrays by value. */
10236 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
10237 return true;
10238 break;
10239
10240 default:
10241 gcc_unreachable ();
10242 }
10243 }
10244 else
10245 return TYPE_ALIGN (type) >= 128;
10246
10247 return false;
10248 }
10249
10250 /* Gives the alignment boundary, in bits, of an argument with the
10251 specified mode and type. */
10252
10253 static unsigned int
10254 ix86_function_arg_boundary (machine_mode mode, const_tree type)
10255 {
10256 unsigned int align;
10257 if (type)
10258 {
10259 /* Since the main variant type is used for call, we convert it to
10260 the main variant type. */
10261 type = TYPE_MAIN_VARIANT (type);
10262 align = TYPE_ALIGN (type);
10263 }
10264 else
10265 align = GET_MODE_ALIGNMENT (mode);
10266 if (align < PARM_BOUNDARY)
10267 align = PARM_BOUNDARY;
10268 else
10269 {
10270 static bool warned;
10271 unsigned int saved_align = align;
10272
10273 if (!TARGET_64BIT)
10274 {
10275 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
10276 if (!type)
10277 {
10278 if (mode == XFmode || mode == XCmode)
10279 align = PARM_BOUNDARY;
10280 }
10281 else if (!ix86_contains_aligned_value_p (type))
10282 align = PARM_BOUNDARY;
10283
10284 if (align < 128)
10285 align = PARM_BOUNDARY;
10286 }
10287
10288 if (warn_psabi
10289 && !warned
10290 && align != ix86_compat_function_arg_boundary (mode, type,
10291 saved_align))
10292 {
10293 warned = true;
10294 inform (input_location,
10295 "The ABI for passing parameters with %d-byte"
10296 " alignment has changed in GCC 4.6",
10297 align / BITS_PER_UNIT);
10298 }
10299 }
10300
10301 return align;
10302 }
10303
10304 /* Return true if N is a possible register number of function value. */
10305
10306 static bool
10307 ix86_function_value_regno_p (const unsigned int regno)
10308 {
10309 switch (regno)
10310 {
10311 case AX_REG:
10312 return true;
10313 case DX_REG:
10314 return (!TARGET_64BIT || ix86_cfun_abi () != MS_ABI);
10315 case DI_REG:
10316 case SI_REG:
10317 return TARGET_64BIT && ix86_cfun_abi () != MS_ABI;
10318
10319 case BND0_REG:
10320 case BND1_REG:
10321 return chkp_function_instrumented_p (current_function_decl);
10322
10323 /* Complex values are returned in %st(0)/%st(1) pair. */
10324 case ST0_REG:
10325 case ST1_REG:
10326 /* TODO: The function should depend on current function ABI but
10327 builtins.c would need updating then. Therefore we use the
10328 default ABI. */
10329 if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
10330 return false;
10331 return TARGET_FLOAT_RETURNS_IN_80387;
10332
10333 /* Complex values are returned in %xmm0/%xmm1 pair. */
10334 case XMM0_REG:
10335 case XMM1_REG:
10336 return TARGET_SSE;
10337
10338 case MM0_REG:
10339 if (TARGET_MACHO || TARGET_64BIT)
10340 return false;
10341 return TARGET_MMX;
10342 }
10343
10344 return false;
10345 }
10346
10347 /* Define how to find the value returned by a function.
10348 VALTYPE is the data type of the value (as a tree).
10349 If the precise function being called is known, FUNC is its FUNCTION_DECL;
10350 otherwise, FUNC is 0. */
10351
10352 static rtx
10353 function_value_32 (machine_mode orig_mode, machine_mode mode,
10354 const_tree fntype, const_tree fn)
10355 {
10356 unsigned int regno;
10357
10358 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
10359 we normally prevent this case when mmx is not available. However
10360 some ABIs may require the result to be returned like DImode. */
10361 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
10362 regno = FIRST_MMX_REG;
10363
10364 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
10365 we prevent this case when sse is not available. However some ABIs
10366 may require the result to be returned like integer TImode. */
10367 else if (mode == TImode
10368 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
10369 regno = FIRST_SSE_REG;
10370
10371 /* 32-byte vector modes in %ymm0. */
10372 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
10373 regno = FIRST_SSE_REG;
10374
10375 /* 64-byte vector modes in %zmm0. */
10376 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
10377 regno = FIRST_SSE_REG;
10378
10379 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
10380 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
10381 regno = FIRST_FLOAT_REG;
10382 else
10383 /* Most things go in %eax. */
10384 regno = AX_REG;
10385
10386 /* Override FP return register with %xmm0 for local functions when
10387 SSE math is enabled or for functions with sseregparm attribute. */
10388 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
10389 {
10390 int sse_level = ix86_function_sseregparm (fntype, fn, false);
10391 if (sse_level == -1)
10392 {
10393 error ("calling %qD with SSE caling convention without "
10394 "SSE/SSE2 enabled", fn);
10395 sorry ("this is a GCC bug that can be worked around by adding "
10396 "attribute used to function called");
10397 }
10398 else if ((sse_level >= 1 && mode == SFmode)
10399 || (sse_level == 2 && mode == DFmode))
10400 regno = FIRST_SSE_REG;
10401 }
10402
10403 /* OImode shouldn't be used directly. */
10404 gcc_assert (mode != OImode);
10405
10406 return gen_rtx_REG (orig_mode, regno);
10407 }
10408
10409 static rtx
10410 function_value_64 (machine_mode orig_mode, machine_mode mode,
10411 const_tree valtype)
10412 {
10413 rtx ret;
10414
10415 /* Handle libcalls, which don't provide a type node. */
10416 if (valtype == NULL)
10417 {
10418 unsigned int regno;
10419
10420 switch (mode)
10421 {
10422 case SFmode:
10423 case SCmode:
10424 case DFmode:
10425 case DCmode:
10426 case TFmode:
10427 case SDmode:
10428 case DDmode:
10429 case TDmode:
10430 regno = FIRST_SSE_REG;
10431 break;
10432 case XFmode:
10433 case XCmode:
10434 regno = FIRST_FLOAT_REG;
10435 break;
10436 case TCmode:
10437 return NULL;
10438 default:
10439 regno = AX_REG;
10440 }
10441
10442 return gen_rtx_REG (mode, regno);
10443 }
10444 else if (POINTER_TYPE_P (valtype))
10445 {
10446 /* Pointers are always returned in word_mode. */
10447 mode = word_mode;
10448 }
10449
10450 ret = construct_container (mode, orig_mode, valtype, 1,
10451 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
10452 x86_64_int_return_registers, 0);
10453
10454 /* For zero sized structures, construct_container returns NULL, but we
10455 need to keep rest of compiler happy by returning meaningful value. */
10456 if (!ret)
10457 ret = gen_rtx_REG (orig_mode, AX_REG);
10458
10459 return ret;
10460 }
10461
10462 static rtx
10463 function_value_ms_64 (machine_mode orig_mode, machine_mode mode,
10464 const_tree valtype)
10465 {
10466 unsigned int regno = AX_REG;
10467
10468 if (TARGET_SSE)
10469 {
10470 switch (GET_MODE_SIZE (mode))
10471 {
10472 case 16:
10473 if (valtype != NULL_TREE
10474 && !VECTOR_INTEGER_TYPE_P (valtype)
10475 && !VECTOR_INTEGER_TYPE_P (valtype)
10476 && !INTEGRAL_TYPE_P (valtype)
10477 && !VECTOR_FLOAT_TYPE_P (valtype))
10478 break;
10479 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
10480 && !COMPLEX_MODE_P (mode))
10481 regno = FIRST_SSE_REG;
10482 break;
10483 case 8:
10484 case 4:
10485 if (mode == SFmode || mode == DFmode)
10486 regno = FIRST_SSE_REG;
10487 break;
10488 default:
10489 break;
10490 }
10491 }
10492 return gen_rtx_REG (orig_mode, regno);
10493 }
10494
10495 static rtx
10496 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
10497 machine_mode orig_mode, machine_mode mode)
10498 {
10499 const_tree fn, fntype;
10500
10501 fn = NULL_TREE;
10502 if (fntype_or_decl && DECL_P (fntype_or_decl))
10503 fn = fntype_or_decl;
10504 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
10505
10506 if ((valtype && POINTER_BOUNDS_TYPE_P (valtype))
10507 || POINTER_BOUNDS_MODE_P (mode))
10508 return gen_rtx_REG (BNDmode, FIRST_BND_REG);
10509 else if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
10510 return function_value_ms_64 (orig_mode, mode, valtype);
10511 else if (TARGET_64BIT)
10512 return function_value_64 (orig_mode, mode, valtype);
10513 else
10514 return function_value_32 (orig_mode, mode, fntype, fn);
10515 }
10516
10517 static rtx
10518 ix86_function_value (const_tree valtype, const_tree fntype_or_decl, bool)
10519 {
10520 machine_mode mode, orig_mode;
10521
10522 orig_mode = TYPE_MODE (valtype);
10523 mode = type_natural_mode (valtype, NULL, true);
10524 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
10525 }
10526
10527 /* Return an RTX representing a place where a function returns
10528 or recieves pointer bounds or NULL if no bounds are returned.
10529
10530 VALTYPE is a data type of a value returned by the function.
10531
10532 FN_DECL_OR_TYPE is a tree node representing FUNCTION_DECL
10533 or FUNCTION_TYPE of the function.
10534
10535 If OUTGOING is false, return a place in which the caller will
10536 see the return value. Otherwise, return a place where a
10537 function returns a value. */
10538
10539 static rtx
10540 ix86_function_value_bounds (const_tree valtype,
10541 const_tree fntype_or_decl ATTRIBUTE_UNUSED,
10542 bool outgoing ATTRIBUTE_UNUSED)
10543 {
10544 rtx res = NULL_RTX;
10545
10546 if (BOUNDED_TYPE_P (valtype))
10547 res = gen_rtx_REG (BNDmode, FIRST_BND_REG);
10548 else if (chkp_type_has_pointer (valtype))
10549 {
10550 bitmap slots;
10551 rtx bounds[2];
10552 bitmap_iterator bi;
10553 unsigned i, bnd_no = 0;
10554
10555 bitmap_obstack_initialize (NULL);
10556 slots = BITMAP_ALLOC (NULL);
10557 chkp_find_bound_slots (valtype, slots);
10558
10559 EXECUTE_IF_SET_IN_BITMAP (slots, 0, i, bi)
10560 {
10561 rtx reg = gen_rtx_REG (BNDmode, FIRST_BND_REG + bnd_no);
10562 rtx offs = GEN_INT (i * POINTER_SIZE / BITS_PER_UNIT);
10563 gcc_assert (bnd_no < 2);
10564 bounds[bnd_no++] = gen_rtx_EXPR_LIST (VOIDmode, reg, offs);
10565 }
10566
10567 res = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (bnd_no, bounds));
10568
10569 BITMAP_FREE (slots);
10570 bitmap_obstack_release (NULL);
10571 }
10572 else
10573 res = NULL_RTX;
10574
10575 return res;
10576 }
10577
10578 /* Pointer function arguments and return values are promoted to
10579 word_mode for normal functions. */
10580
10581 static machine_mode
10582 ix86_promote_function_mode (const_tree type, machine_mode mode,
10583 int *punsignedp, const_tree fntype,
10584 int for_return)
10585 {
10586 if (cfun->machine->func_type == TYPE_NORMAL
10587 && type != NULL_TREE
10588 && POINTER_TYPE_P (type))
10589 {
10590 *punsignedp = POINTERS_EXTEND_UNSIGNED;
10591 return word_mode;
10592 }
10593 return default_promote_function_mode (type, mode, punsignedp, fntype,
10594 for_return);
10595 }
10596
10597 /* Return true if a structure, union or array with MODE containing FIELD
10598 should be accessed using BLKmode. */
10599
10600 static bool
10601 ix86_member_type_forces_blk (const_tree field, machine_mode mode)
10602 {
10603 /* Union with XFmode must be in BLKmode. */
10604 return (mode == XFmode
10605 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
10606 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
10607 }
10608
10609 rtx
10610 ix86_libcall_value (machine_mode mode)
10611 {
10612 return ix86_function_value_1 (NULL, NULL, mode, mode);
10613 }
10614
10615 /* Return true iff type is returned in memory. */
10616
10617 static bool
10618 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
10619 {
10620 #ifdef SUBTARGET_RETURN_IN_MEMORY
10621 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
10622 #else
10623 const machine_mode mode = type_natural_mode (type, NULL, true);
10624 HOST_WIDE_INT size;
10625
10626 if (POINTER_BOUNDS_TYPE_P (type))
10627 return false;
10628
10629 if (TARGET_64BIT)
10630 {
10631 if (ix86_function_type_abi (fntype) == MS_ABI)
10632 {
10633 size = int_size_in_bytes (type);
10634
10635 /* __m128 is returned in xmm0. */
10636 if ((!type || VECTOR_INTEGER_TYPE_P (type)
10637 || INTEGRAL_TYPE_P (type)
10638 || VECTOR_FLOAT_TYPE_P (type))
10639 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
10640 && !COMPLEX_MODE_P (mode)
10641 && (GET_MODE_SIZE (mode) == 16 || size == 16))
10642 return false;
10643
10644 /* Otherwise, the size must be exactly in [1248]. */
10645 return size != 1 && size != 2 && size != 4 && size != 8;
10646 }
10647 else
10648 {
10649 int needed_intregs, needed_sseregs;
10650
10651 return examine_argument (mode, type, 1,
10652 &needed_intregs, &needed_sseregs);
10653 }
10654 }
10655 else
10656 {
10657 size = int_size_in_bytes (type);
10658
10659 /* Intel MCU psABI returns scalars and aggregates no larger than 8
10660 bytes in registers. */
10661 if (TARGET_IAMCU)
10662 return VECTOR_MODE_P (mode) || size < 0 || size > 8;
10663
10664 if (mode == BLKmode)
10665 return true;
10666
10667 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
10668 return false;
10669
10670 if (VECTOR_MODE_P (mode) || mode == TImode)
10671 {
10672 /* User-created vectors small enough to fit in EAX. */
10673 if (size < 8)
10674 return false;
10675
10676 /* Unless ABI prescibes otherwise,
10677 MMX/3dNow values are returned in MM0 if available. */
10678
10679 if (size == 8)
10680 return TARGET_VECT8_RETURNS || !TARGET_MMX;
10681
10682 /* SSE values are returned in XMM0 if available. */
10683 if (size == 16)
10684 return !TARGET_SSE;
10685
10686 /* AVX values are returned in YMM0 if available. */
10687 if (size == 32)
10688 return !TARGET_AVX;
10689
10690 /* AVX512F values are returned in ZMM0 if available. */
10691 if (size == 64)
10692 return !TARGET_AVX512F;
10693 }
10694
10695 if (mode == XFmode)
10696 return false;
10697
10698 if (size > 12)
10699 return true;
10700
10701 /* OImode shouldn't be used directly. */
10702 gcc_assert (mode != OImode);
10703
10704 return false;
10705 }
10706 #endif
10707 }
10708
10709 \f
10710 /* Create the va_list data type. */
10711
10712 static tree
10713 ix86_build_builtin_va_list_64 (void)
10714 {
10715 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
10716
10717 record = lang_hooks.types.make_type (RECORD_TYPE);
10718 type_decl = build_decl (BUILTINS_LOCATION,
10719 TYPE_DECL, get_identifier ("__va_list_tag"), record);
10720
10721 f_gpr = build_decl (BUILTINS_LOCATION,
10722 FIELD_DECL, get_identifier ("gp_offset"),
10723 unsigned_type_node);
10724 f_fpr = build_decl (BUILTINS_LOCATION,
10725 FIELD_DECL, get_identifier ("fp_offset"),
10726 unsigned_type_node);
10727 f_ovf = build_decl (BUILTINS_LOCATION,
10728 FIELD_DECL, get_identifier ("overflow_arg_area"),
10729 ptr_type_node);
10730 f_sav = build_decl (BUILTINS_LOCATION,
10731 FIELD_DECL, get_identifier ("reg_save_area"),
10732 ptr_type_node);
10733
10734 va_list_gpr_counter_field = f_gpr;
10735 va_list_fpr_counter_field = f_fpr;
10736
10737 DECL_FIELD_CONTEXT (f_gpr) = record;
10738 DECL_FIELD_CONTEXT (f_fpr) = record;
10739 DECL_FIELD_CONTEXT (f_ovf) = record;
10740 DECL_FIELD_CONTEXT (f_sav) = record;
10741
10742 TYPE_STUB_DECL (record) = type_decl;
10743 TYPE_NAME (record) = type_decl;
10744 TYPE_FIELDS (record) = f_gpr;
10745 DECL_CHAIN (f_gpr) = f_fpr;
10746 DECL_CHAIN (f_fpr) = f_ovf;
10747 DECL_CHAIN (f_ovf) = f_sav;
10748
10749 layout_type (record);
10750
10751 TYPE_ATTRIBUTES (record) = tree_cons (get_identifier ("sysv_abi va_list"),
10752 NULL_TREE, TYPE_ATTRIBUTES (record));
10753
10754 /* The correct type is an array type of one element. */
10755 return build_array_type (record, build_index_type (size_zero_node));
10756 }
10757
10758 /* Setup the builtin va_list data type and for 64-bit the additional
10759 calling convention specific va_list data types. */
10760
10761 static tree
10762 ix86_build_builtin_va_list (void)
10763 {
10764 if (TARGET_64BIT)
10765 {
10766 /* Initialize ABI specific va_list builtin types.
10767
10768 In lto1, we can encounter two va_list types:
10769 - one as a result of the type-merge across TUs, and
10770 - the one constructed here.
10771 These two types will not have the same TYPE_MAIN_VARIANT, and therefore
10772 a type identity check in canonical_va_list_type based on
10773 TYPE_MAIN_VARIANT (which we used to have) will not work.
10774 Instead, we tag each va_list_type_node with its unique attribute, and
10775 look for the attribute in the type identity check in
10776 canonical_va_list_type.
10777
10778 Tagging sysv_va_list_type_node directly with the attribute is
10779 problematic since it's a array of one record, which will degrade into a
10780 pointer to record when used as parameter (see build_va_arg comments for
10781 an example), dropping the attribute in the process. So we tag the
10782 record instead. */
10783
10784 /* For SYSV_ABI we use an array of one record. */
10785 sysv_va_list_type_node = ix86_build_builtin_va_list_64 ();
10786
10787 /* For MS_ABI we use plain pointer to argument area. */
10788 tree char_ptr_type = build_pointer_type (char_type_node);
10789 tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE,
10790 TYPE_ATTRIBUTES (char_ptr_type));
10791 ms_va_list_type_node = build_type_attribute_variant (char_ptr_type, attr);
10792
10793 return ((ix86_abi == MS_ABI)
10794 ? ms_va_list_type_node
10795 : sysv_va_list_type_node);
10796 }
10797 else
10798 {
10799 /* For i386 we use plain pointer to argument area. */
10800 return build_pointer_type (char_type_node);
10801 }
10802 }
10803
10804 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
10805
10806 static void
10807 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
10808 {
10809 rtx save_area, mem;
10810 alias_set_type set;
10811 int i, max;
10812
10813 /* GPR size of varargs save area. */
10814 if (cfun->va_list_gpr_size)
10815 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
10816 else
10817 ix86_varargs_gpr_size = 0;
10818
10819 /* FPR size of varargs save area. We don't need it if we don't pass
10820 anything in SSE registers. */
10821 if (TARGET_SSE && cfun->va_list_fpr_size)
10822 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
10823 else
10824 ix86_varargs_fpr_size = 0;
10825
10826 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
10827 return;
10828
10829 save_area = frame_pointer_rtx;
10830 set = get_varargs_alias_set ();
10831
10832 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
10833 if (max > X86_64_REGPARM_MAX)
10834 max = X86_64_REGPARM_MAX;
10835
10836 for (i = cum->regno; i < max; i++)
10837 {
10838 mem = gen_rtx_MEM (word_mode,
10839 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
10840 MEM_NOTRAP_P (mem) = 1;
10841 set_mem_alias_set (mem, set);
10842 emit_move_insn (mem,
10843 gen_rtx_REG (word_mode,
10844 x86_64_int_parameter_registers[i]));
10845 }
10846
10847 if (ix86_varargs_fpr_size)
10848 {
10849 machine_mode smode;
10850 rtx_code_label *label;
10851 rtx test;
10852
10853 /* Now emit code to save SSE registers. The AX parameter contains number
10854 of SSE parameter registers used to call this function, though all we
10855 actually check here is the zero/non-zero status. */
10856
10857 label = gen_label_rtx ();
10858 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
10859 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
10860 label));
10861
10862 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
10863 we used movdqa (i.e. TImode) instead? Perhaps even better would
10864 be if we could determine the real mode of the data, via a hook
10865 into pass_stdarg. Ignore all that for now. */
10866 smode = V4SFmode;
10867 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
10868 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
10869
10870 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
10871 if (max > X86_64_SSE_REGPARM_MAX)
10872 max = X86_64_SSE_REGPARM_MAX;
10873
10874 for (i = cum->sse_regno; i < max; ++i)
10875 {
10876 mem = plus_constant (Pmode, save_area,
10877 i * 16 + ix86_varargs_gpr_size);
10878 mem = gen_rtx_MEM (smode, mem);
10879 MEM_NOTRAP_P (mem) = 1;
10880 set_mem_alias_set (mem, set);
10881 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
10882
10883 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
10884 }
10885
10886 emit_label (label);
10887 }
10888 }
10889
10890 static void
10891 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
10892 {
10893 alias_set_type set = get_varargs_alias_set ();
10894 int i;
10895
10896 /* Reset to zero, as there might be a sysv vaarg used
10897 before. */
10898 ix86_varargs_gpr_size = 0;
10899 ix86_varargs_fpr_size = 0;
10900
10901 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
10902 {
10903 rtx reg, mem;
10904
10905 mem = gen_rtx_MEM (Pmode,
10906 plus_constant (Pmode, virtual_incoming_args_rtx,
10907 i * UNITS_PER_WORD));
10908 MEM_NOTRAP_P (mem) = 1;
10909 set_mem_alias_set (mem, set);
10910
10911 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
10912 emit_move_insn (mem, reg);
10913 }
10914 }
10915
10916 static void
10917 ix86_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10918 tree type, int *, int no_rtl)
10919 {
10920 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10921 CUMULATIVE_ARGS next_cum;
10922 tree fntype;
10923
10924 /* This argument doesn't appear to be used anymore. Which is good,
10925 because the old code here didn't suppress rtl generation. */
10926 gcc_assert (!no_rtl);
10927
10928 if (!TARGET_64BIT)
10929 return;
10930
10931 fntype = TREE_TYPE (current_function_decl);
10932
10933 /* For varargs, we do not want to skip the dummy va_dcl argument.
10934 For stdargs, we do want to skip the last named argument. */
10935 next_cum = *cum;
10936 if (stdarg_p (fntype))
10937 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
10938 true);
10939
10940 if (cum->call_abi == MS_ABI)
10941 setup_incoming_varargs_ms_64 (&next_cum);
10942 else
10943 setup_incoming_varargs_64 (&next_cum);
10944 }
10945
10946 static void
10947 ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v,
10948 enum machine_mode mode,
10949 tree type,
10950 int *pretend_size ATTRIBUTE_UNUSED,
10951 int no_rtl)
10952 {
10953 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10954 CUMULATIVE_ARGS next_cum;
10955 tree fntype;
10956 rtx save_area;
10957 int bnd_reg, i, max;
10958
10959 gcc_assert (!no_rtl);
10960
10961 /* Do nothing if we use plain pointer to argument area. */
10962 if (!TARGET_64BIT || cum->call_abi == MS_ABI)
10963 return;
10964
10965 fntype = TREE_TYPE (current_function_decl);
10966
10967 /* For varargs, we do not want to skip the dummy va_dcl argument.
10968 For stdargs, we do want to skip the last named argument. */
10969 next_cum = *cum;
10970 if (stdarg_p (fntype))
10971 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
10972 true);
10973 save_area = frame_pointer_rtx;
10974
10975 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
10976 if (max > X86_64_REGPARM_MAX)
10977 max = X86_64_REGPARM_MAX;
10978
10979 bnd_reg = cum->bnd_regno + cum->force_bnd_pass;
10980 if (chkp_function_instrumented_p (current_function_decl))
10981 for (i = cum->regno; i < max; i++)
10982 {
10983 rtx addr = plus_constant (Pmode, save_area, i * UNITS_PER_WORD);
10984 rtx ptr = gen_rtx_REG (Pmode,
10985 x86_64_int_parameter_registers[i]);
10986 rtx bounds;
10987
10988 if (bnd_reg <= LAST_BND_REG)
10989 bounds = gen_rtx_REG (BNDmode, bnd_reg);
10990 else
10991 {
10992 rtx ldx_addr =
10993 plus_constant (Pmode, arg_pointer_rtx,
10994 (LAST_BND_REG - bnd_reg) * GET_MODE_SIZE (Pmode));
10995 bounds = gen_reg_rtx (BNDmode);
10996 emit_insn (BNDmode == BND64mode
10997 ? gen_bnd64_ldx (bounds, ldx_addr, ptr)
10998 : gen_bnd32_ldx (bounds, ldx_addr, ptr));
10999 }
11000
11001 emit_insn (BNDmode == BND64mode
11002 ? gen_bnd64_stx (addr, ptr, bounds)
11003 : gen_bnd32_stx (addr, ptr, bounds));
11004
11005 bnd_reg++;
11006 }
11007 }
11008
11009
11010 /* Checks if TYPE is of kind va_list char *. */
11011
11012 static bool
11013 is_va_list_char_pointer (tree type)
11014 {
11015 tree canonic;
11016
11017 /* For 32-bit it is always true. */
11018 if (!TARGET_64BIT)
11019 return true;
11020 canonic = ix86_canonical_va_list_type (type);
11021 return (canonic == ms_va_list_type_node
11022 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
11023 }
11024
11025 /* Implement va_start. */
11026
11027 static void
11028 ix86_va_start (tree valist, rtx nextarg)
11029 {
11030 HOST_WIDE_INT words, n_gpr, n_fpr;
11031 tree f_gpr, f_fpr, f_ovf, f_sav;
11032 tree gpr, fpr, ovf, sav, t;
11033 tree type;
11034 rtx ovf_rtx;
11035
11036 if (flag_split_stack
11037 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
11038 {
11039 unsigned int scratch_regno;
11040
11041 /* When we are splitting the stack, we can't refer to the stack
11042 arguments using internal_arg_pointer, because they may be on
11043 the old stack. The split stack prologue will arrange to
11044 leave a pointer to the old stack arguments in a scratch
11045 register, which we here copy to a pseudo-register. The split
11046 stack prologue can't set the pseudo-register directly because
11047 it (the prologue) runs before any registers have been saved. */
11048
11049 scratch_regno = split_stack_prologue_scratch_regno ();
11050 if (scratch_regno != INVALID_REGNUM)
11051 {
11052 rtx reg;
11053 rtx_insn *seq;
11054
11055 reg = gen_reg_rtx (Pmode);
11056 cfun->machine->split_stack_varargs_pointer = reg;
11057
11058 start_sequence ();
11059 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
11060 seq = get_insns ();
11061 end_sequence ();
11062
11063 push_topmost_sequence ();
11064 emit_insn_after (seq, entry_of_function ());
11065 pop_topmost_sequence ();
11066 }
11067 }
11068
11069 /* Only 64bit target needs something special. */
11070 if (is_va_list_char_pointer (TREE_TYPE (valist)))
11071 {
11072 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
11073 std_expand_builtin_va_start (valist, nextarg);
11074 else
11075 {
11076 rtx va_r, next;
11077
11078 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
11079 next = expand_binop (ptr_mode, add_optab,
11080 cfun->machine->split_stack_varargs_pointer,
11081 crtl->args.arg_offset_rtx,
11082 NULL_RTX, 0, OPTAB_LIB_WIDEN);
11083 convert_move (va_r, next, 0);
11084
11085 /* Store zero bounds for va_list. */
11086 if (chkp_function_instrumented_p (current_function_decl))
11087 chkp_expand_bounds_reset_for_mem (valist,
11088 make_tree (TREE_TYPE (valist),
11089 next));
11090
11091 }
11092 return;
11093 }
11094
11095 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
11096 f_fpr = DECL_CHAIN (f_gpr);
11097 f_ovf = DECL_CHAIN (f_fpr);
11098 f_sav = DECL_CHAIN (f_ovf);
11099
11100 valist = build_simple_mem_ref (valist);
11101 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
11102 /* The following should be folded into the MEM_REF offset. */
11103 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
11104 f_gpr, NULL_TREE);
11105 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
11106 f_fpr, NULL_TREE);
11107 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
11108 f_ovf, NULL_TREE);
11109 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
11110 f_sav, NULL_TREE);
11111
11112 /* Count number of gp and fp argument registers used. */
11113 words = crtl->args.info.words;
11114 n_gpr = crtl->args.info.regno;
11115 n_fpr = crtl->args.info.sse_regno;
11116
11117 if (cfun->va_list_gpr_size)
11118 {
11119 type = TREE_TYPE (gpr);
11120 t = build2 (MODIFY_EXPR, type,
11121 gpr, build_int_cst (type, n_gpr * 8));
11122 TREE_SIDE_EFFECTS (t) = 1;
11123 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11124 }
11125
11126 if (TARGET_SSE && cfun->va_list_fpr_size)
11127 {
11128 type = TREE_TYPE (fpr);
11129 t = build2 (MODIFY_EXPR, type, fpr,
11130 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
11131 TREE_SIDE_EFFECTS (t) = 1;
11132 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11133 }
11134
11135 /* Find the overflow area. */
11136 type = TREE_TYPE (ovf);
11137 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
11138 ovf_rtx = crtl->args.internal_arg_pointer;
11139 else
11140 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
11141 t = make_tree (type, ovf_rtx);
11142 if (words != 0)
11143 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
11144
11145 /* Store zero bounds for overflow area pointer. */
11146 if (chkp_function_instrumented_p (current_function_decl))
11147 chkp_expand_bounds_reset_for_mem (ovf, t);
11148
11149 t = build2 (MODIFY_EXPR, type, ovf, t);
11150 TREE_SIDE_EFFECTS (t) = 1;
11151 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11152
11153 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
11154 {
11155 /* Find the register save area.
11156 Prologue of the function save it right above stack frame. */
11157 type = TREE_TYPE (sav);
11158 t = make_tree (type, frame_pointer_rtx);
11159 if (!ix86_varargs_gpr_size)
11160 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
11161
11162 /* Store zero bounds for save area pointer. */
11163 if (chkp_function_instrumented_p (current_function_decl))
11164 chkp_expand_bounds_reset_for_mem (sav, t);
11165
11166 t = build2 (MODIFY_EXPR, type, sav, t);
11167 TREE_SIDE_EFFECTS (t) = 1;
11168 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11169 }
11170 }
11171
11172 /* Implement va_arg. */
11173
11174 static tree
11175 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
11176 gimple_seq *post_p)
11177 {
11178 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
11179 tree f_gpr, f_fpr, f_ovf, f_sav;
11180 tree gpr, fpr, ovf, sav, t;
11181 int size, rsize;
11182 tree lab_false, lab_over = NULL_TREE;
11183 tree addr, t2;
11184 rtx container;
11185 int indirect_p = 0;
11186 tree ptrtype;
11187 machine_mode nat_mode;
11188 unsigned int arg_boundary;
11189
11190 /* Only 64bit target needs something special. */
11191 if (is_va_list_char_pointer (TREE_TYPE (valist)))
11192 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
11193
11194 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
11195 f_fpr = DECL_CHAIN (f_gpr);
11196 f_ovf = DECL_CHAIN (f_fpr);
11197 f_sav = DECL_CHAIN (f_ovf);
11198
11199 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
11200 valist, f_gpr, NULL_TREE);
11201
11202 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
11203 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
11204 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
11205
11206 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
11207 if (indirect_p)
11208 type = build_pointer_type (type);
11209 size = int_size_in_bytes (type);
11210 rsize = CEIL (size, UNITS_PER_WORD);
11211
11212 nat_mode = type_natural_mode (type, NULL, false);
11213 switch (nat_mode)
11214 {
11215 case V8SFmode:
11216 case V8SImode:
11217 case V32QImode:
11218 case V16HImode:
11219 case V4DFmode:
11220 case V4DImode:
11221 case V16SFmode:
11222 case V16SImode:
11223 case V64QImode:
11224 case V32HImode:
11225 case V8DFmode:
11226 case V8DImode:
11227 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
11228 if (!TARGET_64BIT_MS_ABI)
11229 {
11230 container = NULL;
11231 break;
11232 }
11233 /* FALLTHRU */
11234
11235 default:
11236 container = construct_container (nat_mode, TYPE_MODE (type),
11237 type, 0, X86_64_REGPARM_MAX,
11238 X86_64_SSE_REGPARM_MAX, intreg,
11239 0);
11240 break;
11241 }
11242
11243 /* Pull the value out of the saved registers. */
11244
11245 addr = create_tmp_var (ptr_type_node, "addr");
11246
11247 if (container)
11248 {
11249 int needed_intregs, needed_sseregs;
11250 bool need_temp;
11251 tree int_addr, sse_addr;
11252
11253 lab_false = create_artificial_label (UNKNOWN_LOCATION);
11254 lab_over = create_artificial_label (UNKNOWN_LOCATION);
11255
11256 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
11257
11258 need_temp = (!REG_P (container)
11259 && ((needed_intregs && TYPE_ALIGN (type) > 64)
11260 || TYPE_ALIGN (type) > 128));
11261
11262 /* In case we are passing structure, verify that it is consecutive block
11263 on the register save area. If not we need to do moves. */
11264 if (!need_temp && !REG_P (container))
11265 {
11266 /* Verify that all registers are strictly consecutive */
11267 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
11268 {
11269 int i;
11270
11271 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
11272 {
11273 rtx slot = XVECEXP (container, 0, i);
11274 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
11275 || INTVAL (XEXP (slot, 1)) != i * 16)
11276 need_temp = true;
11277 }
11278 }
11279 else
11280 {
11281 int i;
11282
11283 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
11284 {
11285 rtx slot = XVECEXP (container, 0, i);
11286 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
11287 || INTVAL (XEXP (slot, 1)) != i * 8)
11288 need_temp = true;
11289 }
11290 }
11291 }
11292 if (!need_temp)
11293 {
11294 int_addr = addr;
11295 sse_addr = addr;
11296 }
11297 else
11298 {
11299 int_addr = create_tmp_var (ptr_type_node, "int_addr");
11300 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
11301 }
11302
11303 /* First ensure that we fit completely in registers. */
11304 if (needed_intregs)
11305 {
11306 t = build_int_cst (TREE_TYPE (gpr),
11307 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
11308 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
11309 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
11310 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
11311 gimplify_and_add (t, pre_p);
11312 }
11313 if (needed_sseregs)
11314 {
11315 t = build_int_cst (TREE_TYPE (fpr),
11316 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
11317 + X86_64_REGPARM_MAX * 8);
11318 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
11319 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
11320 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
11321 gimplify_and_add (t, pre_p);
11322 }
11323
11324 /* Compute index to start of area used for integer regs. */
11325 if (needed_intregs)
11326 {
11327 /* int_addr = gpr + sav; */
11328 t = fold_build_pointer_plus (sav, gpr);
11329 gimplify_assign (int_addr, t, pre_p);
11330 }
11331 if (needed_sseregs)
11332 {
11333 /* sse_addr = fpr + sav; */
11334 t = fold_build_pointer_plus (sav, fpr);
11335 gimplify_assign (sse_addr, t, pre_p);
11336 }
11337 if (need_temp)
11338 {
11339 int i, prev_size = 0;
11340 tree temp = create_tmp_var (type, "va_arg_tmp");
11341
11342 /* addr = &temp; */
11343 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
11344 gimplify_assign (addr, t, pre_p);
11345
11346 for (i = 0; i < XVECLEN (container, 0); i++)
11347 {
11348 rtx slot = XVECEXP (container, 0, i);
11349 rtx reg = XEXP (slot, 0);
11350 machine_mode mode = GET_MODE (reg);
11351 tree piece_type;
11352 tree addr_type;
11353 tree daddr_type;
11354 tree src_addr, src;
11355 int src_offset;
11356 tree dest_addr, dest;
11357 int cur_size = GET_MODE_SIZE (mode);
11358
11359 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
11360 prev_size = INTVAL (XEXP (slot, 1));
11361 if (prev_size + cur_size > size)
11362 {
11363 cur_size = size - prev_size;
11364 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
11365 if (mode == BLKmode)
11366 mode = QImode;
11367 }
11368 piece_type = lang_hooks.types.type_for_mode (mode, 1);
11369 if (mode == GET_MODE (reg))
11370 addr_type = build_pointer_type (piece_type);
11371 else
11372 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
11373 true);
11374 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
11375 true);
11376
11377 if (SSE_REGNO_P (REGNO (reg)))
11378 {
11379 src_addr = sse_addr;
11380 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
11381 }
11382 else
11383 {
11384 src_addr = int_addr;
11385 src_offset = REGNO (reg) * 8;
11386 }
11387 src_addr = fold_convert (addr_type, src_addr);
11388 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
11389
11390 dest_addr = fold_convert (daddr_type, addr);
11391 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
11392 if (cur_size == GET_MODE_SIZE (mode))
11393 {
11394 src = build_va_arg_indirect_ref (src_addr);
11395 dest = build_va_arg_indirect_ref (dest_addr);
11396
11397 gimplify_assign (dest, src, pre_p);
11398 }
11399 else
11400 {
11401 tree copy
11402 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
11403 3, dest_addr, src_addr,
11404 size_int (cur_size));
11405 gimplify_and_add (copy, pre_p);
11406 }
11407 prev_size += cur_size;
11408 }
11409 }
11410
11411 if (needed_intregs)
11412 {
11413 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
11414 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
11415 gimplify_assign (gpr, t, pre_p);
11416 }
11417
11418 if (needed_sseregs)
11419 {
11420 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
11421 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
11422 gimplify_assign (unshare_expr (fpr), t, pre_p);
11423 }
11424
11425 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
11426
11427 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
11428 }
11429
11430 /* ... otherwise out of the overflow area. */
11431
11432 /* When we align parameter on stack for caller, if the parameter
11433 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
11434 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
11435 here with caller. */
11436 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
11437 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
11438 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
11439
11440 /* Care for on-stack alignment if needed. */
11441 if (arg_boundary <= 64 || size == 0)
11442 t = ovf;
11443 else
11444 {
11445 HOST_WIDE_INT align = arg_boundary / 8;
11446 t = fold_build_pointer_plus_hwi (ovf, align - 1);
11447 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
11448 build_int_cst (TREE_TYPE (t), -align));
11449 }
11450
11451 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
11452 gimplify_assign (addr, t, pre_p);
11453
11454 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
11455 gimplify_assign (unshare_expr (ovf), t, pre_p);
11456
11457 if (container)
11458 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
11459
11460 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
11461 addr = fold_convert (ptrtype, addr);
11462
11463 if (indirect_p)
11464 addr = build_va_arg_indirect_ref (addr);
11465 return build_va_arg_indirect_ref (addr);
11466 }
11467 \f
11468 /* Return true if OPNUM's MEM should be matched
11469 in movabs* patterns. */
11470
11471 bool
11472 ix86_check_movabs (rtx insn, int opnum)
11473 {
11474 rtx set, mem;
11475
11476 set = PATTERN (insn);
11477 if (GET_CODE (set) == PARALLEL)
11478 set = XVECEXP (set, 0, 0);
11479 gcc_assert (GET_CODE (set) == SET);
11480 mem = XEXP (set, opnum);
11481 while (SUBREG_P (mem))
11482 mem = SUBREG_REG (mem);
11483 gcc_assert (MEM_P (mem));
11484 return volatile_ok || !MEM_VOLATILE_P (mem);
11485 }
11486
11487 /* Return false if INSN contains a MEM with a non-default address space. */
11488 bool
11489 ix86_check_no_addr_space (rtx insn)
11490 {
11491 subrtx_var_iterator::array_type array;
11492 FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (insn), ALL)
11493 {
11494 rtx x = *iter;
11495 if (MEM_P (x) && !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x)))
11496 return false;
11497 }
11498 return true;
11499 }
11500 \f
11501 /* Initialize the table of extra 80387 mathematical constants. */
11502
11503 static void
11504 init_ext_80387_constants (void)
11505 {
11506 static const char * cst[5] =
11507 {
11508 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
11509 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
11510 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
11511 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
11512 "3.1415926535897932385128089594061862044", /* 4: fldpi */
11513 };
11514 int i;
11515
11516 for (i = 0; i < 5; i++)
11517 {
11518 real_from_string (&ext_80387_constants_table[i], cst[i]);
11519 /* Ensure each constant is rounded to XFmode precision. */
11520 real_convert (&ext_80387_constants_table[i],
11521 XFmode, &ext_80387_constants_table[i]);
11522 }
11523
11524 ext_80387_constants_init = 1;
11525 }
11526
11527 /* Return non-zero if the constant is something that
11528 can be loaded with a special instruction. */
11529
11530 int
11531 standard_80387_constant_p (rtx x)
11532 {
11533 machine_mode mode = GET_MODE (x);
11534
11535 const REAL_VALUE_TYPE *r;
11536
11537 if (!(CONST_DOUBLE_P (x) && X87_FLOAT_MODE_P (mode)))
11538 return -1;
11539
11540 if (x == CONST0_RTX (mode))
11541 return 1;
11542 if (x == CONST1_RTX (mode))
11543 return 2;
11544
11545 r = CONST_DOUBLE_REAL_VALUE (x);
11546
11547 /* For XFmode constants, try to find a special 80387 instruction when
11548 optimizing for size or on those CPUs that benefit from them. */
11549 if (mode == XFmode
11550 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
11551 {
11552 int i;
11553
11554 if (! ext_80387_constants_init)
11555 init_ext_80387_constants ();
11556
11557 for (i = 0; i < 5; i++)
11558 if (real_identical (r, &ext_80387_constants_table[i]))
11559 return i + 3;
11560 }
11561
11562 /* Load of the constant -0.0 or -1.0 will be split as
11563 fldz;fchs or fld1;fchs sequence. */
11564 if (real_isnegzero (r))
11565 return 8;
11566 if (real_identical (r, &dconstm1))
11567 return 9;
11568
11569 return 0;
11570 }
11571
11572 /* Return the opcode of the special instruction to be used to load
11573 the constant X. */
11574
11575 const char *
11576 standard_80387_constant_opcode (rtx x)
11577 {
11578 switch (standard_80387_constant_p (x))
11579 {
11580 case 1:
11581 return "fldz";
11582 case 2:
11583 return "fld1";
11584 case 3:
11585 return "fldlg2";
11586 case 4:
11587 return "fldln2";
11588 case 5:
11589 return "fldl2e";
11590 case 6:
11591 return "fldl2t";
11592 case 7:
11593 return "fldpi";
11594 case 8:
11595 case 9:
11596 return "#";
11597 default:
11598 gcc_unreachable ();
11599 }
11600 }
11601
11602 /* Return the CONST_DOUBLE representing the 80387 constant that is
11603 loaded by the specified special instruction. The argument IDX
11604 matches the return value from standard_80387_constant_p. */
11605
11606 rtx
11607 standard_80387_constant_rtx (int idx)
11608 {
11609 int i;
11610
11611 if (! ext_80387_constants_init)
11612 init_ext_80387_constants ();
11613
11614 switch (idx)
11615 {
11616 case 3:
11617 case 4:
11618 case 5:
11619 case 6:
11620 case 7:
11621 i = idx - 3;
11622 break;
11623
11624 default:
11625 gcc_unreachable ();
11626 }
11627
11628 return const_double_from_real_value (ext_80387_constants_table[i],
11629 XFmode);
11630 }
11631
11632 /* Return 1 if X is all bits 0 and 2 if X is all bits 1
11633 in supported SSE/AVX vector mode. */
11634
11635 int
11636 standard_sse_constant_p (rtx x, machine_mode pred_mode)
11637 {
11638 machine_mode mode;
11639
11640 if (!TARGET_SSE)
11641 return 0;
11642
11643 mode = GET_MODE (x);
11644
11645 if (x == const0_rtx || const0_operand (x, mode))
11646 return 1;
11647
11648 if (x == constm1_rtx || vector_all_ones_operand (x, mode))
11649 {
11650 /* VOIDmode integer constant, get mode from the predicate. */
11651 if (mode == VOIDmode)
11652 mode = pred_mode;
11653
11654 switch (GET_MODE_SIZE (mode))
11655 {
11656 case 64:
11657 if (TARGET_AVX512F)
11658 return 2;
11659 break;
11660 case 32:
11661 if (TARGET_AVX2)
11662 return 2;
11663 break;
11664 case 16:
11665 if (TARGET_SSE2)
11666 return 2;
11667 break;
11668 case 0:
11669 /* VOIDmode */
11670 gcc_unreachable ();
11671 default:
11672 break;
11673 }
11674 }
11675
11676 return 0;
11677 }
11678
11679 /* Return the opcode of the special instruction to be used to load
11680 the constant X. */
11681
11682 const char *
11683 standard_sse_constant_opcode (rtx_insn *insn, rtx x)
11684 {
11685 machine_mode mode;
11686
11687 gcc_assert (TARGET_SSE);
11688
11689 mode = GET_MODE (x);
11690
11691 if (x == const0_rtx || const0_operand (x, mode))
11692 {
11693 switch (get_attr_mode (insn))
11694 {
11695 case MODE_XI:
11696 return "vpxord\t%g0, %g0, %g0";
11697 case MODE_OI:
11698 return (TARGET_AVX512VL
11699 ? "vpxord\t%x0, %x0, %x0"
11700 : "vpxor\t%x0, %x0, %x0");
11701 case MODE_TI:
11702 return (TARGET_AVX512VL
11703 ? "vpxord\t%t0, %t0, %t0"
11704 : "%vpxor\t%0, %d0");
11705
11706 case MODE_V8DF:
11707 return (TARGET_AVX512DQ
11708 ? "vxorpd\t%g0, %g0, %g0"
11709 : "vpxorq\t%g0, %g0, %g0");
11710 case MODE_V4DF:
11711 return "vxorpd\t%x0, %x0, %x0";
11712 case MODE_V2DF:
11713 return "%vxorpd\t%0, %d0";
11714
11715 case MODE_V16SF:
11716 return (TARGET_AVX512DQ
11717 ? "vxorps\t%g0, %g0, %g0"
11718 : "vpxord\t%g0, %g0, %g0");
11719 case MODE_V8SF:
11720 return "vxorps\t%x0, %x0, %x0";
11721 case MODE_V4SF:
11722 return "%vxorps\t%0, %d0";
11723
11724 default:
11725 gcc_unreachable ();
11726 }
11727 }
11728 else if (x == constm1_rtx || vector_all_ones_operand (x, mode))
11729 {
11730 enum attr_mode insn_mode = get_attr_mode (insn);
11731
11732 switch (insn_mode)
11733 {
11734 case MODE_XI:
11735 case MODE_V8DF:
11736 case MODE_V16SF:
11737 gcc_assert (TARGET_AVX512F);
11738 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
11739
11740 case MODE_OI:
11741 case MODE_V4DF:
11742 case MODE_V8SF:
11743 gcc_assert (TARGET_AVX2);
11744 /* FALLTHRU */
11745 case MODE_TI:
11746 case MODE_V2DF:
11747 case MODE_V4SF:
11748 gcc_assert (TARGET_SSE2);
11749 return (TARGET_AVX
11750 ? "vpcmpeqd\t%0, %0, %0"
11751 : "pcmpeqd\t%0, %0");
11752
11753 default:
11754 gcc_unreachable ();
11755 }
11756 }
11757
11758 gcc_unreachable ();
11759 }
11760
11761 /* Returns true if INSN can be transformed from a memory load
11762 to a supported FP constant load. */
11763
11764 bool
11765 ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
11766 {
11767 rtx src = find_constant_src (insn);
11768
11769 gcc_assert (REG_P (dst));
11770
11771 if (src == NULL
11772 || (SSE_REGNO_P (REGNO (dst))
11773 && standard_sse_constant_p (src, GET_MODE (dst)) != 1)
11774 || (STACK_REGNO_P (REGNO (dst))
11775 && standard_80387_constant_p (src) < 1))
11776 return false;
11777
11778 return true;
11779 }
11780
11781 /* Returns true if OP contains a symbol reference */
11782
11783 bool
11784 symbolic_reference_mentioned_p (rtx op)
11785 {
11786 const char *fmt;
11787 int i;
11788
11789 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
11790 return true;
11791
11792 fmt = GET_RTX_FORMAT (GET_CODE (op));
11793 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
11794 {
11795 if (fmt[i] == 'E')
11796 {
11797 int j;
11798
11799 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
11800 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
11801 return true;
11802 }
11803
11804 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
11805 return true;
11806 }
11807
11808 return false;
11809 }
11810
11811 /* Return true if it is appropriate to emit `ret' instructions in the
11812 body of a function. Do this only if the epilogue is simple, needing a
11813 couple of insns. Prior to reloading, we can't tell how many registers
11814 must be saved, so return false then. Return false if there is no frame
11815 marker to de-allocate. */
11816
11817 bool
11818 ix86_can_use_return_insn_p (void)
11819 {
11820 struct ix86_frame frame;
11821
11822 /* Don't use `ret' instruction in interrupt handler. */
11823 if (! reload_completed
11824 || frame_pointer_needed
11825 || cfun->machine->func_type != TYPE_NORMAL)
11826 return 0;
11827
11828 /* Don't allow more than 32k pop, since that's all we can do
11829 with one instruction. */
11830 if (crtl->args.pops_args && crtl->args.size >= 32768)
11831 return 0;
11832
11833 ix86_compute_frame_layout (&frame);
11834 return (frame.stack_pointer_offset == UNITS_PER_WORD
11835 && (frame.nregs + frame.nsseregs) == 0);
11836 }
11837 \f
11838 /* Value should be nonzero if functions must have frame pointers.
11839 Zero means the frame pointer need not be set up (and parms may
11840 be accessed via the stack pointer) in functions that seem suitable. */
11841
11842 static bool
11843 ix86_frame_pointer_required (void)
11844 {
11845 /* If we accessed previous frames, then the generated code expects
11846 to be able to access the saved ebp value in our frame. */
11847 if (cfun->machine->accesses_prev_frame)
11848 return true;
11849
11850 /* Several x86 os'es need a frame pointer for other reasons,
11851 usually pertaining to setjmp. */
11852 if (SUBTARGET_FRAME_POINTER_REQUIRED)
11853 return true;
11854
11855 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
11856 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
11857 return true;
11858
11859 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
11860 allocation is 4GB. */
11861 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
11862 return true;
11863
11864 /* SSE saves require frame-pointer when stack is misaligned. */
11865 if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128)
11866 return true;
11867
11868 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
11869 turns off the frame pointer by default. Turn it back on now if
11870 we've not got a leaf function. */
11871 if (TARGET_OMIT_LEAF_FRAME_POINTER
11872 && (!crtl->is_leaf
11873 || ix86_current_function_calls_tls_descriptor))
11874 return true;
11875
11876 if (crtl->profile && !flag_fentry)
11877 return true;
11878
11879 return false;
11880 }
11881
11882 /* Record that the current function accesses previous call frames. */
11883
11884 void
11885 ix86_setup_frame_addresses (void)
11886 {
11887 cfun->machine->accesses_prev_frame = 1;
11888 }
11889 \f
11890 #ifndef USE_HIDDEN_LINKONCE
11891 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
11892 # define USE_HIDDEN_LINKONCE 1
11893 # else
11894 # define USE_HIDDEN_LINKONCE 0
11895 # endif
11896 #endif
11897
11898 static int pic_labels_used;
11899
11900 /* Fills in the label name that should be used for a pc thunk for
11901 the given register. */
11902
11903 static void
11904 get_pc_thunk_name (char name[32], unsigned int regno)
11905 {
11906 gcc_assert (!TARGET_64BIT);
11907
11908 if (USE_HIDDEN_LINKONCE)
11909 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
11910 else
11911 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
11912 }
11913
11914
11915 /* This function generates code for -fpic that loads %ebx with
11916 the return address of the caller and then returns. */
11917
11918 static void
11919 ix86_code_end (void)
11920 {
11921 rtx xops[2];
11922 int regno;
11923
11924 for (regno = AX_REG; regno <= SP_REG; regno++)
11925 {
11926 char name[32];
11927 tree decl;
11928
11929 if (!(pic_labels_used & (1 << regno)))
11930 continue;
11931
11932 get_pc_thunk_name (name, regno);
11933
11934 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
11935 get_identifier (name),
11936 build_function_type_list (void_type_node, NULL_TREE));
11937 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
11938 NULL_TREE, void_type_node);
11939 TREE_PUBLIC (decl) = 1;
11940 TREE_STATIC (decl) = 1;
11941 DECL_IGNORED_P (decl) = 1;
11942
11943 #if TARGET_MACHO
11944 if (TARGET_MACHO)
11945 {
11946 switch_to_section (darwin_sections[picbase_thunk_section]);
11947 fputs ("\t.weak_definition\t", asm_out_file);
11948 assemble_name (asm_out_file, name);
11949 fputs ("\n\t.private_extern\t", asm_out_file);
11950 assemble_name (asm_out_file, name);
11951 putc ('\n', asm_out_file);
11952 ASM_OUTPUT_LABEL (asm_out_file, name);
11953 DECL_WEAK (decl) = 1;
11954 }
11955 else
11956 #endif
11957 if (USE_HIDDEN_LINKONCE)
11958 {
11959 cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
11960
11961 targetm.asm_out.unique_section (decl, 0);
11962 switch_to_section (get_named_section (decl, NULL, 0));
11963
11964 targetm.asm_out.globalize_label (asm_out_file, name);
11965 fputs ("\t.hidden\t", asm_out_file);
11966 assemble_name (asm_out_file, name);
11967 putc ('\n', asm_out_file);
11968 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
11969 }
11970 else
11971 {
11972 switch_to_section (text_section);
11973 ASM_OUTPUT_LABEL (asm_out_file, name);
11974 }
11975
11976 DECL_INITIAL (decl) = make_node (BLOCK);
11977 current_function_decl = decl;
11978 allocate_struct_function (decl, false);
11979 init_function_start (decl);
11980 /* We're about to hide the function body from callees of final_* by
11981 emitting it directly; tell them we're a thunk, if they care. */
11982 cfun->is_thunk = true;
11983 first_function_block_is_cold = false;
11984 /* Make sure unwind info is emitted for the thunk if needed. */
11985 final_start_function (emit_barrier (), asm_out_file, 1);
11986
11987 /* Pad stack IP move with 4 instructions (two NOPs count
11988 as one instruction). */
11989 if (TARGET_PAD_SHORT_FUNCTION)
11990 {
11991 int i = 8;
11992
11993 while (i--)
11994 fputs ("\tnop\n", asm_out_file);
11995 }
11996
11997 xops[0] = gen_rtx_REG (Pmode, regno);
11998 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
11999 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
12000 output_asm_insn ("%!ret", NULL);
12001 final_end_function ();
12002 init_insn_lengths ();
12003 free_after_compilation (cfun);
12004 set_cfun (NULL);
12005 current_function_decl = NULL;
12006 }
12007
12008 if (flag_split_stack)
12009 file_end_indicate_split_stack ();
12010 }
12011
12012 /* Emit code for the SET_GOT patterns. */
12013
12014 const char *
12015 output_set_got (rtx dest, rtx label)
12016 {
12017 rtx xops[3];
12018
12019 xops[0] = dest;
12020
12021 if (TARGET_VXWORKS_RTP && flag_pic)
12022 {
12023 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
12024 xops[2] = gen_rtx_MEM (Pmode,
12025 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
12026 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
12027
12028 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
12029 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
12030 an unadorned address. */
12031 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
12032 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
12033 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
12034 return "";
12035 }
12036
12037 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
12038
12039 if (flag_pic)
12040 {
12041 char name[32];
12042 get_pc_thunk_name (name, REGNO (dest));
12043 pic_labels_used |= 1 << REGNO (dest);
12044
12045 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
12046 xops[2] = gen_rtx_MEM (QImode, xops[2]);
12047 output_asm_insn ("%!call\t%X2", xops);
12048
12049 #if TARGET_MACHO
12050 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
12051 This is what will be referenced by the Mach-O PIC subsystem. */
12052 if (machopic_should_output_picbase_label () || !label)
12053 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
12054
12055 /* When we are restoring the pic base at the site of a nonlocal label,
12056 and we decided to emit the pic base above, we will still output a
12057 local label used for calculating the correction offset (even though
12058 the offset will be 0 in that case). */
12059 if (label)
12060 targetm.asm_out.internal_label (asm_out_file, "L",
12061 CODE_LABEL_NUMBER (label));
12062 #endif
12063 }
12064 else
12065 {
12066 if (TARGET_MACHO)
12067 /* We don't need a pic base, we're not producing pic. */
12068 gcc_unreachable ();
12069
12070 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
12071 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
12072 targetm.asm_out.internal_label (asm_out_file, "L",
12073 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
12074 }
12075
12076 if (!TARGET_MACHO)
12077 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
12078
12079 return "";
12080 }
12081
12082 /* Generate an "push" pattern for input ARG. */
12083
12084 static rtx
12085 gen_push (rtx arg)
12086 {
12087 struct machine_function *m = cfun->machine;
12088
12089 if (m->fs.cfa_reg == stack_pointer_rtx)
12090 m->fs.cfa_offset += UNITS_PER_WORD;
12091 m->fs.sp_offset += UNITS_PER_WORD;
12092
12093 if (REG_P (arg) && GET_MODE (arg) != word_mode)
12094 arg = gen_rtx_REG (word_mode, REGNO (arg));
12095
12096 return gen_rtx_SET (gen_rtx_MEM (word_mode,
12097 gen_rtx_PRE_DEC (Pmode,
12098 stack_pointer_rtx)),
12099 arg);
12100 }
12101
12102 /* Generate an "pop" pattern for input ARG. */
12103
12104 static rtx
12105 gen_pop (rtx arg)
12106 {
12107 if (REG_P (arg) && GET_MODE (arg) != word_mode)
12108 arg = gen_rtx_REG (word_mode, REGNO (arg));
12109
12110 return gen_rtx_SET (arg,
12111 gen_rtx_MEM (word_mode,
12112 gen_rtx_POST_INC (Pmode,
12113 stack_pointer_rtx)));
12114 }
12115
12116 /* Return >= 0 if there is an unused call-clobbered register available
12117 for the entire function. */
12118
12119 static unsigned int
12120 ix86_select_alt_pic_regnum (void)
12121 {
12122 if (ix86_use_pseudo_pic_reg ())
12123 return INVALID_REGNUM;
12124
12125 if (crtl->is_leaf
12126 && !crtl->profile
12127 && !ix86_current_function_calls_tls_descriptor)
12128 {
12129 int i, drap;
12130 /* Can't use the same register for both PIC and DRAP. */
12131 if (crtl->drap_reg)
12132 drap = REGNO (crtl->drap_reg);
12133 else
12134 drap = -1;
12135 for (i = 2; i >= 0; --i)
12136 if (i != drap && !df_regs_ever_live_p (i))
12137 return i;
12138 }
12139
12140 return INVALID_REGNUM;
12141 }
12142
12143 /* Return true if REGNO is used by the epilogue. */
12144
12145 bool
12146 ix86_epilogue_uses (int regno)
12147 {
12148 /* If there are no caller-saved registers, we preserve all registers,
12149 except for MMX and x87 registers which aren't supported when saving
12150 and restoring registers. Don't explicitly save SP register since
12151 it is always preserved. */
12152 return (epilogue_completed
12153 && cfun->machine->no_caller_saved_registers
12154 && !fixed_regs[regno]
12155 && !STACK_REGNO_P (regno)
12156 && !MMX_REGNO_P (regno));
12157 }
12158
12159 /* Return nonzero if register REGNO can be used as a scratch register
12160 in peephole2. */
12161
12162 static bool
12163 ix86_hard_regno_scratch_ok (unsigned int regno)
12164 {
12165 /* If there are no caller-saved registers, we can't use any register
12166 as a scratch register after epilogue and use REGNO as scratch
12167 register only if it has been used before to avoid saving and
12168 restoring it. */
12169 return (!cfun->machine->no_caller_saved_registers
12170 || (!epilogue_completed
12171 && df_regs_ever_live_p (regno)));
12172 }
12173
12174 /* Return true if register class CL should be an additional allocno
12175 class. */
12176
12177 static bool
12178 ix86_additional_allocno_class_p (reg_class_t cl)
12179 {
12180 return cl == MOD4_SSE_REGS;
12181 }
12182
12183 /* Return TRUE if we need to save REGNO. */
12184
12185 static bool
12186 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
12187 {
12188 /* If there are no caller-saved registers, we preserve all registers,
12189 except for MMX and x87 registers which aren't supported when saving
12190 and restoring registers. Don't explicitly save SP register since
12191 it is always preserved. */
12192 if (cfun->machine->no_caller_saved_registers)
12193 {
12194 /* Don't preserve registers used for function return value. */
12195 rtx reg = crtl->return_rtx;
12196 if (reg)
12197 {
12198 unsigned int i = REGNO (reg);
12199 unsigned int nregs = hard_regno_nregs[i][GET_MODE (reg)];
12200 while (nregs-- > 0)
12201 if ((i + nregs) == regno)
12202 return false;
12203
12204 reg = crtl->return_bnd;
12205 if (reg)
12206 {
12207 i = REGNO (reg);
12208 nregs = hard_regno_nregs[i][GET_MODE (reg)];
12209 while (nregs-- > 0)
12210 if ((i + nregs) == regno)
12211 return false;
12212 }
12213 }
12214
12215 return (df_regs_ever_live_p (regno)
12216 && !fixed_regs[regno]
12217 && !STACK_REGNO_P (regno)
12218 && !MMX_REGNO_P (regno)
12219 && (regno != HARD_FRAME_POINTER_REGNUM
12220 || !frame_pointer_needed));
12221 }
12222
12223 if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
12224 && pic_offset_table_rtx)
12225 {
12226 if (ix86_use_pseudo_pic_reg ())
12227 {
12228 /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to
12229 _mcount in prologue. */
12230 if (!TARGET_64BIT && flag_pic && crtl->profile)
12231 return true;
12232 }
12233 else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
12234 || crtl->profile
12235 || crtl->calls_eh_return
12236 || crtl->uses_const_pool
12237 || cfun->has_nonlocal_label)
12238 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
12239 }
12240
12241 if (crtl->calls_eh_return && maybe_eh_return)
12242 {
12243 unsigned i;
12244 for (i = 0; ; i++)
12245 {
12246 unsigned test = EH_RETURN_DATA_REGNO (i);
12247 if (test == INVALID_REGNUM)
12248 break;
12249 if (test == regno)
12250 return true;
12251 }
12252 }
12253
12254 if (crtl->drap_reg
12255 && regno == REGNO (crtl->drap_reg)
12256 && !cfun->machine->no_drap_save_restore)
12257 return true;
12258
12259 return (df_regs_ever_live_p (regno)
12260 && !call_used_regs[regno]
12261 && !fixed_regs[regno]
12262 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
12263 }
12264
12265 /* Return number of saved general prupose registers. */
12266
12267 static int
12268 ix86_nsaved_regs (void)
12269 {
12270 int nregs = 0;
12271 int regno;
12272
12273 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12274 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true))
12275 nregs ++;
12276 return nregs;
12277 }
12278
12279 /* Return number of saved SSE registers. */
12280
12281 static int
12282 ix86_nsaved_sseregs (void)
12283 {
12284 int nregs = 0;
12285 int regno;
12286
12287 if (!TARGET_64BIT_MS_ABI)
12288 return 0;
12289 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12290 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
12291 nregs ++;
12292 return nregs;
12293 }
12294
12295 /* Given FROM and TO register numbers, say whether this elimination is
12296 allowed. If stack alignment is needed, we can only replace argument
12297 pointer with hard frame pointer, or replace frame pointer with stack
12298 pointer. Otherwise, frame pointer elimination is automatically
12299 handled and all other eliminations are valid. */
12300
12301 static bool
12302 ix86_can_eliminate (const int from, const int to)
12303 {
12304 if (stack_realign_fp)
12305 return ((from == ARG_POINTER_REGNUM
12306 && to == HARD_FRAME_POINTER_REGNUM)
12307 || (from == FRAME_POINTER_REGNUM
12308 && to == STACK_POINTER_REGNUM));
12309 else
12310 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
12311 }
12312
12313 /* Return the offset between two registers, one to be eliminated, and the other
12314 its replacement, at the start of a routine. */
12315
12316 HOST_WIDE_INT
12317 ix86_initial_elimination_offset (int from, int to)
12318 {
12319 struct ix86_frame frame;
12320 ix86_compute_frame_layout (&frame);
12321
12322 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
12323 return frame.hard_frame_pointer_offset;
12324 else if (from == FRAME_POINTER_REGNUM
12325 && to == HARD_FRAME_POINTER_REGNUM)
12326 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
12327 else
12328 {
12329 gcc_assert (to == STACK_POINTER_REGNUM);
12330
12331 if (from == ARG_POINTER_REGNUM)
12332 return frame.stack_pointer_offset;
12333
12334 gcc_assert (from == FRAME_POINTER_REGNUM);
12335 return frame.stack_pointer_offset - frame.frame_pointer_offset;
12336 }
12337 }
12338
12339 /* In a dynamically-aligned function, we can't know the offset from
12340 stack pointer to frame pointer, so we must ensure that setjmp
12341 eliminates fp against the hard fp (%ebp) rather than trying to
12342 index from %esp up to the top of the frame across a gap that is
12343 of unknown (at compile-time) size. */
12344 static rtx
12345 ix86_builtin_setjmp_frame_value (void)
12346 {
12347 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
12348 }
12349
12350 /* When using -fsplit-stack, the allocation routines set a field in
12351 the TCB to the bottom of the stack plus this much space, measured
12352 in bytes. */
12353
12354 #define SPLIT_STACK_AVAILABLE 256
12355
12356 /* Fill structure ix86_frame about frame of currently computed function. */
12357
12358 static void
12359 ix86_compute_frame_layout (struct ix86_frame *frame)
12360 {
12361 unsigned HOST_WIDE_INT stack_alignment_needed;
12362 HOST_WIDE_INT offset;
12363 unsigned HOST_WIDE_INT preferred_alignment;
12364 HOST_WIDE_INT size = get_frame_size ();
12365 HOST_WIDE_INT to_allocate;
12366
12367 frame->nregs = ix86_nsaved_regs ();
12368 frame->nsseregs = ix86_nsaved_sseregs ();
12369
12370 /* 64-bit MS ABI seem to require stack alignment to be always 16,
12371 except for function prologues, leaf functions and when the defult
12372 incoming stack boundary is overriden at command line or via
12373 force_align_arg_pointer attribute. */
12374 if ((TARGET_64BIT_MS_ABI && crtl->preferred_stack_boundary < 128)
12375 && (!crtl->is_leaf || cfun->calls_alloca != 0
12376 || ix86_current_function_calls_tls_descriptor
12377 || ix86_incoming_stack_boundary < 128))
12378 {
12379 crtl->preferred_stack_boundary = 128;
12380 crtl->stack_alignment_needed = 128;
12381 }
12382
12383 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
12384 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
12385
12386 gcc_assert (!size || stack_alignment_needed);
12387 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
12388 gcc_assert (preferred_alignment <= stack_alignment_needed);
12389
12390 /* For SEH we have to limit the amount of code movement into the prologue.
12391 At present we do this via a BLOCKAGE, at which point there's very little
12392 scheduling that can be done, which means that there's very little point
12393 in doing anything except PUSHs. */
12394 if (TARGET_SEH)
12395 cfun->machine->use_fast_prologue_epilogue = false;
12396
12397 /* During reload iteration the amount of registers saved can change.
12398 Recompute the value as needed. Do not recompute when amount of registers
12399 didn't change as reload does multiple calls to the function and does not
12400 expect the decision to change within single iteration. */
12401 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun))
12402 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
12403 {
12404 int count = frame->nregs;
12405 struct cgraph_node *node = cgraph_node::get (current_function_decl);
12406
12407 cfun->machine->use_fast_prologue_epilogue_nregs = count;
12408
12409 /* The fast prologue uses move instead of push to save registers. This
12410 is significantly longer, but also executes faster as modern hardware
12411 can execute the moves in parallel, but can't do that for push/pop.
12412
12413 Be careful about choosing what prologue to emit: When function takes
12414 many instructions to execute we may use slow version as well as in
12415 case function is known to be outside hot spot (this is known with
12416 feedback only). Weight the size of function by number of registers
12417 to save as it is cheap to use one or two push instructions but very
12418 slow to use many of them. */
12419 if (count)
12420 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
12421 if (node->frequency < NODE_FREQUENCY_NORMAL
12422 || (flag_branch_probabilities
12423 && node->frequency < NODE_FREQUENCY_HOT))
12424 cfun->machine->use_fast_prologue_epilogue = false;
12425 else
12426 cfun->machine->use_fast_prologue_epilogue
12427 = !expensive_function_p (count);
12428 }
12429
12430 frame->save_regs_using_mov
12431 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
12432 /* If static stack checking is enabled and done with probes,
12433 the registers need to be saved before allocating the frame. */
12434 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
12435
12436 /* Skip return address. */
12437 offset = UNITS_PER_WORD;
12438
12439 /* Skip pushed static chain. */
12440 if (ix86_static_chain_on_stack)
12441 offset += UNITS_PER_WORD;
12442
12443 /* Skip saved base pointer. */
12444 if (frame_pointer_needed)
12445 offset += UNITS_PER_WORD;
12446 frame->hfp_save_offset = offset;
12447
12448 /* The traditional frame pointer location is at the top of the frame. */
12449 frame->hard_frame_pointer_offset = offset;
12450
12451 /* Register save area */
12452 offset += frame->nregs * UNITS_PER_WORD;
12453 frame->reg_save_offset = offset;
12454
12455 /* On SEH target, registers are pushed just before the frame pointer
12456 location. */
12457 if (TARGET_SEH)
12458 frame->hard_frame_pointer_offset = offset;
12459
12460 /* Align and set SSE register save area. */
12461 if (frame->nsseregs)
12462 {
12463 /* The only ABI that has saved SSE registers (Win64) also has a
12464 16-byte aligned default stack, and thus we don't need to be
12465 within the re-aligned local stack frame to save them. In case
12466 incoming stack boundary is aligned to less than 16 bytes,
12467 unaligned move of SSE register will be emitted, so there is
12468 no point to round up the SSE register save area outside the
12469 re-aligned local stack frame to 16 bytes. */
12470 if (ix86_incoming_stack_boundary >= 128)
12471 offset = ROUND_UP (offset, 16);
12472 offset += frame->nsseregs * 16;
12473 }
12474 frame->sse_reg_save_offset = offset;
12475
12476 /* The re-aligned stack starts here. Values before this point are not
12477 directly comparable with values below this point. In order to make
12478 sure that no value happens to be the same before and after, force
12479 the alignment computation below to add a non-zero value. */
12480 if (stack_realign_fp)
12481 offset = ROUND_UP (offset, stack_alignment_needed);
12482
12483 /* Va-arg area */
12484 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
12485 offset += frame->va_arg_size;
12486
12487 /* Align start of frame for local function. */
12488 if (stack_realign_fp
12489 || offset != frame->sse_reg_save_offset
12490 || size != 0
12491 || !crtl->is_leaf
12492 || cfun->calls_alloca
12493 || ix86_current_function_calls_tls_descriptor)
12494 offset = ROUND_UP (offset, stack_alignment_needed);
12495
12496 /* Frame pointer points here. */
12497 frame->frame_pointer_offset = offset;
12498
12499 offset += size;
12500
12501 /* Add outgoing arguments area. Can be skipped if we eliminated
12502 all the function calls as dead code.
12503 Skipping is however impossible when function calls alloca. Alloca
12504 expander assumes that last crtl->outgoing_args_size
12505 of stack frame are unused. */
12506 if (ACCUMULATE_OUTGOING_ARGS
12507 && (!crtl->is_leaf || cfun->calls_alloca
12508 || ix86_current_function_calls_tls_descriptor))
12509 {
12510 offset += crtl->outgoing_args_size;
12511 frame->outgoing_arguments_size = crtl->outgoing_args_size;
12512 }
12513 else
12514 frame->outgoing_arguments_size = 0;
12515
12516 /* Align stack boundary. Only needed if we're calling another function
12517 or using alloca. */
12518 if (!crtl->is_leaf || cfun->calls_alloca
12519 || ix86_current_function_calls_tls_descriptor)
12520 offset = ROUND_UP (offset, preferred_alignment);
12521
12522 /* We've reached end of stack frame. */
12523 frame->stack_pointer_offset = offset;
12524
12525 /* Size prologue needs to allocate. */
12526 to_allocate = offset - frame->sse_reg_save_offset;
12527
12528 if ((!to_allocate && frame->nregs <= 1)
12529 || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000)))
12530 frame->save_regs_using_mov = false;
12531
12532 if (ix86_using_red_zone ()
12533 && crtl->sp_is_unchanging
12534 && crtl->is_leaf
12535 && !ix86_pc_thunk_call_expanded
12536 && !ix86_current_function_calls_tls_descriptor)
12537 {
12538 frame->red_zone_size = to_allocate;
12539 if (frame->save_regs_using_mov)
12540 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
12541 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
12542 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
12543 }
12544 else
12545 frame->red_zone_size = 0;
12546 frame->stack_pointer_offset -= frame->red_zone_size;
12547
12548 /* The SEH frame pointer location is near the bottom of the frame.
12549 This is enforced by the fact that the difference between the
12550 stack pointer and the frame pointer is limited to 240 bytes in
12551 the unwind data structure. */
12552 if (TARGET_SEH)
12553 {
12554 HOST_WIDE_INT diff;
12555
12556 /* If we can leave the frame pointer where it is, do so. Also, returns
12557 the establisher frame for __builtin_frame_address (0). */
12558 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
12559 if (diff <= SEH_MAX_FRAME_SIZE
12560 && (diff > 240 || (diff & 15) != 0)
12561 && !crtl->accesses_prior_frames)
12562 {
12563 /* Ideally we'd determine what portion of the local stack frame
12564 (within the constraint of the lowest 240) is most heavily used.
12565 But without that complication, simply bias the frame pointer
12566 by 128 bytes so as to maximize the amount of the local stack
12567 frame that is addressable with 8-bit offsets. */
12568 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
12569 }
12570 }
12571 }
12572
12573 /* This is semi-inlined memory_address_length, but simplified
12574 since we know that we're always dealing with reg+offset, and
12575 to avoid having to create and discard all that rtl. */
12576
12577 static inline int
12578 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
12579 {
12580 int len = 4;
12581
12582 if (offset == 0)
12583 {
12584 /* EBP and R13 cannot be encoded without an offset. */
12585 len = (regno == BP_REG || regno == R13_REG);
12586 }
12587 else if (IN_RANGE (offset, -128, 127))
12588 len = 1;
12589
12590 /* ESP and R12 must be encoded with a SIB byte. */
12591 if (regno == SP_REG || regno == R12_REG)
12592 len++;
12593
12594 return len;
12595 }
12596
12597 /* Return an RTX that points to CFA_OFFSET within the stack frame.
12598 The valid base registers are taken from CFUN->MACHINE->FS. */
12599
12600 static rtx
12601 choose_baseaddr (HOST_WIDE_INT cfa_offset)
12602 {
12603 const struct machine_function *m = cfun->machine;
12604 rtx base_reg = NULL;
12605 HOST_WIDE_INT base_offset = 0;
12606
12607 if (m->use_fast_prologue_epilogue)
12608 {
12609 /* Choose the base register most likely to allow the most scheduling
12610 opportunities. Generally FP is valid throughout the function,
12611 while DRAP must be reloaded within the epilogue. But choose either
12612 over the SP due to increased encoding size. */
12613
12614 if (m->fs.fp_valid)
12615 {
12616 base_reg = hard_frame_pointer_rtx;
12617 base_offset = m->fs.fp_offset - cfa_offset;
12618 }
12619 else if (m->fs.drap_valid)
12620 {
12621 base_reg = crtl->drap_reg;
12622 base_offset = 0 - cfa_offset;
12623 }
12624 else if (m->fs.sp_valid)
12625 {
12626 base_reg = stack_pointer_rtx;
12627 base_offset = m->fs.sp_offset - cfa_offset;
12628 }
12629 }
12630 else
12631 {
12632 HOST_WIDE_INT toffset;
12633 int len = 16, tlen;
12634
12635 /* Choose the base register with the smallest address encoding.
12636 With a tie, choose FP > DRAP > SP. */
12637 if (m->fs.sp_valid)
12638 {
12639 base_reg = stack_pointer_rtx;
12640 base_offset = m->fs.sp_offset - cfa_offset;
12641 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
12642 }
12643 if (m->fs.drap_valid)
12644 {
12645 toffset = 0 - cfa_offset;
12646 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
12647 if (tlen <= len)
12648 {
12649 base_reg = crtl->drap_reg;
12650 base_offset = toffset;
12651 len = tlen;
12652 }
12653 }
12654 if (m->fs.fp_valid)
12655 {
12656 toffset = m->fs.fp_offset - cfa_offset;
12657 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
12658 if (tlen <= len)
12659 {
12660 base_reg = hard_frame_pointer_rtx;
12661 base_offset = toffset;
12662 len = tlen;
12663 }
12664 }
12665 }
12666 gcc_assert (base_reg != NULL);
12667
12668 return plus_constant (Pmode, base_reg, base_offset);
12669 }
12670
12671 /* Emit code to save registers in the prologue. */
12672
12673 static void
12674 ix86_emit_save_regs (void)
12675 {
12676 unsigned int regno;
12677 rtx_insn *insn;
12678
12679 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
12680 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true))
12681 {
12682 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
12683 RTX_FRAME_RELATED_P (insn) = 1;
12684 }
12685 }
12686
12687 /* Emit a single register save at CFA - CFA_OFFSET. */
12688
12689 static void
12690 ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno,
12691 HOST_WIDE_INT cfa_offset)
12692 {
12693 struct machine_function *m = cfun->machine;
12694 rtx reg = gen_rtx_REG (mode, regno);
12695 rtx mem, addr, base, insn;
12696 unsigned int align;
12697
12698 addr = choose_baseaddr (cfa_offset);
12699 mem = gen_frame_mem (mode, addr);
12700
12701 /* The location is aligned up to INCOMING_STACK_BOUNDARY. */
12702 align = MIN (GET_MODE_ALIGNMENT (mode), INCOMING_STACK_BOUNDARY);
12703 set_mem_align (mem, align);
12704
12705 insn = emit_insn (gen_rtx_SET (mem, reg));
12706 RTX_FRAME_RELATED_P (insn) = 1;
12707
12708 base = addr;
12709 if (GET_CODE (base) == PLUS)
12710 base = XEXP (base, 0);
12711 gcc_checking_assert (REG_P (base));
12712
12713 /* When saving registers into a re-aligned local stack frame, avoid
12714 any tricky guessing by dwarf2out. */
12715 if (m->fs.realigned)
12716 {
12717 gcc_checking_assert (stack_realign_drap);
12718
12719 if (regno == REGNO (crtl->drap_reg))
12720 {
12721 /* A bit of a hack. We force the DRAP register to be saved in
12722 the re-aligned stack frame, which provides us with a copy
12723 of the CFA that will last past the prologue. Install it. */
12724 gcc_checking_assert (cfun->machine->fs.fp_valid);
12725 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
12726 cfun->machine->fs.fp_offset - cfa_offset);
12727 mem = gen_rtx_MEM (mode, addr);
12728 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
12729 }
12730 else
12731 {
12732 /* The frame pointer is a stable reference within the
12733 aligned frame. Use it. */
12734 gcc_checking_assert (cfun->machine->fs.fp_valid);
12735 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
12736 cfun->machine->fs.fp_offset - cfa_offset);
12737 mem = gen_rtx_MEM (mode, addr);
12738 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
12739 }
12740 }
12741
12742 /* The memory may not be relative to the current CFA register,
12743 which means that we may need to generate a new pattern for
12744 use by the unwind info. */
12745 else if (base != m->fs.cfa_reg)
12746 {
12747 addr = plus_constant (Pmode, m->fs.cfa_reg,
12748 m->fs.cfa_offset - cfa_offset);
12749 mem = gen_rtx_MEM (mode, addr);
12750 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg));
12751 }
12752 }
12753
12754 /* Emit code to save registers using MOV insns.
12755 First register is stored at CFA - CFA_OFFSET. */
12756 static void
12757 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
12758 {
12759 unsigned int regno;
12760
12761 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12762 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true))
12763 {
12764 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
12765 cfa_offset -= UNITS_PER_WORD;
12766 }
12767 }
12768
12769 /* Emit code to save SSE registers using MOV insns.
12770 First register is stored at CFA - CFA_OFFSET. */
12771 static void
12772 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
12773 {
12774 unsigned int regno;
12775
12776 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12777 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
12778 {
12779 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
12780 cfa_offset -= GET_MODE_SIZE (V4SFmode);
12781 }
12782 }
12783
12784 static GTY(()) rtx queued_cfa_restores;
12785
12786 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
12787 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
12788 Don't add the note if the previously saved value will be left untouched
12789 within stack red-zone till return, as unwinders can find the same value
12790 in the register and on the stack. */
12791
12792 static void
12793 ix86_add_cfa_restore_note (rtx_insn *insn, rtx reg, HOST_WIDE_INT cfa_offset)
12794 {
12795 if (!crtl->shrink_wrapped
12796 && cfa_offset <= cfun->machine->fs.red_zone_offset)
12797 return;
12798
12799 if (insn)
12800 {
12801 add_reg_note (insn, REG_CFA_RESTORE, reg);
12802 RTX_FRAME_RELATED_P (insn) = 1;
12803 }
12804 else
12805 queued_cfa_restores
12806 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
12807 }
12808
12809 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
12810
12811 static void
12812 ix86_add_queued_cfa_restore_notes (rtx insn)
12813 {
12814 rtx last;
12815 if (!queued_cfa_restores)
12816 return;
12817 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
12818 ;
12819 XEXP (last, 1) = REG_NOTES (insn);
12820 REG_NOTES (insn) = queued_cfa_restores;
12821 queued_cfa_restores = NULL_RTX;
12822 RTX_FRAME_RELATED_P (insn) = 1;
12823 }
12824
12825 /* Expand prologue or epilogue stack adjustment.
12826 The pattern exist to put a dependency on all ebp-based memory accesses.
12827 STYLE should be negative if instructions should be marked as frame related,
12828 zero if %r11 register is live and cannot be freely used and positive
12829 otherwise. */
12830
12831 static void
12832 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
12833 int style, bool set_cfa)
12834 {
12835 struct machine_function *m = cfun->machine;
12836 rtx insn;
12837 bool add_frame_related_expr = false;
12838
12839 if (Pmode == SImode)
12840 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
12841 else if (x86_64_immediate_operand (offset, DImode))
12842 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
12843 else
12844 {
12845 rtx tmp;
12846 /* r11 is used by indirect sibcall return as well, set before the
12847 epilogue and used after the epilogue. */
12848 if (style)
12849 tmp = gen_rtx_REG (DImode, R11_REG);
12850 else
12851 {
12852 gcc_assert (src != hard_frame_pointer_rtx
12853 && dest != hard_frame_pointer_rtx);
12854 tmp = hard_frame_pointer_rtx;
12855 }
12856 insn = emit_insn (gen_rtx_SET (tmp, offset));
12857 if (style < 0)
12858 add_frame_related_expr = true;
12859
12860 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
12861 }
12862
12863 insn = emit_insn (insn);
12864 if (style >= 0)
12865 ix86_add_queued_cfa_restore_notes (insn);
12866
12867 if (set_cfa)
12868 {
12869 rtx r;
12870
12871 gcc_assert (m->fs.cfa_reg == src);
12872 m->fs.cfa_offset += INTVAL (offset);
12873 m->fs.cfa_reg = dest;
12874
12875 r = gen_rtx_PLUS (Pmode, src, offset);
12876 r = gen_rtx_SET (dest, r);
12877 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
12878 RTX_FRAME_RELATED_P (insn) = 1;
12879 }
12880 else if (style < 0)
12881 {
12882 RTX_FRAME_RELATED_P (insn) = 1;
12883 if (add_frame_related_expr)
12884 {
12885 rtx r = gen_rtx_PLUS (Pmode, src, offset);
12886 r = gen_rtx_SET (dest, r);
12887 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
12888 }
12889 }
12890
12891 if (dest == stack_pointer_rtx)
12892 {
12893 HOST_WIDE_INT ooffset = m->fs.sp_offset;
12894 bool valid = m->fs.sp_valid;
12895
12896 if (src == hard_frame_pointer_rtx)
12897 {
12898 valid = m->fs.fp_valid;
12899 ooffset = m->fs.fp_offset;
12900 }
12901 else if (src == crtl->drap_reg)
12902 {
12903 valid = m->fs.drap_valid;
12904 ooffset = 0;
12905 }
12906 else
12907 {
12908 /* Else there are two possibilities: SP itself, which we set
12909 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
12910 taken care of this by hand along the eh_return path. */
12911 gcc_checking_assert (src == stack_pointer_rtx
12912 || offset == const0_rtx);
12913 }
12914
12915 m->fs.sp_offset = ooffset - INTVAL (offset);
12916 m->fs.sp_valid = valid;
12917 }
12918 }
12919
12920 /* Find an available register to be used as dynamic realign argument
12921 pointer regsiter. Such a register will be written in prologue and
12922 used in begin of body, so it must not be
12923 1. parameter passing register.
12924 2. GOT pointer.
12925 We reuse static-chain register if it is available. Otherwise, we
12926 use DI for i386 and R13 for x86-64. We chose R13 since it has
12927 shorter encoding.
12928
12929 Return: the regno of chosen register. */
12930
12931 static unsigned int
12932 find_drap_reg (void)
12933 {
12934 tree decl = cfun->decl;
12935
12936 /* Always use callee-saved register if there are no caller-saved
12937 registers. */
12938 if (TARGET_64BIT)
12939 {
12940 /* Use R13 for nested function or function need static chain.
12941 Since function with tail call may use any caller-saved
12942 registers in epilogue, DRAP must not use caller-saved
12943 register in such case. */
12944 if (DECL_STATIC_CHAIN (decl)
12945 || cfun->machine->no_caller_saved_registers
12946 || crtl->tail_call_emit)
12947 return R13_REG;
12948
12949 return R10_REG;
12950 }
12951 else
12952 {
12953 /* Use DI for nested function or function need static chain.
12954 Since function with tail call may use any caller-saved
12955 registers in epilogue, DRAP must not use caller-saved
12956 register in such case. */
12957 if (DECL_STATIC_CHAIN (decl)
12958 || cfun->machine->no_caller_saved_registers
12959 || crtl->tail_call_emit)
12960 return DI_REG;
12961
12962 /* Reuse static chain register if it isn't used for parameter
12963 passing. */
12964 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
12965 {
12966 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
12967 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
12968 return CX_REG;
12969 }
12970 return DI_REG;
12971 }
12972 }
12973
12974 /* Handle a "force_align_arg_pointer" attribute. */
12975
12976 static tree
12977 ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name,
12978 tree, int, bool *no_add_attrs)
12979 {
12980 if (TREE_CODE (*node) != FUNCTION_TYPE
12981 && TREE_CODE (*node) != METHOD_TYPE
12982 && TREE_CODE (*node) != FIELD_DECL
12983 && TREE_CODE (*node) != TYPE_DECL)
12984 {
12985 warning (OPT_Wattributes, "%qE attribute only applies to functions",
12986 name);
12987 *no_add_attrs = true;
12988 }
12989
12990 return NULL_TREE;
12991 }
12992
12993 /* Return minimum incoming stack alignment. */
12994
12995 static unsigned int
12996 ix86_minimum_incoming_stack_boundary (bool sibcall)
12997 {
12998 unsigned int incoming_stack_boundary;
12999
13000 /* Stack of interrupt handler is always aligned to MIN_STACK_BOUNDARY.
13001 */
13002 if (cfun->machine->func_type != TYPE_NORMAL)
13003 incoming_stack_boundary = MIN_STACK_BOUNDARY;
13004 /* Prefer the one specified at command line. */
13005 else if (ix86_user_incoming_stack_boundary)
13006 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
13007 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
13008 if -mstackrealign is used, it isn't used for sibcall check and
13009 estimated stack alignment is 128bit. */
13010 else if (!sibcall
13011 && ix86_force_align_arg_pointer
13012 && crtl->stack_alignment_estimated == 128)
13013 incoming_stack_boundary = MIN_STACK_BOUNDARY;
13014 else
13015 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
13016
13017 /* Incoming stack alignment can be changed on individual functions
13018 via force_align_arg_pointer attribute. We use the smallest
13019 incoming stack boundary. */
13020 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
13021 && lookup_attribute (ix86_force_align_arg_pointer_string,
13022 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
13023 incoming_stack_boundary = MIN_STACK_BOUNDARY;
13024
13025 /* The incoming stack frame has to be aligned at least at
13026 parm_stack_boundary. */
13027 if (incoming_stack_boundary < crtl->parm_stack_boundary)
13028 incoming_stack_boundary = crtl->parm_stack_boundary;
13029
13030 /* Stack at entrance of main is aligned by runtime. We use the
13031 smallest incoming stack boundary. */
13032 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
13033 && DECL_NAME (current_function_decl)
13034 && MAIN_NAME_P (DECL_NAME (current_function_decl))
13035 && DECL_FILE_SCOPE_P (current_function_decl))
13036 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
13037
13038 return incoming_stack_boundary;
13039 }
13040
13041 /* Update incoming stack boundary and estimated stack alignment. */
13042
13043 static void
13044 ix86_update_stack_boundary (void)
13045 {
13046 ix86_incoming_stack_boundary
13047 = ix86_minimum_incoming_stack_boundary (false);
13048
13049 /* x86_64 vararg needs 16byte stack alignment for register save
13050 area. */
13051 if (TARGET_64BIT
13052 && cfun->stdarg
13053 && crtl->stack_alignment_estimated < 128)
13054 crtl->stack_alignment_estimated = 128;
13055
13056 /* __tls_get_addr needs to be called with 16-byte aligned stack. */
13057 if (ix86_tls_descriptor_calls_expanded_in_cfun
13058 && crtl->preferred_stack_boundary < 128)
13059 crtl->preferred_stack_boundary = 128;
13060 }
13061
13062 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
13063 needed or an rtx for DRAP otherwise. */
13064
13065 static rtx
13066 ix86_get_drap_rtx (void)
13067 {
13068 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
13069 crtl->need_drap = true;
13070
13071 if (stack_realign_drap)
13072 {
13073 /* Assign DRAP to vDRAP and returns vDRAP */
13074 unsigned int regno = find_drap_reg ();
13075 rtx drap_vreg;
13076 rtx arg_ptr;
13077 rtx_insn *seq, *insn;
13078
13079 arg_ptr = gen_rtx_REG (Pmode, regno);
13080 crtl->drap_reg = arg_ptr;
13081
13082 start_sequence ();
13083 drap_vreg = copy_to_reg (arg_ptr);
13084 seq = get_insns ();
13085 end_sequence ();
13086
13087 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
13088 if (!optimize)
13089 {
13090 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
13091 RTX_FRAME_RELATED_P (insn) = 1;
13092 }
13093 return drap_vreg;
13094 }
13095 else
13096 return NULL;
13097 }
13098
13099 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
13100
13101 static rtx
13102 ix86_internal_arg_pointer (void)
13103 {
13104 return virtual_incoming_args_rtx;
13105 }
13106
13107 struct scratch_reg {
13108 rtx reg;
13109 bool saved;
13110 };
13111
13112 /* Return a short-lived scratch register for use on function entry.
13113 In 32-bit mode, it is valid only after the registers are saved
13114 in the prologue. This register must be released by means of
13115 release_scratch_register_on_entry once it is dead. */
13116
13117 static void
13118 get_scratch_register_on_entry (struct scratch_reg *sr)
13119 {
13120 int regno;
13121
13122 sr->saved = false;
13123
13124 if (TARGET_64BIT)
13125 {
13126 /* We always use R11 in 64-bit mode. */
13127 regno = R11_REG;
13128 }
13129 else
13130 {
13131 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
13132 bool fastcall_p
13133 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
13134 bool thiscall_p
13135 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
13136 bool static_chain_p = DECL_STATIC_CHAIN (decl);
13137 int regparm = ix86_function_regparm (fntype, decl);
13138 int drap_regno
13139 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
13140
13141 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
13142 for the static chain register. */
13143 if ((regparm < 1 || (fastcall_p && !static_chain_p))
13144 && drap_regno != AX_REG)
13145 regno = AX_REG;
13146 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
13147 for the static chain register. */
13148 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
13149 regno = AX_REG;
13150 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
13151 regno = DX_REG;
13152 /* ecx is the static chain register. */
13153 else if (regparm < 3 && !fastcall_p && !thiscall_p
13154 && !static_chain_p
13155 && drap_regno != CX_REG)
13156 regno = CX_REG;
13157 else if (ix86_save_reg (BX_REG, true))
13158 regno = BX_REG;
13159 /* esi is the static chain register. */
13160 else if (!(regparm == 3 && static_chain_p)
13161 && ix86_save_reg (SI_REG, true))
13162 regno = SI_REG;
13163 else if (ix86_save_reg (DI_REG, true))
13164 regno = DI_REG;
13165 else
13166 {
13167 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
13168 sr->saved = true;
13169 }
13170 }
13171
13172 sr->reg = gen_rtx_REG (Pmode, regno);
13173 if (sr->saved)
13174 {
13175 rtx_insn *insn = emit_insn (gen_push (sr->reg));
13176 RTX_FRAME_RELATED_P (insn) = 1;
13177 }
13178 }
13179
13180 /* Release a scratch register obtained from the preceding function. */
13181
13182 static void
13183 release_scratch_register_on_entry (struct scratch_reg *sr)
13184 {
13185 if (sr->saved)
13186 {
13187 struct machine_function *m = cfun->machine;
13188 rtx x, insn = emit_insn (gen_pop (sr->reg));
13189
13190 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
13191 RTX_FRAME_RELATED_P (insn) = 1;
13192 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
13193 x = gen_rtx_SET (stack_pointer_rtx, x);
13194 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
13195 m->fs.sp_offset -= UNITS_PER_WORD;
13196 }
13197 }
13198
13199 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
13200
13201 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
13202
13203 static void
13204 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
13205 {
13206 /* We skip the probe for the first interval + a small dope of 4 words and
13207 probe that many bytes past the specified size to maintain a protection
13208 area at the botton of the stack. */
13209 const int dope = 4 * UNITS_PER_WORD;
13210 rtx size_rtx = GEN_INT (size), last;
13211
13212 /* See if we have a constant small number of probes to generate. If so,
13213 that's the easy case. The run-time loop is made up of 9 insns in the
13214 generic case while the compile-time loop is made up of 3+2*(n-1) insns
13215 for n # of intervals. */
13216 if (size <= 4 * PROBE_INTERVAL)
13217 {
13218 HOST_WIDE_INT i, adjust;
13219 bool first_probe = true;
13220
13221 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
13222 values of N from 1 until it exceeds SIZE. If only one probe is
13223 needed, this will not generate any code. Then adjust and probe
13224 to PROBE_INTERVAL + SIZE. */
13225 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
13226 {
13227 if (first_probe)
13228 {
13229 adjust = 2 * PROBE_INTERVAL + dope;
13230 first_probe = false;
13231 }
13232 else
13233 adjust = PROBE_INTERVAL;
13234
13235 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13236 plus_constant (Pmode, stack_pointer_rtx,
13237 -adjust)));
13238 emit_stack_probe (stack_pointer_rtx);
13239 }
13240
13241 if (first_probe)
13242 adjust = size + PROBE_INTERVAL + dope;
13243 else
13244 adjust = size + PROBE_INTERVAL - i;
13245
13246 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13247 plus_constant (Pmode, stack_pointer_rtx,
13248 -adjust)));
13249 emit_stack_probe (stack_pointer_rtx);
13250
13251 /* Adjust back to account for the additional first interval. */
13252 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
13253 plus_constant (Pmode, stack_pointer_rtx,
13254 PROBE_INTERVAL + dope)));
13255 }
13256
13257 /* Otherwise, do the same as above, but in a loop. Note that we must be
13258 extra careful with variables wrapping around because we might be at
13259 the very top (or the very bottom) of the address space and we have
13260 to be able to handle this case properly; in particular, we use an
13261 equality test for the loop condition. */
13262 else
13263 {
13264 HOST_WIDE_INT rounded_size;
13265 struct scratch_reg sr;
13266
13267 get_scratch_register_on_entry (&sr);
13268
13269
13270 /* Step 1: round SIZE to the previous multiple of the interval. */
13271
13272 rounded_size = ROUND_DOWN (size, PROBE_INTERVAL);
13273
13274
13275 /* Step 2: compute initial and final value of the loop counter. */
13276
13277 /* SP = SP_0 + PROBE_INTERVAL. */
13278 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13279 plus_constant (Pmode, stack_pointer_rtx,
13280 - (PROBE_INTERVAL + dope))));
13281
13282 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
13283 if (rounded_size <= (HOST_WIDE_INT_1 << 31))
13284 emit_insn (gen_rtx_SET (sr.reg,
13285 plus_constant (Pmode, stack_pointer_rtx,
13286 -rounded_size)));
13287 else
13288 {
13289 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
13290 emit_insn (gen_rtx_SET (sr.reg,
13291 gen_rtx_PLUS (Pmode, sr.reg,
13292 stack_pointer_rtx)));
13293 }
13294
13295
13296 /* Step 3: the loop
13297
13298 do
13299 {
13300 SP = SP + PROBE_INTERVAL
13301 probe at SP
13302 }
13303 while (SP != LAST_ADDR)
13304
13305 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
13306 values of N from 1 until it is equal to ROUNDED_SIZE. */
13307
13308 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
13309
13310
13311 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
13312 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
13313
13314 if (size != rounded_size)
13315 {
13316 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13317 plus_constant (Pmode, stack_pointer_rtx,
13318 rounded_size - size)));
13319 emit_stack_probe (stack_pointer_rtx);
13320 }
13321
13322 /* Adjust back to account for the additional first interval. */
13323 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
13324 plus_constant (Pmode, stack_pointer_rtx,
13325 PROBE_INTERVAL + dope)));
13326
13327 release_scratch_register_on_entry (&sr);
13328 }
13329
13330 /* Even if the stack pointer isn't the CFA register, we need to correctly
13331 describe the adjustments made to it, in particular differentiate the
13332 frame-related ones from the frame-unrelated ones. */
13333 if (size > 0)
13334 {
13335 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
13336 XVECEXP (expr, 0, 0)
13337 = gen_rtx_SET (stack_pointer_rtx,
13338 plus_constant (Pmode, stack_pointer_rtx, -size));
13339 XVECEXP (expr, 0, 1)
13340 = gen_rtx_SET (stack_pointer_rtx,
13341 plus_constant (Pmode, stack_pointer_rtx,
13342 PROBE_INTERVAL + dope + size));
13343 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
13344 RTX_FRAME_RELATED_P (last) = 1;
13345
13346 cfun->machine->fs.sp_offset += size;
13347 }
13348
13349 /* Make sure nothing is scheduled before we are done. */
13350 emit_insn (gen_blockage ());
13351 }
13352
13353 /* Adjust the stack pointer up to REG while probing it. */
13354
13355 const char *
13356 output_adjust_stack_and_probe (rtx reg)
13357 {
13358 static int labelno = 0;
13359 char loop_lab[32];
13360 rtx xops[2];
13361
13362 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
13363
13364 /* Loop. */
13365 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
13366
13367 /* SP = SP + PROBE_INTERVAL. */
13368 xops[0] = stack_pointer_rtx;
13369 xops[1] = GEN_INT (PROBE_INTERVAL);
13370 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
13371
13372 /* Probe at SP. */
13373 xops[1] = const0_rtx;
13374 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
13375
13376 /* Test if SP == LAST_ADDR. */
13377 xops[0] = stack_pointer_rtx;
13378 xops[1] = reg;
13379 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
13380
13381 /* Branch. */
13382 fputs ("\tjne\t", asm_out_file);
13383 assemble_name_raw (asm_out_file, loop_lab);
13384 fputc ('\n', asm_out_file);
13385
13386 return "";
13387 }
13388
13389 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
13390 inclusive. These are offsets from the current stack pointer. */
13391
13392 static void
13393 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
13394 {
13395 /* See if we have a constant small number of probes to generate. If so,
13396 that's the easy case. The run-time loop is made up of 6 insns in the
13397 generic case while the compile-time loop is made up of n insns for n #
13398 of intervals. */
13399 if (size <= 6 * PROBE_INTERVAL)
13400 {
13401 HOST_WIDE_INT i;
13402
13403 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
13404 it exceeds SIZE. If only one probe is needed, this will not
13405 generate any code. Then probe at FIRST + SIZE. */
13406 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
13407 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
13408 -(first + i)));
13409
13410 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
13411 -(first + size)));
13412 }
13413
13414 /* Otherwise, do the same as above, but in a loop. Note that we must be
13415 extra careful with variables wrapping around because we might be at
13416 the very top (or the very bottom) of the address space and we have
13417 to be able to handle this case properly; in particular, we use an
13418 equality test for the loop condition. */
13419 else
13420 {
13421 HOST_WIDE_INT rounded_size, last;
13422 struct scratch_reg sr;
13423
13424 get_scratch_register_on_entry (&sr);
13425
13426
13427 /* Step 1: round SIZE to the previous multiple of the interval. */
13428
13429 rounded_size = ROUND_DOWN (size, PROBE_INTERVAL);
13430
13431
13432 /* Step 2: compute initial and final value of the loop counter. */
13433
13434 /* TEST_OFFSET = FIRST. */
13435 emit_move_insn (sr.reg, GEN_INT (-first));
13436
13437 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
13438 last = first + rounded_size;
13439
13440
13441 /* Step 3: the loop
13442
13443 do
13444 {
13445 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
13446 probe at TEST_ADDR
13447 }
13448 while (TEST_ADDR != LAST_ADDR)
13449
13450 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
13451 until it is equal to ROUNDED_SIZE. */
13452
13453 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
13454
13455
13456 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
13457 that SIZE is equal to ROUNDED_SIZE. */
13458
13459 if (size != rounded_size)
13460 emit_stack_probe (plus_constant (Pmode,
13461 gen_rtx_PLUS (Pmode,
13462 stack_pointer_rtx,
13463 sr.reg),
13464 rounded_size - size));
13465
13466 release_scratch_register_on_entry (&sr);
13467 }
13468
13469 /* Make sure nothing is scheduled before we are done. */
13470 emit_insn (gen_blockage ());
13471 }
13472
13473 /* Probe a range of stack addresses from REG to END, inclusive. These are
13474 offsets from the current stack pointer. */
13475
13476 const char *
13477 output_probe_stack_range (rtx reg, rtx end)
13478 {
13479 static int labelno = 0;
13480 char loop_lab[32];
13481 rtx xops[3];
13482
13483 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
13484
13485 /* Loop. */
13486 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
13487
13488 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
13489 xops[0] = reg;
13490 xops[1] = GEN_INT (PROBE_INTERVAL);
13491 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
13492
13493 /* Probe at TEST_ADDR. */
13494 xops[0] = stack_pointer_rtx;
13495 xops[1] = reg;
13496 xops[2] = const0_rtx;
13497 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
13498
13499 /* Test if TEST_ADDR == LAST_ADDR. */
13500 xops[0] = reg;
13501 xops[1] = end;
13502 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
13503
13504 /* Branch. */
13505 fputs ("\tjne\t", asm_out_file);
13506 assemble_name_raw (asm_out_file, loop_lab);
13507 fputc ('\n', asm_out_file);
13508
13509 return "";
13510 }
13511
13512 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
13513 to be generated in correct form. */
13514 static void
13515 ix86_finalize_stack_realign_flags (void)
13516 {
13517 /* Check if stack realign is really needed after reload, and
13518 stores result in cfun */
13519 unsigned int incoming_stack_boundary
13520 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
13521 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
13522 unsigned int stack_realign
13523 = (incoming_stack_boundary
13524 < (crtl->is_leaf && !ix86_current_function_calls_tls_descriptor
13525 ? crtl->max_used_stack_slot_alignment
13526 : crtl->stack_alignment_needed));
13527
13528 if (crtl->stack_realign_finalized)
13529 {
13530 /* After stack_realign_needed is finalized, we can't no longer
13531 change it. */
13532 gcc_assert (crtl->stack_realign_needed == stack_realign);
13533 return;
13534 }
13535
13536 /* If the only reason for frame_pointer_needed is that we conservatively
13537 assumed stack realignment might be needed, but in the end nothing that
13538 needed the stack alignment had been spilled, clear frame_pointer_needed
13539 and say we don't need stack realignment. */
13540 if (stack_realign
13541 && frame_pointer_needed
13542 && crtl->is_leaf
13543 && flag_omit_frame_pointer
13544 && crtl->sp_is_unchanging
13545 && !ix86_current_function_calls_tls_descriptor
13546 && !crtl->accesses_prior_frames
13547 && !cfun->calls_alloca
13548 && !crtl->calls_eh_return
13549 /* See ira_setup_eliminable_regset for the rationale. */
13550 && !(STACK_CHECK_MOVING_SP
13551 && flag_stack_check
13552 && flag_exceptions
13553 && cfun->can_throw_non_call_exceptions)
13554 && !ix86_frame_pointer_required ()
13555 && get_frame_size () == 0
13556 && ix86_nsaved_sseregs () == 0
13557 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
13558 {
13559 HARD_REG_SET set_up_by_prologue, prologue_used;
13560 basic_block bb;
13561
13562 CLEAR_HARD_REG_SET (prologue_used);
13563 CLEAR_HARD_REG_SET (set_up_by_prologue);
13564 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
13565 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
13566 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
13567 HARD_FRAME_POINTER_REGNUM);
13568 FOR_EACH_BB_FN (bb, cfun)
13569 {
13570 rtx_insn *insn;
13571 FOR_BB_INSNS (bb, insn)
13572 if (NONDEBUG_INSN_P (insn)
13573 && requires_stack_frame_p (insn, prologue_used,
13574 set_up_by_prologue))
13575 {
13576 crtl->stack_realign_needed = stack_realign;
13577 crtl->stack_realign_finalized = true;
13578 return;
13579 }
13580 }
13581
13582 /* If drap has been set, but it actually isn't live at the start
13583 of the function, there is no reason to set it up. */
13584 if (crtl->drap_reg)
13585 {
13586 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
13587 if (! REGNO_REG_SET_P (DF_LR_IN (bb), REGNO (crtl->drap_reg)))
13588 {
13589 crtl->drap_reg = NULL_RTX;
13590 crtl->need_drap = false;
13591 }
13592 }
13593 else
13594 cfun->machine->no_drap_save_restore = true;
13595
13596 frame_pointer_needed = false;
13597 stack_realign = false;
13598 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
13599 crtl->stack_alignment_needed = incoming_stack_boundary;
13600 crtl->stack_alignment_estimated = incoming_stack_boundary;
13601 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
13602 crtl->preferred_stack_boundary = incoming_stack_boundary;
13603 df_finish_pass (true);
13604 df_scan_alloc (NULL);
13605 df_scan_blocks ();
13606 df_compute_regs_ever_live (true);
13607 df_analyze ();
13608 }
13609
13610 crtl->stack_realign_needed = stack_realign;
13611 crtl->stack_realign_finalized = true;
13612 }
13613
13614 /* Delete SET_GOT right after entry block if it is allocated to reg. */
13615
13616 static void
13617 ix86_elim_entry_set_got (rtx reg)
13618 {
13619 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
13620 rtx_insn *c_insn = BB_HEAD (bb);
13621 if (!NONDEBUG_INSN_P (c_insn))
13622 c_insn = next_nonnote_nondebug_insn (c_insn);
13623 if (c_insn && NONJUMP_INSN_P (c_insn))
13624 {
13625 rtx pat = PATTERN (c_insn);
13626 if (GET_CODE (pat) == PARALLEL)
13627 {
13628 rtx vec = XVECEXP (pat, 0, 0);
13629 if (GET_CODE (vec) == SET
13630 && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT
13631 && REGNO (XEXP (vec, 0)) == REGNO (reg))
13632 delete_insn (c_insn);
13633 }
13634 }
13635 }
13636
13637 /* Expand the prologue into a bunch of separate insns. */
13638
13639 void
13640 ix86_expand_prologue (void)
13641 {
13642 struct machine_function *m = cfun->machine;
13643 rtx insn, t;
13644 struct ix86_frame frame;
13645 HOST_WIDE_INT allocate;
13646 bool int_registers_saved;
13647 bool sse_registers_saved;
13648 rtx static_chain = NULL_RTX;
13649
13650 ix86_finalize_stack_realign_flags ();
13651
13652 /* DRAP should not coexist with stack_realign_fp */
13653 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
13654
13655 memset (&m->fs, 0, sizeof (m->fs));
13656
13657 /* Initialize CFA state for before the prologue. */
13658 m->fs.cfa_reg = stack_pointer_rtx;
13659 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
13660
13661 /* Track SP offset to the CFA. We continue tracking this after we've
13662 swapped the CFA register away from SP. In the case of re-alignment
13663 this is fudged; we're interested to offsets within the local frame. */
13664 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
13665 m->fs.sp_valid = true;
13666
13667 ix86_compute_frame_layout (&frame);
13668
13669 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
13670 {
13671 /* We should have already generated an error for any use of
13672 ms_hook on a nested function. */
13673 gcc_checking_assert (!ix86_static_chain_on_stack);
13674
13675 /* Check if profiling is active and we shall use profiling before
13676 prologue variant. If so sorry. */
13677 if (crtl->profile && flag_fentry != 0)
13678 sorry ("ms_hook_prologue attribute isn%'t compatible "
13679 "with -mfentry for 32-bit");
13680
13681 /* In ix86_asm_output_function_label we emitted:
13682 8b ff movl.s %edi,%edi
13683 55 push %ebp
13684 8b ec movl.s %esp,%ebp
13685
13686 This matches the hookable function prologue in Win32 API
13687 functions in Microsoft Windows XP Service Pack 2 and newer.
13688 Wine uses this to enable Windows apps to hook the Win32 API
13689 functions provided by Wine.
13690
13691 What that means is that we've already set up the frame pointer. */
13692
13693 if (frame_pointer_needed
13694 && !(crtl->drap_reg && crtl->stack_realign_needed))
13695 {
13696 rtx push, mov;
13697
13698 /* We've decided to use the frame pointer already set up.
13699 Describe this to the unwinder by pretending that both
13700 push and mov insns happen right here.
13701
13702 Putting the unwind info here at the end of the ms_hook
13703 is done so that we can make absolutely certain we get
13704 the required byte sequence at the start of the function,
13705 rather than relying on an assembler that can produce
13706 the exact encoding required.
13707
13708 However it does mean (in the unpatched case) that we have
13709 a 1 insn window where the asynchronous unwind info is
13710 incorrect. However, if we placed the unwind info at
13711 its correct location we would have incorrect unwind info
13712 in the patched case. Which is probably all moot since
13713 I don't expect Wine generates dwarf2 unwind info for the
13714 system libraries that use this feature. */
13715
13716 insn = emit_insn (gen_blockage ());
13717
13718 push = gen_push (hard_frame_pointer_rtx);
13719 mov = gen_rtx_SET (hard_frame_pointer_rtx,
13720 stack_pointer_rtx);
13721 RTX_FRAME_RELATED_P (push) = 1;
13722 RTX_FRAME_RELATED_P (mov) = 1;
13723
13724 RTX_FRAME_RELATED_P (insn) = 1;
13725 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13726 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
13727
13728 /* Note that gen_push incremented m->fs.cfa_offset, even
13729 though we didn't emit the push insn here. */
13730 m->fs.cfa_reg = hard_frame_pointer_rtx;
13731 m->fs.fp_offset = m->fs.cfa_offset;
13732 m->fs.fp_valid = true;
13733 }
13734 else
13735 {
13736 /* The frame pointer is not needed so pop %ebp again.
13737 This leaves us with a pristine state. */
13738 emit_insn (gen_pop (hard_frame_pointer_rtx));
13739 }
13740 }
13741
13742 /* The first insn of a function that accepts its static chain on the
13743 stack is to push the register that would be filled in by a direct
13744 call. This insn will be skipped by the trampoline. */
13745 else if (ix86_static_chain_on_stack)
13746 {
13747 static_chain = ix86_static_chain (cfun->decl, false);
13748 insn = emit_insn (gen_push (static_chain));
13749 emit_insn (gen_blockage ());
13750
13751 /* We don't want to interpret this push insn as a register save,
13752 only as a stack adjustment. The real copy of the register as
13753 a save will be done later, if needed. */
13754 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
13755 t = gen_rtx_SET (stack_pointer_rtx, t);
13756 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
13757 RTX_FRAME_RELATED_P (insn) = 1;
13758 }
13759
13760 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
13761 of DRAP is needed and stack realignment is really needed after reload */
13762 if (stack_realign_drap)
13763 {
13764 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13765
13766 /* Can't use DRAP in interrupt function. */
13767 if (cfun->machine->func_type != TYPE_NORMAL)
13768 sorry ("Dynamic Realign Argument Pointer (DRAP) not supported "
13769 "in interrupt service routine. This may be worked "
13770 "around by avoiding functions with aggregate return.");
13771
13772 /* Only need to push parameter pointer reg if it is caller saved. */
13773 if (!call_used_regs[REGNO (crtl->drap_reg)])
13774 {
13775 /* Push arg pointer reg */
13776 insn = emit_insn (gen_push (crtl->drap_reg));
13777 RTX_FRAME_RELATED_P (insn) = 1;
13778 }
13779
13780 /* Grab the argument pointer. */
13781 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
13782 insn = emit_insn (gen_rtx_SET (crtl->drap_reg, t));
13783 RTX_FRAME_RELATED_P (insn) = 1;
13784 m->fs.cfa_reg = crtl->drap_reg;
13785 m->fs.cfa_offset = 0;
13786
13787 /* Align the stack. */
13788 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13789 stack_pointer_rtx,
13790 GEN_INT (-align_bytes)));
13791 RTX_FRAME_RELATED_P (insn) = 1;
13792
13793 /* Replicate the return address on the stack so that return
13794 address can be reached via (argp - 1) slot. This is needed
13795 to implement macro RETURN_ADDR_RTX and intrinsic function
13796 expand_builtin_return_addr etc. */
13797 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
13798 t = gen_frame_mem (word_mode, t);
13799 insn = emit_insn (gen_push (t));
13800 RTX_FRAME_RELATED_P (insn) = 1;
13801
13802 /* For the purposes of frame and register save area addressing,
13803 we've started over with a new frame. */
13804 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
13805 m->fs.realigned = true;
13806
13807 if (static_chain)
13808 {
13809 /* Replicate static chain on the stack so that static chain
13810 can be reached via (argp - 2) slot. This is needed for
13811 nested function with stack realignment. */
13812 insn = emit_insn (gen_push (static_chain));
13813 RTX_FRAME_RELATED_P (insn) = 1;
13814 }
13815 }
13816
13817 int_registers_saved = (frame.nregs == 0);
13818 sse_registers_saved = (frame.nsseregs == 0);
13819
13820 if (frame_pointer_needed && !m->fs.fp_valid)
13821 {
13822 /* Note: AT&T enter does NOT have reversed args. Enter is probably
13823 slower on all targets. Also sdb doesn't like it. */
13824 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
13825 RTX_FRAME_RELATED_P (insn) = 1;
13826
13827 /* Push registers now, before setting the frame pointer
13828 on SEH target. */
13829 if (!int_registers_saved
13830 && TARGET_SEH
13831 && !frame.save_regs_using_mov)
13832 {
13833 ix86_emit_save_regs ();
13834 int_registers_saved = true;
13835 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13836 }
13837
13838 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
13839 {
13840 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
13841 RTX_FRAME_RELATED_P (insn) = 1;
13842
13843 if (m->fs.cfa_reg == stack_pointer_rtx)
13844 m->fs.cfa_reg = hard_frame_pointer_rtx;
13845 m->fs.fp_offset = m->fs.sp_offset;
13846 m->fs.fp_valid = true;
13847 }
13848 }
13849
13850 if (!int_registers_saved)
13851 {
13852 /* If saving registers via PUSH, do so now. */
13853 if (!frame.save_regs_using_mov)
13854 {
13855 ix86_emit_save_regs ();
13856 int_registers_saved = true;
13857 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13858 }
13859
13860 /* When using red zone we may start register saving before allocating
13861 the stack frame saving one cycle of the prologue. However, avoid
13862 doing this if we have to probe the stack; at least on x86_64 the
13863 stack probe can turn into a call that clobbers a red zone location. */
13864 else if (ix86_using_red_zone ()
13865 && (! TARGET_STACK_PROBE
13866 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
13867 {
13868 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
13869 int_registers_saved = true;
13870 }
13871 }
13872
13873 if (stack_realign_fp)
13874 {
13875 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13876 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
13877
13878 /* The computation of the size of the re-aligned stack frame means
13879 that we must allocate the size of the register save area before
13880 performing the actual alignment. Otherwise we cannot guarantee
13881 that there's enough storage above the realignment point. */
13882 if (m->fs.sp_offset != frame.sse_reg_save_offset)
13883 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13884 GEN_INT (m->fs.sp_offset
13885 - frame.sse_reg_save_offset),
13886 -1, false);
13887
13888 /* Align the stack. */
13889 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13890 stack_pointer_rtx,
13891 GEN_INT (-align_bytes)));
13892
13893 /* For the purposes of register save area addressing, the stack
13894 pointer is no longer valid. As for the value of sp_offset,
13895 see ix86_compute_frame_layout, which we need to match in order
13896 to pass verification of stack_pointer_offset at the end. */
13897 m->fs.sp_offset = ROUND_UP (m->fs.sp_offset, align_bytes);
13898 m->fs.sp_valid = false;
13899 }
13900
13901 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
13902
13903 if (flag_stack_usage_info)
13904 {
13905 /* We start to count from ARG_POINTER. */
13906 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
13907
13908 /* If it was realigned, take into account the fake frame. */
13909 if (stack_realign_drap)
13910 {
13911 if (ix86_static_chain_on_stack)
13912 stack_size += UNITS_PER_WORD;
13913
13914 if (!call_used_regs[REGNO (crtl->drap_reg)])
13915 stack_size += UNITS_PER_WORD;
13916
13917 /* This over-estimates by 1 minimal-stack-alignment-unit but
13918 mitigates that by counting in the new return address slot. */
13919 current_function_dynamic_stack_size
13920 += crtl->stack_alignment_needed / BITS_PER_UNIT;
13921 }
13922
13923 current_function_static_stack_size = stack_size;
13924 }
13925
13926 /* On SEH target with very large frame size, allocate an area to save
13927 SSE registers (as the very large allocation won't be described). */
13928 if (TARGET_SEH
13929 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
13930 && !sse_registers_saved)
13931 {
13932 HOST_WIDE_INT sse_size =
13933 frame.sse_reg_save_offset - frame.reg_save_offset;
13934
13935 gcc_assert (int_registers_saved);
13936
13937 /* No need to do stack checking as the area will be immediately
13938 written. */
13939 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13940 GEN_INT (-sse_size), -1,
13941 m->fs.cfa_reg == stack_pointer_rtx);
13942 allocate -= sse_size;
13943 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13944 sse_registers_saved = true;
13945 }
13946
13947 /* The stack has already been decremented by the instruction calling us
13948 so probe if the size is non-negative to preserve the protection area. */
13949 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
13950 {
13951 /* We expect the registers to be saved when probes are used. */
13952 gcc_assert (int_registers_saved);
13953
13954 if (STACK_CHECK_MOVING_SP)
13955 {
13956 if (!(crtl->is_leaf && !cfun->calls_alloca
13957 && allocate <= PROBE_INTERVAL))
13958 {
13959 ix86_adjust_stack_and_probe (allocate);
13960 allocate = 0;
13961 }
13962 }
13963 else
13964 {
13965 HOST_WIDE_INT size = allocate;
13966
13967 if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000))
13968 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
13969
13970 if (TARGET_STACK_PROBE)
13971 {
13972 if (crtl->is_leaf && !cfun->calls_alloca)
13973 {
13974 if (size > PROBE_INTERVAL)
13975 ix86_emit_probe_stack_range (0, size);
13976 }
13977 else
13978 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
13979 }
13980 else
13981 {
13982 if (crtl->is_leaf && !cfun->calls_alloca)
13983 {
13984 if (size > PROBE_INTERVAL && size > STACK_CHECK_PROTECT)
13985 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT,
13986 size - STACK_CHECK_PROTECT);
13987 }
13988 else
13989 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
13990 }
13991 }
13992 }
13993
13994 if (allocate == 0)
13995 ;
13996 else if (!ix86_target_stack_probe ()
13997 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
13998 {
13999 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14000 GEN_INT (-allocate), -1,
14001 m->fs.cfa_reg == stack_pointer_rtx);
14002 }
14003 else
14004 {
14005 rtx eax = gen_rtx_REG (Pmode, AX_REG);
14006 rtx r10 = NULL;
14007 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
14008 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
14009 bool eax_live = ix86_eax_live_at_start_p ();
14010 bool r10_live = false;
14011
14012 if (TARGET_64BIT)
14013 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
14014
14015 if (eax_live)
14016 {
14017 insn = emit_insn (gen_push (eax));
14018 allocate -= UNITS_PER_WORD;
14019 /* Note that SEH directives need to continue tracking the stack
14020 pointer even after the frame pointer has been set up. */
14021 if (sp_is_cfa_reg || TARGET_SEH)
14022 {
14023 if (sp_is_cfa_reg)
14024 m->fs.cfa_offset += UNITS_PER_WORD;
14025 RTX_FRAME_RELATED_P (insn) = 1;
14026 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14027 gen_rtx_SET (stack_pointer_rtx,
14028 plus_constant (Pmode, stack_pointer_rtx,
14029 -UNITS_PER_WORD)));
14030 }
14031 }
14032
14033 if (r10_live)
14034 {
14035 r10 = gen_rtx_REG (Pmode, R10_REG);
14036 insn = emit_insn (gen_push (r10));
14037 allocate -= UNITS_PER_WORD;
14038 if (sp_is_cfa_reg || TARGET_SEH)
14039 {
14040 if (sp_is_cfa_reg)
14041 m->fs.cfa_offset += UNITS_PER_WORD;
14042 RTX_FRAME_RELATED_P (insn) = 1;
14043 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14044 gen_rtx_SET (stack_pointer_rtx,
14045 plus_constant (Pmode, stack_pointer_rtx,
14046 -UNITS_PER_WORD)));
14047 }
14048 }
14049
14050 emit_move_insn (eax, GEN_INT (allocate));
14051 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
14052
14053 /* Use the fact that AX still contains ALLOCATE. */
14054 adjust_stack_insn = (Pmode == DImode
14055 ? gen_pro_epilogue_adjust_stack_di_sub
14056 : gen_pro_epilogue_adjust_stack_si_sub);
14057
14058 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
14059 stack_pointer_rtx, eax));
14060
14061 if (sp_is_cfa_reg || TARGET_SEH)
14062 {
14063 if (sp_is_cfa_reg)
14064 m->fs.cfa_offset += allocate;
14065 RTX_FRAME_RELATED_P (insn) = 1;
14066 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14067 gen_rtx_SET (stack_pointer_rtx,
14068 plus_constant (Pmode, stack_pointer_rtx,
14069 -allocate)));
14070 }
14071 m->fs.sp_offset += allocate;
14072
14073 /* Use stack_pointer_rtx for relative addressing so that code
14074 works for realigned stack, too. */
14075 if (r10_live && eax_live)
14076 {
14077 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
14078 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
14079 gen_frame_mem (word_mode, t));
14080 t = plus_constant (Pmode, t, UNITS_PER_WORD);
14081 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
14082 gen_frame_mem (word_mode, t));
14083 }
14084 else if (eax_live || r10_live)
14085 {
14086 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
14087 emit_move_insn (gen_rtx_REG (word_mode,
14088 (eax_live ? AX_REG : R10_REG)),
14089 gen_frame_mem (word_mode, t));
14090 }
14091 }
14092 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
14093
14094 /* If we havn't already set up the frame pointer, do so now. */
14095 if (frame_pointer_needed && !m->fs.fp_valid)
14096 {
14097 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
14098 GEN_INT (frame.stack_pointer_offset
14099 - frame.hard_frame_pointer_offset));
14100 insn = emit_insn (insn);
14101 RTX_FRAME_RELATED_P (insn) = 1;
14102 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
14103
14104 if (m->fs.cfa_reg == stack_pointer_rtx)
14105 m->fs.cfa_reg = hard_frame_pointer_rtx;
14106 m->fs.fp_offset = frame.hard_frame_pointer_offset;
14107 m->fs.fp_valid = true;
14108 }
14109
14110 if (!int_registers_saved)
14111 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
14112 if (!sse_registers_saved)
14113 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
14114
14115 /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT
14116 in PROLOGUE. */
14117 if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry)
14118 {
14119 rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM);
14120 insn = emit_insn (gen_set_got (pic));
14121 RTX_FRAME_RELATED_P (insn) = 1;
14122 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
14123 emit_insn (gen_prologue_use (pic));
14124 /* Deleting already emmitted SET_GOT if exist and allocated to
14125 REAL_PIC_OFFSET_TABLE_REGNUM. */
14126 ix86_elim_entry_set_got (pic);
14127 }
14128
14129 if (crtl->drap_reg && !crtl->stack_realign_needed)
14130 {
14131 /* vDRAP is setup but after reload it turns out stack realign
14132 isn't necessary, here we will emit prologue to setup DRAP
14133 without stack realign adjustment */
14134 t = choose_baseaddr (0);
14135 emit_insn (gen_rtx_SET (crtl->drap_reg, t));
14136 }
14137
14138 /* Prevent instructions from being scheduled into register save push
14139 sequence when access to the redzone area is done through frame pointer.
14140 The offset between the frame pointer and the stack pointer is calculated
14141 relative to the value of the stack pointer at the end of the function
14142 prologue, and moving instructions that access redzone area via frame
14143 pointer inside push sequence violates this assumption. */
14144 if (frame_pointer_needed && frame.red_zone_size)
14145 emit_insn (gen_memory_blockage ());
14146
14147 /* SEH requires that the prologue end within 256 bytes of the start of
14148 the function. Prevent instruction schedules that would extend that.
14149 Further, prevent alloca modifications to the stack pointer from being
14150 combined with prologue modifications. */
14151 if (TARGET_SEH)
14152 emit_insn (gen_prologue_use (stack_pointer_rtx));
14153 }
14154
14155 /* Emit code to restore REG using a POP insn. */
14156
14157 static void
14158 ix86_emit_restore_reg_using_pop (rtx reg)
14159 {
14160 struct machine_function *m = cfun->machine;
14161 rtx_insn *insn = emit_insn (gen_pop (reg));
14162
14163 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
14164 m->fs.sp_offset -= UNITS_PER_WORD;
14165
14166 if (m->fs.cfa_reg == crtl->drap_reg
14167 && REGNO (reg) == REGNO (crtl->drap_reg))
14168 {
14169 /* Previously we'd represented the CFA as an expression
14170 like *(%ebp - 8). We've just popped that value from
14171 the stack, which means we need to reset the CFA to
14172 the drap register. This will remain until we restore
14173 the stack pointer. */
14174 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
14175 RTX_FRAME_RELATED_P (insn) = 1;
14176
14177 /* This means that the DRAP register is valid for addressing too. */
14178 m->fs.drap_valid = true;
14179 return;
14180 }
14181
14182 if (m->fs.cfa_reg == stack_pointer_rtx)
14183 {
14184 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14185 x = gen_rtx_SET (stack_pointer_rtx, x);
14186 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14187 RTX_FRAME_RELATED_P (insn) = 1;
14188
14189 m->fs.cfa_offset -= UNITS_PER_WORD;
14190 }
14191
14192 /* When the frame pointer is the CFA, and we pop it, we are
14193 swapping back to the stack pointer as the CFA. This happens
14194 for stack frames that don't allocate other data, so we assume
14195 the stack pointer is now pointing at the return address, i.e.
14196 the function entry state, which makes the offset be 1 word. */
14197 if (reg == hard_frame_pointer_rtx)
14198 {
14199 m->fs.fp_valid = false;
14200 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
14201 {
14202 m->fs.cfa_reg = stack_pointer_rtx;
14203 m->fs.cfa_offset -= UNITS_PER_WORD;
14204
14205 add_reg_note (insn, REG_CFA_DEF_CFA,
14206 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14207 GEN_INT (m->fs.cfa_offset)));
14208 RTX_FRAME_RELATED_P (insn) = 1;
14209 }
14210 }
14211 }
14212
14213 /* Emit code to restore saved registers using POP insns. */
14214
14215 static void
14216 ix86_emit_restore_regs_using_pop (void)
14217 {
14218 unsigned int regno;
14219
14220 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14221 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false))
14222 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
14223 }
14224
14225 /* Emit code and notes for the LEAVE instruction. */
14226
14227 static void
14228 ix86_emit_leave (void)
14229 {
14230 struct machine_function *m = cfun->machine;
14231 rtx_insn *insn = emit_insn (ix86_gen_leave ());
14232
14233 ix86_add_queued_cfa_restore_notes (insn);
14234
14235 gcc_assert (m->fs.fp_valid);
14236 m->fs.sp_valid = true;
14237 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
14238 m->fs.fp_valid = false;
14239
14240 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
14241 {
14242 m->fs.cfa_reg = stack_pointer_rtx;
14243 m->fs.cfa_offset = m->fs.sp_offset;
14244
14245 add_reg_note (insn, REG_CFA_DEF_CFA,
14246 plus_constant (Pmode, stack_pointer_rtx,
14247 m->fs.sp_offset));
14248 RTX_FRAME_RELATED_P (insn) = 1;
14249 }
14250 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
14251 m->fs.fp_offset);
14252 }
14253
14254 /* Emit code to restore saved registers using MOV insns.
14255 First register is restored from CFA - CFA_OFFSET. */
14256 static void
14257 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
14258 bool maybe_eh_return)
14259 {
14260 struct machine_function *m = cfun->machine;
14261 unsigned int regno;
14262
14263 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14264 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
14265 {
14266 rtx reg = gen_rtx_REG (word_mode, regno);
14267 rtx mem;
14268 rtx_insn *insn;
14269
14270 mem = choose_baseaddr (cfa_offset);
14271 mem = gen_frame_mem (word_mode, mem);
14272 insn = emit_move_insn (reg, mem);
14273
14274 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
14275 {
14276 /* Previously we'd represented the CFA as an expression
14277 like *(%ebp - 8). We've just popped that value from
14278 the stack, which means we need to reset the CFA to
14279 the drap register. This will remain until we restore
14280 the stack pointer. */
14281 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
14282 RTX_FRAME_RELATED_P (insn) = 1;
14283
14284 /* This means that the DRAP register is valid for addressing. */
14285 m->fs.drap_valid = true;
14286 }
14287 else
14288 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
14289
14290 cfa_offset -= UNITS_PER_WORD;
14291 }
14292 }
14293
14294 /* Emit code to restore saved registers using MOV insns.
14295 First register is restored from CFA - CFA_OFFSET. */
14296 static void
14297 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
14298 bool maybe_eh_return)
14299 {
14300 unsigned int regno;
14301
14302 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14303 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
14304 {
14305 rtx reg = gen_rtx_REG (V4SFmode, regno);
14306 rtx mem;
14307 unsigned int align;
14308
14309 mem = choose_baseaddr (cfa_offset);
14310 mem = gen_rtx_MEM (V4SFmode, mem);
14311
14312 /* The location is aligned up to INCOMING_STACK_BOUNDARY. */
14313 align = MIN (GET_MODE_ALIGNMENT (V4SFmode), INCOMING_STACK_BOUNDARY);
14314 set_mem_align (mem, align);
14315 emit_insn (gen_rtx_SET (reg, mem));
14316
14317 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
14318
14319 cfa_offset -= GET_MODE_SIZE (V4SFmode);
14320 }
14321 }
14322
14323 /* Restore function stack, frame, and registers. */
14324
14325 void
14326 ix86_expand_epilogue (int style)
14327 {
14328 struct machine_function *m = cfun->machine;
14329 struct machine_frame_state frame_state_save = m->fs;
14330 struct ix86_frame frame;
14331 bool restore_regs_via_mov;
14332 bool using_drap;
14333
14334 ix86_finalize_stack_realign_flags ();
14335 ix86_compute_frame_layout (&frame);
14336
14337 m->fs.sp_valid = (!frame_pointer_needed
14338 || (crtl->sp_is_unchanging
14339 && !stack_realign_fp));
14340 gcc_assert (!m->fs.sp_valid
14341 || m->fs.sp_offset == frame.stack_pointer_offset);
14342
14343 /* The FP must be valid if the frame pointer is present. */
14344 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
14345 gcc_assert (!m->fs.fp_valid
14346 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
14347
14348 /* We must have *some* valid pointer to the stack frame. */
14349 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
14350
14351 /* The DRAP is never valid at this point. */
14352 gcc_assert (!m->fs.drap_valid);
14353
14354 /* See the comment about red zone and frame
14355 pointer usage in ix86_expand_prologue. */
14356 if (frame_pointer_needed && frame.red_zone_size)
14357 emit_insn (gen_memory_blockage ());
14358
14359 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
14360 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
14361
14362 /* Determine the CFA offset of the end of the red-zone. */
14363 m->fs.red_zone_offset = 0;
14364 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
14365 {
14366 /* The red-zone begins below the return address. */
14367 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
14368
14369 /* When the register save area is in the aligned portion of
14370 the stack, determine the maximum runtime displacement that
14371 matches up with the aligned frame. */
14372 if (stack_realign_drap)
14373 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
14374 + UNITS_PER_WORD);
14375 }
14376
14377 /* Special care must be taken for the normal return case of a function
14378 using eh_return: the eax and edx registers are marked as saved, but
14379 not restored along this path. Adjust the save location to match. */
14380 if (crtl->calls_eh_return && style != 2)
14381 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
14382
14383 /* EH_RETURN requires the use of moves to function properly. */
14384 if (crtl->calls_eh_return)
14385 restore_regs_via_mov = true;
14386 /* SEH requires the use of pops to identify the epilogue. */
14387 else if (TARGET_SEH)
14388 restore_regs_via_mov = false;
14389 /* If we're only restoring one register and sp is not valid then
14390 using a move instruction to restore the register since it's
14391 less work than reloading sp and popping the register. */
14392 else if (!m->fs.sp_valid && frame.nregs <= 1)
14393 restore_regs_via_mov = true;
14394 else if (TARGET_EPILOGUE_USING_MOVE
14395 && cfun->machine->use_fast_prologue_epilogue
14396 && (frame.nregs > 1
14397 || m->fs.sp_offset != frame.reg_save_offset))
14398 restore_regs_via_mov = true;
14399 else if (frame_pointer_needed
14400 && !frame.nregs
14401 && m->fs.sp_offset != frame.reg_save_offset)
14402 restore_regs_via_mov = true;
14403 else if (frame_pointer_needed
14404 && TARGET_USE_LEAVE
14405 && cfun->machine->use_fast_prologue_epilogue
14406 && frame.nregs == 1)
14407 restore_regs_via_mov = true;
14408 else
14409 restore_regs_via_mov = false;
14410
14411 if (restore_regs_via_mov || frame.nsseregs)
14412 {
14413 /* Ensure that the entire register save area is addressable via
14414 the stack pointer, if we will restore via sp. */
14415 if (TARGET_64BIT
14416 && m->fs.sp_offset > 0x7fffffff
14417 && !(m->fs.fp_valid || m->fs.drap_valid)
14418 && (frame.nsseregs + frame.nregs) != 0)
14419 {
14420 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14421 GEN_INT (m->fs.sp_offset
14422 - frame.sse_reg_save_offset),
14423 style,
14424 m->fs.cfa_reg == stack_pointer_rtx);
14425 }
14426 }
14427
14428 /* If there are any SSE registers to restore, then we have to do it
14429 via moves, since there's obviously no pop for SSE regs. */
14430 if (frame.nsseregs)
14431 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
14432 style == 2);
14433
14434 if (restore_regs_via_mov)
14435 {
14436 rtx t;
14437
14438 if (frame.nregs)
14439 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
14440
14441 /* eh_return epilogues need %ecx added to the stack pointer. */
14442 if (style == 2)
14443 {
14444 rtx sa = EH_RETURN_STACKADJ_RTX;
14445 rtx_insn *insn;
14446
14447 /* %ecx can't be used for both DRAP register and eh_return. */
14448 if (crtl->drap_reg)
14449 gcc_assert (REGNO (crtl->drap_reg) != CX_REG);
14450
14451 /* regparm nested functions don't work with eh_return. */
14452 gcc_assert (!ix86_static_chain_on_stack);
14453
14454 if (frame_pointer_needed)
14455 {
14456 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
14457 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
14458 emit_insn (gen_rtx_SET (sa, t));
14459
14460 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
14461 insn = emit_move_insn (hard_frame_pointer_rtx, t);
14462
14463 /* Note that we use SA as a temporary CFA, as the return
14464 address is at the proper place relative to it. We
14465 pretend this happens at the FP restore insn because
14466 prior to this insn the FP would be stored at the wrong
14467 offset relative to SA, and after this insn we have no
14468 other reasonable register to use for the CFA. We don't
14469 bother resetting the CFA to the SP for the duration of
14470 the return insn. */
14471 add_reg_note (insn, REG_CFA_DEF_CFA,
14472 plus_constant (Pmode, sa, UNITS_PER_WORD));
14473 ix86_add_queued_cfa_restore_notes (insn);
14474 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
14475 RTX_FRAME_RELATED_P (insn) = 1;
14476
14477 m->fs.cfa_reg = sa;
14478 m->fs.cfa_offset = UNITS_PER_WORD;
14479 m->fs.fp_valid = false;
14480
14481 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
14482 const0_rtx, style, false);
14483 }
14484 else
14485 {
14486 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
14487 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
14488 insn = emit_insn (gen_rtx_SET (stack_pointer_rtx, t));
14489 ix86_add_queued_cfa_restore_notes (insn);
14490
14491 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
14492 if (m->fs.cfa_offset != UNITS_PER_WORD)
14493 {
14494 m->fs.cfa_offset = UNITS_PER_WORD;
14495 add_reg_note (insn, REG_CFA_DEF_CFA,
14496 plus_constant (Pmode, stack_pointer_rtx,
14497 UNITS_PER_WORD));
14498 RTX_FRAME_RELATED_P (insn) = 1;
14499 }
14500 }
14501 m->fs.sp_offset = UNITS_PER_WORD;
14502 m->fs.sp_valid = true;
14503 }
14504 }
14505 else
14506 {
14507 /* SEH requires that the function end with (1) a stack adjustment
14508 if necessary, (2) a sequence of pops, and (3) a return or
14509 jump instruction. Prevent insns from the function body from
14510 being scheduled into this sequence. */
14511 if (TARGET_SEH)
14512 {
14513 /* Prevent a catch region from being adjacent to the standard
14514 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
14515 several other flags that would be interesting to test are
14516 not yet set up. */
14517 if (flag_non_call_exceptions)
14518 emit_insn (gen_nops (const1_rtx));
14519 else
14520 emit_insn (gen_blockage ());
14521 }
14522
14523 /* First step is to deallocate the stack frame so that we can
14524 pop the registers. Also do it on SEH target for very large
14525 frame as the emitted instructions aren't allowed by the ABI in
14526 epilogues. */
14527 if (!m->fs.sp_valid
14528 || (TARGET_SEH
14529 && (m->fs.sp_offset - frame.reg_save_offset
14530 >= SEH_MAX_FRAME_SIZE)))
14531 {
14532 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
14533 GEN_INT (m->fs.fp_offset
14534 - frame.reg_save_offset),
14535 style, false);
14536 }
14537 else if (m->fs.sp_offset != frame.reg_save_offset)
14538 {
14539 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14540 GEN_INT (m->fs.sp_offset
14541 - frame.reg_save_offset),
14542 style,
14543 m->fs.cfa_reg == stack_pointer_rtx);
14544 }
14545
14546 ix86_emit_restore_regs_using_pop ();
14547 }
14548
14549 /* If we used a stack pointer and haven't already got rid of it,
14550 then do so now. */
14551 if (m->fs.fp_valid)
14552 {
14553 /* If the stack pointer is valid and pointing at the frame
14554 pointer store address, then we only need a pop. */
14555 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
14556 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14557 /* Leave results in shorter dependency chains on CPUs that are
14558 able to grok it fast. */
14559 else if (TARGET_USE_LEAVE
14560 || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
14561 || !cfun->machine->use_fast_prologue_epilogue)
14562 ix86_emit_leave ();
14563 else
14564 {
14565 pro_epilogue_adjust_stack (stack_pointer_rtx,
14566 hard_frame_pointer_rtx,
14567 const0_rtx, style, !using_drap);
14568 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14569 }
14570 }
14571
14572 if (using_drap)
14573 {
14574 int param_ptr_offset = UNITS_PER_WORD;
14575 rtx_insn *insn;
14576
14577 gcc_assert (stack_realign_drap);
14578
14579 if (ix86_static_chain_on_stack)
14580 param_ptr_offset += UNITS_PER_WORD;
14581 if (!call_used_regs[REGNO (crtl->drap_reg)])
14582 param_ptr_offset += UNITS_PER_WORD;
14583
14584 insn = emit_insn (gen_rtx_SET
14585 (stack_pointer_rtx,
14586 gen_rtx_PLUS (Pmode,
14587 crtl->drap_reg,
14588 GEN_INT (-param_ptr_offset))));
14589 m->fs.cfa_reg = stack_pointer_rtx;
14590 m->fs.cfa_offset = param_ptr_offset;
14591 m->fs.sp_offset = param_ptr_offset;
14592 m->fs.realigned = false;
14593
14594 add_reg_note (insn, REG_CFA_DEF_CFA,
14595 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14596 GEN_INT (param_ptr_offset)));
14597 RTX_FRAME_RELATED_P (insn) = 1;
14598
14599 if (!call_used_regs[REGNO (crtl->drap_reg)])
14600 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
14601 }
14602
14603 /* At this point the stack pointer must be valid, and we must have
14604 restored all of the registers. We may not have deallocated the
14605 entire stack frame. We've delayed this until now because it may
14606 be possible to merge the local stack deallocation with the
14607 deallocation forced by ix86_static_chain_on_stack. */
14608 gcc_assert (m->fs.sp_valid);
14609 gcc_assert (!m->fs.fp_valid);
14610 gcc_assert (!m->fs.realigned);
14611 if (m->fs.sp_offset != UNITS_PER_WORD)
14612 {
14613 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14614 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
14615 style, true);
14616 }
14617 else
14618 ix86_add_queued_cfa_restore_notes (get_last_insn ());
14619
14620 /* Sibcall epilogues don't want a return instruction. */
14621 if (style == 0)
14622 {
14623 m->fs = frame_state_save;
14624 return;
14625 }
14626
14627 if (cfun->machine->func_type != TYPE_NORMAL)
14628 {
14629 /* Return with the "IRET" instruction from interrupt handler.
14630 Pop the 'ERROR_CODE' off the stack before the 'IRET'
14631 instruction in exception handler. */
14632 if (cfun->machine->func_type == TYPE_EXCEPTION)
14633 {
14634 rtx r = plus_constant (Pmode, stack_pointer_rtx,
14635 UNITS_PER_WORD);
14636 emit_insn (gen_rtx_SET (stack_pointer_rtx, r));
14637 }
14638 emit_jump_insn (gen_interrupt_return ());
14639 }
14640 else if (crtl->args.pops_args && crtl->args.size)
14641 {
14642 rtx popc = GEN_INT (crtl->args.pops_args);
14643
14644 /* i386 can only pop 64K bytes. If asked to pop more, pop return
14645 address, do explicit add, and jump indirectly to the caller. */
14646
14647 if (crtl->args.pops_args >= 65536)
14648 {
14649 rtx ecx = gen_rtx_REG (SImode, CX_REG);
14650 rtx_insn *insn;
14651
14652 /* There is no "pascal" calling convention in any 64bit ABI. */
14653 gcc_assert (!TARGET_64BIT);
14654
14655 insn = emit_insn (gen_pop (ecx));
14656 m->fs.cfa_offset -= UNITS_PER_WORD;
14657 m->fs.sp_offset -= UNITS_PER_WORD;
14658
14659 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14660 x = gen_rtx_SET (stack_pointer_rtx, x);
14661 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14662 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
14663 RTX_FRAME_RELATED_P (insn) = 1;
14664
14665 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14666 popc, -1, true);
14667 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
14668 }
14669 else
14670 emit_jump_insn (gen_simple_return_pop_internal (popc));
14671 }
14672 else
14673 emit_jump_insn (gen_simple_return_internal ());
14674
14675 /* Restore the state back to the state from the prologue,
14676 so that it's correct for the next epilogue. */
14677 m->fs = frame_state_save;
14678 }
14679
14680 /* Reset from the function's potential modifications. */
14681
14682 static void
14683 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED, HOST_WIDE_INT)
14684 {
14685 if (pic_offset_table_rtx
14686 && !ix86_use_pseudo_pic_reg ())
14687 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
14688
14689 if (TARGET_MACHO)
14690 {
14691 rtx_insn *insn = get_last_insn ();
14692 rtx_insn *deleted_debug_label = NULL;
14693
14694 /* Mach-O doesn't support labels at the end of objects, so if
14695 it looks like we might want one, take special action.
14696 First, collect any sequence of deleted debug labels. */
14697 while (insn
14698 && NOTE_P (insn)
14699 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
14700 {
14701 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
14702 notes only, instead set their CODE_LABEL_NUMBER to -1,
14703 otherwise there would be code generation differences
14704 in between -g and -g0. */
14705 if (NOTE_P (insn) && NOTE_KIND (insn)
14706 == NOTE_INSN_DELETED_DEBUG_LABEL)
14707 deleted_debug_label = insn;
14708 insn = PREV_INSN (insn);
14709 }
14710
14711 /* If we have:
14712 label:
14713 barrier
14714 then this needs to be detected, so skip past the barrier. */
14715
14716 if (insn && BARRIER_P (insn))
14717 insn = PREV_INSN (insn);
14718
14719 /* Up to now we've only seen notes or barriers. */
14720 if (insn)
14721 {
14722 if (LABEL_P (insn)
14723 || (NOTE_P (insn)
14724 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
14725 /* Trailing label. */
14726 fputs ("\tnop\n", file);
14727 else if (cfun && ! cfun->is_thunk)
14728 {
14729 /* See if we have a completely empty function body, skipping
14730 the special case of the picbase thunk emitted as asm. */
14731 while (insn && ! INSN_P (insn))
14732 insn = PREV_INSN (insn);
14733 /* If we don't find any insns, we've got an empty function body;
14734 I.e. completely empty - without a return or branch. This is
14735 taken as the case where a function body has been removed
14736 because it contains an inline __builtin_unreachable(). GCC
14737 declares that reaching __builtin_unreachable() means UB so
14738 we're not obliged to do anything special; however, we want
14739 non-zero-sized function bodies. To meet this, and help the
14740 user out, let's trap the case. */
14741 if (insn == NULL)
14742 fputs ("\tud2\n", file);
14743 }
14744 }
14745 else if (deleted_debug_label)
14746 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
14747 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
14748 CODE_LABEL_NUMBER (insn) = -1;
14749 }
14750 }
14751
14752 /* Return a scratch register to use in the split stack prologue. The
14753 split stack prologue is used for -fsplit-stack. It is the first
14754 instructions in the function, even before the regular prologue.
14755 The scratch register can be any caller-saved register which is not
14756 used for parameters or for the static chain. */
14757
14758 static unsigned int
14759 split_stack_prologue_scratch_regno (void)
14760 {
14761 if (TARGET_64BIT)
14762 return R11_REG;
14763 else
14764 {
14765 bool is_fastcall, is_thiscall;
14766 int regparm;
14767
14768 is_fastcall = (lookup_attribute ("fastcall",
14769 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14770 != NULL);
14771 is_thiscall = (lookup_attribute ("thiscall",
14772 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14773 != NULL);
14774 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
14775
14776 if (is_fastcall)
14777 {
14778 if (DECL_STATIC_CHAIN (cfun->decl))
14779 {
14780 sorry ("-fsplit-stack does not support fastcall with "
14781 "nested function");
14782 return INVALID_REGNUM;
14783 }
14784 return AX_REG;
14785 }
14786 else if (is_thiscall)
14787 {
14788 if (!DECL_STATIC_CHAIN (cfun->decl))
14789 return DX_REG;
14790 return AX_REG;
14791 }
14792 else if (regparm < 3)
14793 {
14794 if (!DECL_STATIC_CHAIN (cfun->decl))
14795 return CX_REG;
14796 else
14797 {
14798 if (regparm >= 2)
14799 {
14800 sorry ("-fsplit-stack does not support 2 register "
14801 "parameters for a nested function");
14802 return INVALID_REGNUM;
14803 }
14804 return DX_REG;
14805 }
14806 }
14807 else
14808 {
14809 /* FIXME: We could make this work by pushing a register
14810 around the addition and comparison. */
14811 sorry ("-fsplit-stack does not support 3 register parameters");
14812 return INVALID_REGNUM;
14813 }
14814 }
14815 }
14816
14817 /* A SYMBOL_REF for the function which allocates new stackspace for
14818 -fsplit-stack. */
14819
14820 static GTY(()) rtx split_stack_fn;
14821
14822 /* A SYMBOL_REF for the more stack function when using the large
14823 model. */
14824
14825 static GTY(()) rtx split_stack_fn_large;
14826
14827 /* Handle -fsplit-stack. These are the first instructions in the
14828 function, even before the regular prologue. */
14829
14830 void
14831 ix86_expand_split_stack_prologue (void)
14832 {
14833 struct ix86_frame frame;
14834 HOST_WIDE_INT allocate;
14835 unsigned HOST_WIDE_INT args_size;
14836 rtx_code_label *label;
14837 rtx limit, current, allocate_rtx, call_insn, call_fusage;
14838 rtx scratch_reg = NULL_RTX;
14839 rtx_code_label *varargs_label = NULL;
14840 rtx fn;
14841
14842 gcc_assert (flag_split_stack && reload_completed);
14843
14844 ix86_finalize_stack_realign_flags ();
14845 ix86_compute_frame_layout (&frame);
14846 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
14847
14848 /* This is the label we will branch to if we have enough stack
14849 space. We expect the basic block reordering pass to reverse this
14850 branch if optimizing, so that we branch in the unlikely case. */
14851 label = gen_label_rtx ();
14852
14853 /* We need to compare the stack pointer minus the frame size with
14854 the stack boundary in the TCB. The stack boundary always gives
14855 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
14856 can compare directly. Otherwise we need to do an addition. */
14857
14858 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
14859 UNSPEC_STACK_CHECK);
14860 limit = gen_rtx_CONST (Pmode, limit);
14861 limit = gen_rtx_MEM (Pmode, limit);
14862 if (allocate < SPLIT_STACK_AVAILABLE)
14863 current = stack_pointer_rtx;
14864 else
14865 {
14866 unsigned int scratch_regno;
14867 rtx offset;
14868
14869 /* We need a scratch register to hold the stack pointer minus
14870 the required frame size. Since this is the very start of the
14871 function, the scratch register can be any caller-saved
14872 register which is not used for parameters. */
14873 offset = GEN_INT (- allocate);
14874 scratch_regno = split_stack_prologue_scratch_regno ();
14875 if (scratch_regno == INVALID_REGNUM)
14876 return;
14877 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
14878 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
14879 {
14880 /* We don't use ix86_gen_add3 in this case because it will
14881 want to split to lea, but when not optimizing the insn
14882 will not be split after this point. */
14883 emit_insn (gen_rtx_SET (scratch_reg,
14884 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14885 offset)));
14886 }
14887 else
14888 {
14889 emit_move_insn (scratch_reg, offset);
14890 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
14891 stack_pointer_rtx));
14892 }
14893 current = scratch_reg;
14894 }
14895
14896 ix86_expand_branch (GEU, current, limit, label);
14897 rtx_insn *jump_insn = get_last_insn ();
14898 JUMP_LABEL (jump_insn) = label;
14899
14900 /* Mark the jump as very likely to be taken. */
14901 add_int_reg_note (jump_insn, REG_BR_PROB,
14902 REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100);
14903
14904 if (split_stack_fn == NULL_RTX)
14905 {
14906 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
14907 SYMBOL_REF_FLAGS (split_stack_fn) |= SYMBOL_FLAG_LOCAL;
14908 }
14909 fn = split_stack_fn;
14910
14911 /* Get more stack space. We pass in the desired stack space and the
14912 size of the arguments to copy to the new stack. In 32-bit mode
14913 we push the parameters; __morestack will return on a new stack
14914 anyhow. In 64-bit mode we pass the parameters in r10 and
14915 r11. */
14916 allocate_rtx = GEN_INT (allocate);
14917 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
14918 call_fusage = NULL_RTX;
14919 if (TARGET_64BIT)
14920 {
14921 rtx reg10, reg11;
14922
14923 reg10 = gen_rtx_REG (Pmode, R10_REG);
14924 reg11 = gen_rtx_REG (Pmode, R11_REG);
14925
14926 /* If this function uses a static chain, it will be in %r10.
14927 Preserve it across the call to __morestack. */
14928 if (DECL_STATIC_CHAIN (cfun->decl))
14929 {
14930 rtx rax;
14931
14932 rax = gen_rtx_REG (word_mode, AX_REG);
14933 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
14934 use_reg (&call_fusage, rax);
14935 }
14936
14937 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
14938 && !TARGET_PECOFF)
14939 {
14940 HOST_WIDE_INT argval;
14941
14942 gcc_assert (Pmode == DImode);
14943 /* When using the large model we need to load the address
14944 into a register, and we've run out of registers. So we
14945 switch to a different calling convention, and we call a
14946 different function: __morestack_large. We pass the
14947 argument size in the upper 32 bits of r10 and pass the
14948 frame size in the lower 32 bits. */
14949 gcc_assert ((allocate & HOST_WIDE_INT_C (0xffffffff)) == allocate);
14950 gcc_assert ((args_size & 0xffffffff) == args_size);
14951
14952 if (split_stack_fn_large == NULL_RTX)
14953 {
14954 split_stack_fn_large =
14955 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
14956 SYMBOL_REF_FLAGS (split_stack_fn_large) |= SYMBOL_FLAG_LOCAL;
14957 }
14958 if (ix86_cmodel == CM_LARGE_PIC)
14959 {
14960 rtx_code_label *label;
14961 rtx x;
14962
14963 label = gen_label_rtx ();
14964 emit_label (label);
14965 LABEL_PRESERVE_P (label) = 1;
14966 emit_insn (gen_set_rip_rex64 (reg10, label));
14967 emit_insn (gen_set_got_offset_rex64 (reg11, label));
14968 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
14969 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
14970 UNSPEC_GOT);
14971 x = gen_rtx_CONST (Pmode, x);
14972 emit_move_insn (reg11, x);
14973 x = gen_rtx_PLUS (Pmode, reg10, reg11);
14974 x = gen_const_mem (Pmode, x);
14975 emit_move_insn (reg11, x);
14976 }
14977 else
14978 emit_move_insn (reg11, split_stack_fn_large);
14979
14980 fn = reg11;
14981
14982 argval = ((args_size << 16) << 16) + allocate;
14983 emit_move_insn (reg10, GEN_INT (argval));
14984 }
14985 else
14986 {
14987 emit_move_insn (reg10, allocate_rtx);
14988 emit_move_insn (reg11, GEN_INT (args_size));
14989 use_reg (&call_fusage, reg11);
14990 }
14991
14992 use_reg (&call_fusage, reg10);
14993 }
14994 else
14995 {
14996 emit_insn (gen_push (GEN_INT (args_size)));
14997 emit_insn (gen_push (allocate_rtx));
14998 }
14999 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
15000 GEN_INT (UNITS_PER_WORD), constm1_rtx,
15001 NULL_RTX, false);
15002 add_function_usage_to (call_insn, call_fusage);
15003
15004 /* In order to make call/return prediction work right, we now need
15005 to execute a return instruction. See
15006 libgcc/config/i386/morestack.S for the details on how this works.
15007
15008 For flow purposes gcc must not see this as a return
15009 instruction--we need control flow to continue at the subsequent
15010 label. Therefore, we use an unspec. */
15011 gcc_assert (crtl->args.pops_args < 65536);
15012 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
15013
15014 /* If we are in 64-bit mode and this function uses a static chain,
15015 we saved %r10 in %rax before calling _morestack. */
15016 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
15017 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
15018 gen_rtx_REG (word_mode, AX_REG));
15019
15020 /* If this function calls va_start, we need to store a pointer to
15021 the arguments on the old stack, because they may not have been
15022 all copied to the new stack. At this point the old stack can be
15023 found at the frame pointer value used by __morestack, because
15024 __morestack has set that up before calling back to us. Here we
15025 store that pointer in a scratch register, and in
15026 ix86_expand_prologue we store the scratch register in a stack
15027 slot. */
15028 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15029 {
15030 unsigned int scratch_regno;
15031 rtx frame_reg;
15032 int words;
15033
15034 scratch_regno = split_stack_prologue_scratch_regno ();
15035 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
15036 frame_reg = gen_rtx_REG (Pmode, BP_REG);
15037
15038 /* 64-bit:
15039 fp -> old fp value
15040 return address within this function
15041 return address of caller of this function
15042 stack arguments
15043 So we add three words to get to the stack arguments.
15044
15045 32-bit:
15046 fp -> old fp value
15047 return address within this function
15048 first argument to __morestack
15049 second argument to __morestack
15050 return address of caller of this function
15051 stack arguments
15052 So we add five words to get to the stack arguments.
15053 */
15054 words = TARGET_64BIT ? 3 : 5;
15055 emit_insn (gen_rtx_SET (scratch_reg,
15056 gen_rtx_PLUS (Pmode, frame_reg,
15057 GEN_INT (words * UNITS_PER_WORD))));
15058
15059 varargs_label = gen_label_rtx ();
15060 emit_jump_insn (gen_jump (varargs_label));
15061 JUMP_LABEL (get_last_insn ()) = varargs_label;
15062
15063 emit_barrier ();
15064 }
15065
15066 emit_label (label);
15067 LABEL_NUSES (label) = 1;
15068
15069 /* If this function calls va_start, we now have to set the scratch
15070 register for the case where we do not call __morestack. In this
15071 case we need to set it based on the stack pointer. */
15072 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15073 {
15074 emit_insn (gen_rtx_SET (scratch_reg,
15075 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
15076 GEN_INT (UNITS_PER_WORD))));
15077
15078 emit_label (varargs_label);
15079 LABEL_NUSES (varargs_label) = 1;
15080 }
15081 }
15082
15083 /* We may have to tell the dataflow pass that the split stack prologue
15084 is initializing a scratch register. */
15085
15086 static void
15087 ix86_live_on_entry (bitmap regs)
15088 {
15089 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15090 {
15091 gcc_assert (flag_split_stack);
15092 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
15093 }
15094 }
15095 \f
15096 /* Extract the parts of an RTL expression that is a valid memory address
15097 for an instruction. Return 0 if the structure of the address is
15098 grossly off. Return -1 if the address contains ASHIFT, so it is not
15099 strictly valid, but still used for computing length of lea instruction. */
15100
15101 int
15102 ix86_decompose_address (rtx addr, struct ix86_address *out)
15103 {
15104 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
15105 rtx base_reg, index_reg;
15106 HOST_WIDE_INT scale = 1;
15107 rtx scale_rtx = NULL_RTX;
15108 rtx tmp;
15109 int retval = 1;
15110 addr_space_t seg = ADDR_SPACE_GENERIC;
15111
15112 /* Allow zero-extended SImode addresses,
15113 they will be emitted with addr32 prefix. */
15114 if (TARGET_64BIT && GET_MODE (addr) == DImode)
15115 {
15116 if (GET_CODE (addr) == ZERO_EXTEND
15117 && GET_MODE (XEXP (addr, 0)) == SImode)
15118 {
15119 addr = XEXP (addr, 0);
15120 if (CONST_INT_P (addr))
15121 return 0;
15122 }
15123 else if (GET_CODE (addr) == AND
15124 && const_32bit_mask (XEXP (addr, 1), DImode))
15125 {
15126 addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode);
15127 if (addr == NULL_RTX)
15128 return 0;
15129
15130 if (CONST_INT_P (addr))
15131 return 0;
15132 }
15133 }
15134
15135 /* Allow SImode subregs of DImode addresses,
15136 they will be emitted with addr32 prefix. */
15137 if (TARGET_64BIT && GET_MODE (addr) == SImode)
15138 {
15139 if (SUBREG_P (addr)
15140 && GET_MODE (SUBREG_REG (addr)) == DImode)
15141 {
15142 addr = SUBREG_REG (addr);
15143 if (CONST_INT_P (addr))
15144 return 0;
15145 }
15146 }
15147
15148 if (REG_P (addr))
15149 base = addr;
15150 else if (SUBREG_P (addr))
15151 {
15152 if (REG_P (SUBREG_REG (addr)))
15153 base = addr;
15154 else
15155 return 0;
15156 }
15157 else if (GET_CODE (addr) == PLUS)
15158 {
15159 rtx addends[4], op;
15160 int n = 0, i;
15161
15162 op = addr;
15163 do
15164 {
15165 if (n >= 4)
15166 return 0;
15167 addends[n++] = XEXP (op, 1);
15168 op = XEXP (op, 0);
15169 }
15170 while (GET_CODE (op) == PLUS);
15171 if (n >= 4)
15172 return 0;
15173 addends[n] = op;
15174
15175 for (i = n; i >= 0; --i)
15176 {
15177 op = addends[i];
15178 switch (GET_CODE (op))
15179 {
15180 case MULT:
15181 if (index)
15182 return 0;
15183 index = XEXP (op, 0);
15184 scale_rtx = XEXP (op, 1);
15185 break;
15186
15187 case ASHIFT:
15188 if (index)
15189 return 0;
15190 index = XEXP (op, 0);
15191 tmp = XEXP (op, 1);
15192 if (!CONST_INT_P (tmp))
15193 return 0;
15194 scale = INTVAL (tmp);
15195 if ((unsigned HOST_WIDE_INT) scale > 3)
15196 return 0;
15197 scale = 1 << scale;
15198 break;
15199
15200 case ZERO_EXTEND:
15201 op = XEXP (op, 0);
15202 if (GET_CODE (op) != UNSPEC)
15203 return 0;
15204 /* FALLTHRU */
15205
15206 case UNSPEC:
15207 if (XINT (op, 1) == UNSPEC_TP
15208 && TARGET_TLS_DIRECT_SEG_REFS
15209 && seg == ADDR_SPACE_GENERIC)
15210 seg = DEFAULT_TLS_SEG_REG;
15211 else
15212 return 0;
15213 break;
15214
15215 case SUBREG:
15216 if (!REG_P (SUBREG_REG (op)))
15217 return 0;
15218 /* FALLTHRU */
15219
15220 case REG:
15221 if (!base)
15222 base = op;
15223 else if (!index)
15224 index = op;
15225 else
15226 return 0;
15227 break;
15228
15229 case CONST:
15230 case CONST_INT:
15231 case SYMBOL_REF:
15232 case LABEL_REF:
15233 if (disp)
15234 return 0;
15235 disp = op;
15236 break;
15237
15238 default:
15239 return 0;
15240 }
15241 }
15242 }
15243 else if (GET_CODE (addr) == MULT)
15244 {
15245 index = XEXP (addr, 0); /* index*scale */
15246 scale_rtx = XEXP (addr, 1);
15247 }
15248 else if (GET_CODE (addr) == ASHIFT)
15249 {
15250 /* We're called for lea too, which implements ashift on occasion. */
15251 index = XEXP (addr, 0);
15252 tmp = XEXP (addr, 1);
15253 if (!CONST_INT_P (tmp))
15254 return 0;
15255 scale = INTVAL (tmp);
15256 if ((unsigned HOST_WIDE_INT) scale > 3)
15257 return 0;
15258 scale = 1 << scale;
15259 retval = -1;
15260 }
15261 else
15262 disp = addr; /* displacement */
15263
15264 if (index)
15265 {
15266 if (REG_P (index))
15267 ;
15268 else if (SUBREG_P (index)
15269 && REG_P (SUBREG_REG (index)))
15270 ;
15271 else
15272 return 0;
15273 }
15274
15275 /* Extract the integral value of scale. */
15276 if (scale_rtx)
15277 {
15278 if (!CONST_INT_P (scale_rtx))
15279 return 0;
15280 scale = INTVAL (scale_rtx);
15281 }
15282
15283 base_reg = base && SUBREG_P (base) ? SUBREG_REG (base) : base;
15284 index_reg = index && SUBREG_P (index) ? SUBREG_REG (index) : index;
15285
15286 /* Avoid useless 0 displacement. */
15287 if (disp == const0_rtx && (base || index))
15288 disp = NULL_RTX;
15289
15290 /* Allow arg pointer and stack pointer as index if there is not scaling. */
15291 if (base_reg && index_reg && scale == 1
15292 && (index_reg == arg_pointer_rtx
15293 || index_reg == frame_pointer_rtx
15294 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
15295 {
15296 std::swap (base, index);
15297 std::swap (base_reg, index_reg);
15298 }
15299
15300 /* Special case: %ebp cannot be encoded as a base without a displacement.
15301 Similarly %r13. */
15302 if (!disp
15303 && base_reg
15304 && (base_reg == hard_frame_pointer_rtx
15305 || base_reg == frame_pointer_rtx
15306 || base_reg == arg_pointer_rtx
15307 || (REG_P (base_reg)
15308 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
15309 || REGNO (base_reg) == R13_REG))))
15310 disp = const0_rtx;
15311
15312 /* Special case: on K6, [%esi] makes the instruction vector decoded.
15313 Avoid this by transforming to [%esi+0].
15314 Reload calls address legitimization without cfun defined, so we need
15315 to test cfun for being non-NULL. */
15316 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
15317 && base_reg && !index_reg && !disp
15318 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
15319 disp = const0_rtx;
15320
15321 /* Special case: encode reg+reg instead of reg*2. */
15322 if (!base && index && scale == 2)
15323 base = index, base_reg = index_reg, scale = 1;
15324
15325 /* Special case: scaling cannot be encoded without base or displacement. */
15326 if (!base && !disp && index && scale != 1)
15327 disp = const0_rtx;
15328
15329 out->base = base;
15330 out->index = index;
15331 out->disp = disp;
15332 out->scale = scale;
15333 out->seg = seg;
15334
15335 return retval;
15336 }
15337 \f
15338 /* Return cost of the memory address x.
15339 For i386, it is better to use a complex address than let gcc copy
15340 the address into a reg and make a new pseudo. But not if the address
15341 requires to two regs - that would mean more pseudos with longer
15342 lifetimes. */
15343 static int
15344 ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
15345 {
15346 struct ix86_address parts;
15347 int cost = 1;
15348 int ok = ix86_decompose_address (x, &parts);
15349
15350 gcc_assert (ok);
15351
15352 if (parts.base && SUBREG_P (parts.base))
15353 parts.base = SUBREG_REG (parts.base);
15354 if (parts.index && SUBREG_P (parts.index))
15355 parts.index = SUBREG_REG (parts.index);
15356
15357 /* Attempt to minimize number of registers in the address by increasing
15358 address cost for each used register. We don't increase address cost
15359 for "pic_offset_table_rtx". When a memopt with "pic_offset_table_rtx"
15360 is not invariant itself it most likely means that base or index is not
15361 invariant. Therefore only "pic_offset_table_rtx" could be hoisted out,
15362 which is not profitable for x86. */
15363 if (parts.base
15364 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
15365 && (current_pass->type == GIMPLE_PASS
15366 || !pic_offset_table_rtx
15367 || !REG_P (parts.base)
15368 || REGNO (pic_offset_table_rtx) != REGNO (parts.base)))
15369 cost++;
15370
15371 if (parts.index
15372 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
15373 && (current_pass->type == GIMPLE_PASS
15374 || !pic_offset_table_rtx
15375 || !REG_P (parts.index)
15376 || REGNO (pic_offset_table_rtx) != REGNO (parts.index)))
15377 cost++;
15378
15379 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
15380 since it's predecode logic can't detect the length of instructions
15381 and it degenerates to vector decoded. Increase cost of such
15382 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
15383 to split such addresses or even refuse such addresses at all.
15384
15385 Following addressing modes are affected:
15386 [base+scale*index]
15387 [scale*index+disp]
15388 [base+index]
15389
15390 The first and last case may be avoidable by explicitly coding the zero in
15391 memory address, but I don't have AMD-K6 machine handy to check this
15392 theory. */
15393
15394 if (TARGET_K6
15395 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
15396 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
15397 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
15398 cost += 10;
15399
15400 return cost;
15401 }
15402 \f
15403 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
15404 this is used for to form addresses to local data when -fPIC is in
15405 use. */
15406
15407 static bool
15408 darwin_local_data_pic (rtx disp)
15409 {
15410 return (GET_CODE (disp) == UNSPEC
15411 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
15412 }
15413
15414 /* True if operand X should be loaded from GOT. */
15415
15416 bool
15417 ix86_force_load_from_GOT_p (rtx x)
15418 {
15419 return ((TARGET_64BIT || HAVE_AS_IX86_GOT32X)
15420 && !TARGET_PECOFF && !TARGET_MACHO
15421 && !flag_plt && !flag_pic
15422 && ix86_cmodel != CM_LARGE
15423 && GET_CODE (x) == SYMBOL_REF
15424 && SYMBOL_REF_FUNCTION_P (x)
15425 && !SYMBOL_REF_LOCAL_P (x));
15426 }
15427
15428 /* Determine if a given RTX is a valid constant. We already know this
15429 satisfies CONSTANT_P. */
15430
15431 static bool
15432 ix86_legitimate_constant_p (machine_mode mode, rtx x)
15433 {
15434 /* Pointer bounds constants are not valid. */
15435 if (POINTER_BOUNDS_MODE_P (GET_MODE (x)))
15436 return false;
15437
15438 switch (GET_CODE (x))
15439 {
15440 case CONST:
15441 x = XEXP (x, 0);
15442
15443 if (GET_CODE (x) == PLUS)
15444 {
15445 if (!CONST_INT_P (XEXP (x, 1)))
15446 return false;
15447 x = XEXP (x, 0);
15448 }
15449
15450 if (TARGET_MACHO && darwin_local_data_pic (x))
15451 return true;
15452
15453 /* Only some unspecs are valid as "constants". */
15454 if (GET_CODE (x) == UNSPEC)
15455 switch (XINT (x, 1))
15456 {
15457 case UNSPEC_GOT:
15458 case UNSPEC_GOTOFF:
15459 case UNSPEC_PLTOFF:
15460 return TARGET_64BIT;
15461 case UNSPEC_TPOFF:
15462 case UNSPEC_NTPOFF:
15463 x = XVECEXP (x, 0, 0);
15464 return (GET_CODE (x) == SYMBOL_REF
15465 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15466 case UNSPEC_DTPOFF:
15467 x = XVECEXP (x, 0, 0);
15468 return (GET_CODE (x) == SYMBOL_REF
15469 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
15470 default:
15471 return false;
15472 }
15473
15474 /* We must have drilled down to a symbol. */
15475 if (GET_CODE (x) == LABEL_REF)
15476 return true;
15477 if (GET_CODE (x) != SYMBOL_REF)
15478 return false;
15479 /* FALLTHRU */
15480
15481 case SYMBOL_REF:
15482 /* TLS symbols are never valid. */
15483 if (SYMBOL_REF_TLS_MODEL (x))
15484 return false;
15485
15486 /* DLLIMPORT symbols are never valid. */
15487 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15488 && SYMBOL_REF_DLLIMPORT_P (x))
15489 return false;
15490
15491 #if TARGET_MACHO
15492 /* mdynamic-no-pic */
15493 if (MACHO_DYNAMIC_NO_PIC_P)
15494 return machopic_symbol_defined_p (x);
15495 #endif
15496
15497 /* External function address should be loaded
15498 via the GOT slot to avoid PLT. */
15499 if (ix86_force_load_from_GOT_p (x))
15500 return false;
15501
15502 break;
15503
15504 CASE_CONST_SCALAR_INT:
15505 switch (mode)
15506 {
15507 case TImode:
15508 if (TARGET_64BIT)
15509 return true;
15510 /* FALLTHRU */
15511 case OImode:
15512 case XImode:
15513 if (!standard_sse_constant_p (x, mode))
15514 return false;
15515 default:
15516 break;
15517 }
15518 break;
15519
15520 case CONST_VECTOR:
15521 if (!standard_sse_constant_p (x, mode))
15522 return false;
15523
15524 default:
15525 break;
15526 }
15527
15528 /* Otherwise we handle everything else in the move patterns. */
15529 return true;
15530 }
15531
15532 /* Determine if it's legal to put X into the constant pool. This
15533 is not possible for the address of thread-local symbols, which
15534 is checked above. */
15535
15536 static bool
15537 ix86_cannot_force_const_mem (machine_mode mode, rtx x)
15538 {
15539 /* We can put any immediate constant in memory. */
15540 switch (GET_CODE (x))
15541 {
15542 CASE_CONST_ANY:
15543 return false;
15544
15545 default:
15546 break;
15547 }
15548
15549 return !ix86_legitimate_constant_p (mode, x);
15550 }
15551
15552 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
15553 otherwise zero. */
15554
15555 static bool
15556 is_imported_p (rtx x)
15557 {
15558 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
15559 || GET_CODE (x) != SYMBOL_REF)
15560 return false;
15561
15562 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
15563 }
15564
15565
15566 /* Nonzero if the constant value X is a legitimate general operand
15567 when generating PIC code. It is given that flag_pic is on and
15568 that X satisfies CONSTANT_P. */
15569
15570 bool
15571 legitimate_pic_operand_p (rtx x)
15572 {
15573 rtx inner;
15574
15575 switch (GET_CODE (x))
15576 {
15577 case CONST:
15578 inner = XEXP (x, 0);
15579 if (GET_CODE (inner) == PLUS
15580 && CONST_INT_P (XEXP (inner, 1)))
15581 inner = XEXP (inner, 0);
15582
15583 /* Only some unspecs are valid as "constants". */
15584 if (GET_CODE (inner) == UNSPEC)
15585 switch (XINT (inner, 1))
15586 {
15587 case UNSPEC_GOT:
15588 case UNSPEC_GOTOFF:
15589 case UNSPEC_PLTOFF:
15590 return TARGET_64BIT;
15591 case UNSPEC_TPOFF:
15592 x = XVECEXP (inner, 0, 0);
15593 return (GET_CODE (x) == SYMBOL_REF
15594 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15595 case UNSPEC_MACHOPIC_OFFSET:
15596 return legitimate_pic_address_disp_p (x);
15597 default:
15598 return false;
15599 }
15600 /* FALLTHRU */
15601
15602 case SYMBOL_REF:
15603 case LABEL_REF:
15604 return legitimate_pic_address_disp_p (x);
15605
15606 default:
15607 return true;
15608 }
15609 }
15610
15611 /* Determine if a given CONST RTX is a valid memory displacement
15612 in PIC mode. */
15613
15614 bool
15615 legitimate_pic_address_disp_p (rtx disp)
15616 {
15617 bool saw_plus;
15618
15619 /* In 64bit mode we can allow direct addresses of symbols and labels
15620 when they are not dynamic symbols. */
15621 if (TARGET_64BIT)
15622 {
15623 rtx op0 = disp, op1;
15624
15625 switch (GET_CODE (disp))
15626 {
15627 case LABEL_REF:
15628 return true;
15629
15630 case CONST:
15631 if (GET_CODE (XEXP (disp, 0)) != PLUS)
15632 break;
15633 op0 = XEXP (XEXP (disp, 0), 0);
15634 op1 = XEXP (XEXP (disp, 0), 1);
15635 if (!CONST_INT_P (op1)
15636 || INTVAL (op1) >= 16*1024*1024
15637 || INTVAL (op1) < -16*1024*1024)
15638 break;
15639 if (GET_CODE (op0) == LABEL_REF)
15640 return true;
15641 if (GET_CODE (op0) == CONST
15642 && GET_CODE (XEXP (op0, 0)) == UNSPEC
15643 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
15644 return true;
15645 if (GET_CODE (op0) == UNSPEC
15646 && XINT (op0, 1) == UNSPEC_PCREL)
15647 return true;
15648 if (GET_CODE (op0) != SYMBOL_REF)
15649 break;
15650 /* FALLTHRU */
15651
15652 case SYMBOL_REF:
15653 /* TLS references should always be enclosed in UNSPEC.
15654 The dllimported symbol needs always to be resolved. */
15655 if (SYMBOL_REF_TLS_MODEL (op0)
15656 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
15657 return false;
15658
15659 if (TARGET_PECOFF)
15660 {
15661 if (is_imported_p (op0))
15662 return true;
15663
15664 if (SYMBOL_REF_FAR_ADDR_P (op0)
15665 || !SYMBOL_REF_LOCAL_P (op0))
15666 break;
15667
15668 /* Function-symbols need to be resolved only for
15669 large-model.
15670 For the small-model we don't need to resolve anything
15671 here. */
15672 if ((ix86_cmodel != CM_LARGE_PIC
15673 && SYMBOL_REF_FUNCTION_P (op0))
15674 || ix86_cmodel == CM_SMALL_PIC)
15675 return true;
15676 /* Non-external symbols don't need to be resolved for
15677 large, and medium-model. */
15678 if ((ix86_cmodel == CM_LARGE_PIC
15679 || ix86_cmodel == CM_MEDIUM_PIC)
15680 && !SYMBOL_REF_EXTERNAL_P (op0))
15681 return true;
15682 }
15683 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
15684 && (SYMBOL_REF_LOCAL_P (op0)
15685 || (HAVE_LD_PIE_COPYRELOC
15686 && flag_pie
15687 && !SYMBOL_REF_WEAK (op0)
15688 && !SYMBOL_REF_FUNCTION_P (op0)))
15689 && ix86_cmodel != CM_LARGE_PIC)
15690 return true;
15691 break;
15692
15693 default:
15694 break;
15695 }
15696 }
15697 if (GET_CODE (disp) != CONST)
15698 return false;
15699 disp = XEXP (disp, 0);
15700
15701 if (TARGET_64BIT)
15702 {
15703 /* We are unsafe to allow PLUS expressions. This limit allowed distance
15704 of GOT tables. We should not need these anyway. */
15705 if (GET_CODE (disp) != UNSPEC
15706 || (XINT (disp, 1) != UNSPEC_GOTPCREL
15707 && XINT (disp, 1) != UNSPEC_GOTOFF
15708 && XINT (disp, 1) != UNSPEC_PCREL
15709 && XINT (disp, 1) != UNSPEC_PLTOFF))
15710 return false;
15711
15712 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
15713 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
15714 return false;
15715 return true;
15716 }
15717
15718 saw_plus = false;
15719 if (GET_CODE (disp) == PLUS)
15720 {
15721 if (!CONST_INT_P (XEXP (disp, 1)))
15722 return false;
15723 disp = XEXP (disp, 0);
15724 saw_plus = true;
15725 }
15726
15727 if (TARGET_MACHO && darwin_local_data_pic (disp))
15728 return true;
15729
15730 if (GET_CODE (disp) != UNSPEC)
15731 return false;
15732
15733 switch (XINT (disp, 1))
15734 {
15735 case UNSPEC_GOT:
15736 if (saw_plus)
15737 return false;
15738 /* We need to check for both symbols and labels because VxWorks loads
15739 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
15740 details. */
15741 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15742 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
15743 case UNSPEC_GOTOFF:
15744 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
15745 While ABI specify also 32bit relocation but we don't produce it in
15746 small PIC model at all. */
15747 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15748 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
15749 && !TARGET_64BIT)
15750 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
15751 return false;
15752 case UNSPEC_GOTTPOFF:
15753 case UNSPEC_GOTNTPOFF:
15754 case UNSPEC_INDNTPOFF:
15755 if (saw_plus)
15756 return false;
15757 disp = XVECEXP (disp, 0, 0);
15758 return (GET_CODE (disp) == SYMBOL_REF
15759 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
15760 case UNSPEC_NTPOFF:
15761 disp = XVECEXP (disp, 0, 0);
15762 return (GET_CODE (disp) == SYMBOL_REF
15763 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
15764 case UNSPEC_DTPOFF:
15765 disp = XVECEXP (disp, 0, 0);
15766 return (GET_CODE (disp) == SYMBOL_REF
15767 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
15768 }
15769
15770 return false;
15771 }
15772
15773 /* Determine if op is suitable RTX for an address register.
15774 Return naked register if a register or a register subreg is
15775 found, otherwise return NULL_RTX. */
15776
15777 static rtx
15778 ix86_validate_address_register (rtx op)
15779 {
15780 machine_mode mode = GET_MODE (op);
15781
15782 /* Only SImode or DImode registers can form the address. */
15783 if (mode != SImode && mode != DImode)
15784 return NULL_RTX;
15785
15786 if (REG_P (op))
15787 return op;
15788 else if (SUBREG_P (op))
15789 {
15790 rtx reg = SUBREG_REG (op);
15791
15792 if (!REG_P (reg))
15793 return NULL_RTX;
15794
15795 mode = GET_MODE (reg);
15796
15797 /* Don't allow SUBREGs that span more than a word. It can
15798 lead to spill failures when the register is one word out
15799 of a two word structure. */
15800 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
15801 return NULL_RTX;
15802
15803 /* Allow only SUBREGs of non-eliminable hard registers. */
15804 if (register_no_elim_operand (reg, mode))
15805 return reg;
15806 }
15807
15808 /* Op is not a register. */
15809 return NULL_RTX;
15810 }
15811
15812 /* Recognizes RTL expressions that are valid memory addresses for an
15813 instruction. The MODE argument is the machine mode for the MEM
15814 expression that wants to use this address.
15815
15816 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
15817 convert common non-canonical forms to canonical form so that they will
15818 be recognized. */
15819
15820 static bool
15821 ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
15822 {
15823 struct ix86_address parts;
15824 rtx base, index, disp;
15825 HOST_WIDE_INT scale;
15826 addr_space_t seg;
15827
15828 if (ix86_decompose_address (addr, &parts) <= 0)
15829 /* Decomposition failed. */
15830 return false;
15831
15832 base = parts.base;
15833 index = parts.index;
15834 disp = parts.disp;
15835 scale = parts.scale;
15836 seg = parts.seg;
15837
15838 /* Validate base register. */
15839 if (base)
15840 {
15841 rtx reg = ix86_validate_address_register (base);
15842
15843 if (reg == NULL_RTX)
15844 return false;
15845
15846 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
15847 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
15848 /* Base is not valid. */
15849 return false;
15850 }
15851
15852 /* Validate index register. */
15853 if (index)
15854 {
15855 rtx reg = ix86_validate_address_register (index);
15856
15857 if (reg == NULL_RTX)
15858 return false;
15859
15860 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
15861 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
15862 /* Index is not valid. */
15863 return false;
15864 }
15865
15866 /* Index and base should have the same mode. */
15867 if (base && index
15868 && GET_MODE (base) != GET_MODE (index))
15869 return false;
15870
15871 /* Address override works only on the (%reg) part of %fs:(%reg). */
15872 if (seg != ADDR_SPACE_GENERIC
15873 && ((base && GET_MODE (base) != word_mode)
15874 || (index && GET_MODE (index) != word_mode)))
15875 return false;
15876
15877 /* Validate scale factor. */
15878 if (scale != 1)
15879 {
15880 if (!index)
15881 /* Scale without index. */
15882 return false;
15883
15884 if (scale != 2 && scale != 4 && scale != 8)
15885 /* Scale is not a valid multiplier. */
15886 return false;
15887 }
15888
15889 /* Validate displacement. */
15890 if (disp)
15891 {
15892 if (GET_CODE (disp) == CONST
15893 && GET_CODE (XEXP (disp, 0)) == UNSPEC
15894 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
15895 switch (XINT (XEXP (disp, 0), 1))
15896 {
15897 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit
15898 when used. While ABI specify also 32bit relocations, we
15899 don't produce them at all and use IP relative instead.
15900 Allow GOT in 32bit mode for both PIC and non-PIC if symbol
15901 should be loaded via GOT. */
15902 case UNSPEC_GOT:
15903 if (!TARGET_64BIT
15904 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
15905 goto is_legitimate_pic;
15906 /* FALLTHRU */
15907 case UNSPEC_GOTOFF:
15908 gcc_assert (flag_pic);
15909 if (!TARGET_64BIT)
15910 goto is_legitimate_pic;
15911
15912 /* 64bit address unspec. */
15913 return false;
15914
15915 case UNSPEC_GOTPCREL:
15916 if (ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
15917 goto is_legitimate_pic;
15918 /* FALLTHRU */
15919 case UNSPEC_PCREL:
15920 gcc_assert (flag_pic);
15921 goto is_legitimate_pic;
15922
15923 case UNSPEC_GOTTPOFF:
15924 case UNSPEC_GOTNTPOFF:
15925 case UNSPEC_INDNTPOFF:
15926 case UNSPEC_NTPOFF:
15927 case UNSPEC_DTPOFF:
15928 break;
15929
15930 case UNSPEC_STACK_CHECK:
15931 gcc_assert (flag_split_stack);
15932 break;
15933
15934 default:
15935 /* Invalid address unspec. */
15936 return false;
15937 }
15938
15939 else if (SYMBOLIC_CONST (disp)
15940 && (flag_pic
15941 || (TARGET_MACHO
15942 #if TARGET_MACHO
15943 && MACHOPIC_INDIRECT
15944 && !machopic_operand_p (disp)
15945 #endif
15946 )))
15947 {
15948
15949 is_legitimate_pic:
15950 if (TARGET_64BIT && (index || base))
15951 {
15952 /* foo@dtpoff(%rX) is ok. */
15953 if (GET_CODE (disp) != CONST
15954 || GET_CODE (XEXP (disp, 0)) != PLUS
15955 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
15956 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
15957 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
15958 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
15959 /* Non-constant pic memory reference. */
15960 return false;
15961 }
15962 else if ((!TARGET_MACHO || flag_pic)
15963 && ! legitimate_pic_address_disp_p (disp))
15964 /* Displacement is an invalid pic construct. */
15965 return false;
15966 #if TARGET_MACHO
15967 else if (MACHO_DYNAMIC_NO_PIC_P
15968 && !ix86_legitimate_constant_p (Pmode, disp))
15969 /* displacment must be referenced via non_lazy_pointer */
15970 return false;
15971 #endif
15972
15973 /* This code used to verify that a symbolic pic displacement
15974 includes the pic_offset_table_rtx register.
15975
15976 While this is good idea, unfortunately these constructs may
15977 be created by "adds using lea" optimization for incorrect
15978 code like:
15979
15980 int a;
15981 int foo(int i)
15982 {
15983 return *(&a+i);
15984 }
15985
15986 This code is nonsensical, but results in addressing
15987 GOT table with pic_offset_table_rtx base. We can't
15988 just refuse it easily, since it gets matched by
15989 "addsi3" pattern, that later gets split to lea in the
15990 case output register differs from input. While this
15991 can be handled by separate addsi pattern for this case
15992 that never results in lea, this seems to be easier and
15993 correct fix for crash to disable this test. */
15994 }
15995 else if (GET_CODE (disp) != LABEL_REF
15996 && !CONST_INT_P (disp)
15997 && (GET_CODE (disp) != CONST
15998 || !ix86_legitimate_constant_p (Pmode, disp))
15999 && (GET_CODE (disp) != SYMBOL_REF
16000 || !ix86_legitimate_constant_p (Pmode, disp)))
16001 /* Displacement is not constant. */
16002 return false;
16003 else if (TARGET_64BIT
16004 && !x86_64_immediate_operand (disp, VOIDmode))
16005 /* Displacement is out of range. */
16006 return false;
16007 /* In x32 mode, constant addresses are sign extended to 64bit, so
16008 we have to prevent addresses from 0x80000000 to 0xffffffff. */
16009 else if (TARGET_X32 && !(index || base)
16010 && CONST_INT_P (disp)
16011 && val_signbit_known_set_p (SImode, INTVAL (disp)))
16012 return false;
16013 }
16014
16015 /* Everything looks valid. */
16016 return true;
16017 }
16018
16019 /* Determine if a given RTX is a valid constant address. */
16020
16021 bool
16022 constant_address_p (rtx x)
16023 {
16024 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
16025 }
16026 \f
16027 /* Return a unique alias set for the GOT. */
16028
16029 static alias_set_type
16030 ix86_GOT_alias_set (void)
16031 {
16032 static alias_set_type set = -1;
16033 if (set == -1)
16034 set = new_alias_set ();
16035 return set;
16036 }
16037
16038 /* Return a legitimate reference for ORIG (an address) using the
16039 register REG. If REG is 0, a new pseudo is generated.
16040
16041 There are two types of references that must be handled:
16042
16043 1. Global data references must load the address from the GOT, via
16044 the PIC reg. An insn is emitted to do this load, and the reg is
16045 returned.
16046
16047 2. Static data references, constant pool addresses, and code labels
16048 compute the address as an offset from the GOT, whose base is in
16049 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
16050 differentiate them from global data objects. The returned
16051 address is the PIC reg + an unspec constant.
16052
16053 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
16054 reg also appears in the address. */
16055
16056 static rtx
16057 legitimize_pic_address (rtx orig, rtx reg)
16058 {
16059 rtx addr = orig;
16060 rtx new_rtx = orig;
16061
16062 #if TARGET_MACHO
16063 if (TARGET_MACHO && !TARGET_64BIT)
16064 {
16065 if (reg == 0)
16066 reg = gen_reg_rtx (Pmode);
16067 /* Use the generic Mach-O PIC machinery. */
16068 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
16069 }
16070 #endif
16071
16072 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16073 {
16074 rtx tmp = legitimize_pe_coff_symbol (addr, true);
16075 if (tmp)
16076 return tmp;
16077 }
16078
16079 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
16080 new_rtx = addr;
16081 else if ((!TARGET_64BIT
16082 || /* TARGET_64BIT && */ ix86_cmodel != CM_SMALL_PIC)
16083 && !TARGET_PECOFF
16084 && gotoff_operand (addr, Pmode))
16085 {
16086 /* This symbol may be referenced via a displacement
16087 from the PIC base address (@GOTOFF). */
16088 if (GET_CODE (addr) == CONST)
16089 addr = XEXP (addr, 0);
16090
16091 if (GET_CODE (addr) == PLUS)
16092 {
16093 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
16094 UNSPEC_GOTOFF);
16095 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
16096 }
16097 else
16098 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
16099
16100 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16101
16102 if (TARGET_64BIT)
16103 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
16104
16105 if (reg != 0)
16106 {
16107 gcc_assert (REG_P (reg));
16108 new_rtx = expand_simple_binop (Pmode, PLUS, pic_offset_table_rtx,
16109 new_rtx, reg, 1, OPTAB_DIRECT);
16110 }
16111 else
16112 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
16113 }
16114 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
16115 /* We can't use @GOTOFF for text labels
16116 on VxWorks, see gotoff_operand. */
16117 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
16118 {
16119 rtx tmp = legitimize_pe_coff_symbol (addr, true);
16120 if (tmp)
16121 return tmp;
16122
16123 /* For x64 PE-COFF there is no GOT table,
16124 so we use address directly. */
16125 if (TARGET_64BIT && TARGET_PECOFF)
16126 {
16127 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
16128 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16129 }
16130 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
16131 {
16132 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
16133 UNSPEC_GOTPCREL);
16134 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16135 new_rtx = gen_const_mem (Pmode, new_rtx);
16136 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
16137 }
16138 else
16139 {
16140 /* This symbol must be referenced via a load
16141 from the Global Offset Table (@GOT). */
16142 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
16143 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16144 if (TARGET_64BIT)
16145 new_rtx = force_reg (Pmode, new_rtx);
16146 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
16147 new_rtx = gen_const_mem (Pmode, new_rtx);
16148 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
16149 }
16150
16151 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
16152 }
16153 else
16154 {
16155 if (CONST_INT_P (addr)
16156 && !x86_64_immediate_operand (addr, VOIDmode))
16157 new_rtx = copy_to_suggested_reg (addr, reg, Pmode);
16158 else if (GET_CODE (addr) == CONST)
16159 {
16160 addr = XEXP (addr, 0);
16161
16162 /* We must match stuff we generate before. Assume the only
16163 unspecs that can get here are ours. Not that we could do
16164 anything with them anyway.... */
16165 if (GET_CODE (addr) == UNSPEC
16166 || (GET_CODE (addr) == PLUS
16167 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
16168 return orig;
16169 gcc_assert (GET_CODE (addr) == PLUS);
16170 }
16171
16172 if (GET_CODE (addr) == PLUS)
16173 {
16174 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
16175
16176 /* Check first to see if this is a constant
16177 offset from a @GOTOFF symbol reference. */
16178 if (!TARGET_PECOFF
16179 && gotoff_operand (op0, Pmode)
16180 && CONST_INT_P (op1))
16181 {
16182 if (!TARGET_64BIT)
16183 {
16184 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
16185 UNSPEC_GOTOFF);
16186 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
16187 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16188
16189 if (reg != 0)
16190 {
16191 gcc_assert (REG_P (reg));
16192 new_rtx = expand_simple_binop (Pmode, PLUS,
16193 pic_offset_table_rtx,
16194 new_rtx, reg, 1,
16195 OPTAB_DIRECT);
16196 }
16197 else
16198 new_rtx
16199 = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
16200 }
16201 else
16202 {
16203 if (INTVAL (op1) < -16*1024*1024
16204 || INTVAL (op1) >= 16*1024*1024)
16205 {
16206 if (!x86_64_immediate_operand (op1, Pmode))
16207 op1 = force_reg (Pmode, op1);
16208
16209 new_rtx
16210 = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
16211 }
16212 }
16213 }
16214 else
16215 {
16216 rtx base = legitimize_pic_address (op0, reg);
16217 machine_mode mode = GET_MODE (base);
16218 new_rtx
16219 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
16220
16221 if (CONST_INT_P (new_rtx))
16222 {
16223 if (INTVAL (new_rtx) < -16*1024*1024
16224 || INTVAL (new_rtx) >= 16*1024*1024)
16225 {
16226 if (!x86_64_immediate_operand (new_rtx, mode))
16227 new_rtx = force_reg (mode, new_rtx);
16228
16229 new_rtx
16230 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
16231 }
16232 else
16233 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
16234 }
16235 else
16236 {
16237 /* For %rip addressing, we have to use
16238 just disp32, not base nor index. */
16239 if (TARGET_64BIT
16240 && (GET_CODE (base) == SYMBOL_REF
16241 || GET_CODE (base) == LABEL_REF))
16242 base = force_reg (mode, base);
16243 if (GET_CODE (new_rtx) == PLUS
16244 && CONSTANT_P (XEXP (new_rtx, 1)))
16245 {
16246 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
16247 new_rtx = XEXP (new_rtx, 1);
16248 }
16249 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
16250 }
16251 }
16252 }
16253 }
16254 return new_rtx;
16255 }
16256 \f
16257 /* Load the thread pointer. If TO_REG is true, force it into a register. */
16258
16259 static rtx
16260 get_thread_pointer (machine_mode tp_mode, bool to_reg)
16261 {
16262 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
16263
16264 if (GET_MODE (tp) != tp_mode)
16265 {
16266 gcc_assert (GET_MODE (tp) == SImode);
16267 gcc_assert (tp_mode == DImode);
16268
16269 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
16270 }
16271
16272 if (to_reg)
16273 tp = copy_to_mode_reg (tp_mode, tp);
16274
16275 return tp;
16276 }
16277
16278 /* Construct the SYMBOL_REF for the tls_get_addr function. */
16279
16280 static GTY(()) rtx ix86_tls_symbol;
16281
16282 static rtx
16283 ix86_tls_get_addr (void)
16284 {
16285 if (!ix86_tls_symbol)
16286 {
16287 const char *sym
16288 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
16289 ? "___tls_get_addr" : "__tls_get_addr");
16290
16291 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
16292 }
16293
16294 if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
16295 {
16296 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
16297 UNSPEC_PLTOFF);
16298 return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
16299 gen_rtx_CONST (Pmode, unspec));
16300 }
16301
16302 return ix86_tls_symbol;
16303 }
16304
16305 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
16306
16307 static GTY(()) rtx ix86_tls_module_base_symbol;
16308
16309 rtx
16310 ix86_tls_module_base (void)
16311 {
16312 if (!ix86_tls_module_base_symbol)
16313 {
16314 ix86_tls_module_base_symbol
16315 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
16316
16317 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
16318 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
16319 }
16320
16321 return ix86_tls_module_base_symbol;
16322 }
16323
16324 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
16325 false if we expect this to be used for a memory address and true if
16326 we expect to load the address into a register. */
16327
16328 static rtx
16329 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
16330 {
16331 rtx dest, base, off;
16332 rtx pic = NULL_RTX, tp = NULL_RTX;
16333 machine_mode tp_mode = Pmode;
16334 int type;
16335
16336 /* Fall back to global dynamic model if tool chain cannot support local
16337 dynamic. */
16338 if (TARGET_SUN_TLS && !TARGET_64BIT
16339 && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM
16340 && model == TLS_MODEL_LOCAL_DYNAMIC)
16341 model = TLS_MODEL_GLOBAL_DYNAMIC;
16342
16343 switch (model)
16344 {
16345 case TLS_MODEL_GLOBAL_DYNAMIC:
16346 dest = gen_reg_rtx (Pmode);
16347
16348 if (!TARGET_64BIT)
16349 {
16350 if (flag_pic && !TARGET_PECOFF)
16351 pic = pic_offset_table_rtx;
16352 else
16353 {
16354 pic = gen_reg_rtx (Pmode);
16355 emit_insn (gen_set_got (pic));
16356 }
16357 }
16358
16359 if (TARGET_GNU2_TLS)
16360 {
16361 if (TARGET_64BIT)
16362 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
16363 else
16364 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
16365
16366 tp = get_thread_pointer (Pmode, true);
16367 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
16368
16369 if (GET_MODE (x) != Pmode)
16370 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16371
16372 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
16373 }
16374 else
16375 {
16376 rtx caddr = ix86_tls_get_addr ();
16377
16378 if (TARGET_64BIT)
16379 {
16380 rtx rax = gen_rtx_REG (Pmode, AX_REG);
16381 rtx_insn *insns;
16382
16383 start_sequence ();
16384 emit_call_insn
16385 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
16386 insns = get_insns ();
16387 end_sequence ();
16388
16389 if (GET_MODE (x) != Pmode)
16390 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16391
16392 RTL_CONST_CALL_P (insns) = 1;
16393 emit_libcall_block (insns, dest, rax, x);
16394 }
16395 else
16396 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
16397 }
16398 break;
16399
16400 case TLS_MODEL_LOCAL_DYNAMIC:
16401 base = gen_reg_rtx (Pmode);
16402
16403 if (!TARGET_64BIT)
16404 {
16405 if (flag_pic)
16406 pic = pic_offset_table_rtx;
16407 else
16408 {
16409 pic = gen_reg_rtx (Pmode);
16410 emit_insn (gen_set_got (pic));
16411 }
16412 }
16413
16414 if (TARGET_GNU2_TLS)
16415 {
16416 rtx tmp = ix86_tls_module_base ();
16417
16418 if (TARGET_64BIT)
16419 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
16420 else
16421 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
16422
16423 tp = get_thread_pointer (Pmode, true);
16424 set_unique_reg_note (get_last_insn (), REG_EQUAL,
16425 gen_rtx_MINUS (Pmode, tmp, tp));
16426 }
16427 else
16428 {
16429 rtx caddr = ix86_tls_get_addr ();
16430
16431 if (TARGET_64BIT)
16432 {
16433 rtx rax = gen_rtx_REG (Pmode, AX_REG);
16434 rtx_insn *insns;
16435 rtx eqv;
16436
16437 start_sequence ();
16438 emit_call_insn
16439 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
16440 insns = get_insns ();
16441 end_sequence ();
16442
16443 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
16444 share the LD_BASE result with other LD model accesses. */
16445 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
16446 UNSPEC_TLS_LD_BASE);
16447
16448 RTL_CONST_CALL_P (insns) = 1;
16449 emit_libcall_block (insns, base, rax, eqv);
16450 }
16451 else
16452 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
16453 }
16454
16455 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
16456 off = gen_rtx_CONST (Pmode, off);
16457
16458 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
16459
16460 if (TARGET_GNU2_TLS)
16461 {
16462 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
16463
16464 if (GET_MODE (x) != Pmode)
16465 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16466
16467 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
16468 }
16469 break;
16470
16471 case TLS_MODEL_INITIAL_EXEC:
16472 if (TARGET_64BIT)
16473 {
16474 if (TARGET_SUN_TLS && !TARGET_X32)
16475 {
16476 /* The Sun linker took the AMD64 TLS spec literally
16477 and can only handle %rax as destination of the
16478 initial executable code sequence. */
16479
16480 dest = gen_reg_rtx (DImode);
16481 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
16482 return dest;
16483 }
16484
16485 /* Generate DImode references to avoid %fs:(%reg32)
16486 problems and linker IE->LE relaxation bug. */
16487 tp_mode = DImode;
16488 pic = NULL;
16489 type = UNSPEC_GOTNTPOFF;
16490 }
16491 else if (flag_pic)
16492 {
16493 pic = pic_offset_table_rtx;
16494 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
16495 }
16496 else if (!TARGET_ANY_GNU_TLS)
16497 {
16498 pic = gen_reg_rtx (Pmode);
16499 emit_insn (gen_set_got (pic));
16500 type = UNSPEC_GOTTPOFF;
16501 }
16502 else
16503 {
16504 pic = NULL;
16505 type = UNSPEC_INDNTPOFF;
16506 }
16507
16508 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
16509 off = gen_rtx_CONST (tp_mode, off);
16510 if (pic)
16511 off = gen_rtx_PLUS (tp_mode, pic, off);
16512 off = gen_const_mem (tp_mode, off);
16513 set_mem_alias_set (off, ix86_GOT_alias_set ());
16514
16515 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16516 {
16517 base = get_thread_pointer (tp_mode,
16518 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16519 off = force_reg (tp_mode, off);
16520 dest = gen_rtx_PLUS (tp_mode, base, off);
16521 if (tp_mode != Pmode)
16522 dest = convert_to_mode (Pmode, dest, 1);
16523 }
16524 else
16525 {
16526 base = get_thread_pointer (Pmode, true);
16527 dest = gen_reg_rtx (Pmode);
16528 emit_insn (ix86_gen_sub3 (dest, base, off));
16529 }
16530 break;
16531
16532 case TLS_MODEL_LOCAL_EXEC:
16533 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
16534 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16535 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
16536 off = gen_rtx_CONST (Pmode, off);
16537
16538 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16539 {
16540 base = get_thread_pointer (Pmode,
16541 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16542 return gen_rtx_PLUS (Pmode, base, off);
16543 }
16544 else
16545 {
16546 base = get_thread_pointer (Pmode, true);
16547 dest = gen_reg_rtx (Pmode);
16548 emit_insn (ix86_gen_sub3 (dest, base, off));
16549 }
16550 break;
16551
16552 default:
16553 gcc_unreachable ();
16554 }
16555
16556 return dest;
16557 }
16558
16559 /* Create or return the unique __imp_DECL dllimport symbol corresponding
16560 to symbol DECL if BEIMPORT is true. Otherwise create or return the
16561 unique refptr-DECL symbol corresponding to symbol DECL. */
16562
16563 struct dllimport_hasher : ggc_cache_ptr_hash<tree_map>
16564 {
16565 static inline hashval_t hash (tree_map *m) { return m->hash; }
16566 static inline bool
16567 equal (tree_map *a, tree_map *b)
16568 {
16569 return a->base.from == b->base.from;
16570 }
16571
16572 static int
16573 keep_cache_entry (tree_map *&m)
16574 {
16575 return ggc_marked_p (m->base.from);
16576 }
16577 };
16578
16579 static GTY((cache)) hash_table<dllimport_hasher> *dllimport_map;
16580
16581 static tree
16582 get_dllimport_decl (tree decl, bool beimport)
16583 {
16584 struct tree_map *h, in;
16585 const char *name;
16586 const char *prefix;
16587 size_t namelen, prefixlen;
16588 char *imp_name;
16589 tree to;
16590 rtx rtl;
16591
16592 if (!dllimport_map)
16593 dllimport_map = hash_table<dllimport_hasher>::create_ggc (512);
16594
16595 in.hash = htab_hash_pointer (decl);
16596 in.base.from = decl;
16597 tree_map **loc = dllimport_map->find_slot_with_hash (&in, in.hash, INSERT);
16598 h = *loc;
16599 if (h)
16600 return h->to;
16601
16602 *loc = h = ggc_alloc<tree_map> ();
16603 h->hash = in.hash;
16604 h->base.from = decl;
16605 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
16606 VAR_DECL, NULL, ptr_type_node);
16607 DECL_ARTIFICIAL (to) = 1;
16608 DECL_IGNORED_P (to) = 1;
16609 DECL_EXTERNAL (to) = 1;
16610 TREE_READONLY (to) = 1;
16611
16612 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
16613 name = targetm.strip_name_encoding (name);
16614 if (beimport)
16615 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
16616 ? "*__imp_" : "*__imp__";
16617 else
16618 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
16619 namelen = strlen (name);
16620 prefixlen = strlen (prefix);
16621 imp_name = (char *) alloca (namelen + prefixlen + 1);
16622 memcpy (imp_name, prefix, prefixlen);
16623 memcpy (imp_name + prefixlen, name, namelen + 1);
16624
16625 name = ggc_alloc_string (imp_name, namelen + prefixlen);
16626 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
16627 SET_SYMBOL_REF_DECL (rtl, to);
16628 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
16629 if (!beimport)
16630 {
16631 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
16632 #ifdef SUB_TARGET_RECORD_STUB
16633 SUB_TARGET_RECORD_STUB (name);
16634 #endif
16635 }
16636
16637 rtl = gen_const_mem (Pmode, rtl);
16638 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
16639
16640 SET_DECL_RTL (to, rtl);
16641 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
16642
16643 return to;
16644 }
16645
16646 /* Expand SYMBOL into its corresponding far-addresse symbol.
16647 WANT_REG is true if we require the result be a register. */
16648
16649 static rtx
16650 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
16651 {
16652 tree imp_decl;
16653 rtx x;
16654
16655 gcc_assert (SYMBOL_REF_DECL (symbol));
16656 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
16657
16658 x = DECL_RTL (imp_decl);
16659 if (want_reg)
16660 x = force_reg (Pmode, x);
16661 return x;
16662 }
16663
16664 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
16665 true if we require the result be a register. */
16666
16667 static rtx
16668 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
16669 {
16670 tree imp_decl;
16671 rtx x;
16672
16673 gcc_assert (SYMBOL_REF_DECL (symbol));
16674 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
16675
16676 x = DECL_RTL (imp_decl);
16677 if (want_reg)
16678 x = force_reg (Pmode, x);
16679 return x;
16680 }
16681
16682 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
16683 is true if we require the result be a register. */
16684
16685 static rtx
16686 legitimize_pe_coff_symbol (rtx addr, bool inreg)
16687 {
16688 if (!TARGET_PECOFF)
16689 return NULL_RTX;
16690
16691 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16692 {
16693 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
16694 return legitimize_dllimport_symbol (addr, inreg);
16695 if (GET_CODE (addr) == CONST
16696 && GET_CODE (XEXP (addr, 0)) == PLUS
16697 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
16698 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
16699 {
16700 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
16701 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
16702 }
16703 }
16704
16705 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
16706 return NULL_RTX;
16707 if (GET_CODE (addr) == SYMBOL_REF
16708 && !is_imported_p (addr)
16709 && SYMBOL_REF_EXTERNAL_P (addr)
16710 && SYMBOL_REF_DECL (addr))
16711 return legitimize_pe_coff_extern_decl (addr, inreg);
16712
16713 if (GET_CODE (addr) == CONST
16714 && GET_CODE (XEXP (addr, 0)) == PLUS
16715 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
16716 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
16717 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
16718 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
16719 {
16720 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
16721 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
16722 }
16723 return NULL_RTX;
16724 }
16725
16726 /* Try machine-dependent ways of modifying an illegitimate address
16727 to be legitimate. If we find one, return the new, valid address.
16728 This macro is used in only one place: `memory_address' in explow.c.
16729
16730 OLDX is the address as it was before break_out_memory_refs was called.
16731 In some cases it is useful to look at this to decide what needs to be done.
16732
16733 It is always safe for this macro to do nothing. It exists to recognize
16734 opportunities to optimize the output.
16735
16736 For the 80386, we handle X+REG by loading X into a register R and
16737 using R+REG. R will go in a general reg and indexing will be used.
16738 However, if REG is a broken-out memory address or multiplication,
16739 nothing needs to be done because REG can certainly go in a general reg.
16740
16741 When -fpic is used, special handling is needed for symbolic references.
16742 See comments by legitimize_pic_address in i386.c for details. */
16743
16744 static rtx
16745 ix86_legitimize_address (rtx x, rtx, machine_mode mode)
16746 {
16747 bool changed = false;
16748 unsigned log;
16749
16750 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
16751 if (log)
16752 return legitimize_tls_address (x, (enum tls_model) log, false);
16753 if (GET_CODE (x) == CONST
16754 && GET_CODE (XEXP (x, 0)) == PLUS
16755 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
16756 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
16757 {
16758 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
16759 (enum tls_model) log, false);
16760 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
16761 }
16762
16763 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16764 {
16765 rtx tmp = legitimize_pe_coff_symbol (x, true);
16766 if (tmp)
16767 return tmp;
16768 }
16769
16770 if (flag_pic && SYMBOLIC_CONST (x))
16771 return legitimize_pic_address (x, 0);
16772
16773 #if TARGET_MACHO
16774 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
16775 return machopic_indirect_data_reference (x, 0);
16776 #endif
16777
16778 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
16779 if (GET_CODE (x) == ASHIFT
16780 && CONST_INT_P (XEXP (x, 1))
16781 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
16782 {
16783 changed = true;
16784 log = INTVAL (XEXP (x, 1));
16785 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
16786 GEN_INT (1 << log));
16787 }
16788
16789 if (GET_CODE (x) == PLUS)
16790 {
16791 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
16792
16793 if (GET_CODE (XEXP (x, 0)) == ASHIFT
16794 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
16795 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
16796 {
16797 changed = true;
16798 log = INTVAL (XEXP (XEXP (x, 0), 1));
16799 XEXP (x, 0) = gen_rtx_MULT (Pmode,
16800 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
16801 GEN_INT (1 << log));
16802 }
16803
16804 if (GET_CODE (XEXP (x, 1)) == ASHIFT
16805 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
16806 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
16807 {
16808 changed = true;
16809 log = INTVAL (XEXP (XEXP (x, 1), 1));
16810 XEXP (x, 1) = gen_rtx_MULT (Pmode,
16811 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
16812 GEN_INT (1 << log));
16813 }
16814
16815 /* Put multiply first if it isn't already. */
16816 if (GET_CODE (XEXP (x, 1)) == MULT)
16817 {
16818 std::swap (XEXP (x, 0), XEXP (x, 1));
16819 changed = true;
16820 }
16821
16822 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
16823 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
16824 created by virtual register instantiation, register elimination, and
16825 similar optimizations. */
16826 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
16827 {
16828 changed = true;
16829 x = gen_rtx_PLUS (Pmode,
16830 gen_rtx_PLUS (Pmode, XEXP (x, 0),
16831 XEXP (XEXP (x, 1), 0)),
16832 XEXP (XEXP (x, 1), 1));
16833 }
16834
16835 /* Canonicalize
16836 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
16837 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
16838 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
16839 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
16840 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
16841 && CONSTANT_P (XEXP (x, 1)))
16842 {
16843 rtx constant;
16844 rtx other = NULL_RTX;
16845
16846 if (CONST_INT_P (XEXP (x, 1)))
16847 {
16848 constant = XEXP (x, 1);
16849 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
16850 }
16851 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
16852 {
16853 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
16854 other = XEXP (x, 1);
16855 }
16856 else
16857 constant = 0;
16858
16859 if (constant)
16860 {
16861 changed = true;
16862 x = gen_rtx_PLUS (Pmode,
16863 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
16864 XEXP (XEXP (XEXP (x, 0), 1), 0)),
16865 plus_constant (Pmode, other,
16866 INTVAL (constant)));
16867 }
16868 }
16869
16870 if (changed && ix86_legitimate_address_p (mode, x, false))
16871 return x;
16872
16873 if (GET_CODE (XEXP (x, 0)) == MULT)
16874 {
16875 changed = true;
16876 XEXP (x, 0) = copy_addr_to_reg (XEXP (x, 0));
16877 }
16878
16879 if (GET_CODE (XEXP (x, 1)) == MULT)
16880 {
16881 changed = true;
16882 XEXP (x, 1) = copy_addr_to_reg (XEXP (x, 1));
16883 }
16884
16885 if (changed
16886 && REG_P (XEXP (x, 1))
16887 && REG_P (XEXP (x, 0)))
16888 return x;
16889
16890 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
16891 {
16892 changed = true;
16893 x = legitimize_pic_address (x, 0);
16894 }
16895
16896 if (changed && ix86_legitimate_address_p (mode, x, false))
16897 return x;
16898
16899 if (REG_P (XEXP (x, 0)))
16900 {
16901 rtx temp = gen_reg_rtx (Pmode);
16902 rtx val = force_operand (XEXP (x, 1), temp);
16903 if (val != temp)
16904 {
16905 val = convert_to_mode (Pmode, val, 1);
16906 emit_move_insn (temp, val);
16907 }
16908
16909 XEXP (x, 1) = temp;
16910 return x;
16911 }
16912
16913 else if (REG_P (XEXP (x, 1)))
16914 {
16915 rtx temp = gen_reg_rtx (Pmode);
16916 rtx val = force_operand (XEXP (x, 0), temp);
16917 if (val != temp)
16918 {
16919 val = convert_to_mode (Pmode, val, 1);
16920 emit_move_insn (temp, val);
16921 }
16922
16923 XEXP (x, 0) = temp;
16924 return x;
16925 }
16926 }
16927
16928 return x;
16929 }
16930 \f
16931 /* Print an integer constant expression in assembler syntax. Addition
16932 and subtraction are the only arithmetic that may appear in these
16933 expressions. FILE is the stdio stream to write to, X is the rtx, and
16934 CODE is the operand print code from the output string. */
16935
16936 static void
16937 output_pic_addr_const (FILE *file, rtx x, int code)
16938 {
16939 char buf[256];
16940
16941 switch (GET_CODE (x))
16942 {
16943 case PC:
16944 gcc_assert (flag_pic);
16945 putc ('.', file);
16946 break;
16947
16948 case SYMBOL_REF:
16949 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
16950 output_addr_const (file, x);
16951 else
16952 {
16953 const char *name = XSTR (x, 0);
16954
16955 /* Mark the decl as referenced so that cgraph will
16956 output the function. */
16957 if (SYMBOL_REF_DECL (x))
16958 mark_decl_referenced (SYMBOL_REF_DECL (x));
16959
16960 #if TARGET_MACHO
16961 if (MACHOPIC_INDIRECT
16962 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
16963 name = machopic_indirection_name (x, /*stub_p=*/true);
16964 #endif
16965 assemble_name (file, name);
16966 }
16967 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
16968 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
16969 fputs ("@PLT", file);
16970 break;
16971
16972 case LABEL_REF:
16973 x = XEXP (x, 0);
16974 /* FALLTHRU */
16975 case CODE_LABEL:
16976 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
16977 assemble_name (asm_out_file, buf);
16978 break;
16979
16980 case CONST_INT:
16981 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
16982 break;
16983
16984 case CONST:
16985 /* This used to output parentheses around the expression,
16986 but that does not work on the 386 (either ATT or BSD assembler). */
16987 output_pic_addr_const (file, XEXP (x, 0), code);
16988 break;
16989
16990 case CONST_DOUBLE:
16991 /* We can't handle floating point constants;
16992 TARGET_PRINT_OPERAND must handle them. */
16993 output_operand_lossage ("floating constant misused");
16994 break;
16995
16996 case PLUS:
16997 /* Some assemblers need integer constants to appear first. */
16998 if (CONST_INT_P (XEXP (x, 0)))
16999 {
17000 output_pic_addr_const (file, XEXP (x, 0), code);
17001 putc ('+', file);
17002 output_pic_addr_const (file, XEXP (x, 1), code);
17003 }
17004 else
17005 {
17006 gcc_assert (CONST_INT_P (XEXP (x, 1)));
17007 output_pic_addr_const (file, XEXP (x, 1), code);
17008 putc ('+', file);
17009 output_pic_addr_const (file, XEXP (x, 0), code);
17010 }
17011 break;
17012
17013 case MINUS:
17014 if (!TARGET_MACHO)
17015 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
17016 output_pic_addr_const (file, XEXP (x, 0), code);
17017 putc ('-', file);
17018 output_pic_addr_const (file, XEXP (x, 1), code);
17019 if (!TARGET_MACHO)
17020 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
17021 break;
17022
17023 case UNSPEC:
17024 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
17025 {
17026 bool f = i386_asm_output_addr_const_extra (file, x);
17027 gcc_assert (f);
17028 break;
17029 }
17030
17031 gcc_assert (XVECLEN (x, 0) == 1);
17032 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
17033 switch (XINT (x, 1))
17034 {
17035 case UNSPEC_GOT:
17036 fputs ("@GOT", file);
17037 break;
17038 case UNSPEC_GOTOFF:
17039 fputs ("@GOTOFF", file);
17040 break;
17041 case UNSPEC_PLTOFF:
17042 fputs ("@PLTOFF", file);
17043 break;
17044 case UNSPEC_PCREL:
17045 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17046 "(%rip)" : "[rip]", file);
17047 break;
17048 case UNSPEC_GOTPCREL:
17049 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17050 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
17051 break;
17052 case UNSPEC_GOTTPOFF:
17053 /* FIXME: This might be @TPOFF in Sun ld too. */
17054 fputs ("@gottpoff", file);
17055 break;
17056 case UNSPEC_TPOFF:
17057 fputs ("@tpoff", file);
17058 break;
17059 case UNSPEC_NTPOFF:
17060 if (TARGET_64BIT)
17061 fputs ("@tpoff", file);
17062 else
17063 fputs ("@ntpoff", file);
17064 break;
17065 case UNSPEC_DTPOFF:
17066 fputs ("@dtpoff", file);
17067 break;
17068 case UNSPEC_GOTNTPOFF:
17069 if (TARGET_64BIT)
17070 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17071 "@gottpoff(%rip)": "@gottpoff[rip]", file);
17072 else
17073 fputs ("@gotntpoff", file);
17074 break;
17075 case UNSPEC_INDNTPOFF:
17076 fputs ("@indntpoff", file);
17077 break;
17078 #if TARGET_MACHO
17079 case UNSPEC_MACHOPIC_OFFSET:
17080 putc ('-', file);
17081 machopic_output_function_base_name (file);
17082 break;
17083 #endif
17084 default:
17085 output_operand_lossage ("invalid UNSPEC as operand");
17086 break;
17087 }
17088 break;
17089
17090 default:
17091 output_operand_lossage ("invalid expression as operand");
17092 }
17093 }
17094
17095 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
17096 We need to emit DTP-relative relocations. */
17097
17098 static void ATTRIBUTE_UNUSED
17099 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
17100 {
17101 fputs (ASM_LONG, file);
17102 output_addr_const (file, x);
17103 fputs ("@dtpoff", file);
17104 switch (size)
17105 {
17106 case 4:
17107 break;
17108 case 8:
17109 fputs (", 0", file);
17110 break;
17111 default:
17112 gcc_unreachable ();
17113 }
17114 }
17115
17116 /* Return true if X is a representation of the PIC register. This copes
17117 with calls from ix86_find_base_term, where the register might have
17118 been replaced by a cselib value. */
17119
17120 static bool
17121 ix86_pic_register_p (rtx x)
17122 {
17123 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
17124 return (pic_offset_table_rtx
17125 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
17126 else if (!REG_P (x))
17127 return false;
17128 else if (pic_offset_table_rtx)
17129 {
17130 if (REGNO (x) == REGNO (pic_offset_table_rtx))
17131 return true;
17132 if (HARD_REGISTER_P (x)
17133 && !HARD_REGISTER_P (pic_offset_table_rtx)
17134 && ORIGINAL_REGNO (x) == REGNO (pic_offset_table_rtx))
17135 return true;
17136 return false;
17137 }
17138 else
17139 return REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
17140 }
17141
17142 /* Helper function for ix86_delegitimize_address.
17143 Attempt to delegitimize TLS local-exec accesses. */
17144
17145 static rtx
17146 ix86_delegitimize_tls_address (rtx orig_x)
17147 {
17148 rtx x = orig_x, unspec;
17149 struct ix86_address addr;
17150
17151 if (!TARGET_TLS_DIRECT_SEG_REFS)
17152 return orig_x;
17153 if (MEM_P (x))
17154 x = XEXP (x, 0);
17155 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
17156 return orig_x;
17157 if (ix86_decompose_address (x, &addr) == 0
17158 || addr.seg != DEFAULT_TLS_SEG_REG
17159 || addr.disp == NULL_RTX
17160 || GET_CODE (addr.disp) != CONST)
17161 return orig_x;
17162 unspec = XEXP (addr.disp, 0);
17163 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
17164 unspec = XEXP (unspec, 0);
17165 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
17166 return orig_x;
17167 x = XVECEXP (unspec, 0, 0);
17168 gcc_assert (GET_CODE (x) == SYMBOL_REF);
17169 if (unspec != XEXP (addr.disp, 0))
17170 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
17171 if (addr.index)
17172 {
17173 rtx idx = addr.index;
17174 if (addr.scale != 1)
17175 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
17176 x = gen_rtx_PLUS (Pmode, idx, x);
17177 }
17178 if (addr.base)
17179 x = gen_rtx_PLUS (Pmode, addr.base, x);
17180 if (MEM_P (orig_x))
17181 x = replace_equiv_address_nv (orig_x, x);
17182 return x;
17183 }
17184
17185 /* In the name of slightly smaller debug output, and to cater to
17186 general assembler lossage, recognize PIC+GOTOFF and turn it back
17187 into a direct symbol reference.
17188
17189 On Darwin, this is necessary to avoid a crash, because Darwin
17190 has a different PIC label for each routine but the DWARF debugging
17191 information is not associated with any particular routine, so it's
17192 necessary to remove references to the PIC label from RTL stored by
17193 the DWARF output code. */
17194
17195 static rtx
17196 ix86_delegitimize_address (rtx x)
17197 {
17198 rtx orig_x = delegitimize_mem_from_attrs (x);
17199 /* addend is NULL or some rtx if x is something+GOTOFF where
17200 something doesn't include the PIC register. */
17201 rtx addend = NULL_RTX;
17202 /* reg_addend is NULL or a multiple of some register. */
17203 rtx reg_addend = NULL_RTX;
17204 /* const_addend is NULL or a const_int. */
17205 rtx const_addend = NULL_RTX;
17206 /* This is the result, or NULL. */
17207 rtx result = NULL_RTX;
17208
17209 x = orig_x;
17210
17211 if (MEM_P (x))
17212 x = XEXP (x, 0);
17213
17214 if (TARGET_64BIT)
17215 {
17216 if (GET_CODE (x) == CONST
17217 && GET_CODE (XEXP (x, 0)) == PLUS
17218 && GET_MODE (XEXP (x, 0)) == Pmode
17219 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
17220 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
17221 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
17222 {
17223 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
17224 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
17225 if (MEM_P (orig_x))
17226 x = replace_equiv_address_nv (orig_x, x);
17227 return x;
17228 }
17229
17230 if (GET_CODE (x) == CONST
17231 && GET_CODE (XEXP (x, 0)) == UNSPEC
17232 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
17233 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
17234 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
17235 {
17236 x = XVECEXP (XEXP (x, 0), 0, 0);
17237 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
17238 {
17239 x = lowpart_subreg (GET_MODE (orig_x), x, GET_MODE (x));
17240 if (x == NULL_RTX)
17241 return orig_x;
17242 }
17243 return x;
17244 }
17245
17246 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
17247 return ix86_delegitimize_tls_address (orig_x);
17248
17249 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
17250 and -mcmodel=medium -fpic. */
17251 }
17252
17253 if (GET_CODE (x) != PLUS
17254 || GET_CODE (XEXP (x, 1)) != CONST)
17255 return ix86_delegitimize_tls_address (orig_x);
17256
17257 if (ix86_pic_register_p (XEXP (x, 0)))
17258 /* %ebx + GOT/GOTOFF */
17259 ;
17260 else if (GET_CODE (XEXP (x, 0)) == PLUS)
17261 {
17262 /* %ebx + %reg * scale + GOT/GOTOFF */
17263 reg_addend = XEXP (x, 0);
17264 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
17265 reg_addend = XEXP (reg_addend, 1);
17266 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
17267 reg_addend = XEXP (reg_addend, 0);
17268 else
17269 {
17270 reg_addend = NULL_RTX;
17271 addend = XEXP (x, 0);
17272 }
17273 }
17274 else
17275 addend = XEXP (x, 0);
17276
17277 x = XEXP (XEXP (x, 1), 0);
17278 if (GET_CODE (x) == PLUS
17279 && CONST_INT_P (XEXP (x, 1)))
17280 {
17281 const_addend = XEXP (x, 1);
17282 x = XEXP (x, 0);
17283 }
17284
17285 if (GET_CODE (x) == UNSPEC
17286 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
17287 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
17288 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
17289 && !MEM_P (orig_x) && !addend)))
17290 result = XVECEXP (x, 0, 0);
17291
17292 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
17293 && !MEM_P (orig_x))
17294 result = XVECEXP (x, 0, 0);
17295
17296 if (! result)
17297 return ix86_delegitimize_tls_address (orig_x);
17298
17299 if (const_addend)
17300 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
17301 if (reg_addend)
17302 result = gen_rtx_PLUS (Pmode, reg_addend, result);
17303 if (addend)
17304 {
17305 /* If the rest of original X doesn't involve the PIC register, add
17306 addend and subtract pic_offset_table_rtx. This can happen e.g.
17307 for code like:
17308 leal (%ebx, %ecx, 4), %ecx
17309 ...
17310 movl foo@GOTOFF(%ecx), %edx
17311 in which case we return (%ecx - %ebx) + foo
17312 or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg
17313 and reload has completed. */
17314 if (pic_offset_table_rtx
17315 && (!reload_completed || !ix86_use_pseudo_pic_reg ()))
17316 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
17317 pic_offset_table_rtx),
17318 result);
17319 else if (pic_offset_table_rtx && !TARGET_MACHO && !TARGET_VXWORKS_RTP)
17320 {
17321 rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
17322 tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
17323 result = gen_rtx_PLUS (Pmode, tmp, result);
17324 }
17325 else
17326 return orig_x;
17327 }
17328 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
17329 {
17330 result = lowpart_subreg (GET_MODE (orig_x), result, Pmode);
17331 if (result == NULL_RTX)
17332 return orig_x;
17333 }
17334 return result;
17335 }
17336
17337 /* If X is a machine specific address (i.e. a symbol or label being
17338 referenced as a displacement from the GOT implemented using an
17339 UNSPEC), then return the base term. Otherwise return X. */
17340
17341 rtx
17342 ix86_find_base_term (rtx x)
17343 {
17344 rtx term;
17345
17346 if (TARGET_64BIT)
17347 {
17348 if (GET_CODE (x) != CONST)
17349 return x;
17350 term = XEXP (x, 0);
17351 if (GET_CODE (term) == PLUS
17352 && CONST_INT_P (XEXP (term, 1)))
17353 term = XEXP (term, 0);
17354 if (GET_CODE (term) != UNSPEC
17355 || (XINT (term, 1) != UNSPEC_GOTPCREL
17356 && XINT (term, 1) != UNSPEC_PCREL))
17357 return x;
17358
17359 return XVECEXP (term, 0, 0);
17360 }
17361
17362 return ix86_delegitimize_address (x);
17363 }
17364 \f
17365 static void
17366 put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
17367 bool fp, FILE *file)
17368 {
17369 const char *suffix;
17370
17371 if (mode == CCFPmode || mode == CCFPUmode)
17372 {
17373 code = ix86_fp_compare_code_to_integer (code);
17374 mode = CCmode;
17375 }
17376 if (reverse)
17377 code = reverse_condition (code);
17378
17379 switch (code)
17380 {
17381 case EQ:
17382 switch (mode)
17383 {
17384 case CCAmode:
17385 suffix = "a";
17386 break;
17387 case CCCmode:
17388 suffix = "c";
17389 break;
17390 case CCOmode:
17391 suffix = "o";
17392 break;
17393 case CCPmode:
17394 suffix = "p";
17395 break;
17396 case CCSmode:
17397 suffix = "s";
17398 break;
17399 default:
17400 suffix = "e";
17401 break;
17402 }
17403 break;
17404 case NE:
17405 switch (mode)
17406 {
17407 case CCAmode:
17408 suffix = "na";
17409 break;
17410 case CCCmode:
17411 suffix = "nc";
17412 break;
17413 case CCOmode:
17414 suffix = "no";
17415 break;
17416 case CCPmode:
17417 suffix = "np";
17418 break;
17419 case CCSmode:
17420 suffix = "ns";
17421 break;
17422 default:
17423 suffix = "ne";
17424 break;
17425 }
17426 break;
17427 case GT:
17428 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
17429 suffix = "g";
17430 break;
17431 case GTU:
17432 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
17433 Those same assemblers have the same but opposite lossage on cmov. */
17434 if (mode == CCmode)
17435 suffix = fp ? "nbe" : "a";
17436 else
17437 gcc_unreachable ();
17438 break;
17439 case LT:
17440 switch (mode)
17441 {
17442 case CCNOmode:
17443 case CCGOCmode:
17444 suffix = "s";
17445 break;
17446
17447 case CCmode:
17448 case CCGCmode:
17449 suffix = "l";
17450 break;
17451
17452 default:
17453 gcc_unreachable ();
17454 }
17455 break;
17456 case LTU:
17457 if (mode == CCmode)
17458 suffix = "b";
17459 else if (mode == CCCmode)
17460 suffix = fp ? "b" : "c";
17461 else
17462 gcc_unreachable ();
17463 break;
17464 case GE:
17465 switch (mode)
17466 {
17467 case CCNOmode:
17468 case CCGOCmode:
17469 suffix = "ns";
17470 break;
17471
17472 case CCmode:
17473 case CCGCmode:
17474 suffix = "ge";
17475 break;
17476
17477 default:
17478 gcc_unreachable ();
17479 }
17480 break;
17481 case GEU:
17482 if (mode == CCmode)
17483 suffix = "nb";
17484 else if (mode == CCCmode)
17485 suffix = fp ? "nb" : "nc";
17486 else
17487 gcc_unreachable ();
17488 break;
17489 case LE:
17490 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
17491 suffix = "le";
17492 break;
17493 case LEU:
17494 if (mode == CCmode)
17495 suffix = "be";
17496 else
17497 gcc_unreachable ();
17498 break;
17499 case UNORDERED:
17500 suffix = fp ? "u" : "p";
17501 break;
17502 case ORDERED:
17503 suffix = fp ? "nu" : "np";
17504 break;
17505 default:
17506 gcc_unreachable ();
17507 }
17508 fputs (suffix, file);
17509 }
17510
17511 /* Print the name of register X to FILE based on its machine mode and number.
17512 If CODE is 'w', pretend the mode is HImode.
17513 If CODE is 'b', pretend the mode is QImode.
17514 If CODE is 'k', pretend the mode is SImode.
17515 If CODE is 'q', pretend the mode is DImode.
17516 If CODE is 'x', pretend the mode is V4SFmode.
17517 If CODE is 't', pretend the mode is V8SFmode.
17518 If CODE is 'g', pretend the mode is V16SFmode.
17519 If CODE is 'h', pretend the reg is the 'high' byte register.
17520 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
17521 If CODE is 'd', duplicate the operand for AVX instruction.
17522 */
17523
17524 void
17525 print_reg (rtx x, int code, FILE *file)
17526 {
17527 const char *reg;
17528 int msize;
17529 unsigned int regno;
17530 bool duplicated;
17531
17532 if (ASSEMBLER_DIALECT == ASM_ATT)
17533 putc ('%', file);
17534
17535 if (x == pc_rtx)
17536 {
17537 gcc_assert (TARGET_64BIT);
17538 fputs ("rip", file);
17539 return;
17540 }
17541
17542 if (code == 'y' && STACK_TOP_P (x))
17543 {
17544 fputs ("st(0)", file);
17545 return;
17546 }
17547
17548 if (code == 'w')
17549 msize = 2;
17550 else if (code == 'b')
17551 msize = 1;
17552 else if (code == 'k')
17553 msize = 4;
17554 else if (code == 'q')
17555 msize = 8;
17556 else if (code == 'h')
17557 msize = 0;
17558 else if (code == 'x')
17559 msize = 16;
17560 else if (code == 't')
17561 msize = 32;
17562 else if (code == 'g')
17563 msize = 64;
17564 else
17565 msize = GET_MODE_SIZE (GET_MODE (x));
17566
17567 regno = true_regnum (x);
17568
17569 gcc_assert (regno != ARG_POINTER_REGNUM
17570 && regno != FRAME_POINTER_REGNUM
17571 && regno != FPSR_REG
17572 && regno != FPCR_REG);
17573
17574 if (regno == FLAGS_REG)
17575 {
17576 output_operand_lossage ("invalid use of asm flag output");
17577 return;
17578 }
17579
17580 duplicated = code == 'd' && TARGET_AVX;
17581
17582 switch (msize)
17583 {
17584 case 8:
17585 case 4:
17586 if (LEGACY_INT_REGNO_P (regno))
17587 putc (msize == 8 && TARGET_64BIT ? 'r' : 'e', file);
17588 /* FALLTHRU */
17589 case 16:
17590 case 12:
17591 case 2:
17592 normal:
17593 reg = hi_reg_name[regno];
17594 break;
17595 case 1:
17596 if (regno >= ARRAY_SIZE (qi_reg_name))
17597 goto normal;
17598 reg = qi_reg_name[regno];
17599 break;
17600 case 0:
17601 if (regno >= ARRAY_SIZE (qi_high_reg_name))
17602 goto normal;
17603 reg = qi_high_reg_name[regno];
17604 break;
17605 case 32:
17606 case 64:
17607 if (SSE_REGNO_P (regno))
17608 {
17609 gcc_assert (!duplicated);
17610 putc (msize == 32 ? 'y' : 'z', file);
17611 reg = hi_reg_name[regno] + 1;
17612 break;
17613 }
17614 goto normal;
17615 default:
17616 gcc_unreachable ();
17617 }
17618
17619 fputs (reg, file);
17620
17621 /* Irritatingly, AMD extended registers use
17622 different naming convention: "r%d[bwd]" */
17623 if (REX_INT_REGNO_P (regno))
17624 {
17625 gcc_assert (TARGET_64BIT);
17626 switch (msize)
17627 {
17628 case 0:
17629 error ("extended registers have no high halves");
17630 break;
17631 case 1:
17632 putc ('b', file);
17633 break;
17634 case 2:
17635 putc ('w', file);
17636 break;
17637 case 4:
17638 putc ('d', file);
17639 break;
17640 case 8:
17641 /* no suffix */
17642 break;
17643 default:
17644 error ("unsupported operand size for extended register");
17645 break;
17646 }
17647 return;
17648 }
17649
17650 if (duplicated)
17651 {
17652 if (ASSEMBLER_DIALECT == ASM_ATT)
17653 fprintf (file, ", %%%s", reg);
17654 else
17655 fprintf (file, ", %s", reg);
17656 }
17657 }
17658
17659 /* Meaning of CODE:
17660 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
17661 C -- print opcode suffix for set/cmov insn.
17662 c -- like C, but print reversed condition
17663 F,f -- likewise, but for floating-point.
17664 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
17665 otherwise nothing
17666 R -- print embeded rounding and sae.
17667 r -- print only sae.
17668 z -- print the opcode suffix for the size of the current operand.
17669 Z -- likewise, with special suffixes for x87 instructions.
17670 * -- print a star (in certain assembler syntax)
17671 A -- print an absolute memory reference.
17672 E -- print address with DImode register names if TARGET_64BIT.
17673 w -- print the operand as if it's a "word" (HImode) even if it isn't.
17674 s -- print a shift double count, followed by the assemblers argument
17675 delimiter.
17676 b -- print the QImode name of the register for the indicated operand.
17677 %b0 would print %al if operands[0] is reg 0.
17678 w -- likewise, print the HImode name of the register.
17679 k -- likewise, print the SImode name of the register.
17680 q -- likewise, print the DImode name of the register.
17681 x -- likewise, print the V4SFmode name of the register.
17682 t -- likewise, print the V8SFmode name of the register.
17683 g -- likewise, print the V16SFmode name of the register.
17684 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
17685 y -- print "st(0)" instead of "st" as a register.
17686 d -- print duplicated register operand for AVX instruction.
17687 D -- print condition for SSE cmp instruction.
17688 P -- if PIC, print an @PLT suffix.
17689 p -- print raw symbol name.
17690 X -- don't print any sort of PIC '@' suffix for a symbol.
17691 & -- print some in-use local-dynamic symbol name.
17692 H -- print a memory address offset by 8; used for sse high-parts
17693 Y -- print condition for XOP pcom* instruction.
17694 + -- print a branch hint as 'cs' or 'ds' prefix
17695 ; -- print a semicolon (after prefixes due to bug in older gas).
17696 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
17697 @ -- print a segment register of thread base pointer load
17698 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
17699 ! -- print MPX prefix for jxx/call/ret instructions if required.
17700 */
17701
17702 void
17703 ix86_print_operand (FILE *file, rtx x, int code)
17704 {
17705 if (code)
17706 {
17707 switch (code)
17708 {
17709 case 'A':
17710 switch (ASSEMBLER_DIALECT)
17711 {
17712 case ASM_ATT:
17713 putc ('*', file);
17714 break;
17715
17716 case ASM_INTEL:
17717 /* Intel syntax. For absolute addresses, registers should not
17718 be surrounded by braces. */
17719 if (!REG_P (x))
17720 {
17721 putc ('[', file);
17722 ix86_print_operand (file, x, 0);
17723 putc (']', file);
17724 return;
17725 }
17726 break;
17727
17728 default:
17729 gcc_unreachable ();
17730 }
17731
17732 ix86_print_operand (file, x, 0);
17733 return;
17734
17735 case 'E':
17736 /* Wrap address in an UNSPEC to declare special handling. */
17737 if (TARGET_64BIT)
17738 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
17739
17740 output_address (VOIDmode, x);
17741 return;
17742
17743 case 'L':
17744 if (ASSEMBLER_DIALECT == ASM_ATT)
17745 putc ('l', file);
17746 return;
17747
17748 case 'W':
17749 if (ASSEMBLER_DIALECT == ASM_ATT)
17750 putc ('w', file);
17751 return;
17752
17753 case 'B':
17754 if (ASSEMBLER_DIALECT == ASM_ATT)
17755 putc ('b', file);
17756 return;
17757
17758 case 'Q':
17759 if (ASSEMBLER_DIALECT == ASM_ATT)
17760 putc ('l', file);
17761 return;
17762
17763 case 'S':
17764 if (ASSEMBLER_DIALECT == ASM_ATT)
17765 putc ('s', file);
17766 return;
17767
17768 case 'T':
17769 if (ASSEMBLER_DIALECT == ASM_ATT)
17770 putc ('t', file);
17771 return;
17772
17773 case 'O':
17774 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
17775 if (ASSEMBLER_DIALECT != ASM_ATT)
17776 return;
17777
17778 switch (GET_MODE_SIZE (GET_MODE (x)))
17779 {
17780 case 2:
17781 putc ('w', file);
17782 break;
17783
17784 case 4:
17785 putc ('l', file);
17786 break;
17787
17788 case 8:
17789 putc ('q', file);
17790 break;
17791
17792 default:
17793 output_operand_lossage
17794 ("invalid operand size for operand code 'O'");
17795 return;
17796 }
17797
17798 putc ('.', file);
17799 #endif
17800 return;
17801
17802 case 'z':
17803 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
17804 {
17805 /* Opcodes don't get size suffixes if using Intel opcodes. */
17806 if (ASSEMBLER_DIALECT == ASM_INTEL)
17807 return;
17808
17809 switch (GET_MODE_SIZE (GET_MODE (x)))
17810 {
17811 case 1:
17812 putc ('b', file);
17813 return;
17814
17815 case 2:
17816 putc ('w', file);
17817 return;
17818
17819 case 4:
17820 putc ('l', file);
17821 return;
17822
17823 case 8:
17824 putc ('q', file);
17825 return;
17826
17827 default:
17828 output_operand_lossage
17829 ("invalid operand size for operand code 'z'");
17830 return;
17831 }
17832 }
17833
17834 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
17835 warning
17836 (0, "non-integer operand used with operand code 'z'");
17837 /* FALLTHRU */
17838
17839 case 'Z':
17840 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
17841 if (ASSEMBLER_DIALECT == ASM_INTEL)
17842 return;
17843
17844 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
17845 {
17846 switch (GET_MODE_SIZE (GET_MODE (x)))
17847 {
17848 case 2:
17849 #ifdef HAVE_AS_IX86_FILDS
17850 putc ('s', file);
17851 #endif
17852 return;
17853
17854 case 4:
17855 putc ('l', file);
17856 return;
17857
17858 case 8:
17859 #ifdef HAVE_AS_IX86_FILDQ
17860 putc ('q', file);
17861 #else
17862 fputs ("ll", file);
17863 #endif
17864 return;
17865
17866 default:
17867 break;
17868 }
17869 }
17870 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
17871 {
17872 /* 387 opcodes don't get size suffixes
17873 if the operands are registers. */
17874 if (STACK_REG_P (x))
17875 return;
17876
17877 switch (GET_MODE_SIZE (GET_MODE (x)))
17878 {
17879 case 4:
17880 putc ('s', file);
17881 return;
17882
17883 case 8:
17884 putc ('l', file);
17885 return;
17886
17887 case 12:
17888 case 16:
17889 putc ('t', file);
17890 return;
17891
17892 default:
17893 break;
17894 }
17895 }
17896 else
17897 {
17898 output_operand_lossage
17899 ("invalid operand type used with operand code 'Z'");
17900 return;
17901 }
17902
17903 output_operand_lossage
17904 ("invalid operand size for operand code 'Z'");
17905 return;
17906
17907 case 'd':
17908 case 'b':
17909 case 'w':
17910 case 'k':
17911 case 'q':
17912 case 'h':
17913 case 't':
17914 case 'g':
17915 case 'y':
17916 case 'x':
17917 case 'X':
17918 case 'P':
17919 case 'p':
17920 break;
17921
17922 case 's':
17923 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
17924 {
17925 ix86_print_operand (file, x, 0);
17926 fputs (", ", file);
17927 }
17928 return;
17929
17930 case 'Y':
17931 switch (GET_CODE (x))
17932 {
17933 case NE:
17934 fputs ("neq", file);
17935 break;
17936 case EQ:
17937 fputs ("eq", file);
17938 break;
17939 case GE:
17940 case GEU:
17941 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
17942 break;
17943 case GT:
17944 case GTU:
17945 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
17946 break;
17947 case LE:
17948 case LEU:
17949 fputs ("le", file);
17950 break;
17951 case LT:
17952 case LTU:
17953 fputs ("lt", file);
17954 break;
17955 case UNORDERED:
17956 fputs ("unord", file);
17957 break;
17958 case ORDERED:
17959 fputs ("ord", file);
17960 break;
17961 case UNEQ:
17962 fputs ("ueq", file);
17963 break;
17964 case UNGE:
17965 fputs ("nlt", file);
17966 break;
17967 case UNGT:
17968 fputs ("nle", file);
17969 break;
17970 case UNLE:
17971 fputs ("ule", file);
17972 break;
17973 case UNLT:
17974 fputs ("ult", file);
17975 break;
17976 case LTGT:
17977 fputs ("une", file);
17978 break;
17979 default:
17980 output_operand_lossage ("operand is not a condition code, "
17981 "invalid operand code 'Y'");
17982 return;
17983 }
17984 return;
17985
17986 case 'D':
17987 /* Little bit of braindamage here. The SSE compare instructions
17988 does use completely different names for the comparisons that the
17989 fp conditional moves. */
17990 switch (GET_CODE (x))
17991 {
17992 case UNEQ:
17993 if (TARGET_AVX)
17994 {
17995 fputs ("eq_us", file);
17996 break;
17997 }
17998 /* FALLTHRU */
17999 case EQ:
18000 fputs ("eq", file);
18001 break;
18002 case UNLT:
18003 if (TARGET_AVX)
18004 {
18005 fputs ("nge", file);
18006 break;
18007 }
18008 /* FALLTHRU */
18009 case LT:
18010 fputs ("lt", file);
18011 break;
18012 case UNLE:
18013 if (TARGET_AVX)
18014 {
18015 fputs ("ngt", file);
18016 break;
18017 }
18018 /* FALLTHRU */
18019 case LE:
18020 fputs ("le", file);
18021 break;
18022 case UNORDERED:
18023 fputs ("unord", file);
18024 break;
18025 case LTGT:
18026 if (TARGET_AVX)
18027 {
18028 fputs ("neq_oq", file);
18029 break;
18030 }
18031 /* FALLTHRU */
18032 case NE:
18033 fputs ("neq", file);
18034 break;
18035 case GE:
18036 if (TARGET_AVX)
18037 {
18038 fputs ("ge", file);
18039 break;
18040 }
18041 /* FALLTHRU */
18042 case UNGE:
18043 fputs ("nlt", file);
18044 break;
18045 case GT:
18046 if (TARGET_AVX)
18047 {
18048 fputs ("gt", file);
18049 break;
18050 }
18051 /* FALLTHRU */
18052 case UNGT:
18053 fputs ("nle", file);
18054 break;
18055 case ORDERED:
18056 fputs ("ord", file);
18057 break;
18058 default:
18059 output_operand_lossage ("operand is not a condition code, "
18060 "invalid operand code 'D'");
18061 return;
18062 }
18063 return;
18064
18065 case 'F':
18066 case 'f':
18067 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
18068 if (ASSEMBLER_DIALECT == ASM_ATT)
18069 putc ('.', file);
18070 gcc_fallthrough ();
18071 #endif
18072
18073 case 'C':
18074 case 'c':
18075 if (!COMPARISON_P (x))
18076 {
18077 output_operand_lossage ("operand is not a condition code, "
18078 "invalid operand code '%c'", code);
18079 return;
18080 }
18081 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
18082 code == 'c' || code == 'f',
18083 code == 'F' || code == 'f',
18084 file);
18085 return;
18086
18087 case 'H':
18088 if (!offsettable_memref_p (x))
18089 {
18090 output_operand_lossage ("operand is not an offsettable memory "
18091 "reference, invalid operand code 'H'");
18092 return;
18093 }
18094 /* It doesn't actually matter what mode we use here, as we're
18095 only going to use this for printing. */
18096 x = adjust_address_nv (x, DImode, 8);
18097 /* Output 'qword ptr' for intel assembler dialect. */
18098 if (ASSEMBLER_DIALECT == ASM_INTEL)
18099 code = 'q';
18100 break;
18101
18102 case 'K':
18103 gcc_assert (CONST_INT_P (x));
18104
18105 if (INTVAL (x) & IX86_HLE_ACQUIRE)
18106 #ifdef HAVE_AS_IX86_HLE
18107 fputs ("xacquire ", file);
18108 #else
18109 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
18110 #endif
18111 else if (INTVAL (x) & IX86_HLE_RELEASE)
18112 #ifdef HAVE_AS_IX86_HLE
18113 fputs ("xrelease ", file);
18114 #else
18115 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
18116 #endif
18117 /* We do not want to print value of the operand. */
18118 return;
18119
18120 case 'N':
18121 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
18122 fputs ("{z}", file);
18123 return;
18124
18125 case 'r':
18126 gcc_assert (CONST_INT_P (x));
18127 gcc_assert (INTVAL (x) == ROUND_SAE);
18128
18129 if (ASSEMBLER_DIALECT == ASM_INTEL)
18130 fputs (", ", file);
18131
18132 fputs ("{sae}", file);
18133
18134 if (ASSEMBLER_DIALECT == ASM_ATT)
18135 fputs (", ", file);
18136
18137 return;
18138
18139 case 'R':
18140 gcc_assert (CONST_INT_P (x));
18141
18142 if (ASSEMBLER_DIALECT == ASM_INTEL)
18143 fputs (", ", file);
18144
18145 switch (INTVAL (x))
18146 {
18147 case ROUND_NEAREST_INT | ROUND_SAE:
18148 fputs ("{rn-sae}", file);
18149 break;
18150 case ROUND_NEG_INF | ROUND_SAE:
18151 fputs ("{rd-sae}", file);
18152 break;
18153 case ROUND_POS_INF | ROUND_SAE:
18154 fputs ("{ru-sae}", file);
18155 break;
18156 case ROUND_ZERO | ROUND_SAE:
18157 fputs ("{rz-sae}", file);
18158 break;
18159 default:
18160 gcc_unreachable ();
18161 }
18162
18163 if (ASSEMBLER_DIALECT == ASM_ATT)
18164 fputs (", ", file);
18165
18166 return;
18167
18168 case '*':
18169 if (ASSEMBLER_DIALECT == ASM_ATT)
18170 putc ('*', file);
18171 return;
18172
18173 case '&':
18174 {
18175 const char *name = get_some_local_dynamic_name ();
18176 if (name == NULL)
18177 output_operand_lossage ("'%%&' used without any "
18178 "local dynamic TLS references");
18179 else
18180 assemble_name (file, name);
18181 return;
18182 }
18183
18184 case '+':
18185 {
18186 rtx x;
18187
18188 if (!optimize
18189 || optimize_function_for_size_p (cfun)
18190 || !TARGET_BRANCH_PREDICTION_HINTS)
18191 return;
18192
18193 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
18194 if (x)
18195 {
18196 int pred_val = XINT (x, 0);
18197
18198 if (pred_val < REG_BR_PROB_BASE * 45 / 100
18199 || pred_val > REG_BR_PROB_BASE * 55 / 100)
18200 {
18201 bool taken = pred_val > REG_BR_PROB_BASE / 2;
18202 bool cputaken
18203 = final_forward_branch_p (current_output_insn) == 0;
18204
18205 /* Emit hints only in the case default branch prediction
18206 heuristics would fail. */
18207 if (taken != cputaken)
18208 {
18209 /* We use 3e (DS) prefix for taken branches and
18210 2e (CS) prefix for not taken branches. */
18211 if (taken)
18212 fputs ("ds ; ", file);
18213 else
18214 fputs ("cs ; ", file);
18215 }
18216 }
18217 }
18218 return;
18219 }
18220
18221 case ';':
18222 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
18223 putc (';', file);
18224 #endif
18225 return;
18226
18227 case '@':
18228 if (ASSEMBLER_DIALECT == ASM_ATT)
18229 putc ('%', file);
18230
18231 /* The kernel uses a different segment register for performance
18232 reasons; a system call would not have to trash the userspace
18233 segment register, which would be expensive. */
18234 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
18235 fputs ("fs", file);
18236 else
18237 fputs ("gs", file);
18238 return;
18239
18240 case '~':
18241 putc (TARGET_AVX2 ? 'i' : 'f', file);
18242 return;
18243
18244 case '^':
18245 if (TARGET_64BIT && Pmode != word_mode)
18246 fputs ("addr32 ", file);
18247 return;
18248
18249 case '!':
18250 if (ix86_bnd_prefixed_insn_p (current_output_insn))
18251 fputs ("bnd ", file);
18252 return;
18253
18254 default:
18255 output_operand_lossage ("invalid operand code '%c'", code);
18256 }
18257 }
18258
18259 if (REG_P (x))
18260 print_reg (x, code, file);
18261
18262 else if (MEM_P (x))
18263 {
18264 rtx addr = XEXP (x, 0);
18265
18266 /* No `byte ptr' prefix for call instructions ... */
18267 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
18268 {
18269 machine_mode mode = GET_MODE (x);
18270 const char *size;
18271
18272 /* Check for explicit size override codes. */
18273 if (code == 'b')
18274 size = "BYTE";
18275 else if (code == 'w')
18276 size = "WORD";
18277 else if (code == 'k')
18278 size = "DWORD";
18279 else if (code == 'q')
18280 size = "QWORD";
18281 else if (code == 'x')
18282 size = "XMMWORD";
18283 else if (code == 't')
18284 size = "YMMWORD";
18285 else if (code == 'g')
18286 size = "ZMMWORD";
18287 else if (mode == BLKmode)
18288 /* ... or BLKmode operands, when not overridden. */
18289 size = NULL;
18290 else
18291 switch (GET_MODE_SIZE (mode))
18292 {
18293 case 1: size = "BYTE"; break;
18294 case 2: size = "WORD"; break;
18295 case 4: size = "DWORD"; break;
18296 case 8: size = "QWORD"; break;
18297 case 12: size = "TBYTE"; break;
18298 case 16:
18299 if (mode == XFmode)
18300 size = "TBYTE";
18301 else
18302 size = "XMMWORD";
18303 break;
18304 case 32: size = "YMMWORD"; break;
18305 case 64: size = "ZMMWORD"; break;
18306 default:
18307 gcc_unreachable ();
18308 }
18309 if (size)
18310 {
18311 fputs (size, file);
18312 fputs (" PTR ", file);
18313 }
18314 }
18315
18316 if (this_is_asm_operands && ! address_operand (addr, VOIDmode))
18317 output_operand_lossage ("invalid constraints for operand");
18318 else
18319 ix86_print_operand_address_as
18320 (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');
18321 }
18322
18323 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)
18324 {
18325 long l;
18326
18327 REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18328
18329 if (ASSEMBLER_DIALECT == ASM_ATT)
18330 putc ('$', file);
18331 /* Sign extend 32bit SFmode immediate to 8 bytes. */
18332 if (code == 'q')
18333 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
18334 (unsigned long long) (int) l);
18335 else
18336 fprintf (file, "0x%08x", (unsigned int) l);
18337 }
18338
18339 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == DFmode)
18340 {
18341 long l[2];
18342
18343 REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18344
18345 if (ASSEMBLER_DIALECT == ASM_ATT)
18346 putc ('$', file);
18347 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
18348 }
18349
18350 /* These float cases don't actually occur as immediate operands. */
18351 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == XFmode)
18352 {
18353 char dstr[30];
18354
18355 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
18356 fputs (dstr, file);
18357 }
18358
18359 else
18360 {
18361 /* We have patterns that allow zero sets of memory, for instance.
18362 In 64-bit mode, we should probably support all 8-byte vectors,
18363 since we can in fact encode that into an immediate. */
18364 if (GET_CODE (x) == CONST_VECTOR)
18365 {
18366 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
18367 x = const0_rtx;
18368 }
18369
18370 if (code != 'P' && code != 'p')
18371 {
18372 if (CONST_INT_P (x))
18373 {
18374 if (ASSEMBLER_DIALECT == ASM_ATT)
18375 putc ('$', file);
18376 }
18377 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
18378 || GET_CODE (x) == LABEL_REF)
18379 {
18380 if (ASSEMBLER_DIALECT == ASM_ATT)
18381 putc ('$', file);
18382 else
18383 fputs ("OFFSET FLAT:", file);
18384 }
18385 }
18386 if (CONST_INT_P (x))
18387 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
18388 else if (flag_pic || MACHOPIC_INDIRECT)
18389 output_pic_addr_const (file, x, code);
18390 else
18391 output_addr_const (file, x);
18392 }
18393 }
18394
18395 static bool
18396 ix86_print_operand_punct_valid_p (unsigned char code)
18397 {
18398 return (code == '@' || code == '*' || code == '+' || code == '&'
18399 || code == ';' || code == '~' || code == '^' || code == '!');
18400 }
18401 \f
18402 /* Print a memory operand whose address is ADDR. */
18403
18404 static void
18405 ix86_print_operand_address_as (FILE *file, rtx addr,
18406 addr_space_t as, bool no_rip)
18407 {
18408 struct ix86_address parts;
18409 rtx base, index, disp;
18410 int scale;
18411 int ok;
18412 bool vsib = false;
18413 int code = 0;
18414
18415 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
18416 {
18417 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18418 gcc_assert (parts.index == NULL_RTX);
18419 parts.index = XVECEXP (addr, 0, 1);
18420 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
18421 addr = XVECEXP (addr, 0, 0);
18422 vsib = true;
18423 }
18424 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
18425 {
18426 gcc_assert (TARGET_64BIT);
18427 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18428 code = 'q';
18429 }
18430 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDMK_ADDR)
18431 {
18432 ok = ix86_decompose_address (XVECEXP (addr, 0, 1), &parts);
18433 gcc_assert (parts.base == NULL_RTX || parts.index == NULL_RTX);
18434 if (parts.base != NULL_RTX)
18435 {
18436 parts.index = parts.base;
18437 parts.scale = 1;
18438 }
18439 parts.base = XVECEXP (addr, 0, 0);
18440 addr = XVECEXP (addr, 0, 0);
18441 }
18442 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDLDX_ADDR)
18443 {
18444 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18445 gcc_assert (parts.index == NULL_RTX);
18446 parts.index = XVECEXP (addr, 0, 1);
18447 addr = XVECEXP (addr, 0, 0);
18448 }
18449 else
18450 ok = ix86_decompose_address (addr, &parts);
18451
18452 gcc_assert (ok);
18453
18454 base = parts.base;
18455 index = parts.index;
18456 disp = parts.disp;
18457 scale = parts.scale;
18458
18459 if (ADDR_SPACE_GENERIC_P (as))
18460 as = parts.seg;
18461 else
18462 gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg));
18463
18464 if (!ADDR_SPACE_GENERIC_P (as))
18465 {
18466 const char *string;
18467
18468 if (as == ADDR_SPACE_SEG_FS)
18469 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%fs:" : "fs:");
18470 else if (as == ADDR_SPACE_SEG_GS)
18471 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%gs:" : "gs:");
18472 else
18473 gcc_unreachable ();
18474 fputs (string, file);
18475 }
18476
18477 /* Use one byte shorter RIP relative addressing for 64bit mode. */
18478 if (TARGET_64BIT && !base && !index && !no_rip)
18479 {
18480 rtx symbol = disp;
18481
18482 if (GET_CODE (disp) == CONST
18483 && GET_CODE (XEXP (disp, 0)) == PLUS
18484 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18485 symbol = XEXP (XEXP (disp, 0), 0);
18486
18487 if (GET_CODE (symbol) == LABEL_REF
18488 || (GET_CODE (symbol) == SYMBOL_REF
18489 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
18490 base = pc_rtx;
18491 }
18492
18493 if (!base && !index)
18494 {
18495 /* Displacement only requires special attention. */
18496 if (CONST_INT_P (disp))
18497 {
18498 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == ADDR_SPACE_GENERIC)
18499 fputs ("ds:", file);
18500 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
18501 }
18502 /* Load the external function address via the GOT slot to avoid PLT. */
18503 else if (GET_CODE (disp) == CONST
18504 && GET_CODE (XEXP (disp, 0)) == UNSPEC
18505 && (XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL
18506 || XINT (XEXP (disp, 0), 1) == UNSPEC_GOT)
18507 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
18508 output_pic_addr_const (file, disp, 0);
18509 else if (flag_pic)
18510 output_pic_addr_const (file, disp, 0);
18511 else
18512 output_addr_const (file, disp);
18513 }
18514 else
18515 {
18516 /* Print SImode register names to force addr32 prefix. */
18517 if (SImode_address_operand (addr, VOIDmode))
18518 {
18519 if (flag_checking)
18520 {
18521 gcc_assert (TARGET_64BIT);
18522 switch (GET_CODE (addr))
18523 {
18524 case SUBREG:
18525 gcc_assert (GET_MODE (addr) == SImode);
18526 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
18527 break;
18528 case ZERO_EXTEND:
18529 case AND:
18530 gcc_assert (GET_MODE (addr) == DImode);
18531 break;
18532 default:
18533 gcc_unreachable ();
18534 }
18535 }
18536 gcc_assert (!code);
18537 code = 'k';
18538 }
18539 else if (code == 0
18540 && TARGET_X32
18541 && disp
18542 && CONST_INT_P (disp)
18543 && INTVAL (disp) < -16*1024*1024)
18544 {
18545 /* X32 runs in 64-bit mode, where displacement, DISP, in
18546 address DISP(%r64), is encoded as 32-bit immediate sign-
18547 extended from 32-bit to 64-bit. For -0x40000300(%r64),
18548 address is %r64 + 0xffffffffbffffd00. When %r64 <
18549 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
18550 which is invalid for x32. The correct address is %r64
18551 - 0x40000300 == 0xf7ffdd64. To properly encode
18552 -0x40000300(%r64) for x32, we zero-extend negative
18553 displacement by forcing addr32 prefix which truncates
18554 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
18555 zero-extend all negative displacements, including -1(%rsp).
18556 However, for small negative displacements, sign-extension
18557 won't cause overflow. We only zero-extend negative
18558 displacements if they < -16*1024*1024, which is also used
18559 to check legitimate address displacements for PIC. */
18560 code = 'k';
18561 }
18562
18563 if (ASSEMBLER_DIALECT == ASM_ATT)
18564 {
18565 if (disp)
18566 {
18567 if (flag_pic)
18568 output_pic_addr_const (file, disp, 0);
18569 else if (GET_CODE (disp) == LABEL_REF)
18570 output_asm_label (disp);
18571 else
18572 output_addr_const (file, disp);
18573 }
18574
18575 putc ('(', file);
18576 if (base)
18577 print_reg (base, code, file);
18578 if (index)
18579 {
18580 putc (',', file);
18581 print_reg (index, vsib ? 0 : code, file);
18582 if (scale != 1 || vsib)
18583 fprintf (file, ",%d", scale);
18584 }
18585 putc (')', file);
18586 }
18587 else
18588 {
18589 rtx offset = NULL_RTX;
18590
18591 if (disp)
18592 {
18593 /* Pull out the offset of a symbol; print any symbol itself. */
18594 if (GET_CODE (disp) == CONST
18595 && GET_CODE (XEXP (disp, 0)) == PLUS
18596 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18597 {
18598 offset = XEXP (XEXP (disp, 0), 1);
18599 disp = gen_rtx_CONST (VOIDmode,
18600 XEXP (XEXP (disp, 0), 0));
18601 }
18602
18603 if (flag_pic)
18604 output_pic_addr_const (file, disp, 0);
18605 else if (GET_CODE (disp) == LABEL_REF)
18606 output_asm_label (disp);
18607 else if (CONST_INT_P (disp))
18608 offset = disp;
18609 else
18610 output_addr_const (file, disp);
18611 }
18612
18613 putc ('[', file);
18614 if (base)
18615 {
18616 print_reg (base, code, file);
18617 if (offset)
18618 {
18619 if (INTVAL (offset) >= 0)
18620 putc ('+', file);
18621 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
18622 }
18623 }
18624 else if (offset)
18625 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
18626 else
18627 putc ('0', file);
18628
18629 if (index)
18630 {
18631 putc ('+', file);
18632 print_reg (index, vsib ? 0 : code, file);
18633 if (scale != 1 || vsib)
18634 fprintf (file, "*%d", scale);
18635 }
18636 putc (']', file);
18637 }
18638 }
18639 }
18640
18641 static void
18642 ix86_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr)
18643 {
18644 ix86_print_operand_address_as (file, addr, ADDR_SPACE_GENERIC, false);
18645 }
18646
18647 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
18648
18649 static bool
18650 i386_asm_output_addr_const_extra (FILE *file, rtx x)
18651 {
18652 rtx op;
18653
18654 if (GET_CODE (x) != UNSPEC)
18655 return false;
18656
18657 op = XVECEXP (x, 0, 0);
18658 switch (XINT (x, 1))
18659 {
18660 case UNSPEC_GOTTPOFF:
18661 output_addr_const (file, op);
18662 /* FIXME: This might be @TPOFF in Sun ld. */
18663 fputs ("@gottpoff", file);
18664 break;
18665 case UNSPEC_TPOFF:
18666 output_addr_const (file, op);
18667 fputs ("@tpoff", file);
18668 break;
18669 case UNSPEC_NTPOFF:
18670 output_addr_const (file, op);
18671 if (TARGET_64BIT)
18672 fputs ("@tpoff", file);
18673 else
18674 fputs ("@ntpoff", file);
18675 break;
18676 case UNSPEC_DTPOFF:
18677 output_addr_const (file, op);
18678 fputs ("@dtpoff", file);
18679 break;
18680 case UNSPEC_GOTNTPOFF:
18681 output_addr_const (file, op);
18682 if (TARGET_64BIT)
18683 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
18684 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
18685 else
18686 fputs ("@gotntpoff", file);
18687 break;
18688 case UNSPEC_INDNTPOFF:
18689 output_addr_const (file, op);
18690 fputs ("@indntpoff", file);
18691 break;
18692 #if TARGET_MACHO
18693 case UNSPEC_MACHOPIC_OFFSET:
18694 output_addr_const (file, op);
18695 putc ('-', file);
18696 machopic_output_function_base_name (file);
18697 break;
18698 #endif
18699
18700 case UNSPEC_STACK_CHECK:
18701 {
18702 int offset;
18703
18704 gcc_assert (flag_split_stack);
18705
18706 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
18707 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
18708 #else
18709 gcc_unreachable ();
18710 #endif
18711
18712 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
18713 }
18714 break;
18715
18716 default:
18717 return false;
18718 }
18719
18720 return true;
18721 }
18722 \f
18723 /* Split one or more double-mode RTL references into pairs of half-mode
18724 references. The RTL can be REG, offsettable MEM, integer constant, or
18725 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
18726 split and "num" is its length. lo_half and hi_half are output arrays
18727 that parallel "operands". */
18728
18729 void
18730 split_double_mode (machine_mode mode, rtx operands[],
18731 int num, rtx lo_half[], rtx hi_half[])
18732 {
18733 machine_mode half_mode;
18734 unsigned int byte;
18735
18736 switch (mode)
18737 {
18738 case TImode:
18739 half_mode = DImode;
18740 break;
18741 case DImode:
18742 half_mode = SImode;
18743 break;
18744 default:
18745 gcc_unreachable ();
18746 }
18747
18748 byte = GET_MODE_SIZE (half_mode);
18749
18750 while (num--)
18751 {
18752 rtx op = operands[num];
18753
18754 /* simplify_subreg refuse to split volatile memory addresses,
18755 but we still have to handle it. */
18756 if (MEM_P (op))
18757 {
18758 lo_half[num] = adjust_address (op, half_mode, 0);
18759 hi_half[num] = adjust_address (op, half_mode, byte);
18760 }
18761 else
18762 {
18763 lo_half[num] = simplify_gen_subreg (half_mode, op,
18764 GET_MODE (op) == VOIDmode
18765 ? mode : GET_MODE (op), 0);
18766 hi_half[num] = simplify_gen_subreg (half_mode, op,
18767 GET_MODE (op) == VOIDmode
18768 ? mode : GET_MODE (op), byte);
18769 }
18770 }
18771 }
18772 \f
18773 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
18774 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
18775 is the expression of the binary operation. The output may either be
18776 emitted here, or returned to the caller, like all output_* functions.
18777
18778 There is no guarantee that the operands are the same mode, as they
18779 might be within FLOAT or FLOAT_EXTEND expressions. */
18780
18781 #ifndef SYSV386_COMPAT
18782 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
18783 wants to fix the assemblers because that causes incompatibility
18784 with gcc. No-one wants to fix gcc because that causes
18785 incompatibility with assemblers... You can use the option of
18786 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
18787 #define SYSV386_COMPAT 1
18788 #endif
18789
18790 const char *
18791 output_387_binary_op (rtx_insn *insn, rtx *operands)
18792 {
18793 static char buf[40];
18794 const char *p;
18795 const char *ssep;
18796 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
18797
18798 /* Even if we do not want to check the inputs, this documents input
18799 constraints. Which helps in understanding the following code. */
18800 if (flag_checking)
18801 {
18802 if (STACK_REG_P (operands[0])
18803 && ((REG_P (operands[1])
18804 && REGNO (operands[0]) == REGNO (operands[1])
18805 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
18806 || (REG_P (operands[2])
18807 && REGNO (operands[0]) == REGNO (operands[2])
18808 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
18809 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
18810 ; /* ok */
18811 else
18812 gcc_assert (is_sse);
18813 }
18814
18815 switch (GET_CODE (operands[3]))
18816 {
18817 case PLUS:
18818 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
18819 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
18820 p = "fiadd";
18821 else
18822 p = "fadd";
18823 ssep = "vadd";
18824 break;
18825
18826 case MINUS:
18827 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
18828 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
18829 p = "fisub";
18830 else
18831 p = "fsub";
18832 ssep = "vsub";
18833 break;
18834
18835 case MULT:
18836 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
18837 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
18838 p = "fimul";
18839 else
18840 p = "fmul";
18841 ssep = "vmul";
18842 break;
18843
18844 case DIV:
18845 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
18846 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
18847 p = "fidiv";
18848 else
18849 p = "fdiv";
18850 ssep = "vdiv";
18851 break;
18852
18853 default:
18854 gcc_unreachable ();
18855 }
18856
18857 if (is_sse)
18858 {
18859 if (TARGET_AVX)
18860 {
18861 strcpy (buf, ssep);
18862 if (GET_MODE (operands[0]) == SFmode)
18863 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
18864 else
18865 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
18866 }
18867 else
18868 {
18869 strcpy (buf, ssep + 1);
18870 if (GET_MODE (operands[0]) == SFmode)
18871 strcat (buf, "ss\t{%2, %0|%0, %2}");
18872 else
18873 strcat (buf, "sd\t{%2, %0|%0, %2}");
18874 }
18875 return buf;
18876 }
18877 strcpy (buf, p);
18878
18879 switch (GET_CODE (operands[3]))
18880 {
18881 case MULT:
18882 case PLUS:
18883 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
18884 std::swap (operands[1], operands[2]);
18885
18886 /* know operands[0] == operands[1]. */
18887
18888 if (MEM_P (operands[2]))
18889 {
18890 p = "%Z2\t%2";
18891 break;
18892 }
18893
18894 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
18895 {
18896 if (STACK_TOP_P (operands[0]))
18897 /* How is it that we are storing to a dead operand[2]?
18898 Well, presumably operands[1] is dead too. We can't
18899 store the result to st(0) as st(0) gets popped on this
18900 instruction. Instead store to operands[2] (which I
18901 think has to be st(1)). st(1) will be popped later.
18902 gcc <= 2.8.1 didn't have this check and generated
18903 assembly code that the Unixware assembler rejected. */
18904 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
18905 else
18906 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
18907 break;
18908 }
18909
18910 if (STACK_TOP_P (operands[0]))
18911 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
18912 else
18913 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
18914 break;
18915
18916 case MINUS:
18917 case DIV:
18918 if (MEM_P (operands[1]))
18919 {
18920 p = "r%Z1\t%1";
18921 break;
18922 }
18923
18924 if (MEM_P (operands[2]))
18925 {
18926 p = "%Z2\t%2";
18927 break;
18928 }
18929
18930 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
18931 {
18932 #if SYSV386_COMPAT
18933 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
18934 derived assemblers, confusingly reverse the direction of
18935 the operation for fsub{r} and fdiv{r} when the
18936 destination register is not st(0). The Intel assembler
18937 doesn't have this brain damage. Read !SYSV386_COMPAT to
18938 figure out what the hardware really does. */
18939 if (STACK_TOP_P (operands[0]))
18940 p = "{p\t%0, %2|rp\t%2, %0}";
18941 else
18942 p = "{rp\t%2, %0|p\t%0, %2}";
18943 #else
18944 if (STACK_TOP_P (operands[0]))
18945 /* As above for fmul/fadd, we can't store to st(0). */
18946 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
18947 else
18948 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
18949 #endif
18950 break;
18951 }
18952
18953 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
18954 {
18955 #if SYSV386_COMPAT
18956 if (STACK_TOP_P (operands[0]))
18957 p = "{rp\t%0, %1|p\t%1, %0}";
18958 else
18959 p = "{p\t%1, %0|rp\t%0, %1}";
18960 #else
18961 if (STACK_TOP_P (operands[0]))
18962 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
18963 else
18964 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
18965 #endif
18966 break;
18967 }
18968
18969 if (STACK_TOP_P (operands[0]))
18970 {
18971 if (STACK_TOP_P (operands[1]))
18972 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
18973 else
18974 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
18975 break;
18976 }
18977 else if (STACK_TOP_P (operands[1]))
18978 {
18979 #if SYSV386_COMPAT
18980 p = "{\t%1, %0|r\t%0, %1}";
18981 #else
18982 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
18983 #endif
18984 }
18985 else
18986 {
18987 #if SYSV386_COMPAT
18988 p = "{r\t%2, %0|\t%0, %2}";
18989 #else
18990 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
18991 #endif
18992 }
18993 break;
18994
18995 default:
18996 gcc_unreachable ();
18997 }
18998
18999 strcat (buf, p);
19000 return buf;
19001 }
19002
19003 /* Return needed mode for entity in optimize_mode_switching pass. */
19004
19005 static int
19006 ix86_dirflag_mode_needed (rtx_insn *insn)
19007 {
19008 if (CALL_P (insn))
19009 {
19010 if (cfun->machine->func_type == TYPE_NORMAL)
19011 return X86_DIRFLAG_ANY;
19012 else
19013 /* No need to emit CLD in interrupt handler for TARGET_CLD. */
19014 return TARGET_CLD ? X86_DIRFLAG_ANY : X86_DIRFLAG_RESET;
19015 }
19016
19017 if (recog_memoized (insn) < 0)
19018 return X86_DIRFLAG_ANY;
19019
19020 if (get_attr_type (insn) == TYPE_STR)
19021 {
19022 /* Emit cld instruction if stringops are used in the function. */
19023 if (cfun->machine->func_type == TYPE_NORMAL)
19024 return TARGET_CLD ? X86_DIRFLAG_RESET : X86_DIRFLAG_ANY;
19025 else
19026 return X86_DIRFLAG_RESET;
19027 }
19028
19029 return X86_DIRFLAG_ANY;
19030 }
19031
19032 /* Check if a 256bit AVX register is referenced inside of EXP. */
19033
19034 static bool
19035 ix86_check_avx256_register (const_rtx exp)
19036 {
19037 if (SUBREG_P (exp))
19038 exp = SUBREG_REG (exp);
19039
19040 return (REG_P (exp)
19041 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)));
19042 }
19043
19044 /* Return needed mode for entity in optimize_mode_switching pass. */
19045
19046 static int
19047 ix86_avx_u128_mode_needed (rtx_insn *insn)
19048 {
19049 if (CALL_P (insn))
19050 {
19051 rtx link;
19052
19053 /* Needed mode is set to AVX_U128_CLEAN if there are
19054 no 256bit modes used in function arguments. */
19055 for (link = CALL_INSN_FUNCTION_USAGE (insn);
19056 link;
19057 link = XEXP (link, 1))
19058 {
19059 if (GET_CODE (XEXP (link, 0)) == USE)
19060 {
19061 rtx arg = XEXP (XEXP (link, 0), 0);
19062
19063 if (ix86_check_avx256_register (arg))
19064 return AVX_U128_DIRTY;
19065 }
19066 }
19067
19068 return AVX_U128_CLEAN;
19069 }
19070
19071 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
19072 changes state only when a 256bit register is written to, but we need
19073 to prevent the compiler from moving optimal insertion point above
19074 eventual read from 256bit register. */
19075 subrtx_iterator::array_type array;
19076 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
19077 if (ix86_check_avx256_register (*iter))
19078 return AVX_U128_DIRTY;
19079
19080 return AVX_U128_ANY;
19081 }
19082
19083 /* Return mode that i387 must be switched into
19084 prior to the execution of insn. */
19085
19086 static int
19087 ix86_i387_mode_needed (int entity, rtx_insn *insn)
19088 {
19089 enum attr_i387_cw mode;
19090
19091 /* The mode UNINITIALIZED is used to store control word after a
19092 function call or ASM pattern. The mode ANY specify that function
19093 has no requirements on the control word and make no changes in the
19094 bits we are interested in. */
19095
19096 if (CALL_P (insn)
19097 || (NONJUMP_INSN_P (insn)
19098 && (asm_noperands (PATTERN (insn)) >= 0
19099 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
19100 return I387_CW_UNINITIALIZED;
19101
19102 if (recog_memoized (insn) < 0)
19103 return I387_CW_ANY;
19104
19105 mode = get_attr_i387_cw (insn);
19106
19107 switch (entity)
19108 {
19109 case I387_TRUNC:
19110 if (mode == I387_CW_TRUNC)
19111 return mode;
19112 break;
19113
19114 case I387_FLOOR:
19115 if (mode == I387_CW_FLOOR)
19116 return mode;
19117 break;
19118
19119 case I387_CEIL:
19120 if (mode == I387_CW_CEIL)
19121 return mode;
19122 break;
19123
19124 case I387_MASK_PM:
19125 if (mode == I387_CW_MASK_PM)
19126 return mode;
19127 break;
19128
19129 default:
19130 gcc_unreachable ();
19131 }
19132
19133 return I387_CW_ANY;
19134 }
19135
19136 /* Return mode that entity must be switched into
19137 prior to the execution of insn. */
19138
19139 static int
19140 ix86_mode_needed (int entity, rtx_insn *insn)
19141 {
19142 switch (entity)
19143 {
19144 case X86_DIRFLAG:
19145 return ix86_dirflag_mode_needed (insn);
19146 case AVX_U128:
19147 return ix86_avx_u128_mode_needed (insn);
19148 case I387_TRUNC:
19149 case I387_FLOOR:
19150 case I387_CEIL:
19151 case I387_MASK_PM:
19152 return ix86_i387_mode_needed (entity, insn);
19153 default:
19154 gcc_unreachable ();
19155 }
19156 return 0;
19157 }
19158
19159 /* Check if a 256bit AVX register is referenced in stores. */
19160
19161 static void
19162 ix86_check_avx256_stores (rtx dest, const_rtx, void *data)
19163 {
19164 if (ix86_check_avx256_register (dest))
19165 {
19166 bool *used = (bool *) data;
19167 *used = true;
19168 }
19169 }
19170
19171 /* Calculate mode of upper 128bit AVX registers after the insn. */
19172
19173 static int
19174 ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
19175 {
19176 rtx pat = PATTERN (insn);
19177
19178 if (vzeroupper_operation (pat, VOIDmode)
19179 || vzeroall_operation (pat, VOIDmode))
19180 return AVX_U128_CLEAN;
19181
19182 /* We know that state is clean after CALL insn if there are no
19183 256bit registers used in the function return register. */
19184 if (CALL_P (insn))
19185 {
19186 bool avx_reg256_found = false;
19187 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
19188
19189 return avx_reg256_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
19190 }
19191
19192 /* Otherwise, return current mode. Remember that if insn
19193 references AVX 256bit registers, the mode was already changed
19194 to DIRTY from MODE_NEEDED. */
19195 return mode;
19196 }
19197
19198 /* Return the mode that an insn results in. */
19199
19200 static int
19201 ix86_mode_after (int entity, int mode, rtx_insn *insn)
19202 {
19203 switch (entity)
19204 {
19205 case X86_DIRFLAG:
19206 return mode;
19207 case AVX_U128:
19208 return ix86_avx_u128_mode_after (mode, insn);
19209 case I387_TRUNC:
19210 case I387_FLOOR:
19211 case I387_CEIL:
19212 case I387_MASK_PM:
19213 return mode;
19214 default:
19215 gcc_unreachable ();
19216 }
19217 }
19218
19219 static int
19220 ix86_dirflag_mode_entry (void)
19221 {
19222 /* For TARGET_CLD or in the interrupt handler we can't assume
19223 direction flag state at function entry. */
19224 if (TARGET_CLD
19225 || cfun->machine->func_type != TYPE_NORMAL)
19226 return X86_DIRFLAG_ANY;
19227
19228 return X86_DIRFLAG_RESET;
19229 }
19230
19231 static int
19232 ix86_avx_u128_mode_entry (void)
19233 {
19234 tree arg;
19235
19236 /* Entry mode is set to AVX_U128_DIRTY if there are
19237 256bit modes used in function arguments. */
19238 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
19239 arg = TREE_CHAIN (arg))
19240 {
19241 rtx incoming = DECL_INCOMING_RTL (arg);
19242
19243 if (incoming && ix86_check_avx256_register (incoming))
19244 return AVX_U128_DIRTY;
19245 }
19246
19247 return AVX_U128_CLEAN;
19248 }
19249
19250 /* Return a mode that ENTITY is assumed to be
19251 switched to at function entry. */
19252
19253 static int
19254 ix86_mode_entry (int entity)
19255 {
19256 switch (entity)
19257 {
19258 case X86_DIRFLAG:
19259 return ix86_dirflag_mode_entry ();
19260 case AVX_U128:
19261 return ix86_avx_u128_mode_entry ();
19262 case I387_TRUNC:
19263 case I387_FLOOR:
19264 case I387_CEIL:
19265 case I387_MASK_PM:
19266 return I387_CW_ANY;
19267 default:
19268 gcc_unreachable ();
19269 }
19270 }
19271
19272 static int
19273 ix86_avx_u128_mode_exit (void)
19274 {
19275 rtx reg = crtl->return_rtx;
19276
19277 /* Exit mode is set to AVX_U128_DIRTY if there are
19278 256bit modes used in the function return register. */
19279 if (reg && ix86_check_avx256_register (reg))
19280 return AVX_U128_DIRTY;
19281
19282 return AVX_U128_CLEAN;
19283 }
19284
19285 /* Return a mode that ENTITY is assumed to be
19286 switched to at function exit. */
19287
19288 static int
19289 ix86_mode_exit (int entity)
19290 {
19291 switch (entity)
19292 {
19293 case X86_DIRFLAG:
19294 return X86_DIRFLAG_ANY;
19295 case AVX_U128:
19296 return ix86_avx_u128_mode_exit ();
19297 case I387_TRUNC:
19298 case I387_FLOOR:
19299 case I387_CEIL:
19300 case I387_MASK_PM:
19301 return I387_CW_ANY;
19302 default:
19303 gcc_unreachable ();
19304 }
19305 }
19306
19307 static int
19308 ix86_mode_priority (int, int n)
19309 {
19310 return n;
19311 }
19312
19313 /* Output code to initialize control word copies used by trunc?f?i and
19314 rounding patterns. CURRENT_MODE is set to current control word,
19315 while NEW_MODE is set to new control word. */
19316
19317 static void
19318 emit_i387_cw_initialization (int mode)
19319 {
19320 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
19321 rtx new_mode;
19322
19323 enum ix86_stack_slot slot;
19324
19325 rtx reg = gen_reg_rtx (HImode);
19326
19327 emit_insn (gen_x86_fnstcw_1 (stored_mode));
19328 emit_move_insn (reg, copy_rtx (stored_mode));
19329
19330 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
19331 || optimize_insn_for_size_p ())
19332 {
19333 switch (mode)
19334 {
19335 case I387_CW_TRUNC:
19336 /* round toward zero (truncate) */
19337 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
19338 slot = SLOT_CW_TRUNC;
19339 break;
19340
19341 case I387_CW_FLOOR:
19342 /* round down toward -oo */
19343 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19344 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
19345 slot = SLOT_CW_FLOOR;
19346 break;
19347
19348 case I387_CW_CEIL:
19349 /* round up toward +oo */
19350 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19351 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
19352 slot = SLOT_CW_CEIL;
19353 break;
19354
19355 case I387_CW_MASK_PM:
19356 /* mask precision exception for nearbyint() */
19357 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19358 slot = SLOT_CW_MASK_PM;
19359 break;
19360
19361 default:
19362 gcc_unreachable ();
19363 }
19364 }
19365 else
19366 {
19367 switch (mode)
19368 {
19369 case I387_CW_TRUNC:
19370 /* round toward zero (truncate) */
19371 emit_insn (gen_insvsi_1 (reg, GEN_INT (0xc)));
19372 slot = SLOT_CW_TRUNC;
19373 break;
19374
19375 case I387_CW_FLOOR:
19376 /* round down toward -oo */
19377 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x4)));
19378 slot = SLOT_CW_FLOOR;
19379 break;
19380
19381 case I387_CW_CEIL:
19382 /* round up toward +oo */
19383 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x8)));
19384 slot = SLOT_CW_CEIL;
19385 break;
19386
19387 case I387_CW_MASK_PM:
19388 /* mask precision exception for nearbyint() */
19389 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19390 slot = SLOT_CW_MASK_PM;
19391 break;
19392
19393 default:
19394 gcc_unreachable ();
19395 }
19396 }
19397
19398 gcc_assert (slot < MAX_386_STACK_LOCALS);
19399
19400 new_mode = assign_386_stack_local (HImode, slot);
19401 emit_move_insn (new_mode, reg);
19402 }
19403
19404 /* Emit vzeroupper. */
19405
19406 void
19407 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
19408 {
19409 int i;
19410
19411 /* Cancel automatic vzeroupper insertion if there are
19412 live call-saved SSE registers at the insertion point. */
19413
19414 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
19415 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19416 return;
19417
19418 if (TARGET_64BIT)
19419 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
19420 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19421 return;
19422
19423 emit_insn (gen_avx_vzeroupper ());
19424 }
19425
19426 /* Generate one or more insns to set ENTITY to MODE. */
19427
19428 /* Generate one or more insns to set ENTITY to MODE. HARD_REG_LIVE
19429 is the set of hard registers live at the point where the insn(s)
19430 are to be inserted. */
19431
19432 static void
19433 ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED,
19434 HARD_REG_SET regs_live)
19435 {
19436 switch (entity)
19437 {
19438 case X86_DIRFLAG:
19439 if (mode == X86_DIRFLAG_RESET)
19440 emit_insn (gen_cld ());
19441 break;
19442 case AVX_U128:
19443 if (mode == AVX_U128_CLEAN)
19444 ix86_avx_emit_vzeroupper (regs_live);
19445 break;
19446 case I387_TRUNC:
19447 case I387_FLOOR:
19448 case I387_CEIL:
19449 case I387_MASK_PM:
19450 if (mode != I387_CW_ANY
19451 && mode != I387_CW_UNINITIALIZED)
19452 emit_i387_cw_initialization (mode);
19453 break;
19454 default:
19455 gcc_unreachable ();
19456 }
19457 }
19458
19459 /* Output code for INSN to convert a float to a signed int. OPERANDS
19460 are the insn operands. The output may be [HSD]Imode and the input
19461 operand may be [SDX]Fmode. */
19462
19463 const char *
19464 output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp)
19465 {
19466 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
19467 int dimode_p = GET_MODE (operands[0]) == DImode;
19468 int round_mode = get_attr_i387_cw (insn);
19469
19470 /* Jump through a hoop or two for DImode, since the hardware has no
19471 non-popping instruction. We used to do this a different way, but
19472 that was somewhat fragile and broke with post-reload splitters. */
19473 if ((dimode_p || fisttp) && !stack_top_dies)
19474 output_asm_insn ("fld\t%y1", operands);
19475
19476 gcc_assert (STACK_TOP_P (operands[1]));
19477 gcc_assert (MEM_P (operands[0]));
19478 gcc_assert (GET_MODE (operands[1]) != TFmode);
19479
19480 if (fisttp)
19481 output_asm_insn ("fisttp%Z0\t%0", operands);
19482 else
19483 {
19484 if (round_mode != I387_CW_ANY)
19485 output_asm_insn ("fldcw\t%3", operands);
19486 if (stack_top_dies || dimode_p)
19487 output_asm_insn ("fistp%Z0\t%0", operands);
19488 else
19489 output_asm_insn ("fist%Z0\t%0", operands);
19490 if (round_mode != I387_CW_ANY)
19491 output_asm_insn ("fldcw\t%2", operands);
19492 }
19493
19494 return "";
19495 }
19496
19497 /* Output code for x87 ffreep insn. The OPNO argument, which may only
19498 have the values zero or one, indicates the ffreep insn's operand
19499 from the OPERANDS array. */
19500
19501 static const char *
19502 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
19503 {
19504 if (TARGET_USE_FFREEP)
19505 #ifdef HAVE_AS_IX86_FFREEP
19506 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
19507 #else
19508 {
19509 static char retval[32];
19510 int regno = REGNO (operands[opno]);
19511
19512 gcc_assert (STACK_REGNO_P (regno));
19513
19514 regno -= FIRST_STACK_REG;
19515
19516 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
19517 return retval;
19518 }
19519 #endif
19520
19521 return opno ? "fstp\t%y1" : "fstp\t%y0";
19522 }
19523
19524
19525 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
19526 should be used. UNORDERED_P is true when fucom should be used. */
19527
19528 const char *
19529 output_fp_compare (rtx_insn *insn, rtx *operands, bool eflags_p, bool unordered_p)
19530 {
19531 int stack_top_dies;
19532 rtx cmp_op0, cmp_op1;
19533 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
19534
19535 if (eflags_p)
19536 {
19537 cmp_op0 = operands[0];
19538 cmp_op1 = operands[1];
19539 }
19540 else
19541 {
19542 cmp_op0 = operands[1];
19543 cmp_op1 = operands[2];
19544 }
19545
19546 if (is_sse)
19547 {
19548 if (GET_MODE (operands[0]) == SFmode)
19549 if (unordered_p)
19550 return "%vucomiss\t{%1, %0|%0, %1}";
19551 else
19552 return "%vcomiss\t{%1, %0|%0, %1}";
19553 else
19554 if (unordered_p)
19555 return "%vucomisd\t{%1, %0|%0, %1}";
19556 else
19557 return "%vcomisd\t{%1, %0|%0, %1}";
19558 }
19559
19560 gcc_assert (STACK_TOP_P (cmp_op0));
19561
19562 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
19563
19564 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
19565 {
19566 if (stack_top_dies)
19567 {
19568 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
19569 return output_387_ffreep (operands, 1);
19570 }
19571 else
19572 return "ftst\n\tfnstsw\t%0";
19573 }
19574
19575 if (STACK_REG_P (cmp_op1)
19576 && stack_top_dies
19577 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
19578 && REGNO (cmp_op1) != FIRST_STACK_REG)
19579 {
19580 /* If both the top of the 387 stack dies, and the other operand
19581 is also a stack register that dies, then this must be a
19582 `fcompp' float compare */
19583
19584 if (eflags_p)
19585 {
19586 /* There is no double popping fcomi variant. Fortunately,
19587 eflags is immune from the fstp's cc clobbering. */
19588 if (unordered_p)
19589 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
19590 else
19591 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
19592 return output_387_ffreep (operands, 0);
19593 }
19594 else
19595 {
19596 if (unordered_p)
19597 return "fucompp\n\tfnstsw\t%0";
19598 else
19599 return "fcompp\n\tfnstsw\t%0";
19600 }
19601 }
19602 else
19603 {
19604 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
19605
19606 static const char * const alt[16] =
19607 {
19608 "fcom%Z2\t%y2\n\tfnstsw\t%0",
19609 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
19610 "fucom%Z2\t%y2\n\tfnstsw\t%0",
19611 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
19612
19613 "ficom%Z2\t%y2\n\tfnstsw\t%0",
19614 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
19615 NULL,
19616 NULL,
19617
19618 "fcomi\t{%y1, %0|%0, %y1}",
19619 "fcomip\t{%y1, %0|%0, %y1}",
19620 "fucomi\t{%y1, %0|%0, %y1}",
19621 "fucomip\t{%y1, %0|%0, %y1}",
19622
19623 NULL,
19624 NULL,
19625 NULL,
19626 NULL
19627 };
19628
19629 int mask;
19630 const char *ret;
19631
19632 mask = eflags_p << 3;
19633 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
19634 mask |= unordered_p << 1;
19635 mask |= stack_top_dies;
19636
19637 gcc_assert (mask < 16);
19638 ret = alt[mask];
19639 gcc_assert (ret);
19640
19641 return ret;
19642 }
19643 }
19644
19645 void
19646 ix86_output_addr_vec_elt (FILE *file, int value)
19647 {
19648 const char *directive = ASM_LONG;
19649
19650 #ifdef ASM_QUAD
19651 if (TARGET_LP64)
19652 directive = ASM_QUAD;
19653 #else
19654 gcc_assert (!TARGET_64BIT);
19655 #endif
19656
19657 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
19658 }
19659
19660 void
19661 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
19662 {
19663 const char *directive = ASM_LONG;
19664
19665 #ifdef ASM_QUAD
19666 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
19667 directive = ASM_QUAD;
19668 #else
19669 gcc_assert (!TARGET_64BIT);
19670 #endif
19671 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
19672 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
19673 fprintf (file, "%s%s%d-%s%d\n",
19674 directive, LPREFIX, value, LPREFIX, rel);
19675 else if (HAVE_AS_GOTOFF_IN_DATA)
19676 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
19677 #if TARGET_MACHO
19678 else if (TARGET_MACHO)
19679 {
19680 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
19681 machopic_output_function_base_name (file);
19682 putc ('\n', file);
19683 }
19684 #endif
19685 else
19686 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
19687 GOT_SYMBOL_NAME, LPREFIX, value);
19688 }
19689 \f
19690 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
19691 for the target. */
19692
19693 void
19694 ix86_expand_clear (rtx dest)
19695 {
19696 rtx tmp;
19697
19698 /* We play register width games, which are only valid after reload. */
19699 gcc_assert (reload_completed);
19700
19701 /* Avoid HImode and its attendant prefix byte. */
19702 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
19703 dest = gen_rtx_REG (SImode, REGNO (dest));
19704 tmp = gen_rtx_SET (dest, const0_rtx);
19705
19706 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
19707 {
19708 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
19709 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
19710 }
19711
19712 emit_insn (tmp);
19713 }
19714
19715 /* X is an unchanging MEM. If it is a constant pool reference, return
19716 the constant pool rtx, else NULL. */
19717
19718 rtx
19719 maybe_get_pool_constant (rtx x)
19720 {
19721 x = ix86_delegitimize_address (XEXP (x, 0));
19722
19723 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
19724 return get_pool_constant (x);
19725
19726 return NULL_RTX;
19727 }
19728
19729 void
19730 ix86_expand_move (machine_mode mode, rtx operands[])
19731 {
19732 rtx op0, op1;
19733 rtx tmp, addend = NULL_RTX;
19734 enum tls_model model;
19735
19736 op0 = operands[0];
19737 op1 = operands[1];
19738
19739 switch (GET_CODE (op1))
19740 {
19741 case CONST:
19742 tmp = XEXP (op1, 0);
19743
19744 if (GET_CODE (tmp) != PLUS
19745 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
19746 break;
19747
19748 op1 = XEXP (tmp, 0);
19749 addend = XEXP (tmp, 1);
19750 /* FALLTHRU */
19751
19752 case SYMBOL_REF:
19753 model = SYMBOL_REF_TLS_MODEL (op1);
19754
19755 if (model)
19756 op1 = legitimize_tls_address (op1, model, true);
19757 else if (ix86_force_load_from_GOT_p (op1))
19758 {
19759 /* Load the external function address via GOT slot to avoid PLT. */
19760 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
19761 (TARGET_64BIT
19762 ? UNSPEC_GOTPCREL
19763 : UNSPEC_GOT));
19764 op1 = gen_rtx_CONST (Pmode, op1);
19765 op1 = gen_const_mem (Pmode, op1);
19766 set_mem_alias_set (op1, ix86_GOT_alias_set ());
19767 }
19768 else
19769 {
19770 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
19771 if (tmp)
19772 {
19773 op1 = tmp;
19774 if (!addend)
19775 break;
19776 }
19777 else
19778 {
19779 op1 = operands[1];
19780 break;
19781 }
19782 }
19783
19784 if (addend)
19785 {
19786 op1 = force_operand (op1, NULL_RTX);
19787 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
19788 op0, 1, OPTAB_DIRECT);
19789 }
19790 else
19791 op1 = force_operand (op1, op0);
19792
19793 if (op1 == op0)
19794 return;
19795
19796 op1 = convert_to_mode (mode, op1, 1);
19797
19798 default:
19799 break;
19800 }
19801
19802 if ((flag_pic || MACHOPIC_INDIRECT)
19803 && symbolic_operand (op1, mode))
19804 {
19805 if (TARGET_MACHO && !TARGET_64BIT)
19806 {
19807 #if TARGET_MACHO
19808 /* dynamic-no-pic */
19809 if (MACHOPIC_INDIRECT)
19810 {
19811 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
19812 ? op0 : gen_reg_rtx (Pmode);
19813 op1 = machopic_indirect_data_reference (op1, temp);
19814 if (MACHOPIC_PURE)
19815 op1 = machopic_legitimize_pic_address (op1, mode,
19816 temp == op1 ? 0 : temp);
19817 }
19818 if (op0 != op1 && GET_CODE (op0) != MEM)
19819 {
19820 rtx insn = gen_rtx_SET (op0, op1);
19821 emit_insn (insn);
19822 return;
19823 }
19824 if (GET_CODE (op0) == MEM)
19825 op1 = force_reg (Pmode, op1);
19826 else
19827 {
19828 rtx temp = op0;
19829 if (GET_CODE (temp) != REG)
19830 temp = gen_reg_rtx (Pmode);
19831 temp = legitimize_pic_address (op1, temp);
19832 if (temp == op0)
19833 return;
19834 op1 = temp;
19835 }
19836 /* dynamic-no-pic */
19837 #endif
19838 }
19839 else
19840 {
19841 if (MEM_P (op0))
19842 op1 = force_reg (mode, op1);
19843 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
19844 {
19845 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
19846 op1 = legitimize_pic_address (op1, reg);
19847 if (op0 == op1)
19848 return;
19849 op1 = convert_to_mode (mode, op1, 1);
19850 }
19851 }
19852 }
19853 else
19854 {
19855 if (MEM_P (op0)
19856 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
19857 || !push_operand (op0, mode))
19858 && MEM_P (op1))
19859 op1 = force_reg (mode, op1);
19860
19861 if (push_operand (op0, mode)
19862 && ! general_no_elim_operand (op1, mode))
19863 op1 = copy_to_mode_reg (mode, op1);
19864
19865 /* Force large constants in 64bit compilation into register
19866 to get them CSEed. */
19867 if (can_create_pseudo_p ()
19868 && (mode == DImode) && TARGET_64BIT
19869 && immediate_operand (op1, mode)
19870 && !x86_64_zext_immediate_operand (op1, VOIDmode)
19871 && !register_operand (op0, mode)
19872 && optimize)
19873 op1 = copy_to_mode_reg (mode, op1);
19874
19875 if (can_create_pseudo_p ()
19876 && CONST_DOUBLE_P (op1))
19877 {
19878 /* If we are loading a floating point constant to a register,
19879 force the value to memory now, since we'll get better code
19880 out the back end. */
19881
19882 op1 = validize_mem (force_const_mem (mode, op1));
19883 if (!register_operand (op0, mode))
19884 {
19885 rtx temp = gen_reg_rtx (mode);
19886 emit_insn (gen_rtx_SET (temp, op1));
19887 emit_move_insn (op0, temp);
19888 return;
19889 }
19890 }
19891 }
19892
19893 emit_insn (gen_rtx_SET (op0, op1));
19894 }
19895
19896 void
19897 ix86_expand_vector_move (machine_mode mode, rtx operands[])
19898 {
19899 rtx op0 = operands[0], op1 = operands[1];
19900 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
19901 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
19902 unsigned int align = (TARGET_IAMCU
19903 ? GET_MODE_BITSIZE (mode)
19904 : GET_MODE_ALIGNMENT (mode));
19905
19906 if (push_operand (op0, VOIDmode))
19907 op0 = emit_move_resolve_push (mode, op0);
19908
19909 /* Force constants other than zero into memory. We do not know how
19910 the instructions used to build constants modify the upper 64 bits
19911 of the register, once we have that information we may be able
19912 to handle some of them more efficiently. */
19913 if (can_create_pseudo_p ()
19914 && (CONSTANT_P (op1)
19915 || (SUBREG_P (op1)
19916 && CONSTANT_P (SUBREG_REG (op1))))
19917 && ((register_operand (op0, mode)
19918 && !standard_sse_constant_p (op1, mode))
19919 /* ix86_expand_vector_move_misalign() does not like constants. */
19920 || (SSE_REG_MODE_P (mode)
19921 && MEM_P (op0)
19922 && MEM_ALIGN (op0) < align)))
19923 {
19924 if (SUBREG_P (op1))
19925 {
19926 machine_mode imode = GET_MODE (SUBREG_REG (op1));
19927 rtx r = force_const_mem (imode, SUBREG_REG (op1));
19928 if (r)
19929 r = validize_mem (r);
19930 else
19931 r = force_reg (imode, SUBREG_REG (op1));
19932 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
19933 }
19934 else
19935 op1 = validize_mem (force_const_mem (mode, op1));
19936 }
19937
19938 /* We need to check memory alignment for SSE mode since attribute
19939 can make operands unaligned. */
19940 if (can_create_pseudo_p ()
19941 && SSE_REG_MODE_P (mode)
19942 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
19943 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
19944 {
19945 rtx tmp[2];
19946
19947 /* ix86_expand_vector_move_misalign() does not like both
19948 arguments in memory. */
19949 if (!register_operand (op0, mode)
19950 && !register_operand (op1, mode))
19951 op1 = force_reg (mode, op1);
19952
19953 tmp[0] = op0; tmp[1] = op1;
19954 ix86_expand_vector_move_misalign (mode, tmp);
19955 return;
19956 }
19957
19958 /* Make operand1 a register if it isn't already. */
19959 if (can_create_pseudo_p ()
19960 && !register_operand (op0, mode)
19961 && !register_operand (op1, mode))
19962 {
19963 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
19964 return;
19965 }
19966
19967 emit_insn (gen_rtx_SET (op0, op1));
19968 }
19969
19970 /* Split 32-byte AVX unaligned load and store if needed. */
19971
19972 static void
19973 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
19974 {
19975 rtx m;
19976 rtx (*extract) (rtx, rtx, rtx);
19977 machine_mode mode;
19978
19979 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
19980 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
19981 {
19982 emit_insn (gen_rtx_SET (op0, op1));
19983 return;
19984 }
19985
19986 rtx orig_op0 = NULL_RTX;
19987 mode = GET_MODE (op0);
19988 switch (GET_MODE_CLASS (mode))
19989 {
19990 case MODE_VECTOR_INT:
19991 case MODE_INT:
19992 if (mode != V32QImode)
19993 {
19994 if (!MEM_P (op0))
19995 {
19996 orig_op0 = op0;
19997 op0 = gen_reg_rtx (V32QImode);
19998 }
19999 else
20000 op0 = gen_lowpart (V32QImode, op0);
20001 op1 = gen_lowpart (V32QImode, op1);
20002 mode = V32QImode;
20003 }
20004 break;
20005 case MODE_VECTOR_FLOAT:
20006 break;
20007 default:
20008 gcc_unreachable ();
20009 }
20010
20011 switch (mode)
20012 {
20013 default:
20014 gcc_unreachable ();
20015 case V32QImode:
20016 extract = gen_avx_vextractf128v32qi;
20017 mode = V16QImode;
20018 break;
20019 case V8SFmode:
20020 extract = gen_avx_vextractf128v8sf;
20021 mode = V4SFmode;
20022 break;
20023 case V4DFmode:
20024 extract = gen_avx_vextractf128v4df;
20025 mode = V2DFmode;
20026 break;
20027 }
20028
20029 if (MEM_P (op1))
20030 {
20031 rtx r = gen_reg_rtx (mode);
20032 m = adjust_address (op1, mode, 0);
20033 emit_move_insn (r, m);
20034 m = adjust_address (op1, mode, 16);
20035 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
20036 emit_move_insn (op0, r);
20037 }
20038 else if (MEM_P (op0))
20039 {
20040 m = adjust_address (op0, mode, 0);
20041 emit_insn (extract (m, op1, const0_rtx));
20042 m = adjust_address (op0, mode, 16);
20043 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
20044 }
20045 else
20046 gcc_unreachable ();
20047
20048 if (orig_op0)
20049 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
20050 }
20051
20052 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
20053 straight to ix86_expand_vector_move. */
20054 /* Code generation for scalar reg-reg moves of single and double precision data:
20055 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
20056 movaps reg, reg
20057 else
20058 movss reg, reg
20059 if (x86_sse_partial_reg_dependency == true)
20060 movapd reg, reg
20061 else
20062 movsd reg, reg
20063
20064 Code generation for scalar loads of double precision data:
20065 if (x86_sse_split_regs == true)
20066 movlpd mem, reg (gas syntax)
20067 else
20068 movsd mem, reg
20069
20070 Code generation for unaligned packed loads of single precision data
20071 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
20072 if (x86_sse_unaligned_move_optimal)
20073 movups mem, reg
20074
20075 if (x86_sse_partial_reg_dependency == true)
20076 {
20077 xorps reg, reg
20078 movlps mem, reg
20079 movhps mem+8, reg
20080 }
20081 else
20082 {
20083 movlps mem, reg
20084 movhps mem+8, reg
20085 }
20086
20087 Code generation for unaligned packed loads of double precision data
20088 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
20089 if (x86_sse_unaligned_move_optimal)
20090 movupd mem, reg
20091
20092 if (x86_sse_split_regs == true)
20093 {
20094 movlpd mem, reg
20095 movhpd mem+8, reg
20096 }
20097 else
20098 {
20099 movsd mem, reg
20100 movhpd mem+8, reg
20101 }
20102 */
20103
20104 void
20105 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
20106 {
20107 rtx op0, op1, m;
20108
20109 op0 = operands[0];
20110 op1 = operands[1];
20111
20112 /* Use unaligned load/store for AVX512 or when optimizing for size. */
20113 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
20114 {
20115 emit_insn (gen_rtx_SET (op0, op1));
20116 return;
20117 }
20118
20119 if (TARGET_AVX)
20120 {
20121 if (GET_MODE_SIZE (mode) == 32)
20122 ix86_avx256_split_vector_move_misalign (op0, op1);
20123 else
20124 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
20125 emit_insn (gen_rtx_SET (op0, op1));
20126 return;
20127 }
20128
20129 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
20130 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
20131 {
20132 emit_insn (gen_rtx_SET (op0, op1));
20133 return;
20134 }
20135
20136 /* ??? If we have typed data, then it would appear that using
20137 movdqu is the only way to get unaligned data loaded with
20138 integer type. */
20139 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
20140 {
20141 emit_insn (gen_rtx_SET (op0, op1));
20142 return;
20143 }
20144
20145 if (MEM_P (op1))
20146 {
20147 if (TARGET_SSE2 && mode == V2DFmode)
20148 {
20149 rtx zero;
20150
20151 /* When SSE registers are split into halves, we can avoid
20152 writing to the top half twice. */
20153 if (TARGET_SSE_SPLIT_REGS)
20154 {
20155 emit_clobber (op0);
20156 zero = op0;
20157 }
20158 else
20159 {
20160 /* ??? Not sure about the best option for the Intel chips.
20161 The following would seem to satisfy; the register is
20162 entirely cleared, breaking the dependency chain. We
20163 then store to the upper half, with a dependency depth
20164 of one. A rumor has it that Intel recommends two movsd
20165 followed by an unpacklpd, but this is unconfirmed. And
20166 given that the dependency depth of the unpacklpd would
20167 still be one, I'm not sure why this would be better. */
20168 zero = CONST0_RTX (V2DFmode);
20169 }
20170
20171 m = adjust_address (op1, DFmode, 0);
20172 emit_insn (gen_sse2_loadlpd (op0, zero, m));
20173 m = adjust_address (op1, DFmode, 8);
20174 emit_insn (gen_sse2_loadhpd (op0, op0, m));
20175 }
20176 else
20177 {
20178 rtx t;
20179
20180 if (mode != V4SFmode)
20181 t = gen_reg_rtx (V4SFmode);
20182 else
20183 t = op0;
20184
20185 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
20186 emit_move_insn (t, CONST0_RTX (V4SFmode));
20187 else
20188 emit_clobber (t);
20189
20190 m = adjust_address (op1, V2SFmode, 0);
20191 emit_insn (gen_sse_loadlps (t, t, m));
20192 m = adjust_address (op1, V2SFmode, 8);
20193 emit_insn (gen_sse_loadhps (t, t, m));
20194 if (mode != V4SFmode)
20195 emit_move_insn (op0, gen_lowpart (mode, t));
20196 }
20197 }
20198 else if (MEM_P (op0))
20199 {
20200 if (TARGET_SSE2 && mode == V2DFmode)
20201 {
20202 m = adjust_address (op0, DFmode, 0);
20203 emit_insn (gen_sse2_storelpd (m, op1));
20204 m = adjust_address (op0, DFmode, 8);
20205 emit_insn (gen_sse2_storehpd (m, op1));
20206 }
20207 else
20208 {
20209 if (mode != V4SFmode)
20210 op1 = gen_lowpart (V4SFmode, op1);
20211
20212 m = adjust_address (op0, V2SFmode, 0);
20213 emit_insn (gen_sse_storelps (m, op1));
20214 m = adjust_address (op0, V2SFmode, 8);
20215 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
20216 }
20217 }
20218 else
20219 gcc_unreachable ();
20220 }
20221
20222 /* Helper function of ix86_fixup_binary_operands to canonicalize
20223 operand order. Returns true if the operands should be swapped. */
20224
20225 static bool
20226 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
20227 rtx operands[])
20228 {
20229 rtx dst = operands[0];
20230 rtx src1 = operands[1];
20231 rtx src2 = operands[2];
20232
20233 /* If the operation is not commutative, we can't do anything. */
20234 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
20235 return false;
20236
20237 /* Highest priority is that src1 should match dst. */
20238 if (rtx_equal_p (dst, src1))
20239 return false;
20240 if (rtx_equal_p (dst, src2))
20241 return true;
20242
20243 /* Next highest priority is that immediate constants come second. */
20244 if (immediate_operand (src2, mode))
20245 return false;
20246 if (immediate_operand (src1, mode))
20247 return true;
20248
20249 /* Lowest priority is that memory references should come second. */
20250 if (MEM_P (src2))
20251 return false;
20252 if (MEM_P (src1))
20253 return true;
20254
20255 return false;
20256 }
20257
20258
20259 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
20260 destination to use for the operation. If different from the true
20261 destination in operands[0], a copy operation will be required. */
20262
20263 rtx
20264 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
20265 rtx operands[])
20266 {
20267 rtx dst = operands[0];
20268 rtx src1 = operands[1];
20269 rtx src2 = operands[2];
20270
20271 /* Canonicalize operand order. */
20272 if (ix86_swap_binary_operands_p (code, mode, operands))
20273 {
20274 /* It is invalid to swap operands of different modes. */
20275 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
20276
20277 std::swap (src1, src2);
20278 }
20279
20280 /* Both source operands cannot be in memory. */
20281 if (MEM_P (src1) && MEM_P (src2))
20282 {
20283 /* Optimization: Only read from memory once. */
20284 if (rtx_equal_p (src1, src2))
20285 {
20286 src2 = force_reg (mode, src2);
20287 src1 = src2;
20288 }
20289 else if (rtx_equal_p (dst, src1))
20290 src2 = force_reg (mode, src2);
20291 else
20292 src1 = force_reg (mode, src1);
20293 }
20294
20295 /* If the destination is memory, and we do not have matching source
20296 operands, do things in registers. */
20297 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
20298 dst = gen_reg_rtx (mode);
20299
20300 /* Source 1 cannot be a constant. */
20301 if (CONSTANT_P (src1))
20302 src1 = force_reg (mode, src1);
20303
20304 /* Source 1 cannot be a non-matching memory. */
20305 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
20306 src1 = force_reg (mode, src1);
20307
20308 /* Improve address combine. */
20309 if (code == PLUS
20310 && GET_MODE_CLASS (mode) == MODE_INT
20311 && MEM_P (src2))
20312 src2 = force_reg (mode, src2);
20313
20314 operands[1] = src1;
20315 operands[2] = src2;
20316 return dst;
20317 }
20318
20319 /* Similarly, but assume that the destination has already been
20320 set up properly. */
20321
20322 void
20323 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
20324 machine_mode mode, rtx operands[])
20325 {
20326 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
20327 gcc_assert (dst == operands[0]);
20328 }
20329
20330 /* Attempt to expand a binary operator. Make the expansion closer to the
20331 actual machine, then just general_operand, which will allow 3 separate
20332 memory references (one output, two input) in a single insn. */
20333
20334 void
20335 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
20336 rtx operands[])
20337 {
20338 rtx src1, src2, dst, op, clob;
20339
20340 dst = ix86_fixup_binary_operands (code, mode, operands);
20341 src1 = operands[1];
20342 src2 = operands[2];
20343
20344 /* Emit the instruction. */
20345
20346 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
20347
20348 if (reload_completed
20349 && code == PLUS
20350 && !rtx_equal_p (dst, src1))
20351 {
20352 /* This is going to be an LEA; avoid splitting it later. */
20353 emit_insn (op);
20354 }
20355 else
20356 {
20357 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20358 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20359 }
20360
20361 /* Fix up the destination if needed. */
20362 if (dst != operands[0])
20363 emit_move_insn (operands[0], dst);
20364 }
20365
20366 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
20367 the given OPERANDS. */
20368
20369 void
20370 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
20371 rtx operands[])
20372 {
20373 rtx op1 = NULL_RTX, op2 = NULL_RTX;
20374 if (SUBREG_P (operands[1]))
20375 {
20376 op1 = operands[1];
20377 op2 = operands[2];
20378 }
20379 else if (SUBREG_P (operands[2]))
20380 {
20381 op1 = operands[2];
20382 op2 = operands[1];
20383 }
20384 /* Optimize (__m128i) d | (__m128i) e and similar code
20385 when d and e are float vectors into float vector logical
20386 insn. In C/C++ without using intrinsics there is no other way
20387 to express vector logical operation on float vectors than
20388 to cast them temporarily to integer vectors. */
20389 if (op1
20390 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
20391 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
20392 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
20393 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
20394 && SUBREG_BYTE (op1) == 0
20395 && (GET_CODE (op2) == CONST_VECTOR
20396 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
20397 && SUBREG_BYTE (op2) == 0))
20398 && can_create_pseudo_p ())
20399 {
20400 rtx dst;
20401 switch (GET_MODE (SUBREG_REG (op1)))
20402 {
20403 case V4SFmode:
20404 case V8SFmode:
20405 case V16SFmode:
20406 case V2DFmode:
20407 case V4DFmode:
20408 case V8DFmode:
20409 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
20410 if (GET_CODE (op2) == CONST_VECTOR)
20411 {
20412 op2 = gen_lowpart (GET_MODE (dst), op2);
20413 op2 = force_reg (GET_MODE (dst), op2);
20414 }
20415 else
20416 {
20417 op1 = operands[1];
20418 op2 = SUBREG_REG (operands[2]);
20419 if (!vector_operand (op2, GET_MODE (dst)))
20420 op2 = force_reg (GET_MODE (dst), op2);
20421 }
20422 op1 = SUBREG_REG (op1);
20423 if (!vector_operand (op1, GET_MODE (dst)))
20424 op1 = force_reg (GET_MODE (dst), op1);
20425 emit_insn (gen_rtx_SET (dst,
20426 gen_rtx_fmt_ee (code, GET_MODE (dst),
20427 op1, op2)));
20428 emit_move_insn (operands[0], gen_lowpart (mode, dst));
20429 return;
20430 default:
20431 break;
20432 }
20433 }
20434 if (!vector_operand (operands[1], mode))
20435 operands[1] = force_reg (mode, operands[1]);
20436 if (!vector_operand (operands[2], mode))
20437 operands[2] = force_reg (mode, operands[2]);
20438 ix86_fixup_binary_operands_no_copy (code, mode, operands);
20439 emit_insn (gen_rtx_SET (operands[0],
20440 gen_rtx_fmt_ee (code, mode, operands[1],
20441 operands[2])));
20442 }
20443
20444 /* Return TRUE or FALSE depending on whether the binary operator meets the
20445 appropriate constraints. */
20446
20447 bool
20448 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
20449 rtx operands[3])
20450 {
20451 rtx dst = operands[0];
20452 rtx src1 = operands[1];
20453 rtx src2 = operands[2];
20454
20455 /* Both source operands cannot be in memory. */
20456 if (MEM_P (src1) && MEM_P (src2))
20457 return false;
20458
20459 /* Canonicalize operand order for commutative operators. */
20460 if (ix86_swap_binary_operands_p (code, mode, operands))
20461 std::swap (src1, src2);
20462
20463 /* If the destination is memory, we must have a matching source operand. */
20464 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
20465 return false;
20466
20467 /* Source 1 cannot be a constant. */
20468 if (CONSTANT_P (src1))
20469 return false;
20470
20471 /* Source 1 cannot be a non-matching memory. */
20472 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
20473 /* Support "andhi/andsi/anddi" as a zero-extending move. */
20474 return (code == AND
20475 && (mode == HImode
20476 || mode == SImode
20477 || (TARGET_64BIT && mode == DImode))
20478 && satisfies_constraint_L (src2));
20479
20480 return true;
20481 }
20482
20483 /* Attempt to expand a unary operator. Make the expansion closer to the
20484 actual machine, then just general_operand, which will allow 2 separate
20485 memory references (one output, one input) in a single insn. */
20486
20487 void
20488 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
20489 rtx operands[])
20490 {
20491 bool matching_memory = false;
20492 rtx src, dst, op, clob;
20493
20494 dst = operands[0];
20495 src = operands[1];
20496
20497 /* If the destination is memory, and we do not have matching source
20498 operands, do things in registers. */
20499 if (MEM_P (dst))
20500 {
20501 if (rtx_equal_p (dst, src))
20502 matching_memory = true;
20503 else
20504 dst = gen_reg_rtx (mode);
20505 }
20506
20507 /* When source operand is memory, destination must match. */
20508 if (MEM_P (src) && !matching_memory)
20509 src = force_reg (mode, src);
20510
20511 /* Emit the instruction. */
20512
20513 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
20514
20515 if (code == NOT)
20516 emit_insn (op);
20517 else
20518 {
20519 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20520 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20521 }
20522
20523 /* Fix up the destination if needed. */
20524 if (dst != operands[0])
20525 emit_move_insn (operands[0], dst);
20526 }
20527
20528 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
20529 divisor are within the range [0-255]. */
20530
20531 void
20532 ix86_split_idivmod (machine_mode mode, rtx operands[],
20533 bool signed_p)
20534 {
20535 rtx_code_label *end_label, *qimode_label;
20536 rtx div, mod;
20537 rtx_insn *insn;
20538 rtx scratch, tmp0, tmp1, tmp2;
20539 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
20540 rtx (*gen_zero_extend) (rtx, rtx);
20541 rtx (*gen_test_ccno_1) (rtx, rtx);
20542
20543 switch (mode)
20544 {
20545 case SImode:
20546 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
20547 gen_test_ccno_1 = gen_testsi_ccno_1;
20548 gen_zero_extend = gen_zero_extendqisi2;
20549 break;
20550 case DImode:
20551 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
20552 gen_test_ccno_1 = gen_testdi_ccno_1;
20553 gen_zero_extend = gen_zero_extendqidi2;
20554 break;
20555 default:
20556 gcc_unreachable ();
20557 }
20558
20559 end_label = gen_label_rtx ();
20560 qimode_label = gen_label_rtx ();
20561
20562 scratch = gen_reg_rtx (mode);
20563
20564 /* Use 8bit unsigned divimod if dividend and divisor are within
20565 the range [0-255]. */
20566 emit_move_insn (scratch, operands[2]);
20567 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
20568 scratch, 1, OPTAB_DIRECT);
20569 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
20570 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
20571 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
20572 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
20573 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
20574 pc_rtx);
20575 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
20576 predict_jump (REG_BR_PROB_BASE * 50 / 100);
20577 JUMP_LABEL (insn) = qimode_label;
20578
20579 /* Generate original signed/unsigned divimod. */
20580 div = gen_divmod4_1 (operands[0], operands[1],
20581 operands[2], operands[3]);
20582 emit_insn (div);
20583
20584 /* Branch to the end. */
20585 emit_jump_insn (gen_jump (end_label));
20586 emit_barrier ();
20587
20588 /* Generate 8bit unsigned divide. */
20589 emit_label (qimode_label);
20590 /* Don't use operands[0] for result of 8bit divide since not all
20591 registers support QImode ZERO_EXTRACT. */
20592 tmp0 = lowpart_subreg (HImode, scratch, mode);
20593 tmp1 = lowpart_subreg (HImode, operands[2], mode);
20594 tmp2 = lowpart_subreg (QImode, operands[3], mode);
20595 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
20596
20597 if (signed_p)
20598 {
20599 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
20600 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
20601 }
20602 else
20603 {
20604 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
20605 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
20606 }
20607
20608 /* Extract remainder from AH. */
20609 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
20610 if (REG_P (operands[1]))
20611 insn = emit_move_insn (operands[1], tmp1);
20612 else
20613 {
20614 /* Need a new scratch register since the old one has result
20615 of 8bit divide. */
20616 scratch = gen_reg_rtx (mode);
20617 emit_move_insn (scratch, tmp1);
20618 insn = emit_move_insn (operands[1], scratch);
20619 }
20620 set_unique_reg_note (insn, REG_EQUAL, mod);
20621
20622 /* Zero extend quotient from AL. */
20623 tmp1 = gen_lowpart (QImode, tmp0);
20624 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
20625 set_unique_reg_note (insn, REG_EQUAL, div);
20626
20627 emit_label (end_label);
20628 }
20629
20630 #define LEA_MAX_STALL (3)
20631 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
20632
20633 /* Increase given DISTANCE in half-cycles according to
20634 dependencies between PREV and NEXT instructions.
20635 Add 1 half-cycle if there is no dependency and
20636 go to next cycle if there is some dependecy. */
20637
20638 static unsigned int
20639 increase_distance (rtx_insn *prev, rtx_insn *next, unsigned int distance)
20640 {
20641 df_ref def, use;
20642
20643 if (!prev || !next)
20644 return distance + (distance & 1) + 2;
20645
20646 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
20647 return distance + 1;
20648
20649 FOR_EACH_INSN_USE (use, next)
20650 FOR_EACH_INSN_DEF (def, prev)
20651 if (!DF_REF_IS_ARTIFICIAL (def)
20652 && DF_REF_REGNO (use) == DF_REF_REGNO (def))
20653 return distance + (distance & 1) + 2;
20654
20655 return distance + 1;
20656 }
20657
20658 /* Function checks if instruction INSN defines register number
20659 REGNO1 or REGNO2. */
20660
20661 static bool
20662 insn_defines_reg (unsigned int regno1, unsigned int regno2,
20663 rtx_insn *insn)
20664 {
20665 df_ref def;
20666
20667 FOR_EACH_INSN_DEF (def, insn)
20668 if (DF_REF_REG_DEF_P (def)
20669 && !DF_REF_IS_ARTIFICIAL (def)
20670 && (regno1 == DF_REF_REGNO (def)
20671 || regno2 == DF_REF_REGNO (def)))
20672 return true;
20673
20674 return false;
20675 }
20676
20677 /* Function checks if instruction INSN uses register number
20678 REGNO as a part of address expression. */
20679
20680 static bool
20681 insn_uses_reg_mem (unsigned int regno, rtx insn)
20682 {
20683 df_ref use;
20684
20685 FOR_EACH_INSN_USE (use, insn)
20686 if (DF_REF_REG_MEM_P (use) && regno == DF_REF_REGNO (use))
20687 return true;
20688
20689 return false;
20690 }
20691
20692 /* Search backward for non-agu definition of register number REGNO1
20693 or register number REGNO2 in basic block starting from instruction
20694 START up to head of basic block or instruction INSN.
20695
20696 Function puts true value into *FOUND var if definition was found
20697 and false otherwise.
20698
20699 Distance in half-cycles between START and found instruction or head
20700 of BB is added to DISTANCE and returned. */
20701
20702 static int
20703 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
20704 rtx_insn *insn, int distance,
20705 rtx_insn *start, bool *found)
20706 {
20707 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
20708 rtx_insn *prev = start;
20709 rtx_insn *next = NULL;
20710
20711 *found = false;
20712
20713 while (prev
20714 && prev != insn
20715 && distance < LEA_SEARCH_THRESHOLD)
20716 {
20717 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
20718 {
20719 distance = increase_distance (prev, next, distance);
20720 if (insn_defines_reg (regno1, regno2, prev))
20721 {
20722 if (recog_memoized (prev) < 0
20723 || get_attr_type (prev) != TYPE_LEA)
20724 {
20725 *found = true;
20726 return distance;
20727 }
20728 }
20729
20730 next = prev;
20731 }
20732 if (prev == BB_HEAD (bb))
20733 break;
20734
20735 prev = PREV_INSN (prev);
20736 }
20737
20738 return distance;
20739 }
20740
20741 /* Search backward for non-agu definition of register number REGNO1
20742 or register number REGNO2 in INSN's basic block until
20743 1. Pass LEA_SEARCH_THRESHOLD instructions, or
20744 2. Reach neighbor BBs boundary, or
20745 3. Reach agu definition.
20746 Returns the distance between the non-agu definition point and INSN.
20747 If no definition point, returns -1. */
20748
20749 static int
20750 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
20751 rtx_insn *insn)
20752 {
20753 basic_block bb = BLOCK_FOR_INSN (insn);
20754 int distance = 0;
20755 bool found = false;
20756
20757 if (insn != BB_HEAD (bb))
20758 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
20759 distance, PREV_INSN (insn),
20760 &found);
20761
20762 if (!found && distance < LEA_SEARCH_THRESHOLD)
20763 {
20764 edge e;
20765 edge_iterator ei;
20766 bool simple_loop = false;
20767
20768 FOR_EACH_EDGE (e, ei, bb->preds)
20769 if (e->src == bb)
20770 {
20771 simple_loop = true;
20772 break;
20773 }
20774
20775 if (simple_loop)
20776 distance = distance_non_agu_define_in_bb (regno1, regno2,
20777 insn, distance,
20778 BB_END (bb), &found);
20779 else
20780 {
20781 int shortest_dist = -1;
20782 bool found_in_bb = false;
20783
20784 FOR_EACH_EDGE (e, ei, bb->preds)
20785 {
20786 int bb_dist
20787 = distance_non_agu_define_in_bb (regno1, regno2,
20788 insn, distance,
20789 BB_END (e->src),
20790 &found_in_bb);
20791 if (found_in_bb)
20792 {
20793 if (shortest_dist < 0)
20794 shortest_dist = bb_dist;
20795 else if (bb_dist > 0)
20796 shortest_dist = MIN (bb_dist, shortest_dist);
20797
20798 found = true;
20799 }
20800 }
20801
20802 distance = shortest_dist;
20803 }
20804 }
20805
20806 /* get_attr_type may modify recog data. We want to make sure
20807 that recog data is valid for instruction INSN, on which
20808 distance_non_agu_define is called. INSN is unchanged here. */
20809 extract_insn_cached (insn);
20810
20811 if (!found)
20812 return -1;
20813
20814 return distance >> 1;
20815 }
20816
20817 /* Return the distance in half-cycles between INSN and the next
20818 insn that uses register number REGNO in memory address added
20819 to DISTANCE. Return -1 if REGNO0 is set.
20820
20821 Put true value into *FOUND if register usage was found and
20822 false otherwise.
20823 Put true value into *REDEFINED if register redefinition was
20824 found and false otherwise. */
20825
20826 static int
20827 distance_agu_use_in_bb (unsigned int regno,
20828 rtx_insn *insn, int distance, rtx_insn *start,
20829 bool *found, bool *redefined)
20830 {
20831 basic_block bb = NULL;
20832 rtx_insn *next = start;
20833 rtx_insn *prev = NULL;
20834
20835 *found = false;
20836 *redefined = false;
20837
20838 if (start != NULL_RTX)
20839 {
20840 bb = BLOCK_FOR_INSN (start);
20841 if (start != BB_HEAD (bb))
20842 /* If insn and start belong to the same bb, set prev to insn,
20843 so the call to increase_distance will increase the distance
20844 between insns by 1. */
20845 prev = insn;
20846 }
20847
20848 while (next
20849 && next != insn
20850 && distance < LEA_SEARCH_THRESHOLD)
20851 {
20852 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
20853 {
20854 distance = increase_distance(prev, next, distance);
20855 if (insn_uses_reg_mem (regno, next))
20856 {
20857 /* Return DISTANCE if OP0 is used in memory
20858 address in NEXT. */
20859 *found = true;
20860 return distance;
20861 }
20862
20863 if (insn_defines_reg (regno, INVALID_REGNUM, next))
20864 {
20865 /* Return -1 if OP0 is set in NEXT. */
20866 *redefined = true;
20867 return -1;
20868 }
20869
20870 prev = next;
20871 }
20872
20873 if (next == BB_END (bb))
20874 break;
20875
20876 next = NEXT_INSN (next);
20877 }
20878
20879 return distance;
20880 }
20881
20882 /* Return the distance between INSN and the next insn that uses
20883 register number REGNO0 in memory address. Return -1 if no such
20884 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
20885
20886 static int
20887 distance_agu_use (unsigned int regno0, rtx_insn *insn)
20888 {
20889 basic_block bb = BLOCK_FOR_INSN (insn);
20890 int distance = 0;
20891 bool found = false;
20892 bool redefined = false;
20893
20894 if (insn != BB_END (bb))
20895 distance = distance_agu_use_in_bb (regno0, insn, distance,
20896 NEXT_INSN (insn),
20897 &found, &redefined);
20898
20899 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
20900 {
20901 edge e;
20902 edge_iterator ei;
20903 bool simple_loop = false;
20904
20905 FOR_EACH_EDGE (e, ei, bb->succs)
20906 if (e->dest == bb)
20907 {
20908 simple_loop = true;
20909 break;
20910 }
20911
20912 if (simple_loop)
20913 distance = distance_agu_use_in_bb (regno0, insn,
20914 distance, BB_HEAD (bb),
20915 &found, &redefined);
20916 else
20917 {
20918 int shortest_dist = -1;
20919 bool found_in_bb = false;
20920 bool redefined_in_bb = false;
20921
20922 FOR_EACH_EDGE (e, ei, bb->succs)
20923 {
20924 int bb_dist
20925 = distance_agu_use_in_bb (regno0, insn,
20926 distance, BB_HEAD (e->dest),
20927 &found_in_bb, &redefined_in_bb);
20928 if (found_in_bb)
20929 {
20930 if (shortest_dist < 0)
20931 shortest_dist = bb_dist;
20932 else if (bb_dist > 0)
20933 shortest_dist = MIN (bb_dist, shortest_dist);
20934
20935 found = true;
20936 }
20937 }
20938
20939 distance = shortest_dist;
20940 }
20941 }
20942
20943 if (!found || redefined)
20944 return -1;
20945
20946 return distance >> 1;
20947 }
20948
20949 /* Define this macro to tune LEA priority vs ADD, it take effect when
20950 there is a dilemma of choicing LEA or ADD
20951 Negative value: ADD is more preferred than LEA
20952 Zero: Netrual
20953 Positive value: LEA is more preferred than ADD*/
20954 #define IX86_LEA_PRIORITY 0
20955
20956 /* Return true if usage of lea INSN has performance advantage
20957 over a sequence of instructions. Instructions sequence has
20958 SPLIT_COST cycles higher latency than lea latency. */
20959
20960 static bool
20961 ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1,
20962 unsigned int regno2, int split_cost, bool has_scale)
20963 {
20964 int dist_define, dist_use;
20965
20966 /* For Silvermont if using a 2-source or 3-source LEA for
20967 non-destructive destination purposes, or due to wanting
20968 ability to use SCALE, the use of LEA is justified. */
20969 if (TARGET_SILVERMONT || TARGET_INTEL)
20970 {
20971 if (has_scale)
20972 return true;
20973 if (split_cost < 1)
20974 return false;
20975 if (regno0 == regno1 || regno0 == regno2)
20976 return false;
20977 return true;
20978 }
20979
20980 dist_define = distance_non_agu_define (regno1, regno2, insn);
20981 dist_use = distance_agu_use (regno0, insn);
20982
20983 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
20984 {
20985 /* If there is no non AGU operand definition, no AGU
20986 operand usage and split cost is 0 then both lea
20987 and non lea variants have same priority. Currently
20988 we prefer lea for 64 bit code and non lea on 32 bit
20989 code. */
20990 if (dist_use < 0 && split_cost == 0)
20991 return TARGET_64BIT || IX86_LEA_PRIORITY;
20992 else
20993 return true;
20994 }
20995
20996 /* With longer definitions distance lea is more preferable.
20997 Here we change it to take into account splitting cost and
20998 lea priority. */
20999 dist_define += split_cost + IX86_LEA_PRIORITY;
21000
21001 /* If there is no use in memory addess then we just check
21002 that split cost exceeds AGU stall. */
21003 if (dist_use < 0)
21004 return dist_define > LEA_MAX_STALL;
21005
21006 /* If this insn has both backward non-agu dependence and forward
21007 agu dependence, the one with short distance takes effect. */
21008 return dist_define >= dist_use;
21009 }
21010
21011 /* Return true if it is legal to clobber flags by INSN and
21012 false otherwise. */
21013
21014 static bool
21015 ix86_ok_to_clobber_flags (rtx_insn *insn)
21016 {
21017 basic_block bb = BLOCK_FOR_INSN (insn);
21018 df_ref use;
21019 bitmap live;
21020
21021 while (insn)
21022 {
21023 if (NONDEBUG_INSN_P (insn))
21024 {
21025 FOR_EACH_INSN_USE (use, insn)
21026 if (DF_REF_REG_USE_P (use) && DF_REF_REGNO (use) == FLAGS_REG)
21027 return false;
21028
21029 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
21030 return true;
21031 }
21032
21033 if (insn == BB_END (bb))
21034 break;
21035
21036 insn = NEXT_INSN (insn);
21037 }
21038
21039 live = df_get_live_out(bb);
21040 return !REGNO_REG_SET_P (live, FLAGS_REG);
21041 }
21042
21043 /* Return true if we need to split op0 = op1 + op2 into a sequence of
21044 move and add to avoid AGU stalls. */
21045
21046 bool
21047 ix86_avoid_lea_for_add (rtx_insn *insn, rtx operands[])
21048 {
21049 unsigned int regno0, regno1, regno2;
21050
21051 /* Check if we need to optimize. */
21052 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21053 return false;
21054
21055 /* Check it is correct to split here. */
21056 if (!ix86_ok_to_clobber_flags(insn))
21057 return false;
21058
21059 regno0 = true_regnum (operands[0]);
21060 regno1 = true_regnum (operands[1]);
21061 regno2 = true_regnum (operands[2]);
21062
21063 /* We need to split only adds with non destructive
21064 destination operand. */
21065 if (regno0 == regno1 || regno0 == regno2)
21066 return false;
21067 else
21068 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
21069 }
21070
21071 /* Return true if we should emit lea instruction instead of mov
21072 instruction. */
21073
21074 bool
21075 ix86_use_lea_for_mov (rtx_insn *insn, rtx operands[])
21076 {
21077 unsigned int regno0, regno1;
21078
21079 /* Check if we need to optimize. */
21080 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21081 return false;
21082
21083 /* Use lea for reg to reg moves only. */
21084 if (!REG_P (operands[0]) || !REG_P (operands[1]))
21085 return false;
21086
21087 regno0 = true_regnum (operands[0]);
21088 regno1 = true_regnum (operands[1]);
21089
21090 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
21091 }
21092
21093 /* Return true if we need to split lea into a sequence of
21094 instructions to avoid AGU stalls. */
21095
21096 bool
21097 ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[])
21098 {
21099 unsigned int regno0, regno1, regno2;
21100 int split_cost;
21101 struct ix86_address parts;
21102 int ok;
21103
21104 /* Check we need to optimize. */
21105 if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun))
21106 return false;
21107
21108 /* The "at least two components" test below might not catch simple
21109 move or zero extension insns if parts.base is non-NULL and parts.disp
21110 is const0_rtx as the only components in the address, e.g. if the
21111 register is %rbp or %r13. As this test is much cheaper and moves or
21112 zero extensions are the common case, do this check first. */
21113 if (REG_P (operands[1])
21114 || (SImode_address_operand (operands[1], VOIDmode)
21115 && REG_P (XEXP (operands[1], 0))))
21116 return false;
21117
21118 /* Check if it is OK to split here. */
21119 if (!ix86_ok_to_clobber_flags (insn))
21120 return false;
21121
21122 ok = ix86_decompose_address (operands[1], &parts);
21123 gcc_assert (ok);
21124
21125 /* There should be at least two components in the address. */
21126 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
21127 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
21128 return false;
21129
21130 /* We should not split into add if non legitimate pic
21131 operand is used as displacement. */
21132 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
21133 return false;
21134
21135 regno0 = true_regnum (operands[0]) ;
21136 regno1 = INVALID_REGNUM;
21137 regno2 = INVALID_REGNUM;
21138
21139 if (parts.base)
21140 regno1 = true_regnum (parts.base);
21141 if (parts.index)
21142 regno2 = true_regnum (parts.index);
21143
21144 split_cost = 0;
21145
21146 /* Compute how many cycles we will add to execution time
21147 if split lea into a sequence of instructions. */
21148 if (parts.base || parts.index)
21149 {
21150 /* Have to use mov instruction if non desctructive
21151 destination form is used. */
21152 if (regno1 != regno0 && regno2 != regno0)
21153 split_cost += 1;
21154
21155 /* Have to add index to base if both exist. */
21156 if (parts.base && parts.index)
21157 split_cost += 1;
21158
21159 /* Have to use shift and adds if scale is 2 or greater. */
21160 if (parts.scale > 1)
21161 {
21162 if (regno0 != regno1)
21163 split_cost += 1;
21164 else if (regno2 == regno0)
21165 split_cost += 4;
21166 else
21167 split_cost += parts.scale;
21168 }
21169
21170 /* Have to use add instruction with immediate if
21171 disp is non zero. */
21172 if (parts.disp && parts.disp != const0_rtx)
21173 split_cost += 1;
21174
21175 /* Subtract the price of lea. */
21176 split_cost -= 1;
21177 }
21178
21179 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
21180 parts.scale > 1);
21181 }
21182
21183 /* Emit x86 binary operand CODE in mode MODE, where the first operand
21184 matches destination. RTX includes clobber of FLAGS_REG. */
21185
21186 static void
21187 ix86_emit_binop (enum rtx_code code, machine_mode mode,
21188 rtx dst, rtx src)
21189 {
21190 rtx op, clob;
21191
21192 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
21193 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21194
21195 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
21196 }
21197
21198 /* Return true if regno1 def is nearest to the insn. */
21199
21200 static bool
21201 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
21202 {
21203 rtx_insn *prev = insn;
21204 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
21205
21206 if (insn == start)
21207 return false;
21208 while (prev && prev != start)
21209 {
21210 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
21211 {
21212 prev = PREV_INSN (prev);
21213 continue;
21214 }
21215 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
21216 return true;
21217 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
21218 return false;
21219 prev = PREV_INSN (prev);
21220 }
21221
21222 /* None of the regs is defined in the bb. */
21223 return false;
21224 }
21225
21226 /* Split lea instructions into a sequence of instructions
21227 which are executed on ALU to avoid AGU stalls.
21228 It is assumed that it is allowed to clobber flags register
21229 at lea position. */
21230
21231 void
21232 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
21233 {
21234 unsigned int regno0, regno1, regno2;
21235 struct ix86_address parts;
21236 rtx target, tmp;
21237 int ok, adds;
21238
21239 ok = ix86_decompose_address (operands[1], &parts);
21240 gcc_assert (ok);
21241
21242 target = gen_lowpart (mode, operands[0]);
21243
21244 regno0 = true_regnum (target);
21245 regno1 = INVALID_REGNUM;
21246 regno2 = INVALID_REGNUM;
21247
21248 if (parts.base)
21249 {
21250 parts.base = gen_lowpart (mode, parts.base);
21251 regno1 = true_regnum (parts.base);
21252 }
21253
21254 if (parts.index)
21255 {
21256 parts.index = gen_lowpart (mode, parts.index);
21257 regno2 = true_regnum (parts.index);
21258 }
21259
21260 if (parts.disp)
21261 parts.disp = gen_lowpart (mode, parts.disp);
21262
21263 if (parts.scale > 1)
21264 {
21265 /* Case r1 = r1 + ... */
21266 if (regno1 == regno0)
21267 {
21268 /* If we have a case r1 = r1 + C * r2 then we
21269 should use multiplication which is very
21270 expensive. Assume cost model is wrong if we
21271 have such case here. */
21272 gcc_assert (regno2 != regno0);
21273
21274 for (adds = parts.scale; adds > 0; adds--)
21275 ix86_emit_binop (PLUS, mode, target, parts.index);
21276 }
21277 else
21278 {
21279 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
21280 if (regno0 != regno2)
21281 emit_insn (gen_rtx_SET (target, parts.index));
21282
21283 /* Use shift for scaling. */
21284 ix86_emit_binop (ASHIFT, mode, target,
21285 GEN_INT (exact_log2 (parts.scale)));
21286
21287 if (parts.base)
21288 ix86_emit_binop (PLUS, mode, target, parts.base);
21289
21290 if (parts.disp && parts.disp != const0_rtx)
21291 ix86_emit_binop (PLUS, mode, target, parts.disp);
21292 }
21293 }
21294 else if (!parts.base && !parts.index)
21295 {
21296 gcc_assert(parts.disp);
21297 emit_insn (gen_rtx_SET (target, parts.disp));
21298 }
21299 else
21300 {
21301 if (!parts.base)
21302 {
21303 if (regno0 != regno2)
21304 emit_insn (gen_rtx_SET (target, parts.index));
21305 }
21306 else if (!parts.index)
21307 {
21308 if (regno0 != regno1)
21309 emit_insn (gen_rtx_SET (target, parts.base));
21310 }
21311 else
21312 {
21313 if (regno0 == regno1)
21314 tmp = parts.index;
21315 else if (regno0 == regno2)
21316 tmp = parts.base;
21317 else
21318 {
21319 rtx tmp1;
21320
21321 /* Find better operand for SET instruction, depending
21322 on which definition is farther from the insn. */
21323 if (find_nearest_reg_def (insn, regno1, regno2))
21324 tmp = parts.index, tmp1 = parts.base;
21325 else
21326 tmp = parts.base, tmp1 = parts.index;
21327
21328 emit_insn (gen_rtx_SET (target, tmp));
21329
21330 if (parts.disp && parts.disp != const0_rtx)
21331 ix86_emit_binop (PLUS, mode, target, parts.disp);
21332
21333 ix86_emit_binop (PLUS, mode, target, tmp1);
21334 return;
21335 }
21336
21337 ix86_emit_binop (PLUS, mode, target, tmp);
21338 }
21339
21340 if (parts.disp && parts.disp != const0_rtx)
21341 ix86_emit_binop (PLUS, mode, target, parts.disp);
21342 }
21343 }
21344
21345 /* Return true if it is ok to optimize an ADD operation to LEA
21346 operation to avoid flag register consumation. For most processors,
21347 ADD is faster than LEA. For the processors like BONNELL, if the
21348 destination register of LEA holds an actual address which will be
21349 used soon, LEA is better and otherwise ADD is better. */
21350
21351 bool
21352 ix86_lea_for_add_ok (rtx_insn *insn, rtx operands[])
21353 {
21354 unsigned int regno0 = true_regnum (operands[0]);
21355 unsigned int regno1 = true_regnum (operands[1]);
21356 unsigned int regno2 = true_regnum (operands[2]);
21357
21358 /* If a = b + c, (a!=b && a!=c), must use lea form. */
21359 if (regno0 != regno1 && regno0 != regno2)
21360 return true;
21361
21362 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21363 return false;
21364
21365 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
21366 }
21367
21368 /* Return true if destination reg of SET_BODY is shift count of
21369 USE_BODY. */
21370
21371 static bool
21372 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
21373 {
21374 rtx set_dest;
21375 rtx shift_rtx;
21376 int i;
21377
21378 /* Retrieve destination of SET_BODY. */
21379 switch (GET_CODE (set_body))
21380 {
21381 case SET:
21382 set_dest = SET_DEST (set_body);
21383 if (!set_dest || !REG_P (set_dest))
21384 return false;
21385 break;
21386 case PARALLEL:
21387 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
21388 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
21389 use_body))
21390 return true;
21391 /* FALLTHROUGH */
21392 default:
21393 return false;
21394 }
21395
21396 /* Retrieve shift count of USE_BODY. */
21397 switch (GET_CODE (use_body))
21398 {
21399 case SET:
21400 shift_rtx = XEXP (use_body, 1);
21401 break;
21402 case PARALLEL:
21403 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
21404 if (ix86_dep_by_shift_count_body (set_body,
21405 XVECEXP (use_body, 0, i)))
21406 return true;
21407 /* FALLTHROUGH */
21408 default:
21409 return false;
21410 }
21411
21412 if (shift_rtx
21413 && (GET_CODE (shift_rtx) == ASHIFT
21414 || GET_CODE (shift_rtx) == LSHIFTRT
21415 || GET_CODE (shift_rtx) == ASHIFTRT
21416 || GET_CODE (shift_rtx) == ROTATE
21417 || GET_CODE (shift_rtx) == ROTATERT))
21418 {
21419 rtx shift_count = XEXP (shift_rtx, 1);
21420
21421 /* Return true if shift count is dest of SET_BODY. */
21422 if (REG_P (shift_count))
21423 {
21424 /* Add check since it can be invoked before register
21425 allocation in pre-reload schedule. */
21426 if (reload_completed
21427 && true_regnum (set_dest) == true_regnum (shift_count))
21428 return true;
21429 else if (REGNO(set_dest) == REGNO(shift_count))
21430 return true;
21431 }
21432 }
21433
21434 return false;
21435 }
21436
21437 /* Return true if destination reg of SET_INSN is shift count of
21438 USE_INSN. */
21439
21440 bool
21441 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
21442 {
21443 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
21444 PATTERN (use_insn));
21445 }
21446
21447 /* Return TRUE or FALSE depending on whether the unary operator meets the
21448 appropriate constraints. */
21449
21450 bool
21451 ix86_unary_operator_ok (enum rtx_code,
21452 machine_mode,
21453 rtx operands[2])
21454 {
21455 /* If one of operands is memory, source and destination must match. */
21456 if ((MEM_P (operands[0])
21457 || MEM_P (operands[1]))
21458 && ! rtx_equal_p (operands[0], operands[1]))
21459 return false;
21460 return true;
21461 }
21462
21463 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
21464 are ok, keeping in mind the possible movddup alternative. */
21465
21466 bool
21467 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
21468 {
21469 if (MEM_P (operands[0]))
21470 return rtx_equal_p (operands[0], operands[1 + high]);
21471 if (MEM_P (operands[1]) && MEM_P (operands[2]))
21472 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
21473 return true;
21474 }
21475
21476 /* Post-reload splitter for converting an SF or DFmode value in an
21477 SSE register into an unsigned SImode. */
21478
21479 void
21480 ix86_split_convert_uns_si_sse (rtx operands[])
21481 {
21482 machine_mode vecmode;
21483 rtx value, large, zero_or_two31, input, two31, x;
21484
21485 large = operands[1];
21486 zero_or_two31 = operands[2];
21487 input = operands[3];
21488 two31 = operands[4];
21489 vecmode = GET_MODE (large);
21490 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
21491
21492 /* Load up the value into the low element. We must ensure that the other
21493 elements are valid floats -- zero is the easiest such value. */
21494 if (MEM_P (input))
21495 {
21496 if (vecmode == V4SFmode)
21497 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
21498 else
21499 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
21500 }
21501 else
21502 {
21503 input = gen_rtx_REG (vecmode, REGNO (input));
21504 emit_move_insn (value, CONST0_RTX (vecmode));
21505 if (vecmode == V4SFmode)
21506 emit_insn (gen_sse_movss (value, value, input));
21507 else
21508 emit_insn (gen_sse2_movsd (value, value, input));
21509 }
21510
21511 emit_move_insn (large, two31);
21512 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
21513
21514 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
21515 emit_insn (gen_rtx_SET (large, x));
21516
21517 x = gen_rtx_AND (vecmode, zero_or_two31, large);
21518 emit_insn (gen_rtx_SET (zero_or_two31, x));
21519
21520 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
21521 emit_insn (gen_rtx_SET (value, x));
21522
21523 large = gen_rtx_REG (V4SImode, REGNO (large));
21524 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
21525
21526 x = gen_rtx_REG (V4SImode, REGNO (value));
21527 if (vecmode == V4SFmode)
21528 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
21529 else
21530 emit_insn (gen_sse2_cvttpd2dq (x, value));
21531 value = x;
21532
21533 emit_insn (gen_xorv4si3 (value, value, large));
21534 }
21535
21536 /* Convert an unsigned DImode value into a DFmode, using only SSE.
21537 Expects the 64-bit DImode to be supplied in a pair of integral
21538 registers. Requires SSE2; will use SSE3 if available. For x86_32,
21539 -mfpmath=sse, !optimize_size only. */
21540
21541 void
21542 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
21543 {
21544 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
21545 rtx int_xmm, fp_xmm;
21546 rtx biases, exponents;
21547 rtx x;
21548
21549 int_xmm = gen_reg_rtx (V4SImode);
21550 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
21551 emit_insn (gen_movdi_to_sse (int_xmm, input));
21552 else if (TARGET_SSE_SPLIT_REGS)
21553 {
21554 emit_clobber (int_xmm);
21555 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
21556 }
21557 else
21558 {
21559 x = gen_reg_rtx (V2DImode);
21560 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
21561 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
21562 }
21563
21564 x = gen_rtx_CONST_VECTOR (V4SImode,
21565 gen_rtvec (4, GEN_INT (0x43300000UL),
21566 GEN_INT (0x45300000UL),
21567 const0_rtx, const0_rtx));
21568 exponents = validize_mem (force_const_mem (V4SImode, x));
21569
21570 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
21571 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
21572
21573 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
21574 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
21575 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
21576 (0x1.0p84 + double(fp_value_hi_xmm)).
21577 Note these exponents differ by 32. */
21578
21579 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
21580
21581 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
21582 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
21583 real_ldexp (&bias_lo_rvt, &dconst1, 52);
21584 real_ldexp (&bias_hi_rvt, &dconst1, 84);
21585 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
21586 x = const_double_from_real_value (bias_hi_rvt, DFmode);
21587 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
21588 biases = validize_mem (force_const_mem (V2DFmode, biases));
21589 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
21590
21591 /* Add the upper and lower DFmode values together. */
21592 if (TARGET_SSE3)
21593 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
21594 else
21595 {
21596 x = copy_to_mode_reg (V2DFmode, fp_xmm);
21597 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
21598 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
21599 }
21600
21601 ix86_expand_vector_extract (false, target, fp_xmm, 0);
21602 }
21603
21604 /* Not used, but eases macroization of patterns. */
21605 void
21606 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
21607 {
21608 gcc_unreachable ();
21609 }
21610
21611 /* Convert an unsigned SImode value into a DFmode. Only currently used
21612 for SSE, but applicable anywhere. */
21613
21614 void
21615 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
21616 {
21617 REAL_VALUE_TYPE TWO31r;
21618 rtx x, fp;
21619
21620 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
21621 NULL, 1, OPTAB_DIRECT);
21622
21623 fp = gen_reg_rtx (DFmode);
21624 emit_insn (gen_floatsidf2 (fp, x));
21625
21626 real_ldexp (&TWO31r, &dconst1, 31);
21627 x = const_double_from_real_value (TWO31r, DFmode);
21628
21629 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
21630 if (x != target)
21631 emit_move_insn (target, x);
21632 }
21633
21634 /* Convert a signed DImode value into a DFmode. Only used for SSE in
21635 32-bit mode; otherwise we have a direct convert instruction. */
21636
21637 void
21638 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
21639 {
21640 REAL_VALUE_TYPE TWO32r;
21641 rtx fp_lo, fp_hi, x;
21642
21643 fp_lo = gen_reg_rtx (DFmode);
21644 fp_hi = gen_reg_rtx (DFmode);
21645
21646 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
21647
21648 real_ldexp (&TWO32r, &dconst1, 32);
21649 x = const_double_from_real_value (TWO32r, DFmode);
21650 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
21651
21652 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
21653
21654 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
21655 0, OPTAB_DIRECT);
21656 if (x != target)
21657 emit_move_insn (target, x);
21658 }
21659
21660 /* Convert an unsigned SImode value into a SFmode, using only SSE.
21661 For x86_32, -mfpmath=sse, !optimize_size only. */
21662 void
21663 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
21664 {
21665 REAL_VALUE_TYPE ONE16r;
21666 rtx fp_hi, fp_lo, int_hi, int_lo, x;
21667
21668 real_ldexp (&ONE16r, &dconst1, 16);
21669 x = const_double_from_real_value (ONE16r, SFmode);
21670 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
21671 NULL, 0, OPTAB_DIRECT);
21672 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
21673 NULL, 0, OPTAB_DIRECT);
21674 fp_hi = gen_reg_rtx (SFmode);
21675 fp_lo = gen_reg_rtx (SFmode);
21676 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
21677 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
21678 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
21679 0, OPTAB_DIRECT);
21680 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
21681 0, OPTAB_DIRECT);
21682 if (!rtx_equal_p (target, fp_hi))
21683 emit_move_insn (target, fp_hi);
21684 }
21685
21686 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
21687 a vector of unsigned ints VAL to vector of floats TARGET. */
21688
21689 void
21690 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
21691 {
21692 rtx tmp[8];
21693 REAL_VALUE_TYPE TWO16r;
21694 machine_mode intmode = GET_MODE (val);
21695 machine_mode fltmode = GET_MODE (target);
21696 rtx (*cvt) (rtx, rtx);
21697
21698 if (intmode == V4SImode)
21699 cvt = gen_floatv4siv4sf2;
21700 else
21701 cvt = gen_floatv8siv8sf2;
21702 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
21703 tmp[0] = force_reg (intmode, tmp[0]);
21704 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
21705 OPTAB_DIRECT);
21706 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
21707 NULL_RTX, 1, OPTAB_DIRECT);
21708 tmp[3] = gen_reg_rtx (fltmode);
21709 emit_insn (cvt (tmp[3], tmp[1]));
21710 tmp[4] = gen_reg_rtx (fltmode);
21711 emit_insn (cvt (tmp[4], tmp[2]));
21712 real_ldexp (&TWO16r, &dconst1, 16);
21713 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
21714 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
21715 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
21716 OPTAB_DIRECT);
21717 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
21718 OPTAB_DIRECT);
21719 if (tmp[7] != target)
21720 emit_move_insn (target, tmp[7]);
21721 }
21722
21723 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
21724 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
21725 This is done by doing just signed conversion if < 0x1p31, and otherwise by
21726 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
21727
21728 rtx
21729 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
21730 {
21731 REAL_VALUE_TYPE TWO31r;
21732 rtx two31r, tmp[4];
21733 machine_mode mode = GET_MODE (val);
21734 machine_mode scalarmode = GET_MODE_INNER (mode);
21735 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
21736 rtx (*cmp) (rtx, rtx, rtx, rtx);
21737 int i;
21738
21739 for (i = 0; i < 3; i++)
21740 tmp[i] = gen_reg_rtx (mode);
21741 real_ldexp (&TWO31r, &dconst1, 31);
21742 two31r = const_double_from_real_value (TWO31r, scalarmode);
21743 two31r = ix86_build_const_vector (mode, 1, two31r);
21744 two31r = force_reg (mode, two31r);
21745 switch (mode)
21746 {
21747 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
21748 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
21749 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
21750 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
21751 default: gcc_unreachable ();
21752 }
21753 tmp[3] = gen_rtx_LE (mode, two31r, val);
21754 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
21755 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
21756 0, OPTAB_DIRECT);
21757 if (intmode == V4SImode || TARGET_AVX2)
21758 *xorp = expand_simple_binop (intmode, ASHIFT,
21759 gen_lowpart (intmode, tmp[0]),
21760 GEN_INT (31), NULL_RTX, 0,
21761 OPTAB_DIRECT);
21762 else
21763 {
21764 rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
21765 two31 = ix86_build_const_vector (intmode, 1, two31);
21766 *xorp = expand_simple_binop (intmode, AND,
21767 gen_lowpart (intmode, tmp[0]),
21768 two31, NULL_RTX, 0,
21769 OPTAB_DIRECT);
21770 }
21771 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
21772 0, OPTAB_DIRECT);
21773 }
21774
21775 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
21776 then replicate the value for all elements of the vector
21777 register. */
21778
21779 rtx
21780 ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
21781 {
21782 int i, n_elt;
21783 rtvec v;
21784 machine_mode scalar_mode;
21785
21786 switch (mode)
21787 {
21788 case V64QImode:
21789 case V32QImode:
21790 case V16QImode:
21791 case V32HImode:
21792 case V16HImode:
21793 case V8HImode:
21794 case V16SImode:
21795 case V8SImode:
21796 case V4SImode:
21797 case V8DImode:
21798 case V4DImode:
21799 case V2DImode:
21800 gcc_assert (vect);
21801 /* FALLTHRU */
21802 case V16SFmode:
21803 case V8SFmode:
21804 case V4SFmode:
21805 case V8DFmode:
21806 case V4DFmode:
21807 case V2DFmode:
21808 n_elt = GET_MODE_NUNITS (mode);
21809 v = rtvec_alloc (n_elt);
21810 scalar_mode = GET_MODE_INNER (mode);
21811
21812 RTVEC_ELT (v, 0) = value;
21813
21814 for (i = 1; i < n_elt; ++i)
21815 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
21816
21817 return gen_rtx_CONST_VECTOR (mode, v);
21818
21819 default:
21820 gcc_unreachable ();
21821 }
21822 }
21823
21824 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
21825 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
21826 for an SSE register. If VECT is true, then replicate the mask for
21827 all elements of the vector register. If INVERT is true, then create
21828 a mask excluding the sign bit. */
21829
21830 rtx
21831 ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
21832 {
21833 machine_mode vec_mode, imode;
21834 wide_int w;
21835 rtx mask, v;
21836
21837 switch (mode)
21838 {
21839 case V16SImode:
21840 case V16SFmode:
21841 case V8SImode:
21842 case V4SImode:
21843 case V8SFmode:
21844 case V4SFmode:
21845 vec_mode = mode;
21846 imode = SImode;
21847 break;
21848
21849 case V8DImode:
21850 case V4DImode:
21851 case V2DImode:
21852 case V8DFmode:
21853 case V4DFmode:
21854 case V2DFmode:
21855 vec_mode = mode;
21856 imode = DImode;
21857 break;
21858
21859 case TImode:
21860 case TFmode:
21861 vec_mode = VOIDmode;
21862 imode = TImode;
21863 break;
21864
21865 default:
21866 gcc_unreachable ();
21867 }
21868
21869 machine_mode inner_mode = GET_MODE_INNER (mode);
21870 w = wi::set_bit_in_zero (GET_MODE_BITSIZE (inner_mode) - 1,
21871 GET_MODE_BITSIZE (inner_mode));
21872 if (invert)
21873 w = wi::bit_not (w);
21874
21875 /* Force this value into the low part of a fp vector constant. */
21876 mask = immed_wide_int_const (w, imode);
21877 mask = gen_lowpart (inner_mode, mask);
21878
21879 if (vec_mode == VOIDmode)
21880 return force_reg (inner_mode, mask);
21881
21882 v = ix86_build_const_vector (vec_mode, vect, mask);
21883 return force_reg (vec_mode, v);
21884 }
21885
21886 /* Generate code for floating point ABS or NEG. */
21887
21888 void
21889 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
21890 rtx operands[])
21891 {
21892 rtx mask, set, dst, src;
21893 bool use_sse = false;
21894 bool vector_mode = VECTOR_MODE_P (mode);
21895 machine_mode vmode = mode;
21896
21897 if (vector_mode)
21898 use_sse = true;
21899 else if (mode == TFmode)
21900 use_sse = true;
21901 else if (TARGET_SSE_MATH)
21902 {
21903 use_sse = SSE_FLOAT_MODE_P (mode);
21904 if (mode == SFmode)
21905 vmode = V4SFmode;
21906 else if (mode == DFmode)
21907 vmode = V2DFmode;
21908 }
21909
21910 /* NEG and ABS performed with SSE use bitwise mask operations.
21911 Create the appropriate mask now. */
21912 if (use_sse)
21913 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
21914 else
21915 mask = NULL_RTX;
21916
21917 dst = operands[0];
21918 src = operands[1];
21919
21920 set = gen_rtx_fmt_e (code, mode, src);
21921 set = gen_rtx_SET (dst, set);
21922
21923 if (mask)
21924 {
21925 rtx use, clob;
21926 rtvec par;
21927
21928 use = gen_rtx_USE (VOIDmode, mask);
21929 if (vector_mode)
21930 par = gen_rtvec (2, set, use);
21931 else
21932 {
21933 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21934 par = gen_rtvec (3, set, use, clob);
21935 }
21936 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
21937 }
21938 else
21939 emit_insn (set);
21940 }
21941
21942 /* Expand a copysign operation. Special case operand 0 being a constant. */
21943
21944 void
21945 ix86_expand_copysign (rtx operands[])
21946 {
21947 machine_mode mode, vmode;
21948 rtx dest, op0, op1, mask, nmask;
21949
21950 dest = operands[0];
21951 op0 = operands[1];
21952 op1 = operands[2];
21953
21954 mode = GET_MODE (dest);
21955
21956 if (mode == SFmode)
21957 vmode = V4SFmode;
21958 else if (mode == DFmode)
21959 vmode = V2DFmode;
21960 else
21961 vmode = mode;
21962
21963 if (CONST_DOUBLE_P (op0))
21964 {
21965 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
21966
21967 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
21968 op0 = simplify_unary_operation (ABS, mode, op0, mode);
21969
21970 if (mode == SFmode || mode == DFmode)
21971 {
21972 if (op0 == CONST0_RTX (mode))
21973 op0 = CONST0_RTX (vmode);
21974 else
21975 {
21976 rtx v = ix86_build_const_vector (vmode, false, op0);
21977
21978 op0 = force_reg (vmode, v);
21979 }
21980 }
21981 else if (op0 != CONST0_RTX (mode))
21982 op0 = force_reg (mode, op0);
21983
21984 mask = ix86_build_signbit_mask (vmode, 0, 0);
21985
21986 if (mode == SFmode)
21987 copysign_insn = gen_copysignsf3_const;
21988 else if (mode == DFmode)
21989 copysign_insn = gen_copysigndf3_const;
21990 else
21991 copysign_insn = gen_copysigntf3_const;
21992
21993 emit_insn (copysign_insn (dest, op0, op1, mask));
21994 }
21995 else
21996 {
21997 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
21998
21999 nmask = ix86_build_signbit_mask (vmode, 0, 1);
22000 mask = ix86_build_signbit_mask (vmode, 0, 0);
22001
22002 if (mode == SFmode)
22003 copysign_insn = gen_copysignsf3_var;
22004 else if (mode == DFmode)
22005 copysign_insn = gen_copysigndf3_var;
22006 else
22007 copysign_insn = gen_copysigntf3_var;
22008
22009 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
22010 }
22011 }
22012
22013 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
22014 be a constant, and so has already been expanded into a vector constant. */
22015
22016 void
22017 ix86_split_copysign_const (rtx operands[])
22018 {
22019 machine_mode mode, vmode;
22020 rtx dest, op0, mask, x;
22021
22022 dest = operands[0];
22023 op0 = operands[1];
22024 mask = operands[3];
22025
22026 mode = GET_MODE (dest);
22027 vmode = GET_MODE (mask);
22028
22029 dest = lowpart_subreg (vmode, dest, mode);
22030 x = gen_rtx_AND (vmode, dest, mask);
22031 emit_insn (gen_rtx_SET (dest, x));
22032
22033 if (op0 != CONST0_RTX (vmode))
22034 {
22035 x = gen_rtx_IOR (vmode, dest, op0);
22036 emit_insn (gen_rtx_SET (dest, x));
22037 }
22038 }
22039
22040 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
22041 so we have to do two masks. */
22042
22043 void
22044 ix86_split_copysign_var (rtx operands[])
22045 {
22046 machine_mode mode, vmode;
22047 rtx dest, scratch, op0, op1, mask, nmask, x;
22048
22049 dest = operands[0];
22050 scratch = operands[1];
22051 op0 = operands[2];
22052 op1 = operands[3];
22053 nmask = operands[4];
22054 mask = operands[5];
22055
22056 mode = GET_MODE (dest);
22057 vmode = GET_MODE (mask);
22058
22059 if (rtx_equal_p (op0, op1))
22060 {
22061 /* Shouldn't happen often (it's useless, obviously), but when it does
22062 we'd generate incorrect code if we continue below. */
22063 emit_move_insn (dest, op0);
22064 return;
22065 }
22066
22067 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
22068 {
22069 gcc_assert (REGNO (op1) == REGNO (scratch));
22070
22071 x = gen_rtx_AND (vmode, scratch, mask);
22072 emit_insn (gen_rtx_SET (scratch, x));
22073
22074 dest = mask;
22075 op0 = lowpart_subreg (vmode, op0, mode);
22076 x = gen_rtx_NOT (vmode, dest);
22077 x = gen_rtx_AND (vmode, x, op0);
22078 emit_insn (gen_rtx_SET (dest, x));
22079 }
22080 else
22081 {
22082 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
22083 {
22084 x = gen_rtx_AND (vmode, scratch, mask);
22085 }
22086 else /* alternative 2,4 */
22087 {
22088 gcc_assert (REGNO (mask) == REGNO (scratch));
22089 op1 = lowpart_subreg (vmode, op1, mode);
22090 x = gen_rtx_AND (vmode, scratch, op1);
22091 }
22092 emit_insn (gen_rtx_SET (scratch, x));
22093
22094 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
22095 {
22096 dest = lowpart_subreg (vmode, op0, mode);
22097 x = gen_rtx_AND (vmode, dest, nmask);
22098 }
22099 else /* alternative 3,4 */
22100 {
22101 gcc_assert (REGNO (nmask) == REGNO (dest));
22102 dest = nmask;
22103 op0 = lowpart_subreg (vmode, op0, mode);
22104 x = gen_rtx_AND (vmode, dest, op0);
22105 }
22106 emit_insn (gen_rtx_SET (dest, x));
22107 }
22108
22109 x = gen_rtx_IOR (vmode, dest, scratch);
22110 emit_insn (gen_rtx_SET (dest, x));
22111 }
22112
22113 /* Return TRUE or FALSE depending on whether the first SET in INSN
22114 has source and destination with matching CC modes, and that the
22115 CC mode is at least as constrained as REQ_MODE. */
22116
22117 bool
22118 ix86_match_ccmode (rtx insn, machine_mode req_mode)
22119 {
22120 rtx set;
22121 machine_mode set_mode;
22122
22123 set = PATTERN (insn);
22124 if (GET_CODE (set) == PARALLEL)
22125 set = XVECEXP (set, 0, 0);
22126 gcc_assert (GET_CODE (set) == SET);
22127 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
22128
22129 set_mode = GET_MODE (SET_DEST (set));
22130 switch (set_mode)
22131 {
22132 case CCNOmode:
22133 if (req_mode != CCNOmode
22134 && (req_mode != CCmode
22135 || XEXP (SET_SRC (set), 1) != const0_rtx))
22136 return false;
22137 break;
22138 case CCmode:
22139 if (req_mode == CCGCmode)
22140 return false;
22141 /* FALLTHRU */
22142 case CCGCmode:
22143 if (req_mode == CCGOCmode || req_mode == CCNOmode)
22144 return false;
22145 /* FALLTHRU */
22146 case CCGOCmode:
22147 if (req_mode == CCZmode)
22148 return false;
22149 /* FALLTHRU */
22150 case CCZmode:
22151 break;
22152
22153 case CCAmode:
22154 case CCCmode:
22155 case CCOmode:
22156 case CCPmode:
22157 case CCSmode:
22158 if (set_mode != req_mode)
22159 return false;
22160 break;
22161
22162 default:
22163 gcc_unreachable ();
22164 }
22165
22166 return GET_MODE (SET_SRC (set)) == set_mode;
22167 }
22168
22169 /* Generate insn patterns to do an integer compare of OPERANDS. */
22170
22171 static rtx
22172 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
22173 {
22174 machine_mode cmpmode;
22175 rtx tmp, flags;
22176
22177 cmpmode = SELECT_CC_MODE (code, op0, op1);
22178 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
22179
22180 /* This is very simple, but making the interface the same as in the
22181 FP case makes the rest of the code easier. */
22182 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
22183 emit_insn (gen_rtx_SET (flags, tmp));
22184
22185 /* Return the test that should be put into the flags user, i.e.
22186 the bcc, scc, or cmov instruction. */
22187 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
22188 }
22189
22190 /* Figure out whether to use ordered or unordered fp comparisons.
22191 Return the appropriate mode to use. */
22192
22193 machine_mode
22194 ix86_fp_compare_mode (enum rtx_code)
22195 {
22196 /* ??? In order to make all comparisons reversible, we do all comparisons
22197 non-trapping when compiling for IEEE. Once gcc is able to distinguish
22198 all forms trapping and nontrapping comparisons, we can make inequality
22199 comparisons trapping again, since it results in better code when using
22200 FCOM based compares. */
22201 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
22202 }
22203
22204 machine_mode
22205 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
22206 {
22207 machine_mode mode = GET_MODE (op0);
22208
22209 if (SCALAR_FLOAT_MODE_P (mode))
22210 {
22211 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
22212 return ix86_fp_compare_mode (code);
22213 }
22214
22215 switch (code)
22216 {
22217 /* Only zero flag is needed. */
22218 case EQ: /* ZF=0 */
22219 case NE: /* ZF!=0 */
22220 return CCZmode;
22221 /* Codes needing carry flag. */
22222 case GEU: /* CF=0 */
22223 case LTU: /* CF=1 */
22224 /* Detect overflow checks. They need just the carry flag. */
22225 if (GET_CODE (op0) == PLUS
22226 && (rtx_equal_p (op1, XEXP (op0, 0))
22227 || rtx_equal_p (op1, XEXP (op0, 1))))
22228 return CCCmode;
22229 else
22230 return CCmode;
22231 case GTU: /* CF=0 & ZF=0 */
22232 case LEU: /* CF=1 | ZF=1 */
22233 return CCmode;
22234 /* Codes possibly doable only with sign flag when
22235 comparing against zero. */
22236 case GE: /* SF=OF or SF=0 */
22237 case LT: /* SF<>OF or SF=1 */
22238 if (op1 == const0_rtx)
22239 return CCGOCmode;
22240 else
22241 /* For other cases Carry flag is not required. */
22242 return CCGCmode;
22243 /* Codes doable only with sign flag when comparing
22244 against zero, but we miss jump instruction for it
22245 so we need to use relational tests against overflow
22246 that thus needs to be zero. */
22247 case GT: /* ZF=0 & SF=OF */
22248 case LE: /* ZF=1 | SF<>OF */
22249 if (op1 == const0_rtx)
22250 return CCNOmode;
22251 else
22252 return CCGCmode;
22253 /* strcmp pattern do (use flags) and combine may ask us for proper
22254 mode. */
22255 case USE:
22256 return CCmode;
22257 default:
22258 gcc_unreachable ();
22259 }
22260 }
22261
22262 /* Return the fixed registers used for condition codes. */
22263
22264 static bool
22265 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
22266 {
22267 *p1 = FLAGS_REG;
22268 *p2 = FPSR_REG;
22269 return true;
22270 }
22271
22272 /* If two condition code modes are compatible, return a condition code
22273 mode which is compatible with both. Otherwise, return
22274 VOIDmode. */
22275
22276 static machine_mode
22277 ix86_cc_modes_compatible (machine_mode m1, machine_mode m2)
22278 {
22279 if (m1 == m2)
22280 return m1;
22281
22282 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
22283 return VOIDmode;
22284
22285 if ((m1 == CCGCmode && m2 == CCGOCmode)
22286 || (m1 == CCGOCmode && m2 == CCGCmode))
22287 return CCGCmode;
22288
22289 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
22290 return m2;
22291 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
22292 return m1;
22293
22294 switch (m1)
22295 {
22296 default:
22297 gcc_unreachable ();
22298
22299 case CCmode:
22300 case CCGCmode:
22301 case CCGOCmode:
22302 case CCNOmode:
22303 case CCAmode:
22304 case CCCmode:
22305 case CCOmode:
22306 case CCPmode:
22307 case CCSmode:
22308 case CCZmode:
22309 switch (m2)
22310 {
22311 default:
22312 return VOIDmode;
22313
22314 case CCmode:
22315 case CCGCmode:
22316 case CCGOCmode:
22317 case CCNOmode:
22318 case CCAmode:
22319 case CCCmode:
22320 case CCOmode:
22321 case CCPmode:
22322 case CCSmode:
22323 case CCZmode:
22324 return CCmode;
22325 }
22326
22327 case CCFPmode:
22328 case CCFPUmode:
22329 /* These are only compatible with themselves, which we already
22330 checked above. */
22331 return VOIDmode;
22332 }
22333 }
22334
22335
22336 /* Return a comparison we can do and that it is equivalent to
22337 swap_condition (code) apart possibly from orderedness.
22338 But, never change orderedness if TARGET_IEEE_FP, returning
22339 UNKNOWN in that case if necessary. */
22340
22341 static enum rtx_code
22342 ix86_fp_swap_condition (enum rtx_code code)
22343 {
22344 switch (code)
22345 {
22346 case GT: /* GTU - CF=0 & ZF=0 */
22347 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
22348 case GE: /* GEU - CF=0 */
22349 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
22350 case UNLT: /* LTU - CF=1 */
22351 return TARGET_IEEE_FP ? UNKNOWN : GT;
22352 case UNLE: /* LEU - CF=1 | ZF=1 */
22353 return TARGET_IEEE_FP ? UNKNOWN : GE;
22354 default:
22355 return swap_condition (code);
22356 }
22357 }
22358
22359 /* Return cost of comparison CODE using the best strategy for performance.
22360 All following functions do use number of instructions as a cost metrics.
22361 In future this should be tweaked to compute bytes for optimize_size and
22362 take into account performance of various instructions on various CPUs. */
22363
22364 static int
22365 ix86_fp_comparison_cost (enum rtx_code code)
22366 {
22367 int arith_cost;
22368
22369 /* The cost of code using bit-twiddling on %ah. */
22370 switch (code)
22371 {
22372 case UNLE:
22373 case UNLT:
22374 case LTGT:
22375 case GT:
22376 case GE:
22377 case UNORDERED:
22378 case ORDERED:
22379 case UNEQ:
22380 arith_cost = 4;
22381 break;
22382 case LT:
22383 case NE:
22384 case EQ:
22385 case UNGE:
22386 arith_cost = TARGET_IEEE_FP ? 5 : 4;
22387 break;
22388 case LE:
22389 case UNGT:
22390 arith_cost = TARGET_IEEE_FP ? 6 : 4;
22391 break;
22392 default:
22393 gcc_unreachable ();
22394 }
22395
22396 switch (ix86_fp_comparison_strategy (code))
22397 {
22398 case IX86_FPCMP_COMI:
22399 return arith_cost > 4 ? 3 : 2;
22400 case IX86_FPCMP_SAHF:
22401 return arith_cost > 4 ? 4 : 3;
22402 default:
22403 return arith_cost;
22404 }
22405 }
22406
22407 /* Return strategy to use for floating-point. We assume that fcomi is always
22408 preferrable where available, since that is also true when looking at size
22409 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
22410
22411 enum ix86_fpcmp_strategy
22412 ix86_fp_comparison_strategy (enum rtx_code)
22413 {
22414 /* Do fcomi/sahf based test when profitable. */
22415
22416 if (TARGET_CMOVE)
22417 return IX86_FPCMP_COMI;
22418
22419 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
22420 return IX86_FPCMP_SAHF;
22421
22422 return IX86_FPCMP_ARITH;
22423 }
22424
22425 /* Swap, force into registers, or otherwise massage the two operands
22426 to a fp comparison. The operands are updated in place; the new
22427 comparison code is returned. */
22428
22429 static enum rtx_code
22430 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
22431 {
22432 machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
22433 rtx op0 = *pop0, op1 = *pop1;
22434 machine_mode op_mode = GET_MODE (op0);
22435 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
22436
22437 /* All of the unordered compare instructions only work on registers.
22438 The same is true of the fcomi compare instructions. The XFmode
22439 compare instructions require registers except when comparing
22440 against zero or when converting operand 1 from fixed point to
22441 floating point. */
22442
22443 if (!is_sse
22444 && (fpcmp_mode == CCFPUmode
22445 || (op_mode == XFmode
22446 && ! (standard_80387_constant_p (op0) == 1
22447 || standard_80387_constant_p (op1) == 1)
22448 && GET_CODE (op1) != FLOAT)
22449 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
22450 {
22451 op0 = force_reg (op_mode, op0);
22452 op1 = force_reg (op_mode, op1);
22453 }
22454 else
22455 {
22456 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
22457 things around if they appear profitable, otherwise force op0
22458 into a register. */
22459
22460 if (standard_80387_constant_p (op0) == 0
22461 || (MEM_P (op0)
22462 && ! (standard_80387_constant_p (op1) == 0
22463 || MEM_P (op1))))
22464 {
22465 enum rtx_code new_code = ix86_fp_swap_condition (code);
22466 if (new_code != UNKNOWN)
22467 {
22468 std::swap (op0, op1);
22469 code = new_code;
22470 }
22471 }
22472
22473 if (!REG_P (op0))
22474 op0 = force_reg (op_mode, op0);
22475
22476 if (CONSTANT_P (op1))
22477 {
22478 int tmp = standard_80387_constant_p (op1);
22479 if (tmp == 0)
22480 op1 = validize_mem (force_const_mem (op_mode, op1));
22481 else if (tmp == 1)
22482 {
22483 if (TARGET_CMOVE)
22484 op1 = force_reg (op_mode, op1);
22485 }
22486 else
22487 op1 = force_reg (op_mode, op1);
22488 }
22489 }
22490
22491 /* Try to rearrange the comparison to make it cheaper. */
22492 if (ix86_fp_comparison_cost (code)
22493 > ix86_fp_comparison_cost (swap_condition (code))
22494 && (REG_P (op1) || can_create_pseudo_p ()))
22495 {
22496 std::swap (op0, op1);
22497 code = swap_condition (code);
22498 if (!REG_P (op0))
22499 op0 = force_reg (op_mode, op0);
22500 }
22501
22502 *pop0 = op0;
22503 *pop1 = op1;
22504 return code;
22505 }
22506
22507 /* Convert comparison codes we use to represent FP comparison to integer
22508 code that will result in proper branch. Return UNKNOWN if no such code
22509 is available. */
22510
22511 enum rtx_code
22512 ix86_fp_compare_code_to_integer (enum rtx_code code)
22513 {
22514 switch (code)
22515 {
22516 case GT:
22517 return GTU;
22518 case GE:
22519 return GEU;
22520 case ORDERED:
22521 case UNORDERED:
22522 return code;
22523 case UNEQ:
22524 return EQ;
22525 case UNLT:
22526 return LTU;
22527 case UNLE:
22528 return LEU;
22529 case LTGT:
22530 return NE;
22531 default:
22532 return UNKNOWN;
22533 }
22534 }
22535
22536 /* Generate insn patterns to do a floating point compare of OPERANDS. */
22537
22538 static rtx
22539 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
22540 {
22541 machine_mode fpcmp_mode, intcmp_mode;
22542 rtx tmp, tmp2;
22543
22544 fpcmp_mode = ix86_fp_compare_mode (code);
22545 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
22546
22547 /* Do fcomi/sahf based test when profitable. */
22548 switch (ix86_fp_comparison_strategy (code))
22549 {
22550 case IX86_FPCMP_COMI:
22551 intcmp_mode = fpcmp_mode;
22552 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
22553 tmp = gen_rtx_SET (gen_rtx_REG (fpcmp_mode, FLAGS_REG), tmp);
22554 emit_insn (tmp);
22555 break;
22556
22557 case IX86_FPCMP_SAHF:
22558 intcmp_mode = fpcmp_mode;
22559 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
22560 tmp = gen_rtx_SET (gen_rtx_REG (fpcmp_mode, FLAGS_REG), tmp);
22561
22562 if (!scratch)
22563 scratch = gen_reg_rtx (HImode);
22564 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
22565 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
22566 break;
22567
22568 case IX86_FPCMP_ARITH:
22569 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
22570 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
22571 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
22572 if (!scratch)
22573 scratch = gen_reg_rtx (HImode);
22574 emit_insn (gen_rtx_SET (scratch, tmp2));
22575
22576 /* In the unordered case, we have to check C2 for NaN's, which
22577 doesn't happen to work out to anything nice combination-wise.
22578 So do some bit twiddling on the value we've got in AH to come
22579 up with an appropriate set of condition codes. */
22580
22581 intcmp_mode = CCNOmode;
22582 switch (code)
22583 {
22584 case GT:
22585 case UNGT:
22586 if (code == GT || !TARGET_IEEE_FP)
22587 {
22588 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
22589 code = EQ;
22590 }
22591 else
22592 {
22593 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22594 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22595 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
22596 intcmp_mode = CCmode;
22597 code = GEU;
22598 }
22599 break;
22600 case LT:
22601 case UNLT:
22602 if (code == LT && TARGET_IEEE_FP)
22603 {
22604 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22605 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
22606 intcmp_mode = CCmode;
22607 code = EQ;
22608 }
22609 else
22610 {
22611 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
22612 code = NE;
22613 }
22614 break;
22615 case GE:
22616 case UNGE:
22617 if (code == GE || !TARGET_IEEE_FP)
22618 {
22619 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
22620 code = EQ;
22621 }
22622 else
22623 {
22624 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22625 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
22626 code = NE;
22627 }
22628 break;
22629 case LE:
22630 case UNLE:
22631 if (code == LE && TARGET_IEEE_FP)
22632 {
22633 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22634 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22635 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22636 intcmp_mode = CCmode;
22637 code = LTU;
22638 }
22639 else
22640 {
22641 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
22642 code = NE;
22643 }
22644 break;
22645 case EQ:
22646 case UNEQ:
22647 if (code == EQ && TARGET_IEEE_FP)
22648 {
22649 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22650 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22651 intcmp_mode = CCmode;
22652 code = EQ;
22653 }
22654 else
22655 {
22656 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
22657 code = NE;
22658 }
22659 break;
22660 case NE:
22661 case LTGT:
22662 if (code == NE && TARGET_IEEE_FP)
22663 {
22664 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22665 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
22666 GEN_INT (0x40)));
22667 code = NE;
22668 }
22669 else
22670 {
22671 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
22672 code = EQ;
22673 }
22674 break;
22675
22676 case UNORDERED:
22677 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
22678 code = NE;
22679 break;
22680 case ORDERED:
22681 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
22682 code = EQ;
22683 break;
22684
22685 default:
22686 gcc_unreachable ();
22687 }
22688 break;
22689
22690 default:
22691 gcc_unreachable();
22692 }
22693
22694 /* Return the test that should be put into the flags user, i.e.
22695 the bcc, scc, or cmov instruction. */
22696 return gen_rtx_fmt_ee (code, VOIDmode,
22697 gen_rtx_REG (intcmp_mode, FLAGS_REG),
22698 const0_rtx);
22699 }
22700
22701 static rtx
22702 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
22703 {
22704 rtx ret;
22705
22706 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
22707 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
22708
22709 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
22710 {
22711 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
22712 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
22713 }
22714 else
22715 ret = ix86_expand_int_compare (code, op0, op1);
22716
22717 return ret;
22718 }
22719
22720 void
22721 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
22722 {
22723 machine_mode mode = GET_MODE (op0);
22724 rtx tmp;
22725
22726 /* Handle special case - vector comparsion with boolean result, transform
22727 it using ptest instruction. */
22728 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
22729 {
22730 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
22731 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
22732
22733 gcc_assert (code == EQ || code == NE);
22734 /* Generate XOR since we can't check that one operand is zero vector. */
22735 tmp = gen_reg_rtx (mode);
22736 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
22737 tmp = gen_lowpart (p_mode, tmp);
22738 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
22739 gen_rtx_UNSPEC (CCmode,
22740 gen_rtvec (2, tmp, tmp),
22741 UNSPEC_PTEST)));
22742 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
22743 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22744 gen_rtx_LABEL_REF (VOIDmode, label),
22745 pc_rtx);
22746 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
22747 return;
22748 }
22749
22750 switch (mode)
22751 {
22752 case SFmode:
22753 case DFmode:
22754 case XFmode:
22755 case QImode:
22756 case HImode:
22757 case SImode:
22758 simple:
22759 tmp = ix86_expand_compare (code, op0, op1);
22760 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22761 gen_rtx_LABEL_REF (VOIDmode, label),
22762 pc_rtx);
22763 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
22764 return;
22765
22766 case DImode:
22767 if (TARGET_64BIT)
22768 goto simple;
22769 /* For 32-bit target DI comparison may be performed on
22770 SSE registers. To allow this we should avoid split
22771 to SI mode which is achieved by doing xor in DI mode
22772 and then comparing with zero (which is recognized by
22773 STV pass). We don't compare using xor when optimizing
22774 for size. */
22775 if (!optimize_insn_for_size_p ()
22776 && TARGET_STV
22777 && (code == EQ || code == NE))
22778 {
22779 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
22780 op1 = const0_rtx;
22781 }
22782 /* FALLTHRU */
22783 case TImode:
22784 /* Expand DImode branch into multiple compare+branch. */
22785 {
22786 rtx lo[2], hi[2];
22787 rtx_code_label *label2;
22788 enum rtx_code code1, code2, code3;
22789 machine_mode submode;
22790
22791 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
22792 {
22793 std::swap (op0, op1);
22794 code = swap_condition (code);
22795 }
22796
22797 split_double_mode (mode, &op0, 1, lo+0, hi+0);
22798 split_double_mode (mode, &op1, 1, lo+1, hi+1);
22799
22800 submode = mode == DImode ? SImode : DImode;
22801
22802 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
22803 avoid two branches. This costs one extra insn, so disable when
22804 optimizing for size. */
22805
22806 if ((code == EQ || code == NE)
22807 && (!optimize_insn_for_size_p ()
22808 || hi[1] == const0_rtx || lo[1] == const0_rtx))
22809 {
22810 rtx xor0, xor1;
22811
22812 xor1 = hi[0];
22813 if (hi[1] != const0_rtx)
22814 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
22815 NULL_RTX, 0, OPTAB_WIDEN);
22816
22817 xor0 = lo[0];
22818 if (lo[1] != const0_rtx)
22819 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
22820 NULL_RTX, 0, OPTAB_WIDEN);
22821
22822 tmp = expand_binop (submode, ior_optab, xor1, xor0,
22823 NULL_RTX, 0, OPTAB_WIDEN);
22824
22825 ix86_expand_branch (code, tmp, const0_rtx, label);
22826 return;
22827 }
22828
22829 /* Otherwise, if we are doing less-than or greater-or-equal-than,
22830 op1 is a constant and the low word is zero, then we can just
22831 examine the high word. Similarly for low word -1 and
22832 less-or-equal-than or greater-than. */
22833
22834 if (CONST_INT_P (hi[1]))
22835 switch (code)
22836 {
22837 case LT: case LTU: case GE: case GEU:
22838 if (lo[1] == const0_rtx)
22839 {
22840 ix86_expand_branch (code, hi[0], hi[1], label);
22841 return;
22842 }
22843 break;
22844 case LE: case LEU: case GT: case GTU:
22845 if (lo[1] == constm1_rtx)
22846 {
22847 ix86_expand_branch (code, hi[0], hi[1], label);
22848 return;
22849 }
22850 break;
22851 default:
22852 break;
22853 }
22854
22855 /* Otherwise, we need two or three jumps. */
22856
22857 label2 = gen_label_rtx ();
22858
22859 code1 = code;
22860 code2 = swap_condition (code);
22861 code3 = unsigned_condition (code);
22862
22863 switch (code)
22864 {
22865 case LT: case GT: case LTU: case GTU:
22866 break;
22867
22868 case LE: code1 = LT; code2 = GT; break;
22869 case GE: code1 = GT; code2 = LT; break;
22870 case LEU: code1 = LTU; code2 = GTU; break;
22871 case GEU: code1 = GTU; code2 = LTU; break;
22872
22873 case EQ: code1 = UNKNOWN; code2 = NE; break;
22874 case NE: code2 = UNKNOWN; break;
22875
22876 default:
22877 gcc_unreachable ();
22878 }
22879
22880 /*
22881 * a < b =>
22882 * if (hi(a) < hi(b)) goto true;
22883 * if (hi(a) > hi(b)) goto false;
22884 * if (lo(a) < lo(b)) goto true;
22885 * false:
22886 */
22887
22888 if (code1 != UNKNOWN)
22889 ix86_expand_branch (code1, hi[0], hi[1], label);
22890 if (code2 != UNKNOWN)
22891 ix86_expand_branch (code2, hi[0], hi[1], label2);
22892
22893 ix86_expand_branch (code3, lo[0], lo[1], label);
22894
22895 if (code2 != UNKNOWN)
22896 emit_label (label2);
22897 return;
22898 }
22899
22900 default:
22901 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
22902 goto simple;
22903 }
22904 }
22905
22906 /* Split branch based on floating point condition. */
22907 void
22908 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
22909 rtx target1, rtx target2, rtx tmp)
22910 {
22911 rtx condition;
22912 rtx_insn *i;
22913
22914 if (target2 != pc_rtx)
22915 {
22916 std::swap (target1, target2);
22917 code = reverse_condition_maybe_unordered (code);
22918 }
22919
22920 condition = ix86_expand_fp_compare (code, op1, op2,
22921 tmp);
22922
22923 i = emit_jump_insn (gen_rtx_SET
22924 (pc_rtx,
22925 gen_rtx_IF_THEN_ELSE (VOIDmode,
22926 condition, target1, target2)));
22927 if (split_branch_probability >= 0)
22928 add_int_reg_note (i, REG_BR_PROB, split_branch_probability);
22929 }
22930
22931 void
22932 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
22933 {
22934 rtx ret;
22935
22936 gcc_assert (GET_MODE (dest) == QImode);
22937
22938 ret = ix86_expand_compare (code, op0, op1);
22939 PUT_MODE (ret, QImode);
22940 emit_insn (gen_rtx_SET (dest, ret));
22941 }
22942
22943 /* Expand comparison setting or clearing carry flag. Return true when
22944 successful and set pop for the operation. */
22945 static bool
22946 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
22947 {
22948 machine_mode mode =
22949 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
22950
22951 /* Do not handle double-mode compares that go through special path. */
22952 if (mode == (TARGET_64BIT ? TImode : DImode))
22953 return false;
22954
22955 if (SCALAR_FLOAT_MODE_P (mode))
22956 {
22957 rtx compare_op;
22958 rtx_insn *compare_seq;
22959
22960 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
22961
22962 /* Shortcut: following common codes never translate
22963 into carry flag compares. */
22964 if (code == EQ || code == NE || code == UNEQ || code == LTGT
22965 || code == ORDERED || code == UNORDERED)
22966 return false;
22967
22968 /* These comparisons require zero flag; swap operands so they won't. */
22969 if ((code == GT || code == UNLE || code == LE || code == UNGT)
22970 && !TARGET_IEEE_FP)
22971 {
22972 std::swap (op0, op1);
22973 code = swap_condition (code);
22974 }
22975
22976 /* Try to expand the comparison and verify that we end up with
22977 carry flag based comparison. This fails to be true only when
22978 we decide to expand comparison using arithmetic that is not
22979 too common scenario. */
22980 start_sequence ();
22981 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
22982 compare_seq = get_insns ();
22983 end_sequence ();
22984
22985 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
22986 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
22987 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
22988 else
22989 code = GET_CODE (compare_op);
22990
22991 if (code != LTU && code != GEU)
22992 return false;
22993
22994 emit_insn (compare_seq);
22995 *pop = compare_op;
22996 return true;
22997 }
22998
22999 if (!INTEGRAL_MODE_P (mode))
23000 return false;
23001
23002 switch (code)
23003 {
23004 case LTU:
23005 case GEU:
23006 break;
23007
23008 /* Convert a==0 into (unsigned)a<1. */
23009 case EQ:
23010 case NE:
23011 if (op1 != const0_rtx)
23012 return false;
23013 op1 = const1_rtx;
23014 code = (code == EQ ? LTU : GEU);
23015 break;
23016
23017 /* Convert a>b into b<a or a>=b-1. */
23018 case GTU:
23019 case LEU:
23020 if (CONST_INT_P (op1))
23021 {
23022 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
23023 /* Bail out on overflow. We still can swap operands but that
23024 would force loading of the constant into register. */
23025 if (op1 == const0_rtx
23026 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
23027 return false;
23028 code = (code == GTU ? GEU : LTU);
23029 }
23030 else
23031 {
23032 std::swap (op0, op1);
23033 code = (code == GTU ? LTU : GEU);
23034 }
23035 break;
23036
23037 /* Convert a>=0 into (unsigned)a<0x80000000. */
23038 case LT:
23039 case GE:
23040 if (mode == DImode || op1 != const0_rtx)
23041 return false;
23042 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
23043 code = (code == LT ? GEU : LTU);
23044 break;
23045 case LE:
23046 case GT:
23047 if (mode == DImode || op1 != constm1_rtx)
23048 return false;
23049 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
23050 code = (code == LE ? GEU : LTU);
23051 break;
23052
23053 default:
23054 return false;
23055 }
23056 /* Swapping operands may cause constant to appear as first operand. */
23057 if (!nonimmediate_operand (op0, VOIDmode))
23058 {
23059 if (!can_create_pseudo_p ())
23060 return false;
23061 op0 = force_reg (mode, op0);
23062 }
23063 *pop = ix86_expand_compare (code, op0, op1);
23064 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
23065 return true;
23066 }
23067
23068 bool
23069 ix86_expand_int_movcc (rtx operands[])
23070 {
23071 enum rtx_code code = GET_CODE (operands[1]), compare_code;
23072 rtx_insn *compare_seq;
23073 rtx compare_op;
23074 machine_mode mode = GET_MODE (operands[0]);
23075 bool sign_bit_compare_p = false;
23076 rtx op0 = XEXP (operands[1], 0);
23077 rtx op1 = XEXP (operands[1], 1);
23078
23079 if (GET_MODE (op0) == TImode
23080 || (GET_MODE (op0) == DImode
23081 && !TARGET_64BIT))
23082 return false;
23083
23084 start_sequence ();
23085 compare_op = ix86_expand_compare (code, op0, op1);
23086 compare_seq = get_insns ();
23087 end_sequence ();
23088
23089 compare_code = GET_CODE (compare_op);
23090
23091 if ((op1 == const0_rtx && (code == GE || code == LT))
23092 || (op1 == constm1_rtx && (code == GT || code == LE)))
23093 sign_bit_compare_p = true;
23094
23095 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
23096 HImode insns, we'd be swallowed in word prefix ops. */
23097
23098 if ((mode != HImode || TARGET_FAST_PREFIX)
23099 && (mode != (TARGET_64BIT ? TImode : DImode))
23100 && CONST_INT_P (operands[2])
23101 && CONST_INT_P (operands[3]))
23102 {
23103 rtx out = operands[0];
23104 HOST_WIDE_INT ct = INTVAL (operands[2]);
23105 HOST_WIDE_INT cf = INTVAL (operands[3]);
23106 HOST_WIDE_INT diff;
23107
23108 diff = ct - cf;
23109 /* Sign bit compares are better done using shifts than we do by using
23110 sbb. */
23111 if (sign_bit_compare_p
23112 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
23113 {
23114 /* Detect overlap between destination and compare sources. */
23115 rtx tmp = out;
23116
23117 if (!sign_bit_compare_p)
23118 {
23119 rtx flags;
23120 bool fpcmp = false;
23121
23122 compare_code = GET_CODE (compare_op);
23123
23124 flags = XEXP (compare_op, 0);
23125
23126 if (GET_MODE (flags) == CCFPmode
23127 || GET_MODE (flags) == CCFPUmode)
23128 {
23129 fpcmp = true;
23130 compare_code
23131 = ix86_fp_compare_code_to_integer (compare_code);
23132 }
23133
23134 /* To simplify rest of code, restrict to the GEU case. */
23135 if (compare_code == LTU)
23136 {
23137 std::swap (ct, cf);
23138 compare_code = reverse_condition (compare_code);
23139 code = reverse_condition (code);
23140 }
23141 else
23142 {
23143 if (fpcmp)
23144 PUT_CODE (compare_op,
23145 reverse_condition_maybe_unordered
23146 (GET_CODE (compare_op)));
23147 else
23148 PUT_CODE (compare_op,
23149 reverse_condition (GET_CODE (compare_op)));
23150 }
23151 diff = ct - cf;
23152
23153 if (reg_overlap_mentioned_p (out, op0)
23154 || reg_overlap_mentioned_p (out, op1))
23155 tmp = gen_reg_rtx (mode);
23156
23157 if (mode == DImode)
23158 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
23159 else
23160 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
23161 flags, compare_op));
23162 }
23163 else
23164 {
23165 if (code == GT || code == GE)
23166 code = reverse_condition (code);
23167 else
23168 {
23169 std::swap (ct, cf);
23170 diff = ct - cf;
23171 }
23172 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
23173 }
23174
23175 if (diff == 1)
23176 {
23177 /*
23178 * cmpl op0,op1
23179 * sbbl dest,dest
23180 * [addl dest, ct]
23181 *
23182 * Size 5 - 8.
23183 */
23184 if (ct)
23185 tmp = expand_simple_binop (mode, PLUS,
23186 tmp, GEN_INT (ct),
23187 copy_rtx (tmp), 1, OPTAB_DIRECT);
23188 }
23189 else if (cf == -1)
23190 {
23191 /*
23192 * cmpl op0,op1
23193 * sbbl dest,dest
23194 * orl $ct, dest
23195 *
23196 * Size 8.
23197 */
23198 tmp = expand_simple_binop (mode, IOR,
23199 tmp, GEN_INT (ct),
23200 copy_rtx (tmp), 1, OPTAB_DIRECT);
23201 }
23202 else if (diff == -1 && ct)
23203 {
23204 /*
23205 * cmpl op0,op1
23206 * sbbl dest,dest
23207 * notl dest
23208 * [addl dest, cf]
23209 *
23210 * Size 8 - 11.
23211 */
23212 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
23213 if (cf)
23214 tmp = expand_simple_binop (mode, PLUS,
23215 copy_rtx (tmp), GEN_INT (cf),
23216 copy_rtx (tmp), 1, OPTAB_DIRECT);
23217 }
23218 else
23219 {
23220 /*
23221 * cmpl op0,op1
23222 * sbbl dest,dest
23223 * [notl dest]
23224 * andl cf - ct, dest
23225 * [addl dest, ct]
23226 *
23227 * Size 8 - 11.
23228 */
23229
23230 if (cf == 0)
23231 {
23232 cf = ct;
23233 ct = 0;
23234 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
23235 }
23236
23237 tmp = expand_simple_binop (mode, AND,
23238 copy_rtx (tmp),
23239 gen_int_mode (cf - ct, mode),
23240 copy_rtx (tmp), 1, OPTAB_DIRECT);
23241 if (ct)
23242 tmp = expand_simple_binop (mode, PLUS,
23243 copy_rtx (tmp), GEN_INT (ct),
23244 copy_rtx (tmp), 1, OPTAB_DIRECT);
23245 }
23246
23247 if (!rtx_equal_p (tmp, out))
23248 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
23249
23250 return true;
23251 }
23252
23253 if (diff < 0)
23254 {
23255 machine_mode cmp_mode = GET_MODE (op0);
23256 enum rtx_code new_code;
23257
23258 if (SCALAR_FLOAT_MODE_P (cmp_mode))
23259 {
23260 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
23261
23262 /* We may be reversing unordered compare to normal compare, that
23263 is not valid in general (we may convert non-trapping condition
23264 to trapping one), however on i386 we currently emit all
23265 comparisons unordered. */
23266 new_code = reverse_condition_maybe_unordered (code);
23267 }
23268 else
23269 new_code = ix86_reverse_condition (code, cmp_mode);
23270 if (new_code != UNKNOWN)
23271 {
23272 std::swap (ct, cf);
23273 diff = -diff;
23274 code = new_code;
23275 }
23276 }
23277
23278 compare_code = UNKNOWN;
23279 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
23280 && CONST_INT_P (op1))
23281 {
23282 if (op1 == const0_rtx
23283 && (code == LT || code == GE))
23284 compare_code = code;
23285 else if (op1 == constm1_rtx)
23286 {
23287 if (code == LE)
23288 compare_code = LT;
23289 else if (code == GT)
23290 compare_code = GE;
23291 }
23292 }
23293
23294 /* Optimize dest = (op0 < 0) ? -1 : cf. */
23295 if (compare_code != UNKNOWN
23296 && GET_MODE (op0) == GET_MODE (out)
23297 && (cf == -1 || ct == -1))
23298 {
23299 /* If lea code below could be used, only optimize
23300 if it results in a 2 insn sequence. */
23301
23302 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
23303 || diff == 3 || diff == 5 || diff == 9)
23304 || (compare_code == LT && ct == -1)
23305 || (compare_code == GE && cf == -1))
23306 {
23307 /*
23308 * notl op1 (if necessary)
23309 * sarl $31, op1
23310 * orl cf, op1
23311 */
23312 if (ct != -1)
23313 {
23314 cf = ct;
23315 ct = -1;
23316 code = reverse_condition (code);
23317 }
23318
23319 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23320
23321 out = expand_simple_binop (mode, IOR,
23322 out, GEN_INT (cf),
23323 out, 1, OPTAB_DIRECT);
23324 if (out != operands[0])
23325 emit_move_insn (operands[0], out);
23326
23327 return true;
23328 }
23329 }
23330
23331
23332 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
23333 || diff == 3 || diff == 5 || diff == 9)
23334 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
23335 && (mode != DImode
23336 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
23337 {
23338 /*
23339 * xorl dest,dest
23340 * cmpl op1,op2
23341 * setcc dest
23342 * lea cf(dest*(ct-cf)),dest
23343 *
23344 * Size 14.
23345 *
23346 * This also catches the degenerate setcc-only case.
23347 */
23348
23349 rtx tmp;
23350 int nops;
23351
23352 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23353
23354 nops = 0;
23355 /* On x86_64 the lea instruction operates on Pmode, so we need
23356 to get arithmetics done in proper mode to match. */
23357 if (diff == 1)
23358 tmp = copy_rtx (out);
23359 else
23360 {
23361 rtx out1;
23362 out1 = copy_rtx (out);
23363 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
23364 nops++;
23365 if (diff & 1)
23366 {
23367 tmp = gen_rtx_PLUS (mode, tmp, out1);
23368 nops++;
23369 }
23370 }
23371 if (cf != 0)
23372 {
23373 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
23374 nops++;
23375 }
23376 if (!rtx_equal_p (tmp, out))
23377 {
23378 if (nops == 1)
23379 out = force_operand (tmp, copy_rtx (out));
23380 else
23381 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
23382 }
23383 if (!rtx_equal_p (out, operands[0]))
23384 emit_move_insn (operands[0], copy_rtx (out));
23385
23386 return true;
23387 }
23388
23389 /*
23390 * General case: Jumpful:
23391 * xorl dest,dest cmpl op1, op2
23392 * cmpl op1, op2 movl ct, dest
23393 * setcc dest jcc 1f
23394 * decl dest movl cf, dest
23395 * andl (cf-ct),dest 1:
23396 * addl ct,dest
23397 *
23398 * Size 20. Size 14.
23399 *
23400 * This is reasonably steep, but branch mispredict costs are
23401 * high on modern cpus, so consider failing only if optimizing
23402 * for space.
23403 */
23404
23405 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23406 && BRANCH_COST (optimize_insn_for_speed_p (),
23407 false) >= 2)
23408 {
23409 if (cf == 0)
23410 {
23411 machine_mode cmp_mode = GET_MODE (op0);
23412 enum rtx_code new_code;
23413
23414 if (SCALAR_FLOAT_MODE_P (cmp_mode))
23415 {
23416 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
23417
23418 /* We may be reversing unordered compare to normal compare,
23419 that is not valid in general (we may convert non-trapping
23420 condition to trapping one), however on i386 we currently
23421 emit all comparisons unordered. */
23422 new_code = reverse_condition_maybe_unordered (code);
23423 }
23424 else
23425 {
23426 new_code = ix86_reverse_condition (code, cmp_mode);
23427 if (compare_code != UNKNOWN && new_code != UNKNOWN)
23428 compare_code = reverse_condition (compare_code);
23429 }
23430
23431 if (new_code != UNKNOWN)
23432 {
23433 cf = ct;
23434 ct = 0;
23435 code = new_code;
23436 }
23437 }
23438
23439 if (compare_code != UNKNOWN)
23440 {
23441 /* notl op1 (if needed)
23442 sarl $31, op1
23443 andl (cf-ct), op1
23444 addl ct, op1
23445
23446 For x < 0 (resp. x <= -1) there will be no notl,
23447 so if possible swap the constants to get rid of the
23448 complement.
23449 True/false will be -1/0 while code below (store flag
23450 followed by decrement) is 0/-1, so the constants need
23451 to be exchanged once more. */
23452
23453 if (compare_code == GE || !cf)
23454 {
23455 code = reverse_condition (code);
23456 compare_code = LT;
23457 }
23458 else
23459 std::swap (ct, cf);
23460
23461 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23462 }
23463 else
23464 {
23465 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23466
23467 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
23468 constm1_rtx,
23469 copy_rtx (out), 1, OPTAB_DIRECT);
23470 }
23471
23472 out = expand_simple_binop (mode, AND, copy_rtx (out),
23473 gen_int_mode (cf - ct, mode),
23474 copy_rtx (out), 1, OPTAB_DIRECT);
23475 if (ct)
23476 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
23477 copy_rtx (out), 1, OPTAB_DIRECT);
23478 if (!rtx_equal_p (out, operands[0]))
23479 emit_move_insn (operands[0], copy_rtx (out));
23480
23481 return true;
23482 }
23483 }
23484
23485 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23486 {
23487 /* Try a few things more with specific constants and a variable. */
23488
23489 optab op;
23490 rtx var, orig_out, out, tmp;
23491
23492 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
23493 return false;
23494
23495 /* If one of the two operands is an interesting constant, load a
23496 constant with the above and mask it in with a logical operation. */
23497
23498 if (CONST_INT_P (operands[2]))
23499 {
23500 var = operands[3];
23501 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
23502 operands[3] = constm1_rtx, op = and_optab;
23503 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
23504 operands[3] = const0_rtx, op = ior_optab;
23505 else
23506 return false;
23507 }
23508 else if (CONST_INT_P (operands[3]))
23509 {
23510 var = operands[2];
23511 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
23512 operands[2] = constm1_rtx, op = and_optab;
23513 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
23514 operands[2] = const0_rtx, op = ior_optab;
23515 else
23516 return false;
23517 }
23518 else
23519 return false;
23520
23521 orig_out = operands[0];
23522 tmp = gen_reg_rtx (mode);
23523 operands[0] = tmp;
23524
23525 /* Recurse to get the constant loaded. */
23526 if (!ix86_expand_int_movcc (operands))
23527 return false;
23528
23529 /* Mask in the interesting variable. */
23530 out = expand_binop (mode, op, var, tmp, orig_out, 0,
23531 OPTAB_WIDEN);
23532 if (!rtx_equal_p (out, orig_out))
23533 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
23534
23535 return true;
23536 }
23537
23538 /*
23539 * For comparison with above,
23540 *
23541 * movl cf,dest
23542 * movl ct,tmp
23543 * cmpl op1,op2
23544 * cmovcc tmp,dest
23545 *
23546 * Size 15.
23547 */
23548
23549 if (! nonimmediate_operand (operands[2], mode))
23550 operands[2] = force_reg (mode, operands[2]);
23551 if (! nonimmediate_operand (operands[3], mode))
23552 operands[3] = force_reg (mode, operands[3]);
23553
23554 if (! register_operand (operands[2], VOIDmode)
23555 && (mode == QImode
23556 || ! register_operand (operands[3], VOIDmode)))
23557 operands[2] = force_reg (mode, operands[2]);
23558
23559 if (mode == QImode
23560 && ! register_operand (operands[3], VOIDmode))
23561 operands[3] = force_reg (mode, operands[3]);
23562
23563 emit_insn (compare_seq);
23564 emit_insn (gen_rtx_SET (operands[0],
23565 gen_rtx_IF_THEN_ELSE (mode,
23566 compare_op, operands[2],
23567 operands[3])));
23568 return true;
23569 }
23570
23571 /* Swap, force into registers, or otherwise massage the two operands
23572 to an sse comparison with a mask result. Thus we differ a bit from
23573 ix86_prepare_fp_compare_args which expects to produce a flags result.
23574
23575 The DEST operand exists to help determine whether to commute commutative
23576 operators. The POP0/POP1 operands are updated in place. The new
23577 comparison code is returned, or UNKNOWN if not implementable. */
23578
23579 static enum rtx_code
23580 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
23581 rtx *pop0, rtx *pop1)
23582 {
23583 switch (code)
23584 {
23585 case LTGT:
23586 case UNEQ:
23587 /* AVX supports all the needed comparisons. */
23588 if (TARGET_AVX)
23589 break;
23590 /* We have no LTGT as an operator. We could implement it with
23591 NE & ORDERED, but this requires an extra temporary. It's
23592 not clear that it's worth it. */
23593 return UNKNOWN;
23594
23595 case LT:
23596 case LE:
23597 case UNGT:
23598 case UNGE:
23599 /* These are supported directly. */
23600 break;
23601
23602 case EQ:
23603 case NE:
23604 case UNORDERED:
23605 case ORDERED:
23606 /* AVX has 3 operand comparisons, no need to swap anything. */
23607 if (TARGET_AVX)
23608 break;
23609 /* For commutative operators, try to canonicalize the destination
23610 operand to be first in the comparison - this helps reload to
23611 avoid extra moves. */
23612 if (!dest || !rtx_equal_p (dest, *pop1))
23613 break;
23614 /* FALLTHRU */
23615
23616 case GE:
23617 case GT:
23618 case UNLE:
23619 case UNLT:
23620 /* These are not supported directly before AVX, and furthermore
23621 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
23622 comparison operands to transform into something that is
23623 supported. */
23624 std::swap (*pop0, *pop1);
23625 code = swap_condition (code);
23626 break;
23627
23628 default:
23629 gcc_unreachable ();
23630 }
23631
23632 return code;
23633 }
23634
23635 /* Detect conditional moves that exactly match min/max operational
23636 semantics. Note that this is IEEE safe, as long as we don't
23637 interchange the operands.
23638
23639 Returns FALSE if this conditional move doesn't match a MIN/MAX,
23640 and TRUE if the operation is successful and instructions are emitted. */
23641
23642 static bool
23643 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
23644 rtx cmp_op1, rtx if_true, rtx if_false)
23645 {
23646 machine_mode mode;
23647 bool is_min;
23648 rtx tmp;
23649
23650 if (code == LT)
23651 ;
23652 else if (code == UNGE)
23653 std::swap (if_true, if_false);
23654 else
23655 return false;
23656
23657 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
23658 is_min = true;
23659 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
23660 is_min = false;
23661 else
23662 return false;
23663
23664 mode = GET_MODE (dest);
23665
23666 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
23667 but MODE may be a vector mode and thus not appropriate. */
23668 if (!flag_finite_math_only || flag_signed_zeros)
23669 {
23670 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
23671 rtvec v;
23672
23673 if_true = force_reg (mode, if_true);
23674 v = gen_rtvec (2, if_true, if_false);
23675 tmp = gen_rtx_UNSPEC (mode, v, u);
23676 }
23677 else
23678 {
23679 code = is_min ? SMIN : SMAX;
23680 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
23681 }
23682
23683 emit_insn (gen_rtx_SET (dest, tmp));
23684 return true;
23685 }
23686
23687 /* Expand an sse vector comparison. Return the register with the result. */
23688
23689 static rtx
23690 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
23691 rtx op_true, rtx op_false)
23692 {
23693 machine_mode mode = GET_MODE (dest);
23694 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
23695
23696 /* In general case result of comparison can differ from operands' type. */
23697 machine_mode cmp_mode;
23698
23699 /* In AVX512F the result of comparison is an integer mask. */
23700 bool maskcmp = false;
23701 rtx x;
23702
23703 if (GET_MODE_SIZE (cmp_ops_mode) == 64)
23704 {
23705 cmp_mode = mode_for_size (GET_MODE_NUNITS (cmp_ops_mode), MODE_INT, 0);
23706 gcc_assert (cmp_mode != BLKmode);
23707
23708 maskcmp = true;
23709 }
23710 else
23711 cmp_mode = cmp_ops_mode;
23712
23713
23714 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
23715 if (!nonimmediate_operand (cmp_op1, cmp_ops_mode))
23716 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
23717
23718 if (optimize
23719 || (maskcmp && cmp_mode != mode)
23720 || (op_true && reg_overlap_mentioned_p (dest, op_true))
23721 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
23722 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
23723
23724 /* Compare patterns for int modes are unspec in AVX512F only. */
23725 if (maskcmp && (code == GT || code == EQ))
23726 {
23727 rtx (*gen)(rtx, rtx, rtx);
23728
23729 switch (cmp_ops_mode)
23730 {
23731 case V64QImode:
23732 gcc_assert (TARGET_AVX512BW);
23733 gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
23734 break;
23735 case V32HImode:
23736 gcc_assert (TARGET_AVX512BW);
23737 gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
23738 break;
23739 case V16SImode:
23740 gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
23741 break;
23742 case V8DImode:
23743 gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
23744 break;
23745 default:
23746 gen = NULL;
23747 }
23748
23749 if (gen)
23750 {
23751 emit_insn (gen (dest, cmp_op0, cmp_op1));
23752 return dest;
23753 }
23754 }
23755 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
23756
23757 if (cmp_mode != mode && !maskcmp)
23758 {
23759 x = force_reg (cmp_ops_mode, x);
23760 convert_move (dest, x, false);
23761 }
23762 else
23763 emit_insn (gen_rtx_SET (dest, x));
23764
23765 return dest;
23766 }
23767
23768 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
23769 operations. This is used for both scalar and vector conditional moves. */
23770
23771 void
23772 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
23773 {
23774 machine_mode mode = GET_MODE (dest);
23775 machine_mode cmpmode = GET_MODE (cmp);
23776
23777 /* In AVX512F the result of comparison is an integer mask. */
23778 bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
23779
23780 rtx t2, t3, x;
23781
23782 /* If we have an integer mask and FP value then we need
23783 to cast mask to FP mode. */
23784 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
23785 {
23786 cmp = force_reg (cmpmode, cmp);
23787 cmp = gen_rtx_SUBREG (mode, cmp, 0);
23788 }
23789
23790 if (vector_all_ones_operand (op_true, mode)
23791 && rtx_equal_p (op_false, CONST0_RTX (mode))
23792 && !maskcmp)
23793 {
23794 emit_insn (gen_rtx_SET (dest, cmp));
23795 }
23796 else if (op_false == CONST0_RTX (mode)
23797 && !maskcmp)
23798 {
23799 op_true = force_reg (mode, op_true);
23800 x = gen_rtx_AND (mode, cmp, op_true);
23801 emit_insn (gen_rtx_SET (dest, x));
23802 }
23803 else if (op_true == CONST0_RTX (mode)
23804 && !maskcmp)
23805 {
23806 op_false = force_reg (mode, op_false);
23807 x = gen_rtx_NOT (mode, cmp);
23808 x = gen_rtx_AND (mode, x, op_false);
23809 emit_insn (gen_rtx_SET (dest, x));
23810 }
23811 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)
23812 && !maskcmp)
23813 {
23814 op_false = force_reg (mode, op_false);
23815 x = gen_rtx_IOR (mode, cmp, op_false);
23816 emit_insn (gen_rtx_SET (dest, x));
23817 }
23818 else if (TARGET_XOP
23819 && !maskcmp)
23820 {
23821 op_true = force_reg (mode, op_true);
23822
23823 if (!nonimmediate_operand (op_false, mode))
23824 op_false = force_reg (mode, op_false);
23825
23826 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
23827 op_true,
23828 op_false)));
23829 }
23830 else
23831 {
23832 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
23833 rtx d = dest;
23834
23835 if (!nonimmediate_operand (op_true, mode))
23836 op_true = force_reg (mode, op_true);
23837
23838 op_false = force_reg (mode, op_false);
23839
23840 switch (mode)
23841 {
23842 case V4SFmode:
23843 if (TARGET_SSE4_1)
23844 gen = gen_sse4_1_blendvps;
23845 break;
23846 case V2DFmode:
23847 if (TARGET_SSE4_1)
23848 gen = gen_sse4_1_blendvpd;
23849 break;
23850 case V16QImode:
23851 case V8HImode:
23852 case V4SImode:
23853 case V2DImode:
23854 if (TARGET_SSE4_1)
23855 {
23856 gen = gen_sse4_1_pblendvb;
23857 if (mode != V16QImode)
23858 d = gen_reg_rtx (V16QImode);
23859 op_false = gen_lowpart (V16QImode, op_false);
23860 op_true = gen_lowpart (V16QImode, op_true);
23861 cmp = gen_lowpart (V16QImode, cmp);
23862 }
23863 break;
23864 case V8SFmode:
23865 if (TARGET_AVX)
23866 gen = gen_avx_blendvps256;
23867 break;
23868 case V4DFmode:
23869 if (TARGET_AVX)
23870 gen = gen_avx_blendvpd256;
23871 break;
23872 case V32QImode:
23873 case V16HImode:
23874 case V8SImode:
23875 case V4DImode:
23876 if (TARGET_AVX2)
23877 {
23878 gen = gen_avx2_pblendvb;
23879 if (mode != V32QImode)
23880 d = gen_reg_rtx (V32QImode);
23881 op_false = gen_lowpart (V32QImode, op_false);
23882 op_true = gen_lowpart (V32QImode, op_true);
23883 cmp = gen_lowpart (V32QImode, cmp);
23884 }
23885 break;
23886
23887 case V64QImode:
23888 gen = gen_avx512bw_blendmv64qi;
23889 break;
23890 case V32HImode:
23891 gen = gen_avx512bw_blendmv32hi;
23892 break;
23893 case V16SImode:
23894 gen = gen_avx512f_blendmv16si;
23895 break;
23896 case V8DImode:
23897 gen = gen_avx512f_blendmv8di;
23898 break;
23899 case V8DFmode:
23900 gen = gen_avx512f_blendmv8df;
23901 break;
23902 case V16SFmode:
23903 gen = gen_avx512f_blendmv16sf;
23904 break;
23905
23906 default:
23907 break;
23908 }
23909
23910 if (gen != NULL)
23911 {
23912 emit_insn (gen (d, op_false, op_true, cmp));
23913 if (d != dest)
23914 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
23915 }
23916 else
23917 {
23918 op_true = force_reg (mode, op_true);
23919
23920 t2 = gen_reg_rtx (mode);
23921 if (optimize)
23922 t3 = gen_reg_rtx (mode);
23923 else
23924 t3 = dest;
23925
23926 x = gen_rtx_AND (mode, op_true, cmp);
23927 emit_insn (gen_rtx_SET (t2, x));
23928
23929 x = gen_rtx_NOT (mode, cmp);
23930 x = gen_rtx_AND (mode, x, op_false);
23931 emit_insn (gen_rtx_SET (t3, x));
23932
23933 x = gen_rtx_IOR (mode, t3, t2);
23934 emit_insn (gen_rtx_SET (dest, x));
23935 }
23936 }
23937 }
23938
23939 /* Expand a floating-point conditional move. Return true if successful. */
23940
23941 bool
23942 ix86_expand_fp_movcc (rtx operands[])
23943 {
23944 machine_mode mode = GET_MODE (operands[0]);
23945 enum rtx_code code = GET_CODE (operands[1]);
23946 rtx tmp, compare_op;
23947 rtx op0 = XEXP (operands[1], 0);
23948 rtx op1 = XEXP (operands[1], 1);
23949
23950 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
23951 {
23952 machine_mode cmode;
23953
23954 /* Since we've no cmove for sse registers, don't force bad register
23955 allocation just to gain access to it. Deny movcc when the
23956 comparison mode doesn't match the move mode. */
23957 cmode = GET_MODE (op0);
23958 if (cmode == VOIDmode)
23959 cmode = GET_MODE (op1);
23960 if (cmode != mode)
23961 return false;
23962
23963 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
23964 if (code == UNKNOWN)
23965 return false;
23966
23967 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
23968 operands[2], operands[3]))
23969 return true;
23970
23971 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
23972 operands[2], operands[3]);
23973 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
23974 return true;
23975 }
23976
23977 if (GET_MODE (op0) == TImode
23978 || (GET_MODE (op0) == DImode
23979 && !TARGET_64BIT))
23980 return false;
23981
23982 /* The floating point conditional move instructions don't directly
23983 support conditions resulting from a signed integer comparison. */
23984
23985 compare_op = ix86_expand_compare (code, op0, op1);
23986 if (!fcmov_comparison_operator (compare_op, VOIDmode))
23987 {
23988 tmp = gen_reg_rtx (QImode);
23989 ix86_expand_setcc (tmp, code, op0, op1);
23990
23991 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
23992 }
23993
23994 emit_insn (gen_rtx_SET (operands[0],
23995 gen_rtx_IF_THEN_ELSE (mode, compare_op,
23996 operands[2], operands[3])));
23997
23998 return true;
23999 }
24000
24001 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
24002
24003 static int
24004 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
24005 {
24006 switch (code)
24007 {
24008 case EQ:
24009 return 0;
24010 case LT:
24011 case LTU:
24012 return 1;
24013 case LE:
24014 case LEU:
24015 return 2;
24016 case NE:
24017 return 4;
24018 case GE:
24019 case GEU:
24020 return 5;
24021 case GT:
24022 case GTU:
24023 return 6;
24024 default:
24025 gcc_unreachable ();
24026 }
24027 }
24028
24029 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
24030
24031 static int
24032 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
24033 {
24034 switch (code)
24035 {
24036 case EQ:
24037 return 0x00;
24038 case NE:
24039 return 0x04;
24040 case GT:
24041 return 0x0e;
24042 case LE:
24043 return 0x02;
24044 case GE:
24045 return 0x0d;
24046 case LT:
24047 return 0x01;
24048 case UNLE:
24049 return 0x0a;
24050 case UNLT:
24051 return 0x09;
24052 case UNGE:
24053 return 0x05;
24054 case UNGT:
24055 return 0x06;
24056 case UNEQ:
24057 return 0x18;
24058 case LTGT:
24059 return 0x0c;
24060 case ORDERED:
24061 return 0x07;
24062 case UNORDERED:
24063 return 0x03;
24064 default:
24065 gcc_unreachable ();
24066 }
24067 }
24068
24069 /* Return immediate value to be used in UNSPEC_PCMP
24070 for comparison CODE in MODE. */
24071
24072 static int
24073 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
24074 {
24075 if (FLOAT_MODE_P (mode))
24076 return ix86_fp_cmp_code_to_pcmp_immediate (code);
24077 return ix86_int_cmp_code_to_pcmp_immediate (code);
24078 }
24079
24080 /* Expand AVX-512 vector comparison. */
24081
24082 bool
24083 ix86_expand_mask_vec_cmp (rtx operands[])
24084 {
24085 machine_mode mask_mode = GET_MODE (operands[0]);
24086 machine_mode cmp_mode = GET_MODE (operands[2]);
24087 enum rtx_code code = GET_CODE (operands[1]);
24088 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
24089 int unspec_code;
24090 rtx unspec;
24091
24092 switch (code)
24093 {
24094 case LEU:
24095 case GTU:
24096 case GEU:
24097 case LTU:
24098 unspec_code = UNSPEC_UNSIGNED_PCMP;
24099 break;
24100
24101 default:
24102 unspec_code = UNSPEC_PCMP;
24103 }
24104
24105 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
24106 operands[3], imm),
24107 unspec_code);
24108 emit_insn (gen_rtx_SET (operands[0], unspec));
24109
24110 return true;
24111 }
24112
24113 /* Expand fp vector comparison. */
24114
24115 bool
24116 ix86_expand_fp_vec_cmp (rtx operands[])
24117 {
24118 enum rtx_code code = GET_CODE (operands[1]);
24119 rtx cmp;
24120
24121 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
24122 &operands[2], &operands[3]);
24123 if (code == UNKNOWN)
24124 {
24125 rtx temp;
24126 switch (GET_CODE (operands[1]))
24127 {
24128 case LTGT:
24129 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
24130 operands[3], NULL, NULL);
24131 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
24132 operands[3], NULL, NULL);
24133 code = AND;
24134 break;
24135 case UNEQ:
24136 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
24137 operands[3], NULL, NULL);
24138 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
24139 operands[3], NULL, NULL);
24140 code = IOR;
24141 break;
24142 default:
24143 gcc_unreachable ();
24144 }
24145 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
24146 OPTAB_DIRECT);
24147 }
24148 else
24149 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
24150 operands[1], operands[2]);
24151
24152 if (operands[0] != cmp)
24153 emit_move_insn (operands[0], cmp);
24154
24155 return true;
24156 }
24157
24158 static rtx
24159 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
24160 rtx op_true, rtx op_false, bool *negate)
24161 {
24162 machine_mode data_mode = GET_MODE (dest);
24163 machine_mode mode = GET_MODE (cop0);
24164 rtx x;
24165
24166 *negate = false;
24167
24168 /* XOP supports all of the comparisons on all 128-bit vector int types. */
24169 if (TARGET_XOP
24170 && (mode == V16QImode || mode == V8HImode
24171 || mode == V4SImode || mode == V2DImode))
24172 ;
24173 else
24174 {
24175 /* Canonicalize the comparison to EQ, GT, GTU. */
24176 switch (code)
24177 {
24178 case EQ:
24179 case GT:
24180 case GTU:
24181 break;
24182
24183 case NE:
24184 case LE:
24185 case LEU:
24186 code = reverse_condition (code);
24187 *negate = true;
24188 break;
24189
24190 case GE:
24191 case GEU:
24192 code = reverse_condition (code);
24193 *negate = true;
24194 /* FALLTHRU */
24195
24196 case LT:
24197 case LTU:
24198 std::swap (cop0, cop1);
24199 code = swap_condition (code);
24200 break;
24201
24202 default:
24203 gcc_unreachable ();
24204 }
24205
24206 /* Only SSE4.1/SSE4.2 supports V2DImode. */
24207 if (mode == V2DImode)
24208 {
24209 switch (code)
24210 {
24211 case EQ:
24212 /* SSE4.1 supports EQ. */
24213 if (!TARGET_SSE4_1)
24214 return NULL;
24215 break;
24216
24217 case GT:
24218 case GTU:
24219 /* SSE4.2 supports GT/GTU. */
24220 if (!TARGET_SSE4_2)
24221 return NULL;
24222 break;
24223
24224 default:
24225 gcc_unreachable ();
24226 }
24227 }
24228
24229 /* Unsigned parallel compare is not supported by the hardware.
24230 Play some tricks to turn this into a signed comparison
24231 against 0. */
24232 if (code == GTU)
24233 {
24234 cop0 = force_reg (mode, cop0);
24235
24236 switch (mode)
24237 {
24238 case V16SImode:
24239 case V8DImode:
24240 case V8SImode:
24241 case V4DImode:
24242 case V4SImode:
24243 case V2DImode:
24244 {
24245 rtx t1, t2, mask;
24246 rtx (*gen_sub3) (rtx, rtx, rtx);
24247
24248 switch (mode)
24249 {
24250 case V16SImode: gen_sub3 = gen_subv16si3; break;
24251 case V8DImode: gen_sub3 = gen_subv8di3; break;
24252 case V8SImode: gen_sub3 = gen_subv8si3; break;
24253 case V4DImode: gen_sub3 = gen_subv4di3; break;
24254 case V4SImode: gen_sub3 = gen_subv4si3; break;
24255 case V2DImode: gen_sub3 = gen_subv2di3; break;
24256 default:
24257 gcc_unreachable ();
24258 }
24259 /* Subtract (-(INT MAX) - 1) from both operands to make
24260 them signed. */
24261 mask = ix86_build_signbit_mask (mode, true, false);
24262 t1 = gen_reg_rtx (mode);
24263 emit_insn (gen_sub3 (t1, cop0, mask));
24264
24265 t2 = gen_reg_rtx (mode);
24266 emit_insn (gen_sub3 (t2, cop1, mask));
24267
24268 cop0 = t1;
24269 cop1 = t2;
24270 code = GT;
24271 }
24272 break;
24273
24274 case V64QImode:
24275 case V32HImode:
24276 case V32QImode:
24277 case V16HImode:
24278 case V16QImode:
24279 case V8HImode:
24280 /* Perform a parallel unsigned saturating subtraction. */
24281 x = gen_reg_rtx (mode);
24282 emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0,
24283 cop1)));
24284
24285 cop0 = x;
24286 cop1 = CONST0_RTX (mode);
24287 code = EQ;
24288 *negate = !*negate;
24289 break;
24290
24291 default:
24292 gcc_unreachable ();
24293 }
24294 }
24295 }
24296
24297 if (*negate)
24298 std::swap (op_true, op_false);
24299
24300 /* Allow the comparison to be done in one mode, but the movcc to
24301 happen in another mode. */
24302 if (data_mode == mode)
24303 {
24304 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
24305 op_true, op_false);
24306 }
24307 else
24308 {
24309 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
24310 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
24311 op_true, op_false);
24312 if (GET_MODE (x) == mode)
24313 x = gen_lowpart (data_mode, x);
24314 }
24315
24316 return x;
24317 }
24318
24319 /* Expand integer vector comparison. */
24320
24321 bool
24322 ix86_expand_int_vec_cmp (rtx operands[])
24323 {
24324 rtx_code code = GET_CODE (operands[1]);
24325 bool negate = false;
24326 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
24327 operands[3], NULL, NULL, &negate);
24328
24329 if (!cmp)
24330 return false;
24331
24332 if (negate)
24333 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
24334 CONST0_RTX (GET_MODE (cmp)),
24335 NULL, NULL, &negate);
24336
24337 gcc_assert (!negate);
24338
24339 if (operands[0] != cmp)
24340 emit_move_insn (operands[0], cmp);
24341
24342 return true;
24343 }
24344
24345 /* Expand a floating-point vector conditional move; a vcond operation
24346 rather than a movcc operation. */
24347
24348 bool
24349 ix86_expand_fp_vcond (rtx operands[])
24350 {
24351 enum rtx_code code = GET_CODE (operands[3]);
24352 rtx cmp;
24353
24354 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
24355 &operands[4], &operands[5]);
24356 if (code == UNKNOWN)
24357 {
24358 rtx temp;
24359 switch (GET_CODE (operands[3]))
24360 {
24361 case LTGT:
24362 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
24363 operands[5], operands[0], operands[0]);
24364 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
24365 operands[5], operands[1], operands[2]);
24366 code = AND;
24367 break;
24368 case UNEQ:
24369 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
24370 operands[5], operands[0], operands[0]);
24371 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
24372 operands[5], operands[1], operands[2]);
24373 code = IOR;
24374 break;
24375 default:
24376 gcc_unreachable ();
24377 }
24378 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
24379 OPTAB_DIRECT);
24380 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24381 return true;
24382 }
24383
24384 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
24385 operands[5], operands[1], operands[2]))
24386 return true;
24387
24388 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
24389 operands[1], operands[2]);
24390 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24391 return true;
24392 }
24393
24394 /* Expand a signed/unsigned integral vector conditional move. */
24395
24396 bool
24397 ix86_expand_int_vcond (rtx operands[])
24398 {
24399 machine_mode data_mode = GET_MODE (operands[0]);
24400 machine_mode mode = GET_MODE (operands[4]);
24401 enum rtx_code code = GET_CODE (operands[3]);
24402 bool negate = false;
24403 rtx x, cop0, cop1;
24404
24405 cop0 = operands[4];
24406 cop1 = operands[5];
24407
24408 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
24409 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
24410 if ((code == LT || code == GE)
24411 && data_mode == mode
24412 && cop1 == CONST0_RTX (mode)
24413 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
24414 && GET_MODE_UNIT_SIZE (data_mode) > 1
24415 && GET_MODE_UNIT_SIZE (data_mode) <= 8
24416 && (GET_MODE_SIZE (data_mode) == 16
24417 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
24418 {
24419 rtx negop = operands[2 - (code == LT)];
24420 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
24421 if (negop == CONST1_RTX (data_mode))
24422 {
24423 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
24424 operands[0], 1, OPTAB_DIRECT);
24425 if (res != operands[0])
24426 emit_move_insn (operands[0], res);
24427 return true;
24428 }
24429 else if (GET_MODE_INNER (data_mode) != DImode
24430 && vector_all_ones_operand (negop, data_mode))
24431 {
24432 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
24433 operands[0], 0, OPTAB_DIRECT);
24434 if (res != operands[0])
24435 emit_move_insn (operands[0], res);
24436 return true;
24437 }
24438 }
24439
24440 if (!nonimmediate_operand (cop1, mode))
24441 cop1 = force_reg (mode, cop1);
24442 if (!general_operand (operands[1], data_mode))
24443 operands[1] = force_reg (data_mode, operands[1]);
24444 if (!general_operand (operands[2], data_mode))
24445 operands[2] = force_reg (data_mode, operands[2]);
24446
24447 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
24448 operands[1], operands[2], &negate);
24449
24450 if (!x)
24451 return false;
24452
24453 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
24454 operands[2-negate]);
24455 return true;
24456 }
24457
24458 /* AVX512F does support 64-byte integer vector operations,
24459 thus the longest vector we are faced with is V64QImode. */
24460 #define MAX_VECT_LEN 64
24461
24462 struct expand_vec_perm_d
24463 {
24464 rtx target, op0, op1;
24465 unsigned char perm[MAX_VECT_LEN];
24466 machine_mode vmode;
24467 unsigned char nelt;
24468 bool one_operand_p;
24469 bool testing_p;
24470 };
24471
24472 static bool
24473 ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1,
24474 struct expand_vec_perm_d *d)
24475 {
24476 /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
24477 expander, so args are either in d, or in op0, op1 etc. */
24478 machine_mode mode = GET_MODE (d ? d->op0 : op0);
24479 machine_mode maskmode = mode;
24480 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
24481
24482 switch (mode)
24483 {
24484 case V8HImode:
24485 if (TARGET_AVX512VL && TARGET_AVX512BW)
24486 gen = gen_avx512vl_vpermi2varv8hi3;
24487 break;
24488 case V16HImode:
24489 if (TARGET_AVX512VL && TARGET_AVX512BW)
24490 gen = gen_avx512vl_vpermi2varv16hi3;
24491 break;
24492 case V64QImode:
24493 if (TARGET_AVX512VBMI)
24494 gen = gen_avx512bw_vpermi2varv64qi3;
24495 break;
24496 case V32HImode:
24497 if (TARGET_AVX512BW)
24498 gen = gen_avx512bw_vpermi2varv32hi3;
24499 break;
24500 case V4SImode:
24501 if (TARGET_AVX512VL)
24502 gen = gen_avx512vl_vpermi2varv4si3;
24503 break;
24504 case V8SImode:
24505 if (TARGET_AVX512VL)
24506 gen = gen_avx512vl_vpermi2varv8si3;
24507 break;
24508 case V16SImode:
24509 if (TARGET_AVX512F)
24510 gen = gen_avx512f_vpermi2varv16si3;
24511 break;
24512 case V4SFmode:
24513 if (TARGET_AVX512VL)
24514 {
24515 gen = gen_avx512vl_vpermi2varv4sf3;
24516 maskmode = V4SImode;
24517 }
24518 break;
24519 case V8SFmode:
24520 if (TARGET_AVX512VL)
24521 {
24522 gen = gen_avx512vl_vpermi2varv8sf3;
24523 maskmode = V8SImode;
24524 }
24525 break;
24526 case V16SFmode:
24527 if (TARGET_AVX512F)
24528 {
24529 gen = gen_avx512f_vpermi2varv16sf3;
24530 maskmode = V16SImode;
24531 }
24532 break;
24533 case V2DImode:
24534 if (TARGET_AVX512VL)
24535 gen = gen_avx512vl_vpermi2varv2di3;
24536 break;
24537 case V4DImode:
24538 if (TARGET_AVX512VL)
24539 gen = gen_avx512vl_vpermi2varv4di3;
24540 break;
24541 case V8DImode:
24542 if (TARGET_AVX512F)
24543 gen = gen_avx512f_vpermi2varv8di3;
24544 break;
24545 case V2DFmode:
24546 if (TARGET_AVX512VL)
24547 {
24548 gen = gen_avx512vl_vpermi2varv2df3;
24549 maskmode = V2DImode;
24550 }
24551 break;
24552 case V4DFmode:
24553 if (TARGET_AVX512VL)
24554 {
24555 gen = gen_avx512vl_vpermi2varv4df3;
24556 maskmode = V4DImode;
24557 }
24558 break;
24559 case V8DFmode:
24560 if (TARGET_AVX512F)
24561 {
24562 gen = gen_avx512f_vpermi2varv8df3;
24563 maskmode = V8DImode;
24564 }
24565 break;
24566 default:
24567 break;
24568 }
24569
24570 if (gen == NULL)
24571 return false;
24572
24573 /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
24574 expander, so args are either in d, or in op0, op1 etc. */
24575 if (d)
24576 {
24577 rtx vec[64];
24578 target = d->target;
24579 op0 = d->op0;
24580 op1 = d->op1;
24581 for (int i = 0; i < d->nelt; ++i)
24582 vec[i] = GEN_INT (d->perm[i]);
24583 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
24584 }
24585
24586 emit_insn (gen (target, op0, force_reg (maskmode, mask), op1));
24587 return true;
24588 }
24589
24590 /* Expand a variable vector permutation. */
24591
24592 void
24593 ix86_expand_vec_perm (rtx operands[])
24594 {
24595 rtx target = operands[0];
24596 rtx op0 = operands[1];
24597 rtx op1 = operands[2];
24598 rtx mask = operands[3];
24599 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
24600 machine_mode mode = GET_MODE (op0);
24601 machine_mode maskmode = GET_MODE (mask);
24602 int w, e, i;
24603 bool one_operand_shuffle = rtx_equal_p (op0, op1);
24604
24605 /* Number of elements in the vector. */
24606 w = GET_MODE_NUNITS (mode);
24607 e = GET_MODE_UNIT_SIZE (mode);
24608 gcc_assert (w <= 64);
24609
24610 if (TARGET_AVX512F && one_operand_shuffle)
24611 {
24612 rtx (*gen) (rtx, rtx, rtx) = NULL;
24613 switch (mode)
24614 {
24615 case V16SImode:
24616 gen =gen_avx512f_permvarv16si;
24617 break;
24618 case V16SFmode:
24619 gen = gen_avx512f_permvarv16sf;
24620 break;
24621 case V8DImode:
24622 gen = gen_avx512f_permvarv8di;
24623 break;
24624 case V8DFmode:
24625 gen = gen_avx512f_permvarv8df;
24626 break;
24627 default:
24628 break;
24629 }
24630 if (gen != NULL)
24631 {
24632 emit_insn (gen (target, op0, mask));
24633 return;
24634 }
24635 }
24636
24637 if (ix86_expand_vec_perm_vpermi2 (target, op0, mask, op1, NULL))
24638 return;
24639
24640 if (TARGET_AVX2)
24641 {
24642 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
24643 {
24644 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
24645 an constant shuffle operand. With a tiny bit of effort we can
24646 use VPERMD instead. A re-interpretation stall for V4DFmode is
24647 unfortunate but there's no avoiding it.
24648 Similarly for V16HImode we don't have instructions for variable
24649 shuffling, while for V32QImode we can use after preparing suitable
24650 masks vpshufb; vpshufb; vpermq; vpor. */
24651
24652 if (mode == V16HImode)
24653 {
24654 maskmode = mode = V32QImode;
24655 w = 32;
24656 e = 1;
24657 }
24658 else
24659 {
24660 maskmode = mode = V8SImode;
24661 w = 8;
24662 e = 4;
24663 }
24664 t1 = gen_reg_rtx (maskmode);
24665
24666 /* Replicate the low bits of the V4DImode mask into V8SImode:
24667 mask = { A B C D }
24668 t1 = { A A B B C C D D }. */
24669 for (i = 0; i < w / 2; ++i)
24670 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
24671 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24672 vt = force_reg (maskmode, vt);
24673 mask = gen_lowpart (maskmode, mask);
24674 if (maskmode == V8SImode)
24675 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
24676 else
24677 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
24678
24679 /* Multiply the shuffle indicies by two. */
24680 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
24681 OPTAB_DIRECT);
24682
24683 /* Add one to the odd shuffle indicies:
24684 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
24685 for (i = 0; i < w / 2; ++i)
24686 {
24687 vec[i * 2] = const0_rtx;
24688 vec[i * 2 + 1] = const1_rtx;
24689 }
24690 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24691 vt = validize_mem (force_const_mem (maskmode, vt));
24692 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
24693 OPTAB_DIRECT);
24694
24695 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
24696 operands[3] = mask = t1;
24697 target = gen_reg_rtx (mode);
24698 op0 = gen_lowpart (mode, op0);
24699 op1 = gen_lowpart (mode, op1);
24700 }
24701
24702 switch (mode)
24703 {
24704 case V8SImode:
24705 /* The VPERMD and VPERMPS instructions already properly ignore
24706 the high bits of the shuffle elements. No need for us to
24707 perform an AND ourselves. */
24708 if (one_operand_shuffle)
24709 {
24710 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
24711 if (target != operands[0])
24712 emit_move_insn (operands[0],
24713 gen_lowpart (GET_MODE (operands[0]), target));
24714 }
24715 else
24716 {
24717 t1 = gen_reg_rtx (V8SImode);
24718 t2 = gen_reg_rtx (V8SImode);
24719 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
24720 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
24721 goto merge_two;
24722 }
24723 return;
24724
24725 case V8SFmode:
24726 mask = gen_lowpart (V8SImode, mask);
24727 if (one_operand_shuffle)
24728 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
24729 else
24730 {
24731 t1 = gen_reg_rtx (V8SFmode);
24732 t2 = gen_reg_rtx (V8SFmode);
24733 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
24734 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
24735 goto merge_two;
24736 }
24737 return;
24738
24739 case V4SImode:
24740 /* By combining the two 128-bit input vectors into one 256-bit
24741 input vector, we can use VPERMD and VPERMPS for the full
24742 two-operand shuffle. */
24743 t1 = gen_reg_rtx (V8SImode);
24744 t2 = gen_reg_rtx (V8SImode);
24745 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
24746 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
24747 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
24748 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
24749 return;
24750
24751 case V4SFmode:
24752 t1 = gen_reg_rtx (V8SFmode);
24753 t2 = gen_reg_rtx (V8SImode);
24754 mask = gen_lowpart (V4SImode, mask);
24755 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
24756 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
24757 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
24758 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
24759 return;
24760
24761 case V32QImode:
24762 t1 = gen_reg_rtx (V32QImode);
24763 t2 = gen_reg_rtx (V32QImode);
24764 t3 = gen_reg_rtx (V32QImode);
24765 vt2 = GEN_INT (-128);
24766 for (i = 0; i < 32; i++)
24767 vec[i] = vt2;
24768 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
24769 vt = force_reg (V32QImode, vt);
24770 for (i = 0; i < 32; i++)
24771 vec[i] = i < 16 ? vt2 : const0_rtx;
24772 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
24773 vt2 = force_reg (V32QImode, vt2);
24774 /* From mask create two adjusted masks, which contain the same
24775 bits as mask in the low 7 bits of each vector element.
24776 The first mask will have the most significant bit clear
24777 if it requests element from the same 128-bit lane
24778 and MSB set if it requests element from the other 128-bit lane.
24779 The second mask will have the opposite values of the MSB,
24780 and additionally will have its 128-bit lanes swapped.
24781 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
24782 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
24783 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
24784 stands for other 12 bytes. */
24785 /* The bit whether element is from the same lane or the other
24786 lane is bit 4, so shift it up by 3 to the MSB position. */
24787 t5 = gen_reg_rtx (V4DImode);
24788 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
24789 GEN_INT (3)));
24790 /* Clear MSB bits from the mask just in case it had them set. */
24791 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
24792 /* After this t1 will have MSB set for elements from other lane. */
24793 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
24794 /* Clear bits other than MSB. */
24795 emit_insn (gen_andv32qi3 (t1, t1, vt));
24796 /* Or in the lower bits from mask into t3. */
24797 emit_insn (gen_iorv32qi3 (t3, t1, t2));
24798 /* And invert MSB bits in t1, so MSB is set for elements from the same
24799 lane. */
24800 emit_insn (gen_xorv32qi3 (t1, t1, vt));
24801 /* Swap 128-bit lanes in t3. */
24802 t6 = gen_reg_rtx (V4DImode);
24803 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
24804 const2_rtx, GEN_INT (3),
24805 const0_rtx, const1_rtx));
24806 /* And or in the lower bits from mask into t1. */
24807 emit_insn (gen_iorv32qi3 (t1, t1, t2));
24808 if (one_operand_shuffle)
24809 {
24810 /* Each of these shuffles will put 0s in places where
24811 element from the other 128-bit lane is needed, otherwise
24812 will shuffle in the requested value. */
24813 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
24814 gen_lowpart (V32QImode, t6)));
24815 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
24816 /* For t3 the 128-bit lanes are swapped again. */
24817 t7 = gen_reg_rtx (V4DImode);
24818 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
24819 const2_rtx, GEN_INT (3),
24820 const0_rtx, const1_rtx));
24821 /* And oring both together leads to the result. */
24822 emit_insn (gen_iorv32qi3 (target, t1,
24823 gen_lowpart (V32QImode, t7)));
24824 if (target != operands[0])
24825 emit_move_insn (operands[0],
24826 gen_lowpart (GET_MODE (operands[0]), target));
24827 return;
24828 }
24829
24830 t4 = gen_reg_rtx (V32QImode);
24831 /* Similarly to the above one_operand_shuffle code,
24832 just for repeated twice for each operand. merge_two:
24833 code will merge the two results together. */
24834 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
24835 gen_lowpart (V32QImode, t6)));
24836 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
24837 gen_lowpart (V32QImode, t6)));
24838 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
24839 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
24840 t7 = gen_reg_rtx (V4DImode);
24841 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
24842 const2_rtx, GEN_INT (3),
24843 const0_rtx, const1_rtx));
24844 t8 = gen_reg_rtx (V4DImode);
24845 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
24846 const2_rtx, GEN_INT (3),
24847 const0_rtx, const1_rtx));
24848 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
24849 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
24850 t1 = t4;
24851 t2 = t3;
24852 goto merge_two;
24853
24854 default:
24855 gcc_assert (GET_MODE_SIZE (mode) <= 16);
24856 break;
24857 }
24858 }
24859
24860 if (TARGET_XOP)
24861 {
24862 /* The XOP VPPERM insn supports three inputs. By ignoring the
24863 one_operand_shuffle special case, we avoid creating another
24864 set of constant vectors in memory. */
24865 one_operand_shuffle = false;
24866
24867 /* mask = mask & {2*w-1, ...} */
24868 vt = GEN_INT (2*w - 1);
24869 }
24870 else
24871 {
24872 /* mask = mask & {w-1, ...} */
24873 vt = GEN_INT (w - 1);
24874 }
24875
24876 for (i = 0; i < w; i++)
24877 vec[i] = vt;
24878 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24879 mask = expand_simple_binop (maskmode, AND, mask, vt,
24880 NULL_RTX, 0, OPTAB_DIRECT);
24881
24882 /* For non-QImode operations, convert the word permutation control
24883 into a byte permutation control. */
24884 if (mode != V16QImode)
24885 {
24886 mask = expand_simple_binop (maskmode, ASHIFT, mask,
24887 GEN_INT (exact_log2 (e)),
24888 NULL_RTX, 0, OPTAB_DIRECT);
24889
24890 /* Convert mask to vector of chars. */
24891 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
24892
24893 /* Replicate each of the input bytes into byte positions:
24894 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
24895 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
24896 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
24897 for (i = 0; i < 16; ++i)
24898 vec[i] = GEN_INT (i/e * e);
24899 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
24900 vt = validize_mem (force_const_mem (V16QImode, vt));
24901 if (TARGET_XOP)
24902 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
24903 else
24904 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
24905
24906 /* Convert it into the byte positions by doing
24907 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
24908 for (i = 0; i < 16; ++i)
24909 vec[i] = GEN_INT (i % e);
24910 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
24911 vt = validize_mem (force_const_mem (V16QImode, vt));
24912 emit_insn (gen_addv16qi3 (mask, mask, vt));
24913 }
24914
24915 /* The actual shuffle operations all operate on V16QImode. */
24916 op0 = gen_lowpart (V16QImode, op0);
24917 op1 = gen_lowpart (V16QImode, op1);
24918
24919 if (TARGET_XOP)
24920 {
24921 if (GET_MODE (target) != V16QImode)
24922 target = gen_reg_rtx (V16QImode);
24923 emit_insn (gen_xop_pperm (target, op0, op1, mask));
24924 if (target != operands[0])
24925 emit_move_insn (operands[0],
24926 gen_lowpart (GET_MODE (operands[0]), target));
24927 }
24928 else if (one_operand_shuffle)
24929 {
24930 if (GET_MODE (target) != V16QImode)
24931 target = gen_reg_rtx (V16QImode);
24932 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
24933 if (target != operands[0])
24934 emit_move_insn (operands[0],
24935 gen_lowpart (GET_MODE (operands[0]), target));
24936 }
24937 else
24938 {
24939 rtx xops[6];
24940 bool ok;
24941
24942 /* Shuffle the two input vectors independently. */
24943 t1 = gen_reg_rtx (V16QImode);
24944 t2 = gen_reg_rtx (V16QImode);
24945 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
24946 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
24947
24948 merge_two:
24949 /* Then merge them together. The key is whether any given control
24950 element contained a bit set that indicates the second word. */
24951 mask = operands[3];
24952 vt = GEN_INT (w);
24953 if (maskmode == V2DImode && !TARGET_SSE4_1)
24954 {
24955 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
24956 more shuffle to convert the V2DI input mask into a V4SI
24957 input mask. At which point the masking that expand_int_vcond
24958 will work as desired. */
24959 rtx t3 = gen_reg_rtx (V4SImode);
24960 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
24961 const0_rtx, const0_rtx,
24962 const2_rtx, const2_rtx));
24963 mask = t3;
24964 maskmode = V4SImode;
24965 e = w = 4;
24966 }
24967
24968 for (i = 0; i < w; i++)
24969 vec[i] = vt;
24970 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24971 vt = force_reg (maskmode, vt);
24972 mask = expand_simple_binop (maskmode, AND, mask, vt,
24973 NULL_RTX, 0, OPTAB_DIRECT);
24974
24975 if (GET_MODE (target) != mode)
24976 target = gen_reg_rtx (mode);
24977 xops[0] = target;
24978 xops[1] = gen_lowpart (mode, t2);
24979 xops[2] = gen_lowpart (mode, t1);
24980 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
24981 xops[4] = mask;
24982 xops[5] = vt;
24983 ok = ix86_expand_int_vcond (xops);
24984 gcc_assert (ok);
24985 if (target != operands[0])
24986 emit_move_insn (operands[0],
24987 gen_lowpart (GET_MODE (operands[0]), target));
24988 }
24989 }
24990
24991 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
24992 true if we should do zero extension, else sign extension. HIGH_P is
24993 true if we want the N/2 high elements, else the low elements. */
24994
24995 void
24996 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
24997 {
24998 machine_mode imode = GET_MODE (src);
24999 rtx tmp;
25000
25001 if (TARGET_SSE4_1)
25002 {
25003 rtx (*unpack)(rtx, rtx);
25004 rtx (*extract)(rtx, rtx) = NULL;
25005 machine_mode halfmode = BLKmode;
25006
25007 switch (imode)
25008 {
25009 case V64QImode:
25010 if (unsigned_p)
25011 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
25012 else
25013 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
25014 halfmode = V32QImode;
25015 extract
25016 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
25017 break;
25018 case V32QImode:
25019 if (unsigned_p)
25020 unpack = gen_avx2_zero_extendv16qiv16hi2;
25021 else
25022 unpack = gen_avx2_sign_extendv16qiv16hi2;
25023 halfmode = V16QImode;
25024 extract
25025 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
25026 break;
25027 case V32HImode:
25028 if (unsigned_p)
25029 unpack = gen_avx512f_zero_extendv16hiv16si2;
25030 else
25031 unpack = gen_avx512f_sign_extendv16hiv16si2;
25032 halfmode = V16HImode;
25033 extract
25034 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
25035 break;
25036 case V16HImode:
25037 if (unsigned_p)
25038 unpack = gen_avx2_zero_extendv8hiv8si2;
25039 else
25040 unpack = gen_avx2_sign_extendv8hiv8si2;
25041 halfmode = V8HImode;
25042 extract
25043 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
25044 break;
25045 case V16SImode:
25046 if (unsigned_p)
25047 unpack = gen_avx512f_zero_extendv8siv8di2;
25048 else
25049 unpack = gen_avx512f_sign_extendv8siv8di2;
25050 halfmode = V8SImode;
25051 extract
25052 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
25053 break;
25054 case V8SImode:
25055 if (unsigned_p)
25056 unpack = gen_avx2_zero_extendv4siv4di2;
25057 else
25058 unpack = gen_avx2_sign_extendv4siv4di2;
25059 halfmode = V4SImode;
25060 extract
25061 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
25062 break;
25063 case V16QImode:
25064 if (unsigned_p)
25065 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
25066 else
25067 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
25068 break;
25069 case V8HImode:
25070 if (unsigned_p)
25071 unpack = gen_sse4_1_zero_extendv4hiv4si2;
25072 else
25073 unpack = gen_sse4_1_sign_extendv4hiv4si2;
25074 break;
25075 case V4SImode:
25076 if (unsigned_p)
25077 unpack = gen_sse4_1_zero_extendv2siv2di2;
25078 else
25079 unpack = gen_sse4_1_sign_extendv2siv2di2;
25080 break;
25081 default:
25082 gcc_unreachable ();
25083 }
25084
25085 if (GET_MODE_SIZE (imode) >= 32)
25086 {
25087 tmp = gen_reg_rtx (halfmode);
25088 emit_insn (extract (tmp, src));
25089 }
25090 else if (high_p)
25091 {
25092 /* Shift higher 8 bytes to lower 8 bytes. */
25093 tmp = gen_reg_rtx (V1TImode);
25094 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
25095 GEN_INT (64)));
25096 tmp = gen_lowpart (imode, tmp);
25097 }
25098 else
25099 tmp = src;
25100
25101 emit_insn (unpack (dest, tmp));
25102 }
25103 else
25104 {
25105 rtx (*unpack)(rtx, rtx, rtx);
25106
25107 switch (imode)
25108 {
25109 case V16QImode:
25110 if (high_p)
25111 unpack = gen_vec_interleave_highv16qi;
25112 else
25113 unpack = gen_vec_interleave_lowv16qi;
25114 break;
25115 case V8HImode:
25116 if (high_p)
25117 unpack = gen_vec_interleave_highv8hi;
25118 else
25119 unpack = gen_vec_interleave_lowv8hi;
25120 break;
25121 case V4SImode:
25122 if (high_p)
25123 unpack = gen_vec_interleave_highv4si;
25124 else
25125 unpack = gen_vec_interleave_lowv4si;
25126 break;
25127 default:
25128 gcc_unreachable ();
25129 }
25130
25131 if (unsigned_p)
25132 tmp = force_reg (imode, CONST0_RTX (imode));
25133 else
25134 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
25135 src, pc_rtx, pc_rtx);
25136
25137 rtx tmp2 = gen_reg_rtx (imode);
25138 emit_insn (unpack (tmp2, src, tmp));
25139 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
25140 }
25141 }
25142
25143 /* Expand conditional increment or decrement using adb/sbb instructions.
25144 The default case using setcc followed by the conditional move can be
25145 done by generic code. */
25146 bool
25147 ix86_expand_int_addcc (rtx operands[])
25148 {
25149 enum rtx_code code = GET_CODE (operands[1]);
25150 rtx flags;
25151 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
25152 rtx compare_op;
25153 rtx val = const0_rtx;
25154 bool fpcmp = false;
25155 machine_mode mode;
25156 rtx op0 = XEXP (operands[1], 0);
25157 rtx op1 = XEXP (operands[1], 1);
25158
25159 if (operands[3] != const1_rtx
25160 && operands[3] != constm1_rtx)
25161 return false;
25162 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
25163 return false;
25164 code = GET_CODE (compare_op);
25165
25166 flags = XEXP (compare_op, 0);
25167
25168 if (GET_MODE (flags) == CCFPmode
25169 || GET_MODE (flags) == CCFPUmode)
25170 {
25171 fpcmp = true;
25172 code = ix86_fp_compare_code_to_integer (code);
25173 }
25174
25175 if (code != LTU)
25176 {
25177 val = constm1_rtx;
25178 if (fpcmp)
25179 PUT_CODE (compare_op,
25180 reverse_condition_maybe_unordered
25181 (GET_CODE (compare_op)));
25182 else
25183 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
25184 }
25185
25186 mode = GET_MODE (operands[0]);
25187
25188 /* Construct either adc or sbb insn. */
25189 if ((code == LTU) == (operands[3] == constm1_rtx))
25190 {
25191 switch (mode)
25192 {
25193 case QImode:
25194 insn = gen_subqi3_carry;
25195 break;
25196 case HImode:
25197 insn = gen_subhi3_carry;
25198 break;
25199 case SImode:
25200 insn = gen_subsi3_carry;
25201 break;
25202 case DImode:
25203 insn = gen_subdi3_carry;
25204 break;
25205 default:
25206 gcc_unreachable ();
25207 }
25208 }
25209 else
25210 {
25211 switch (mode)
25212 {
25213 case QImode:
25214 insn = gen_addqi3_carry;
25215 break;
25216 case HImode:
25217 insn = gen_addhi3_carry;
25218 break;
25219 case SImode:
25220 insn = gen_addsi3_carry;
25221 break;
25222 case DImode:
25223 insn = gen_adddi3_carry;
25224 break;
25225 default:
25226 gcc_unreachable ();
25227 }
25228 }
25229 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
25230
25231 return true;
25232 }
25233
25234
25235 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
25236 but works for floating pointer parameters and nonoffsetable memories.
25237 For pushes, it returns just stack offsets; the values will be saved
25238 in the right order. Maximally three parts are generated. */
25239
25240 static int
25241 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
25242 {
25243 int size;
25244
25245 if (!TARGET_64BIT)
25246 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
25247 else
25248 size = (GET_MODE_SIZE (mode) + 4) / 8;
25249
25250 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
25251 gcc_assert (size >= 2 && size <= 4);
25252
25253 /* Optimize constant pool reference to immediates. This is used by fp
25254 moves, that force all constants to memory to allow combining. */
25255 if (MEM_P (operand) && MEM_READONLY_P (operand))
25256 {
25257 rtx tmp = maybe_get_pool_constant (operand);
25258 if (tmp)
25259 operand = tmp;
25260 }
25261
25262 if (MEM_P (operand) && !offsettable_memref_p (operand))
25263 {
25264 /* The only non-offsetable memories we handle are pushes. */
25265 int ok = push_operand (operand, VOIDmode);
25266
25267 gcc_assert (ok);
25268
25269 operand = copy_rtx (operand);
25270 PUT_MODE (operand, word_mode);
25271 parts[0] = parts[1] = parts[2] = parts[3] = operand;
25272 return size;
25273 }
25274
25275 if (GET_CODE (operand) == CONST_VECTOR)
25276 {
25277 machine_mode imode = int_mode_for_mode (mode);
25278 /* Caution: if we looked through a constant pool memory above,
25279 the operand may actually have a different mode now. That's
25280 ok, since we want to pun this all the way back to an integer. */
25281 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
25282 gcc_assert (operand != NULL);
25283 mode = imode;
25284 }
25285
25286 if (!TARGET_64BIT)
25287 {
25288 if (mode == DImode)
25289 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25290 else
25291 {
25292 int i;
25293
25294 if (REG_P (operand))
25295 {
25296 gcc_assert (reload_completed);
25297 for (i = 0; i < size; i++)
25298 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
25299 }
25300 else if (offsettable_memref_p (operand))
25301 {
25302 operand = adjust_address (operand, SImode, 0);
25303 parts[0] = operand;
25304 for (i = 1; i < size; i++)
25305 parts[i] = adjust_address (operand, SImode, 4 * i);
25306 }
25307 else if (CONST_DOUBLE_P (operand))
25308 {
25309 const REAL_VALUE_TYPE *r;
25310 long l[4];
25311
25312 r = CONST_DOUBLE_REAL_VALUE (operand);
25313 switch (mode)
25314 {
25315 case TFmode:
25316 real_to_target (l, r, mode);
25317 parts[3] = gen_int_mode (l[3], SImode);
25318 parts[2] = gen_int_mode (l[2], SImode);
25319 break;
25320 case XFmode:
25321 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
25322 long double may not be 80-bit. */
25323 real_to_target (l, r, mode);
25324 parts[2] = gen_int_mode (l[2], SImode);
25325 break;
25326 case DFmode:
25327 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
25328 break;
25329 default:
25330 gcc_unreachable ();
25331 }
25332 parts[1] = gen_int_mode (l[1], SImode);
25333 parts[0] = gen_int_mode (l[0], SImode);
25334 }
25335 else
25336 gcc_unreachable ();
25337 }
25338 }
25339 else
25340 {
25341 if (mode == TImode)
25342 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25343 if (mode == XFmode || mode == TFmode)
25344 {
25345 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
25346 if (REG_P (operand))
25347 {
25348 gcc_assert (reload_completed);
25349 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
25350 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
25351 }
25352 else if (offsettable_memref_p (operand))
25353 {
25354 operand = adjust_address (operand, DImode, 0);
25355 parts[0] = operand;
25356 parts[1] = adjust_address (operand, upper_mode, 8);
25357 }
25358 else if (CONST_DOUBLE_P (operand))
25359 {
25360 long l[4];
25361
25362 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
25363
25364 /* real_to_target puts 32-bit pieces in each long. */
25365 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
25366 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
25367 << 32), DImode);
25368
25369 if (upper_mode == SImode)
25370 parts[1] = gen_int_mode (l[2], SImode);
25371 else
25372 parts[1]
25373 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
25374 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
25375 << 32), DImode);
25376 }
25377 else
25378 gcc_unreachable ();
25379 }
25380 }
25381
25382 return size;
25383 }
25384
25385 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
25386 Return false when normal moves are needed; true when all required
25387 insns have been emitted. Operands 2-4 contain the input values
25388 int the correct order; operands 5-7 contain the output values. */
25389
25390 void
25391 ix86_split_long_move (rtx operands[])
25392 {
25393 rtx part[2][4];
25394 int nparts, i, j;
25395 int push = 0;
25396 int collisions = 0;
25397 machine_mode mode = GET_MODE (operands[0]);
25398 bool collisionparts[4];
25399
25400 /* The DFmode expanders may ask us to move double.
25401 For 64bit target this is single move. By hiding the fact
25402 here we simplify i386.md splitters. */
25403 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
25404 {
25405 /* Optimize constant pool reference to immediates. This is used by
25406 fp moves, that force all constants to memory to allow combining. */
25407
25408 if (MEM_P (operands[1])
25409 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
25410 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
25411 operands[1] = get_pool_constant (XEXP (operands[1], 0));
25412 if (push_operand (operands[0], VOIDmode))
25413 {
25414 operands[0] = copy_rtx (operands[0]);
25415 PUT_MODE (operands[0], word_mode);
25416 }
25417 else
25418 operands[0] = gen_lowpart (DImode, operands[0]);
25419 operands[1] = gen_lowpart (DImode, operands[1]);
25420 emit_move_insn (operands[0], operands[1]);
25421 return;
25422 }
25423
25424 /* The only non-offsettable memory we handle is push. */
25425 if (push_operand (operands[0], VOIDmode))
25426 push = 1;
25427 else
25428 gcc_assert (!MEM_P (operands[0])
25429 || offsettable_memref_p (operands[0]));
25430
25431 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
25432 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
25433
25434 /* When emitting push, take care for source operands on the stack. */
25435 if (push && MEM_P (operands[1])
25436 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
25437 {
25438 rtx src_base = XEXP (part[1][nparts - 1], 0);
25439
25440 /* Compensate for the stack decrement by 4. */
25441 if (!TARGET_64BIT && nparts == 3
25442 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
25443 src_base = plus_constant (Pmode, src_base, 4);
25444
25445 /* src_base refers to the stack pointer and is
25446 automatically decreased by emitted push. */
25447 for (i = 0; i < nparts; i++)
25448 part[1][i] = change_address (part[1][i],
25449 GET_MODE (part[1][i]), src_base);
25450 }
25451
25452 /* We need to do copy in the right order in case an address register
25453 of the source overlaps the destination. */
25454 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
25455 {
25456 rtx tmp;
25457
25458 for (i = 0; i < nparts; i++)
25459 {
25460 collisionparts[i]
25461 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
25462 if (collisionparts[i])
25463 collisions++;
25464 }
25465
25466 /* Collision in the middle part can be handled by reordering. */
25467 if (collisions == 1 && nparts == 3 && collisionparts [1])
25468 {
25469 std::swap (part[0][1], part[0][2]);
25470 std::swap (part[1][1], part[1][2]);
25471 }
25472 else if (collisions == 1
25473 && nparts == 4
25474 && (collisionparts [1] || collisionparts [2]))
25475 {
25476 if (collisionparts [1])
25477 {
25478 std::swap (part[0][1], part[0][2]);
25479 std::swap (part[1][1], part[1][2]);
25480 }
25481 else
25482 {
25483 std::swap (part[0][2], part[0][3]);
25484 std::swap (part[1][2], part[1][3]);
25485 }
25486 }
25487
25488 /* If there are more collisions, we can't handle it by reordering.
25489 Do an lea to the last part and use only one colliding move. */
25490 else if (collisions > 1)
25491 {
25492 rtx base, addr, tls_base = NULL_RTX;
25493
25494 collisions = 1;
25495
25496 base = part[0][nparts - 1];
25497
25498 /* Handle the case when the last part isn't valid for lea.
25499 Happens in 64-bit mode storing the 12-byte XFmode. */
25500 if (GET_MODE (base) != Pmode)
25501 base = gen_rtx_REG (Pmode, REGNO (base));
25502
25503 addr = XEXP (part[1][0], 0);
25504 if (TARGET_TLS_DIRECT_SEG_REFS)
25505 {
25506 struct ix86_address parts;
25507 int ok = ix86_decompose_address (addr, &parts);
25508 gcc_assert (ok);
25509 if (parts.seg == DEFAULT_TLS_SEG_REG)
25510 {
25511 /* It is not valid to use %gs: or %fs: in
25512 lea though, so we need to remove it from the
25513 address used for lea and add it to each individual
25514 memory loads instead. */
25515 addr = copy_rtx (addr);
25516 rtx *x = &addr;
25517 while (GET_CODE (*x) == PLUS)
25518 {
25519 for (i = 0; i < 2; i++)
25520 {
25521 rtx u = XEXP (*x, i);
25522 if (GET_CODE (u) == ZERO_EXTEND)
25523 u = XEXP (u, 0);
25524 if (GET_CODE (u) == UNSPEC
25525 && XINT (u, 1) == UNSPEC_TP)
25526 {
25527 tls_base = XEXP (*x, i);
25528 *x = XEXP (*x, 1 - i);
25529 break;
25530 }
25531 }
25532 if (tls_base)
25533 break;
25534 x = &XEXP (*x, 0);
25535 }
25536 gcc_assert (tls_base);
25537 }
25538 }
25539 emit_insn (gen_rtx_SET (base, addr));
25540 if (tls_base)
25541 base = gen_rtx_PLUS (GET_MODE (base), base, tls_base);
25542 part[1][0] = replace_equiv_address (part[1][0], base);
25543 for (i = 1; i < nparts; i++)
25544 {
25545 if (tls_base)
25546 base = copy_rtx (base);
25547 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
25548 part[1][i] = replace_equiv_address (part[1][i], tmp);
25549 }
25550 }
25551 }
25552
25553 if (push)
25554 {
25555 if (!TARGET_64BIT)
25556 {
25557 if (nparts == 3)
25558 {
25559 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
25560 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
25561 stack_pointer_rtx, GEN_INT (-4)));
25562 emit_move_insn (part[0][2], part[1][2]);
25563 }
25564 else if (nparts == 4)
25565 {
25566 emit_move_insn (part[0][3], part[1][3]);
25567 emit_move_insn (part[0][2], part[1][2]);
25568 }
25569 }
25570 else
25571 {
25572 /* In 64bit mode we don't have 32bit push available. In case this is
25573 register, it is OK - we will just use larger counterpart. We also
25574 retype memory - these comes from attempt to avoid REX prefix on
25575 moving of second half of TFmode value. */
25576 if (GET_MODE (part[1][1]) == SImode)
25577 {
25578 switch (GET_CODE (part[1][1]))
25579 {
25580 case MEM:
25581 part[1][1] = adjust_address (part[1][1], DImode, 0);
25582 break;
25583
25584 case REG:
25585 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
25586 break;
25587
25588 default:
25589 gcc_unreachable ();
25590 }
25591
25592 if (GET_MODE (part[1][0]) == SImode)
25593 part[1][0] = part[1][1];
25594 }
25595 }
25596 emit_move_insn (part[0][1], part[1][1]);
25597 emit_move_insn (part[0][0], part[1][0]);
25598 return;
25599 }
25600
25601 /* Choose correct order to not overwrite the source before it is copied. */
25602 if ((REG_P (part[0][0])
25603 && REG_P (part[1][1])
25604 && (REGNO (part[0][0]) == REGNO (part[1][1])
25605 || (nparts == 3
25606 && REGNO (part[0][0]) == REGNO (part[1][2]))
25607 || (nparts == 4
25608 && REGNO (part[0][0]) == REGNO (part[1][3]))))
25609 || (collisions > 0
25610 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
25611 {
25612 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
25613 {
25614 operands[2 + i] = part[0][j];
25615 operands[6 + i] = part[1][j];
25616 }
25617 }
25618 else
25619 {
25620 for (i = 0; i < nparts; i++)
25621 {
25622 operands[2 + i] = part[0][i];
25623 operands[6 + i] = part[1][i];
25624 }
25625 }
25626
25627 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
25628 if (optimize_insn_for_size_p ())
25629 {
25630 for (j = 0; j < nparts - 1; j++)
25631 if (CONST_INT_P (operands[6 + j])
25632 && operands[6 + j] != const0_rtx
25633 && REG_P (operands[2 + j]))
25634 for (i = j; i < nparts - 1; i++)
25635 if (CONST_INT_P (operands[7 + i])
25636 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
25637 operands[7 + i] = operands[2 + j];
25638 }
25639
25640 for (i = 0; i < nparts; i++)
25641 emit_move_insn (operands[2 + i], operands[6 + i]);
25642
25643 return;
25644 }
25645
25646 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
25647 left shift by a constant, either using a single shift or
25648 a sequence of add instructions. */
25649
25650 static void
25651 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
25652 {
25653 rtx (*insn)(rtx, rtx, rtx);
25654
25655 if (count == 1
25656 || (count * ix86_cost->add <= ix86_cost->shift_const
25657 && !optimize_insn_for_size_p ()))
25658 {
25659 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
25660 while (count-- > 0)
25661 emit_insn (insn (operand, operand, operand));
25662 }
25663 else
25664 {
25665 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25666 emit_insn (insn (operand, operand, GEN_INT (count)));
25667 }
25668 }
25669
25670 void
25671 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
25672 {
25673 rtx (*gen_ashl3)(rtx, rtx, rtx);
25674 rtx (*gen_shld)(rtx, rtx, rtx);
25675 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25676
25677 rtx low[2], high[2];
25678 int count;
25679
25680 if (CONST_INT_P (operands[2]))
25681 {
25682 split_double_mode (mode, operands, 2, low, high);
25683 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25684
25685 if (count >= half_width)
25686 {
25687 emit_move_insn (high[0], low[1]);
25688 emit_move_insn (low[0], const0_rtx);
25689
25690 if (count > half_width)
25691 ix86_expand_ashl_const (high[0], count - half_width, mode);
25692 }
25693 else
25694 {
25695 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
25696
25697 if (!rtx_equal_p (operands[0], operands[1]))
25698 emit_move_insn (operands[0], operands[1]);
25699
25700 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
25701 ix86_expand_ashl_const (low[0], count, mode);
25702 }
25703 return;
25704 }
25705
25706 split_double_mode (mode, operands, 1, low, high);
25707
25708 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25709
25710 if (operands[1] == const1_rtx)
25711 {
25712 /* Assuming we've chosen a QImode capable registers, then 1 << N
25713 can be done with two 32/64-bit shifts, no branches, no cmoves. */
25714 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
25715 {
25716 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
25717
25718 ix86_expand_clear (low[0]);
25719 ix86_expand_clear (high[0]);
25720 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
25721
25722 d = gen_lowpart (QImode, low[0]);
25723 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
25724 s = gen_rtx_EQ (QImode, flags, const0_rtx);
25725 emit_insn (gen_rtx_SET (d, s));
25726
25727 d = gen_lowpart (QImode, high[0]);
25728 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
25729 s = gen_rtx_NE (QImode, flags, const0_rtx);
25730 emit_insn (gen_rtx_SET (d, s));
25731 }
25732
25733 /* Otherwise, we can get the same results by manually performing
25734 a bit extract operation on bit 5/6, and then performing the two
25735 shifts. The two methods of getting 0/1 into low/high are exactly
25736 the same size. Avoiding the shift in the bit extract case helps
25737 pentium4 a bit; no one else seems to care much either way. */
25738 else
25739 {
25740 machine_mode half_mode;
25741 rtx (*gen_lshr3)(rtx, rtx, rtx);
25742 rtx (*gen_and3)(rtx, rtx, rtx);
25743 rtx (*gen_xor3)(rtx, rtx, rtx);
25744 HOST_WIDE_INT bits;
25745 rtx x;
25746
25747 if (mode == DImode)
25748 {
25749 half_mode = SImode;
25750 gen_lshr3 = gen_lshrsi3;
25751 gen_and3 = gen_andsi3;
25752 gen_xor3 = gen_xorsi3;
25753 bits = 5;
25754 }
25755 else
25756 {
25757 half_mode = DImode;
25758 gen_lshr3 = gen_lshrdi3;
25759 gen_and3 = gen_anddi3;
25760 gen_xor3 = gen_xordi3;
25761 bits = 6;
25762 }
25763
25764 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
25765 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
25766 else
25767 x = gen_lowpart (half_mode, operands[2]);
25768 emit_insn (gen_rtx_SET (high[0], x));
25769
25770 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
25771 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
25772 emit_move_insn (low[0], high[0]);
25773 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
25774 }
25775
25776 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
25777 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
25778 return;
25779 }
25780
25781 if (operands[1] == constm1_rtx)
25782 {
25783 /* For -1 << N, we can avoid the shld instruction, because we
25784 know that we're shifting 0...31/63 ones into a -1. */
25785 emit_move_insn (low[0], constm1_rtx);
25786 if (optimize_insn_for_size_p ())
25787 emit_move_insn (high[0], low[0]);
25788 else
25789 emit_move_insn (high[0], constm1_rtx);
25790 }
25791 else
25792 {
25793 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
25794
25795 if (!rtx_equal_p (operands[0], operands[1]))
25796 emit_move_insn (operands[0], operands[1]);
25797
25798 split_double_mode (mode, operands, 1, low, high);
25799 emit_insn (gen_shld (high[0], low[0], operands[2]));
25800 }
25801
25802 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
25803
25804 if (TARGET_CMOVE && scratch)
25805 {
25806 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25807 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25808
25809 ix86_expand_clear (scratch);
25810 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
25811 }
25812 else
25813 {
25814 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
25815 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
25816
25817 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
25818 }
25819 }
25820
25821 void
25822 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
25823 {
25824 rtx (*gen_ashr3)(rtx, rtx, rtx)
25825 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
25826 rtx (*gen_shrd)(rtx, rtx, rtx);
25827 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25828
25829 rtx low[2], high[2];
25830 int count;
25831
25832 if (CONST_INT_P (operands[2]))
25833 {
25834 split_double_mode (mode, operands, 2, low, high);
25835 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25836
25837 if (count == GET_MODE_BITSIZE (mode) - 1)
25838 {
25839 emit_move_insn (high[0], high[1]);
25840 emit_insn (gen_ashr3 (high[0], high[0],
25841 GEN_INT (half_width - 1)));
25842 emit_move_insn (low[0], high[0]);
25843
25844 }
25845 else if (count >= half_width)
25846 {
25847 emit_move_insn (low[0], high[1]);
25848 emit_move_insn (high[0], low[0]);
25849 emit_insn (gen_ashr3 (high[0], high[0],
25850 GEN_INT (half_width - 1)));
25851
25852 if (count > half_width)
25853 emit_insn (gen_ashr3 (low[0], low[0],
25854 GEN_INT (count - half_width)));
25855 }
25856 else
25857 {
25858 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25859
25860 if (!rtx_equal_p (operands[0], operands[1]))
25861 emit_move_insn (operands[0], operands[1]);
25862
25863 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
25864 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
25865 }
25866 }
25867 else
25868 {
25869 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25870
25871 if (!rtx_equal_p (operands[0], operands[1]))
25872 emit_move_insn (operands[0], operands[1]);
25873
25874 split_double_mode (mode, operands, 1, low, high);
25875
25876 emit_insn (gen_shrd (low[0], high[0], operands[2]));
25877 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
25878
25879 if (TARGET_CMOVE && scratch)
25880 {
25881 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25882 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25883
25884 emit_move_insn (scratch, high[0]);
25885 emit_insn (gen_ashr3 (scratch, scratch,
25886 GEN_INT (half_width - 1)));
25887 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
25888 scratch));
25889 }
25890 else
25891 {
25892 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
25893 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
25894
25895 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
25896 }
25897 }
25898 }
25899
25900 void
25901 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
25902 {
25903 rtx (*gen_lshr3)(rtx, rtx, rtx)
25904 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
25905 rtx (*gen_shrd)(rtx, rtx, rtx);
25906 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25907
25908 rtx low[2], high[2];
25909 int count;
25910
25911 if (CONST_INT_P (operands[2]))
25912 {
25913 split_double_mode (mode, operands, 2, low, high);
25914 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25915
25916 if (count >= half_width)
25917 {
25918 emit_move_insn (low[0], high[1]);
25919 ix86_expand_clear (high[0]);
25920
25921 if (count > half_width)
25922 emit_insn (gen_lshr3 (low[0], low[0],
25923 GEN_INT (count - half_width)));
25924 }
25925 else
25926 {
25927 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25928
25929 if (!rtx_equal_p (operands[0], operands[1]))
25930 emit_move_insn (operands[0], operands[1]);
25931
25932 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
25933 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
25934 }
25935 }
25936 else
25937 {
25938 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25939
25940 if (!rtx_equal_p (operands[0], operands[1]))
25941 emit_move_insn (operands[0], operands[1]);
25942
25943 split_double_mode (mode, operands, 1, low, high);
25944
25945 emit_insn (gen_shrd (low[0], high[0], operands[2]));
25946 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
25947
25948 if (TARGET_CMOVE && scratch)
25949 {
25950 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25951 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25952
25953 ix86_expand_clear (scratch);
25954 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
25955 scratch));
25956 }
25957 else
25958 {
25959 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
25960 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
25961
25962 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
25963 }
25964 }
25965 }
25966
25967 /* Predict just emitted jump instruction to be taken with probability PROB. */
25968 static void
25969 predict_jump (int prob)
25970 {
25971 rtx_insn *insn = get_last_insn ();
25972 gcc_assert (JUMP_P (insn));
25973 add_int_reg_note (insn, REG_BR_PROB, prob);
25974 }
25975
25976 /* Helper function for the string operations below. Dest VARIABLE whether
25977 it is aligned to VALUE bytes. If true, jump to the label. */
25978 static rtx_code_label *
25979 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
25980 {
25981 rtx_code_label *label = gen_label_rtx ();
25982 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
25983 if (GET_MODE (variable) == DImode)
25984 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
25985 else
25986 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
25987 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
25988 1, label);
25989 if (epilogue)
25990 predict_jump (REG_BR_PROB_BASE * 50 / 100);
25991 else
25992 predict_jump (REG_BR_PROB_BASE * 90 / 100);
25993 return label;
25994 }
25995
25996 /* Adjust COUNTER by the VALUE. */
25997 static void
25998 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
25999 {
26000 rtx (*gen_add)(rtx, rtx, rtx)
26001 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
26002
26003 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
26004 }
26005
26006 /* Zero extend possibly SImode EXP to Pmode register. */
26007 rtx
26008 ix86_zero_extend_to_Pmode (rtx exp)
26009 {
26010 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
26011 }
26012
26013 /* Divide COUNTREG by SCALE. */
26014 static rtx
26015 scale_counter (rtx countreg, int scale)
26016 {
26017 rtx sc;
26018
26019 if (scale == 1)
26020 return countreg;
26021 if (CONST_INT_P (countreg))
26022 return GEN_INT (INTVAL (countreg) / scale);
26023 gcc_assert (REG_P (countreg));
26024
26025 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
26026 GEN_INT (exact_log2 (scale)),
26027 NULL, 1, OPTAB_DIRECT);
26028 return sc;
26029 }
26030
26031 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
26032 DImode for constant loop counts. */
26033
26034 static machine_mode
26035 counter_mode (rtx count_exp)
26036 {
26037 if (GET_MODE (count_exp) != VOIDmode)
26038 return GET_MODE (count_exp);
26039 if (!CONST_INT_P (count_exp))
26040 return Pmode;
26041 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
26042 return DImode;
26043 return SImode;
26044 }
26045
26046 /* Copy the address to a Pmode register. This is used for x32 to
26047 truncate DImode TLS address to a SImode register. */
26048
26049 static rtx
26050 ix86_copy_addr_to_reg (rtx addr)
26051 {
26052 rtx reg;
26053 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
26054 {
26055 reg = copy_addr_to_reg (addr);
26056 REG_POINTER (reg) = 1;
26057 return reg;
26058 }
26059 else
26060 {
26061 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
26062 reg = copy_to_mode_reg (DImode, addr);
26063 REG_POINTER (reg) = 1;
26064 return gen_rtx_SUBREG (SImode, reg, 0);
26065 }
26066 }
26067
26068 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
26069 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
26070 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
26071 memory by VALUE (supposed to be in MODE).
26072
26073 The size is rounded down to whole number of chunk size moved at once.
26074 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
26075
26076
26077 static void
26078 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
26079 rtx destptr, rtx srcptr, rtx value,
26080 rtx count, machine_mode mode, int unroll,
26081 int expected_size, bool issetmem)
26082 {
26083 rtx_code_label *out_label, *top_label;
26084 rtx iter, tmp;
26085 machine_mode iter_mode = counter_mode (count);
26086 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
26087 rtx piece_size = GEN_INT (piece_size_n);
26088 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
26089 rtx size;
26090 int i;
26091
26092 top_label = gen_label_rtx ();
26093 out_label = gen_label_rtx ();
26094 iter = gen_reg_rtx (iter_mode);
26095
26096 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
26097 NULL, 1, OPTAB_DIRECT);
26098 /* Those two should combine. */
26099 if (piece_size == const1_rtx)
26100 {
26101 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
26102 true, out_label);
26103 predict_jump (REG_BR_PROB_BASE * 10 / 100);
26104 }
26105 emit_move_insn (iter, const0_rtx);
26106
26107 emit_label (top_label);
26108
26109 tmp = convert_modes (Pmode, iter_mode, iter, true);
26110
26111 /* This assert could be relaxed - in this case we'll need to compute
26112 smallest power of two, containing in PIECE_SIZE_N and pass it to
26113 offset_address. */
26114 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
26115 destmem = offset_address (destmem, tmp, piece_size_n);
26116 destmem = adjust_address (destmem, mode, 0);
26117
26118 if (!issetmem)
26119 {
26120 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
26121 srcmem = adjust_address (srcmem, mode, 0);
26122
26123 /* When unrolling for chips that reorder memory reads and writes,
26124 we can save registers by using single temporary.
26125 Also using 4 temporaries is overkill in 32bit mode. */
26126 if (!TARGET_64BIT && 0)
26127 {
26128 for (i = 0; i < unroll; i++)
26129 {
26130 if (i)
26131 {
26132 destmem =
26133 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26134 srcmem =
26135 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
26136 }
26137 emit_move_insn (destmem, srcmem);
26138 }
26139 }
26140 else
26141 {
26142 rtx tmpreg[4];
26143 gcc_assert (unroll <= 4);
26144 for (i = 0; i < unroll; i++)
26145 {
26146 tmpreg[i] = gen_reg_rtx (mode);
26147 if (i)
26148 {
26149 srcmem =
26150 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
26151 }
26152 emit_move_insn (tmpreg[i], srcmem);
26153 }
26154 for (i = 0; i < unroll; i++)
26155 {
26156 if (i)
26157 {
26158 destmem =
26159 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26160 }
26161 emit_move_insn (destmem, tmpreg[i]);
26162 }
26163 }
26164 }
26165 else
26166 for (i = 0; i < unroll; i++)
26167 {
26168 if (i)
26169 destmem =
26170 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26171 emit_move_insn (destmem, value);
26172 }
26173
26174 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
26175 true, OPTAB_LIB_WIDEN);
26176 if (tmp != iter)
26177 emit_move_insn (iter, tmp);
26178
26179 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
26180 true, top_label);
26181 if (expected_size != -1)
26182 {
26183 expected_size /= GET_MODE_SIZE (mode) * unroll;
26184 if (expected_size == 0)
26185 predict_jump (0);
26186 else if (expected_size > REG_BR_PROB_BASE)
26187 predict_jump (REG_BR_PROB_BASE - 1);
26188 else
26189 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
26190 }
26191 else
26192 predict_jump (REG_BR_PROB_BASE * 80 / 100);
26193 iter = ix86_zero_extend_to_Pmode (iter);
26194 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
26195 true, OPTAB_LIB_WIDEN);
26196 if (tmp != destptr)
26197 emit_move_insn (destptr, tmp);
26198 if (!issetmem)
26199 {
26200 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
26201 true, OPTAB_LIB_WIDEN);
26202 if (tmp != srcptr)
26203 emit_move_insn (srcptr, tmp);
26204 }
26205 emit_label (out_label);
26206 }
26207
26208 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
26209 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
26210 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
26211 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
26212 ORIG_VALUE is the original value passed to memset to fill the memory with.
26213 Other arguments have same meaning as for previous function. */
26214
26215 static void
26216 expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
26217 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
26218 rtx count,
26219 machine_mode mode, bool issetmem)
26220 {
26221 rtx destexp;
26222 rtx srcexp;
26223 rtx countreg;
26224 HOST_WIDE_INT rounded_count;
26225
26226 /* If possible, it is shorter to use rep movs.
26227 TODO: Maybe it is better to move this logic to decide_alg. */
26228 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
26229 && (!issetmem || orig_value == const0_rtx))
26230 mode = SImode;
26231
26232 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
26233 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
26234
26235 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
26236 GET_MODE_SIZE (mode)));
26237 if (mode != QImode)
26238 {
26239 destexp = gen_rtx_ASHIFT (Pmode, countreg,
26240 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
26241 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
26242 }
26243 else
26244 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
26245 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
26246 {
26247 rounded_count
26248 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
26249 destmem = shallow_copy_rtx (destmem);
26250 set_mem_size (destmem, rounded_count);
26251 }
26252 else if (MEM_SIZE_KNOWN_P (destmem))
26253 clear_mem_size (destmem);
26254
26255 if (issetmem)
26256 {
26257 value = force_reg (mode, gen_lowpart (mode, value));
26258 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
26259 }
26260 else
26261 {
26262 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
26263 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
26264 if (mode != QImode)
26265 {
26266 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
26267 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
26268 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
26269 }
26270 else
26271 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
26272 if (CONST_INT_P (count))
26273 {
26274 rounded_count
26275 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
26276 srcmem = shallow_copy_rtx (srcmem);
26277 set_mem_size (srcmem, rounded_count);
26278 }
26279 else
26280 {
26281 if (MEM_SIZE_KNOWN_P (srcmem))
26282 clear_mem_size (srcmem);
26283 }
26284 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
26285 destexp, srcexp));
26286 }
26287 }
26288
26289 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
26290 DESTMEM.
26291 SRC is passed by pointer to be updated on return.
26292 Return value is updated DST. */
26293 static rtx
26294 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
26295 HOST_WIDE_INT size_to_move)
26296 {
26297 rtx dst = destmem, src = *srcmem, adjust, tempreg;
26298 enum insn_code code;
26299 machine_mode move_mode;
26300 int piece_size, i;
26301
26302 /* Find the widest mode in which we could perform moves.
26303 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
26304 it until move of such size is supported. */
26305 piece_size = 1 << floor_log2 (size_to_move);
26306 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
26307 code = optab_handler (mov_optab, move_mode);
26308 while (code == CODE_FOR_nothing && piece_size > 1)
26309 {
26310 piece_size >>= 1;
26311 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
26312 code = optab_handler (mov_optab, move_mode);
26313 }
26314
26315 /* Find the corresponding vector mode with the same size as MOVE_MODE.
26316 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
26317 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
26318 {
26319 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
26320 move_mode = mode_for_vector (word_mode, nunits);
26321 code = optab_handler (mov_optab, move_mode);
26322 if (code == CODE_FOR_nothing)
26323 {
26324 move_mode = word_mode;
26325 piece_size = GET_MODE_SIZE (move_mode);
26326 code = optab_handler (mov_optab, move_mode);
26327 }
26328 }
26329 gcc_assert (code != CODE_FOR_nothing);
26330
26331 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26332 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
26333
26334 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26335 gcc_assert (size_to_move % piece_size == 0);
26336 adjust = GEN_INT (piece_size);
26337 for (i = 0; i < size_to_move; i += piece_size)
26338 {
26339 /* We move from memory to memory, so we'll need to do it via
26340 a temporary register. */
26341 tempreg = gen_reg_rtx (move_mode);
26342 emit_insn (GEN_FCN (code) (tempreg, src));
26343 emit_insn (GEN_FCN (code) (dst, tempreg));
26344
26345 emit_move_insn (destptr,
26346 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26347 emit_move_insn (srcptr,
26348 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
26349
26350 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26351 piece_size);
26352 src = adjust_automodify_address_nv (src, move_mode, srcptr,
26353 piece_size);
26354 }
26355
26356 /* Update DST and SRC rtx. */
26357 *srcmem = src;
26358 return dst;
26359 }
26360
26361 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
26362 static void
26363 expand_movmem_epilogue (rtx destmem, rtx srcmem,
26364 rtx destptr, rtx srcptr, rtx count, int max_size)
26365 {
26366 rtx src, dest;
26367 if (CONST_INT_P (count))
26368 {
26369 HOST_WIDE_INT countval = INTVAL (count);
26370 HOST_WIDE_INT epilogue_size = countval % max_size;
26371 int i;
26372
26373 /* For now MAX_SIZE should be a power of 2. This assert could be
26374 relaxed, but it'll require a bit more complicated epilogue
26375 expanding. */
26376 gcc_assert ((max_size & (max_size - 1)) == 0);
26377 for (i = max_size; i >= 1; i >>= 1)
26378 {
26379 if (epilogue_size & i)
26380 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26381 }
26382 return;
26383 }
26384 if (max_size > 8)
26385 {
26386 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
26387 count, 1, OPTAB_DIRECT);
26388 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
26389 count, QImode, 1, 4, false);
26390 return;
26391 }
26392
26393 /* When there are stringops, we can cheaply increase dest and src pointers.
26394 Otherwise we save code size by maintaining offset (zero is readily
26395 available from preceding rep operation) and using x86 addressing modes.
26396 */
26397 if (TARGET_SINGLE_STRINGOP)
26398 {
26399 if (max_size > 4)
26400 {
26401 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26402 src = change_address (srcmem, SImode, srcptr);
26403 dest = change_address (destmem, SImode, destptr);
26404 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26405 emit_label (label);
26406 LABEL_NUSES (label) = 1;
26407 }
26408 if (max_size > 2)
26409 {
26410 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26411 src = change_address (srcmem, HImode, srcptr);
26412 dest = change_address (destmem, HImode, destptr);
26413 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26414 emit_label (label);
26415 LABEL_NUSES (label) = 1;
26416 }
26417 if (max_size > 1)
26418 {
26419 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26420 src = change_address (srcmem, QImode, srcptr);
26421 dest = change_address (destmem, QImode, destptr);
26422 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26423 emit_label (label);
26424 LABEL_NUSES (label) = 1;
26425 }
26426 }
26427 else
26428 {
26429 rtx offset = force_reg (Pmode, const0_rtx);
26430 rtx tmp;
26431
26432 if (max_size > 4)
26433 {
26434 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26435 src = change_address (srcmem, SImode, srcptr);
26436 dest = change_address (destmem, SImode, destptr);
26437 emit_move_insn (dest, src);
26438 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
26439 true, OPTAB_LIB_WIDEN);
26440 if (tmp != offset)
26441 emit_move_insn (offset, tmp);
26442 emit_label (label);
26443 LABEL_NUSES (label) = 1;
26444 }
26445 if (max_size > 2)
26446 {
26447 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26448 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26449 src = change_address (srcmem, HImode, tmp);
26450 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26451 dest = change_address (destmem, HImode, tmp);
26452 emit_move_insn (dest, src);
26453 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
26454 true, OPTAB_LIB_WIDEN);
26455 if (tmp != offset)
26456 emit_move_insn (offset, tmp);
26457 emit_label (label);
26458 LABEL_NUSES (label) = 1;
26459 }
26460 if (max_size > 1)
26461 {
26462 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26463 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26464 src = change_address (srcmem, QImode, tmp);
26465 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26466 dest = change_address (destmem, QImode, tmp);
26467 emit_move_insn (dest, src);
26468 emit_label (label);
26469 LABEL_NUSES (label) = 1;
26470 }
26471 }
26472 }
26473
26474 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
26475 with value PROMOTED_VAL.
26476 SRC is passed by pointer to be updated on return.
26477 Return value is updated DST. */
26478 static rtx
26479 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
26480 HOST_WIDE_INT size_to_move)
26481 {
26482 rtx dst = destmem, adjust;
26483 enum insn_code code;
26484 machine_mode move_mode;
26485 int piece_size, i;
26486
26487 /* Find the widest mode in which we could perform moves.
26488 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
26489 it until move of such size is supported. */
26490 move_mode = GET_MODE (promoted_val);
26491 if (move_mode == VOIDmode)
26492 move_mode = QImode;
26493 if (size_to_move < GET_MODE_SIZE (move_mode))
26494 {
26495 move_mode = mode_for_size (size_to_move * BITS_PER_UNIT, MODE_INT, 0);
26496 promoted_val = gen_lowpart (move_mode, promoted_val);
26497 }
26498 piece_size = GET_MODE_SIZE (move_mode);
26499 code = optab_handler (mov_optab, move_mode);
26500 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
26501
26502 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26503
26504 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26505 gcc_assert (size_to_move % piece_size == 0);
26506 adjust = GEN_INT (piece_size);
26507 for (i = 0; i < size_to_move; i += piece_size)
26508 {
26509 if (piece_size <= GET_MODE_SIZE (word_mode))
26510 {
26511 emit_insn (gen_strset (destptr, dst, promoted_val));
26512 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26513 piece_size);
26514 continue;
26515 }
26516
26517 emit_insn (GEN_FCN (code) (dst, promoted_val));
26518
26519 emit_move_insn (destptr,
26520 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26521
26522 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26523 piece_size);
26524 }
26525
26526 /* Update DST rtx. */
26527 return dst;
26528 }
26529 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26530 static void
26531 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
26532 rtx count, int max_size)
26533 {
26534 count =
26535 expand_simple_binop (counter_mode (count), AND, count,
26536 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
26537 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
26538 gen_lowpart (QImode, value), count, QImode,
26539 1, max_size / 2, true);
26540 }
26541
26542 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26543 static void
26544 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
26545 rtx count, int max_size)
26546 {
26547 rtx dest;
26548
26549 if (CONST_INT_P (count))
26550 {
26551 HOST_WIDE_INT countval = INTVAL (count);
26552 HOST_WIDE_INT epilogue_size = countval % max_size;
26553 int i;
26554
26555 /* For now MAX_SIZE should be a power of 2. This assert could be
26556 relaxed, but it'll require a bit more complicated epilogue
26557 expanding. */
26558 gcc_assert ((max_size & (max_size - 1)) == 0);
26559 for (i = max_size; i >= 1; i >>= 1)
26560 {
26561 if (epilogue_size & i)
26562 {
26563 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26564 destmem = emit_memset (destmem, destptr, vec_value, i);
26565 else
26566 destmem = emit_memset (destmem, destptr, value, i);
26567 }
26568 }
26569 return;
26570 }
26571 if (max_size > 32)
26572 {
26573 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
26574 return;
26575 }
26576 if (max_size > 16)
26577 {
26578 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
26579 if (TARGET_64BIT)
26580 {
26581 dest = change_address (destmem, DImode, destptr);
26582 emit_insn (gen_strset (destptr, dest, value));
26583 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
26584 emit_insn (gen_strset (destptr, dest, value));
26585 }
26586 else
26587 {
26588 dest = change_address (destmem, SImode, destptr);
26589 emit_insn (gen_strset (destptr, dest, value));
26590 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26591 emit_insn (gen_strset (destptr, dest, value));
26592 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
26593 emit_insn (gen_strset (destptr, dest, value));
26594 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
26595 emit_insn (gen_strset (destptr, dest, value));
26596 }
26597 emit_label (label);
26598 LABEL_NUSES (label) = 1;
26599 }
26600 if (max_size > 8)
26601 {
26602 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
26603 if (TARGET_64BIT)
26604 {
26605 dest = change_address (destmem, DImode, destptr);
26606 emit_insn (gen_strset (destptr, dest, value));
26607 }
26608 else
26609 {
26610 dest = change_address (destmem, SImode, destptr);
26611 emit_insn (gen_strset (destptr, dest, value));
26612 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26613 emit_insn (gen_strset (destptr, dest, value));
26614 }
26615 emit_label (label);
26616 LABEL_NUSES (label) = 1;
26617 }
26618 if (max_size > 4)
26619 {
26620 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26621 dest = change_address (destmem, SImode, destptr);
26622 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
26623 emit_label (label);
26624 LABEL_NUSES (label) = 1;
26625 }
26626 if (max_size > 2)
26627 {
26628 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26629 dest = change_address (destmem, HImode, destptr);
26630 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
26631 emit_label (label);
26632 LABEL_NUSES (label) = 1;
26633 }
26634 if (max_size > 1)
26635 {
26636 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26637 dest = change_address (destmem, QImode, destptr);
26638 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
26639 emit_label (label);
26640 LABEL_NUSES (label) = 1;
26641 }
26642 }
26643
26644 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
26645 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
26646 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
26647 ignored.
26648 Return value is updated DESTMEM. */
26649 static rtx
26650 expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
26651 rtx destptr, rtx srcptr, rtx value,
26652 rtx vec_value, rtx count, int align,
26653 int desired_alignment, bool issetmem)
26654 {
26655 int i;
26656 for (i = 1; i < desired_alignment; i <<= 1)
26657 {
26658 if (align <= i)
26659 {
26660 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
26661 if (issetmem)
26662 {
26663 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26664 destmem = emit_memset (destmem, destptr, vec_value, i);
26665 else
26666 destmem = emit_memset (destmem, destptr, value, i);
26667 }
26668 else
26669 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26670 ix86_adjust_counter (count, i);
26671 emit_label (label);
26672 LABEL_NUSES (label) = 1;
26673 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
26674 }
26675 }
26676 return destmem;
26677 }
26678
26679 /* Test if COUNT&SIZE is nonzero and if so, expand movme
26680 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
26681 and jump to DONE_LABEL. */
26682 static void
26683 expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
26684 rtx destptr, rtx srcptr,
26685 rtx value, rtx vec_value,
26686 rtx count, int size,
26687 rtx done_label, bool issetmem)
26688 {
26689 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
26690 machine_mode mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 1);
26691 rtx modesize;
26692 int n;
26693
26694 /* If we do not have vector value to copy, we must reduce size. */
26695 if (issetmem)
26696 {
26697 if (!vec_value)
26698 {
26699 if (GET_MODE (value) == VOIDmode && size > 8)
26700 mode = Pmode;
26701 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
26702 mode = GET_MODE (value);
26703 }
26704 else
26705 mode = GET_MODE (vec_value), value = vec_value;
26706 }
26707 else
26708 {
26709 /* Choose appropriate vector mode. */
26710 if (size >= 32)
26711 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
26712 else if (size >= 16)
26713 mode = TARGET_SSE ? V16QImode : DImode;
26714 srcmem = change_address (srcmem, mode, srcptr);
26715 }
26716 destmem = change_address (destmem, mode, destptr);
26717 modesize = GEN_INT (GET_MODE_SIZE (mode));
26718 gcc_assert (GET_MODE_SIZE (mode) <= size);
26719 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
26720 {
26721 if (issetmem)
26722 emit_move_insn (destmem, gen_lowpart (mode, value));
26723 else
26724 {
26725 emit_move_insn (destmem, srcmem);
26726 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26727 }
26728 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26729 }
26730
26731 destmem = offset_address (destmem, count, 1);
26732 destmem = offset_address (destmem, GEN_INT (-2 * size),
26733 GET_MODE_SIZE (mode));
26734 if (!issetmem)
26735 {
26736 srcmem = offset_address (srcmem, count, 1);
26737 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
26738 GET_MODE_SIZE (mode));
26739 }
26740 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
26741 {
26742 if (issetmem)
26743 emit_move_insn (destmem, gen_lowpart (mode, value));
26744 else
26745 {
26746 emit_move_insn (destmem, srcmem);
26747 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26748 }
26749 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26750 }
26751 emit_jump_insn (gen_jump (done_label));
26752 emit_barrier ();
26753
26754 emit_label (label);
26755 LABEL_NUSES (label) = 1;
26756 }
26757
26758 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
26759 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
26760 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
26761 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
26762 DONE_LABEL is a label after the whole copying sequence. The label is created
26763 on demand if *DONE_LABEL is NULL.
26764 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
26765 bounds after the initial copies.
26766
26767 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
26768 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
26769 we will dispatch to a library call for large blocks.
26770
26771 In pseudocode we do:
26772
26773 if (COUNT < SIZE)
26774 {
26775 Assume that SIZE is 4. Bigger sizes are handled analogously
26776 if (COUNT & 4)
26777 {
26778 copy 4 bytes from SRCPTR to DESTPTR
26779 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
26780 goto done_label
26781 }
26782 if (!COUNT)
26783 goto done_label;
26784 copy 1 byte from SRCPTR to DESTPTR
26785 if (COUNT & 2)
26786 {
26787 copy 2 bytes from SRCPTR to DESTPTR
26788 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
26789 }
26790 }
26791 else
26792 {
26793 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
26794 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
26795
26796 OLD_DESPTR = DESTPTR;
26797 Align DESTPTR up to DESIRED_ALIGN
26798 SRCPTR += DESTPTR - OLD_DESTPTR
26799 COUNT -= DEST_PTR - OLD_DESTPTR
26800 if (DYNAMIC_CHECK)
26801 Round COUNT down to multiple of SIZE
26802 << optional caller supplied zero size guard is here >>
26803 << optional caller supplied dynamic check is here >>
26804 << caller supplied main copy loop is here >>
26805 }
26806 done_label:
26807 */
26808 static void
26809 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
26810 rtx *destptr, rtx *srcptr,
26811 machine_mode mode,
26812 rtx value, rtx vec_value,
26813 rtx *count,
26814 rtx_code_label **done_label,
26815 int size,
26816 int desired_align,
26817 int align,
26818 unsigned HOST_WIDE_INT *min_size,
26819 bool dynamic_check,
26820 bool issetmem)
26821 {
26822 rtx_code_label *loop_label = NULL, *label;
26823 int n;
26824 rtx modesize;
26825 int prolog_size = 0;
26826 rtx mode_value;
26827
26828 /* Chose proper value to copy. */
26829 if (issetmem && VECTOR_MODE_P (mode))
26830 mode_value = vec_value;
26831 else
26832 mode_value = value;
26833 gcc_assert (GET_MODE_SIZE (mode) <= size);
26834
26835 /* See if block is big or small, handle small blocks. */
26836 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
26837 {
26838 int size2 = size;
26839 loop_label = gen_label_rtx ();
26840
26841 if (!*done_label)
26842 *done_label = gen_label_rtx ();
26843
26844 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
26845 1, loop_label);
26846 size2 >>= 1;
26847
26848 /* Handle sizes > 3. */
26849 for (;size2 > 2; size2 >>= 1)
26850 expand_small_movmem_or_setmem (destmem, srcmem,
26851 *destptr, *srcptr,
26852 value, vec_value,
26853 *count,
26854 size2, *done_label, issetmem);
26855 /* Nothing to copy? Jump to DONE_LABEL if so */
26856 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
26857 1, *done_label);
26858
26859 /* Do a byte copy. */
26860 destmem = change_address (destmem, QImode, *destptr);
26861 if (issetmem)
26862 emit_move_insn (destmem, gen_lowpart (QImode, value));
26863 else
26864 {
26865 srcmem = change_address (srcmem, QImode, *srcptr);
26866 emit_move_insn (destmem, srcmem);
26867 }
26868
26869 /* Handle sizes 2 and 3. */
26870 label = ix86_expand_aligntest (*count, 2, false);
26871 destmem = change_address (destmem, HImode, *destptr);
26872 destmem = offset_address (destmem, *count, 1);
26873 destmem = offset_address (destmem, GEN_INT (-2), 2);
26874 if (issetmem)
26875 emit_move_insn (destmem, gen_lowpart (HImode, value));
26876 else
26877 {
26878 srcmem = change_address (srcmem, HImode, *srcptr);
26879 srcmem = offset_address (srcmem, *count, 1);
26880 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
26881 emit_move_insn (destmem, srcmem);
26882 }
26883
26884 emit_label (label);
26885 LABEL_NUSES (label) = 1;
26886 emit_jump_insn (gen_jump (*done_label));
26887 emit_barrier ();
26888 }
26889 else
26890 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
26891 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
26892
26893 /* Start memcpy for COUNT >= SIZE. */
26894 if (loop_label)
26895 {
26896 emit_label (loop_label);
26897 LABEL_NUSES (loop_label) = 1;
26898 }
26899
26900 /* Copy first desired_align bytes. */
26901 if (!issetmem)
26902 srcmem = change_address (srcmem, mode, *srcptr);
26903 destmem = change_address (destmem, mode, *destptr);
26904 modesize = GEN_INT (GET_MODE_SIZE (mode));
26905 for (n = 0; prolog_size < desired_align - align; n++)
26906 {
26907 if (issetmem)
26908 emit_move_insn (destmem, mode_value);
26909 else
26910 {
26911 emit_move_insn (destmem, srcmem);
26912 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26913 }
26914 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26915 prolog_size += GET_MODE_SIZE (mode);
26916 }
26917
26918
26919 /* Copy last SIZE bytes. */
26920 destmem = offset_address (destmem, *count, 1);
26921 destmem = offset_address (destmem,
26922 GEN_INT (-size - prolog_size),
26923 1);
26924 if (issetmem)
26925 emit_move_insn (destmem, mode_value);
26926 else
26927 {
26928 srcmem = offset_address (srcmem, *count, 1);
26929 srcmem = offset_address (srcmem,
26930 GEN_INT (-size - prolog_size),
26931 1);
26932 emit_move_insn (destmem, srcmem);
26933 }
26934 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
26935 {
26936 destmem = offset_address (destmem, modesize, 1);
26937 if (issetmem)
26938 emit_move_insn (destmem, mode_value);
26939 else
26940 {
26941 srcmem = offset_address (srcmem, modesize, 1);
26942 emit_move_insn (destmem, srcmem);
26943 }
26944 }
26945
26946 /* Align destination. */
26947 if (desired_align > 1 && desired_align > align)
26948 {
26949 rtx saveddest = *destptr;
26950
26951 gcc_assert (desired_align <= size);
26952 /* Align destptr up, place it to new register. */
26953 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
26954 GEN_INT (prolog_size),
26955 NULL_RTX, 1, OPTAB_DIRECT);
26956 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
26957 REG_POINTER (*destptr) = 1;
26958 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
26959 GEN_INT (-desired_align),
26960 *destptr, 1, OPTAB_DIRECT);
26961 /* See how many bytes we skipped. */
26962 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
26963 *destptr,
26964 saveddest, 1, OPTAB_DIRECT);
26965 /* Adjust srcptr and count. */
26966 if (!issetmem)
26967 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
26968 saveddest, *srcptr, 1, OPTAB_DIRECT);
26969 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
26970 saveddest, *count, 1, OPTAB_DIRECT);
26971 /* We copied at most size + prolog_size. */
26972 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
26973 *min_size
26974 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
26975 else
26976 *min_size = 0;
26977
26978 /* Our loops always round down the block size, but for dispatch to
26979 library we need precise value. */
26980 if (dynamic_check)
26981 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
26982 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
26983 }
26984 else
26985 {
26986 gcc_assert (prolog_size == 0);
26987 /* Decrease count, so we won't end up copying last word twice. */
26988 if (!CONST_INT_P (*count))
26989 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
26990 constm1_rtx, *count, 1, OPTAB_DIRECT);
26991 else
26992 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
26993 (unsigned HOST_WIDE_INT)size));
26994 if (*min_size)
26995 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
26996 }
26997 }
26998
26999
27000 /* This function is like the previous one, except here we know how many bytes
27001 need to be copied. That allows us to update alignment not only of DST, which
27002 is returned, but also of SRC, which is passed as a pointer for that
27003 reason. */
27004 static rtx
27005 expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
27006 rtx srcreg, rtx value, rtx vec_value,
27007 int desired_align, int align_bytes,
27008 bool issetmem)
27009 {
27010 rtx src = NULL;
27011 rtx orig_dst = dst;
27012 rtx orig_src = NULL;
27013 int piece_size = 1;
27014 int copied_bytes = 0;
27015
27016 if (!issetmem)
27017 {
27018 gcc_assert (srcp != NULL);
27019 src = *srcp;
27020 orig_src = src;
27021 }
27022
27023 for (piece_size = 1;
27024 piece_size <= desired_align && copied_bytes < align_bytes;
27025 piece_size <<= 1)
27026 {
27027 if (align_bytes & piece_size)
27028 {
27029 if (issetmem)
27030 {
27031 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
27032 dst = emit_memset (dst, destreg, vec_value, piece_size);
27033 else
27034 dst = emit_memset (dst, destreg, value, piece_size);
27035 }
27036 else
27037 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
27038 copied_bytes += piece_size;
27039 }
27040 }
27041 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
27042 set_mem_align (dst, desired_align * BITS_PER_UNIT);
27043 if (MEM_SIZE_KNOWN_P (orig_dst))
27044 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
27045
27046 if (!issetmem)
27047 {
27048 int src_align_bytes = get_mem_align_offset (src, desired_align
27049 * BITS_PER_UNIT);
27050 if (src_align_bytes >= 0)
27051 src_align_bytes = desired_align - src_align_bytes;
27052 if (src_align_bytes >= 0)
27053 {
27054 unsigned int src_align;
27055 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
27056 {
27057 if ((src_align_bytes & (src_align - 1))
27058 == (align_bytes & (src_align - 1)))
27059 break;
27060 }
27061 if (src_align > (unsigned int) desired_align)
27062 src_align = desired_align;
27063 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
27064 set_mem_align (src, src_align * BITS_PER_UNIT);
27065 }
27066 if (MEM_SIZE_KNOWN_P (orig_src))
27067 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
27068 *srcp = src;
27069 }
27070
27071 return dst;
27072 }
27073
27074 /* Return true if ALG can be used in current context.
27075 Assume we expand memset if MEMSET is true. */
27076 static bool
27077 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
27078 {
27079 if (alg == no_stringop)
27080 return false;
27081 if (alg == vector_loop)
27082 return TARGET_SSE || TARGET_AVX;
27083 /* Algorithms using the rep prefix want at least edi and ecx;
27084 additionally, memset wants eax and memcpy wants esi. Don't
27085 consider such algorithms if the user has appropriated those
27086 registers for their own purposes, or if we have a non-default
27087 address space, since some string insns cannot override the segment. */
27088 if (alg == rep_prefix_1_byte
27089 || alg == rep_prefix_4_byte
27090 || alg == rep_prefix_8_byte)
27091 {
27092 if (have_as)
27093 return false;
27094 if (fixed_regs[CX_REG]
27095 || fixed_regs[DI_REG]
27096 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
27097 return false;
27098 }
27099 return true;
27100 }
27101
27102 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
27103 static enum stringop_alg
27104 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
27105 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
27106 bool memset, bool zero_memset, bool have_as,
27107 int *dynamic_check, bool *noalign, bool recur)
27108 {
27109 const struct stringop_algs *algs;
27110 bool optimize_for_speed;
27111 int max = 0;
27112 const struct processor_costs *cost;
27113 int i;
27114 bool any_alg_usable_p = false;
27115
27116 *noalign = false;
27117 *dynamic_check = -1;
27118
27119 /* Even if the string operation call is cold, we still might spend a lot
27120 of time processing large blocks. */
27121 if (optimize_function_for_size_p (cfun)
27122 || (optimize_insn_for_size_p ()
27123 && (max_size < 256
27124 || (expected_size != -1 && expected_size < 256))))
27125 optimize_for_speed = false;
27126 else
27127 optimize_for_speed = true;
27128
27129 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
27130 if (memset)
27131 algs = &cost->memset[TARGET_64BIT != 0];
27132 else
27133 algs = &cost->memcpy[TARGET_64BIT != 0];
27134
27135 /* See maximal size for user defined algorithm. */
27136 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
27137 {
27138 enum stringop_alg candidate = algs->size[i].alg;
27139 bool usable = alg_usable_p (candidate, memset, have_as);
27140 any_alg_usable_p |= usable;
27141
27142 if (candidate != libcall && candidate && usable)
27143 max = algs->size[i].max;
27144 }
27145
27146 /* If expected size is not known but max size is small enough
27147 so inline version is a win, set expected size into
27148 the range. */
27149 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
27150 && expected_size == -1)
27151 expected_size = min_size / 2 + max_size / 2;
27152
27153 /* If user specified the algorithm, honor it if possible. */
27154 if (ix86_stringop_alg != no_stringop
27155 && alg_usable_p (ix86_stringop_alg, memset, have_as))
27156 return ix86_stringop_alg;
27157 /* rep; movq or rep; movl is the smallest variant. */
27158 else if (!optimize_for_speed)
27159 {
27160 *noalign = true;
27161 if (!count || (count & 3) || (memset && !zero_memset))
27162 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
27163 ? rep_prefix_1_byte : loop_1_byte;
27164 else
27165 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
27166 ? rep_prefix_4_byte : loop;
27167 }
27168 /* Very tiny blocks are best handled via the loop, REP is expensive to
27169 setup. */
27170 else if (expected_size != -1 && expected_size < 4)
27171 return loop_1_byte;
27172 else if (expected_size != -1)
27173 {
27174 enum stringop_alg alg = libcall;
27175 bool alg_noalign = false;
27176 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
27177 {
27178 /* We get here if the algorithms that were not libcall-based
27179 were rep-prefix based and we are unable to use rep prefixes
27180 based on global register usage. Break out of the loop and
27181 use the heuristic below. */
27182 if (algs->size[i].max == 0)
27183 break;
27184 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
27185 {
27186 enum stringop_alg candidate = algs->size[i].alg;
27187
27188 if (candidate != libcall
27189 && alg_usable_p (candidate, memset, have_as))
27190 {
27191 alg = candidate;
27192 alg_noalign = algs->size[i].noalign;
27193 }
27194 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
27195 last non-libcall inline algorithm. */
27196 if (TARGET_INLINE_ALL_STRINGOPS)
27197 {
27198 /* When the current size is best to be copied by a libcall,
27199 but we are still forced to inline, run the heuristic below
27200 that will pick code for medium sized blocks. */
27201 if (alg != libcall)
27202 {
27203 *noalign = alg_noalign;
27204 return alg;
27205 }
27206 else if (!any_alg_usable_p)
27207 break;
27208 }
27209 else if (alg_usable_p (candidate, memset, have_as))
27210 {
27211 *noalign = algs->size[i].noalign;
27212 return candidate;
27213 }
27214 }
27215 }
27216 }
27217 /* When asked to inline the call anyway, try to pick meaningful choice.
27218 We look for maximal size of block that is faster to copy by hand and
27219 take blocks of at most of that size guessing that average size will
27220 be roughly half of the block.
27221
27222 If this turns out to be bad, we might simply specify the preferred
27223 choice in ix86_costs. */
27224 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
27225 && (algs->unknown_size == libcall
27226 || !alg_usable_p (algs->unknown_size, memset, have_as)))
27227 {
27228 enum stringop_alg alg;
27229 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
27230
27231 /* If there aren't any usable algorithms or if recursing already,
27232 then recursing on smaller sizes or same size isn't going to
27233 find anything. Just return the simple byte-at-a-time copy loop. */
27234 if (!any_alg_usable_p || recur)
27235 {
27236 /* Pick something reasonable. */
27237 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
27238 *dynamic_check = 128;
27239 return loop_1_byte;
27240 }
27241 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
27242 zero_memset, have_as, dynamic_check, noalign, true);
27243 gcc_assert (*dynamic_check == -1);
27244 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
27245 *dynamic_check = max;
27246 else
27247 gcc_assert (alg != libcall);
27248 return alg;
27249 }
27250 return (alg_usable_p (algs->unknown_size, memset, have_as)
27251 ? algs->unknown_size : libcall);
27252 }
27253
27254 /* Decide on alignment. We know that the operand is already aligned to ALIGN
27255 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
27256 static int
27257 decide_alignment (int align,
27258 enum stringop_alg alg,
27259 int expected_size,
27260 machine_mode move_mode)
27261 {
27262 int desired_align = 0;
27263
27264 gcc_assert (alg != no_stringop);
27265
27266 if (alg == libcall)
27267 return 0;
27268 if (move_mode == VOIDmode)
27269 return 0;
27270
27271 desired_align = GET_MODE_SIZE (move_mode);
27272 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
27273 copying whole cacheline at once. */
27274 if (TARGET_PENTIUMPRO
27275 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
27276 desired_align = 8;
27277
27278 if (optimize_size)
27279 desired_align = 1;
27280 if (desired_align < align)
27281 desired_align = align;
27282 if (expected_size != -1 && expected_size < 4)
27283 desired_align = align;
27284
27285 return desired_align;
27286 }
27287
27288
27289 /* Helper function for memcpy. For QImode value 0xXY produce
27290 0xXYXYXYXY of wide specified by MODE. This is essentially
27291 a * 0x10101010, but we can do slightly better than
27292 synth_mult by unwinding the sequence by hand on CPUs with
27293 slow multiply. */
27294 static rtx
27295 promote_duplicated_reg (machine_mode mode, rtx val)
27296 {
27297 machine_mode valmode = GET_MODE (val);
27298 rtx tmp;
27299 int nops = mode == DImode ? 3 : 2;
27300
27301 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
27302 if (val == const0_rtx)
27303 return copy_to_mode_reg (mode, CONST0_RTX (mode));
27304 if (CONST_INT_P (val))
27305 {
27306 HOST_WIDE_INT v = INTVAL (val) & 255;
27307
27308 v |= v << 8;
27309 v |= v << 16;
27310 if (mode == DImode)
27311 v |= (v << 16) << 16;
27312 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
27313 }
27314
27315 if (valmode == VOIDmode)
27316 valmode = QImode;
27317 if (valmode != QImode)
27318 val = gen_lowpart (QImode, val);
27319 if (mode == QImode)
27320 return val;
27321 if (!TARGET_PARTIAL_REG_STALL)
27322 nops--;
27323 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
27324 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
27325 <= (ix86_cost->shift_const + ix86_cost->add) * nops
27326 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
27327 {
27328 rtx reg = convert_modes (mode, QImode, val, true);
27329 tmp = promote_duplicated_reg (mode, const1_rtx);
27330 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
27331 OPTAB_DIRECT);
27332 }
27333 else
27334 {
27335 rtx reg = convert_modes (mode, QImode, val, true);
27336
27337 if (!TARGET_PARTIAL_REG_STALL)
27338 if (mode == SImode)
27339 emit_insn (gen_insvsi_1 (reg, reg));
27340 else
27341 emit_insn (gen_insvdi_1 (reg, reg));
27342 else
27343 {
27344 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
27345 NULL, 1, OPTAB_DIRECT);
27346 reg =
27347 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27348 }
27349 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
27350 NULL, 1, OPTAB_DIRECT);
27351 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27352 if (mode == SImode)
27353 return reg;
27354 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
27355 NULL, 1, OPTAB_DIRECT);
27356 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27357 return reg;
27358 }
27359 }
27360
27361 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
27362 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
27363 alignment from ALIGN to DESIRED_ALIGN. */
27364 static rtx
27365 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
27366 int align)
27367 {
27368 rtx promoted_val;
27369
27370 if (TARGET_64BIT
27371 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
27372 promoted_val = promote_duplicated_reg (DImode, val);
27373 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
27374 promoted_val = promote_duplicated_reg (SImode, val);
27375 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
27376 promoted_val = promote_duplicated_reg (HImode, val);
27377 else
27378 promoted_val = val;
27379
27380 return promoted_val;
27381 }
27382
27383 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
27384 operations when profitable. The code depends upon architecture, block size
27385 and alignment, but always has one of the following overall structures:
27386
27387 Aligned move sequence:
27388
27389 1) Prologue guard: Conditional that jumps up to epilogues for small
27390 blocks that can be handled by epilogue alone. This is faster
27391 but also needed for correctness, since prologue assume the block
27392 is larger than the desired alignment.
27393
27394 Optional dynamic check for size and libcall for large
27395 blocks is emitted here too, with -minline-stringops-dynamically.
27396
27397 2) Prologue: copy first few bytes in order to get destination
27398 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
27399 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
27400 copied. We emit either a jump tree on power of two sized
27401 blocks, or a byte loop.
27402
27403 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27404 with specified algorithm.
27405
27406 4) Epilogue: code copying tail of the block that is too small to be
27407 handled by main body (or up to size guarded by prologue guard).
27408
27409 Misaligned move sequence
27410
27411 1) missaligned move prologue/epilogue containing:
27412 a) Prologue handling small memory blocks and jumping to done_label
27413 (skipped if blocks are known to be large enough)
27414 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
27415 needed by single possibly misaligned move
27416 (skipped if alignment is not needed)
27417 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
27418
27419 2) Zero size guard dispatching to done_label, if needed
27420
27421 3) dispatch to library call, if needed,
27422
27423 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27424 with specified algorithm. */
27425 bool
27426 ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
27427 rtx align_exp, rtx expected_align_exp,
27428 rtx expected_size_exp, rtx min_size_exp,
27429 rtx max_size_exp, rtx probable_max_size_exp,
27430 bool issetmem)
27431 {
27432 rtx destreg;
27433 rtx srcreg = NULL;
27434 rtx_code_label *label = NULL;
27435 rtx tmp;
27436 rtx_code_label *jump_around_label = NULL;
27437 HOST_WIDE_INT align = 1;
27438 unsigned HOST_WIDE_INT count = 0;
27439 HOST_WIDE_INT expected_size = -1;
27440 int size_needed = 0, epilogue_size_needed;
27441 int desired_align = 0, align_bytes = 0;
27442 enum stringop_alg alg;
27443 rtx promoted_val = NULL;
27444 rtx vec_promoted_val = NULL;
27445 bool force_loopy_epilogue = false;
27446 int dynamic_check;
27447 bool need_zero_guard = false;
27448 bool noalign;
27449 machine_mode move_mode = VOIDmode;
27450 int unroll_factor = 1;
27451 /* TODO: Once value ranges are available, fill in proper data. */
27452 unsigned HOST_WIDE_INT min_size = 0;
27453 unsigned HOST_WIDE_INT max_size = -1;
27454 unsigned HOST_WIDE_INT probable_max_size = -1;
27455 bool misaligned_prologue_used = false;
27456 bool have_as;
27457
27458 if (CONST_INT_P (align_exp))
27459 align = INTVAL (align_exp);
27460 /* i386 can do misaligned access on reasonably increased cost. */
27461 if (CONST_INT_P (expected_align_exp)
27462 && INTVAL (expected_align_exp) > align)
27463 align = INTVAL (expected_align_exp);
27464 /* ALIGN is the minimum of destination and source alignment, but we care here
27465 just about destination alignment. */
27466 else if (!issetmem
27467 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
27468 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
27469
27470 if (CONST_INT_P (count_exp))
27471 {
27472 min_size = max_size = probable_max_size = count = expected_size
27473 = INTVAL (count_exp);
27474 /* When COUNT is 0, there is nothing to do. */
27475 if (!count)
27476 return true;
27477 }
27478 else
27479 {
27480 if (min_size_exp)
27481 min_size = INTVAL (min_size_exp);
27482 if (max_size_exp)
27483 max_size = INTVAL (max_size_exp);
27484 if (probable_max_size_exp)
27485 probable_max_size = INTVAL (probable_max_size_exp);
27486 if (CONST_INT_P (expected_size_exp))
27487 expected_size = INTVAL (expected_size_exp);
27488 }
27489
27490 /* Make sure we don't need to care about overflow later on. */
27491 if (count > (HOST_WIDE_INT_1U << 30))
27492 return false;
27493
27494 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
27495 if (!issetmem)
27496 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
27497
27498 /* Step 0: Decide on preferred algorithm, desired alignment and
27499 size of chunks to be copied by main loop. */
27500 alg = decide_alg (count, expected_size, min_size, probable_max_size,
27501 issetmem,
27502 issetmem && val_exp == const0_rtx, have_as,
27503 &dynamic_check, &noalign, false);
27504 if (alg == libcall)
27505 return false;
27506 gcc_assert (alg != no_stringop);
27507
27508 /* For now vector-version of memset is generated only for memory zeroing, as
27509 creating of promoted vector value is very cheap in this case. */
27510 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
27511 alg = unrolled_loop;
27512
27513 if (!count)
27514 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
27515 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
27516 if (!issetmem)
27517 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
27518
27519 unroll_factor = 1;
27520 move_mode = word_mode;
27521 switch (alg)
27522 {
27523 case libcall:
27524 case no_stringop:
27525 case last_alg:
27526 gcc_unreachable ();
27527 case loop_1_byte:
27528 need_zero_guard = true;
27529 move_mode = QImode;
27530 break;
27531 case loop:
27532 need_zero_guard = true;
27533 break;
27534 case unrolled_loop:
27535 need_zero_guard = true;
27536 unroll_factor = (TARGET_64BIT ? 4 : 2);
27537 break;
27538 case vector_loop:
27539 need_zero_guard = true;
27540 unroll_factor = 4;
27541 /* Find the widest supported mode. */
27542 move_mode = word_mode;
27543 while (optab_handler (mov_optab, GET_MODE_WIDER_MODE (move_mode))
27544 != CODE_FOR_nothing)
27545 move_mode = GET_MODE_WIDER_MODE (move_mode);
27546
27547 /* Find the corresponding vector mode with the same size as MOVE_MODE.
27548 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
27549 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
27550 {
27551 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
27552 move_mode = mode_for_vector (word_mode, nunits);
27553 if (optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
27554 move_mode = word_mode;
27555 }
27556 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
27557 break;
27558 case rep_prefix_8_byte:
27559 move_mode = DImode;
27560 break;
27561 case rep_prefix_4_byte:
27562 move_mode = SImode;
27563 break;
27564 case rep_prefix_1_byte:
27565 move_mode = QImode;
27566 break;
27567 }
27568 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
27569 epilogue_size_needed = size_needed;
27570
27571 /* If we are going to call any library calls conditionally, make sure any
27572 pending stack adjustment happen before the first conditional branch,
27573 otherwise they will be emitted before the library call only and won't
27574 happen from the other branches. */
27575 if (dynamic_check != -1)
27576 do_pending_stack_adjust ();
27577
27578 desired_align = decide_alignment (align, alg, expected_size, move_mode);
27579 if (!TARGET_ALIGN_STRINGOPS || noalign)
27580 align = desired_align;
27581
27582 /* Step 1: Prologue guard. */
27583
27584 /* Alignment code needs count to be in register. */
27585 if (CONST_INT_P (count_exp) && desired_align > align)
27586 {
27587 if (INTVAL (count_exp) > desired_align
27588 && INTVAL (count_exp) > size_needed)
27589 {
27590 align_bytes
27591 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
27592 if (align_bytes <= 0)
27593 align_bytes = 0;
27594 else
27595 align_bytes = desired_align - align_bytes;
27596 }
27597 if (align_bytes == 0)
27598 count_exp = force_reg (counter_mode (count_exp), count_exp);
27599 }
27600 gcc_assert (desired_align >= 1 && align >= 1);
27601
27602 /* Misaligned move sequences handle both prologue and epilogue at once.
27603 Default code generation results in a smaller code for large alignments
27604 and also avoids redundant job when sizes are known precisely. */
27605 misaligned_prologue_used
27606 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
27607 && MAX (desired_align, epilogue_size_needed) <= 32
27608 && desired_align <= epilogue_size_needed
27609 && ((desired_align > align && !align_bytes)
27610 || (!count && epilogue_size_needed > 1)));
27611
27612 /* Do the cheap promotion to allow better CSE across the
27613 main loop and epilogue (ie one load of the big constant in the
27614 front of all code.
27615 For now the misaligned move sequences do not have fast path
27616 without broadcasting. */
27617 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
27618 {
27619 if (alg == vector_loop)
27620 {
27621 gcc_assert (val_exp == const0_rtx);
27622 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
27623 promoted_val = promote_duplicated_reg_to_size (val_exp,
27624 GET_MODE_SIZE (word_mode),
27625 desired_align, align);
27626 }
27627 else
27628 {
27629 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27630 desired_align, align);
27631 }
27632 }
27633 /* Misaligned move sequences handles both prologues and epilogues at once.
27634 Default code generation results in smaller code for large alignments and
27635 also avoids redundant job when sizes are known precisely. */
27636 if (misaligned_prologue_used)
27637 {
27638 /* Misaligned move prologue handled small blocks by itself. */
27639 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
27640 (dst, src, &destreg, &srcreg,
27641 move_mode, promoted_val, vec_promoted_val,
27642 &count_exp,
27643 &jump_around_label,
27644 desired_align < align
27645 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
27646 desired_align, align, &min_size, dynamic_check, issetmem);
27647 if (!issetmem)
27648 src = change_address (src, BLKmode, srcreg);
27649 dst = change_address (dst, BLKmode, destreg);
27650 set_mem_align (dst, desired_align * BITS_PER_UNIT);
27651 epilogue_size_needed = 0;
27652 if (need_zero_guard
27653 && min_size < (unsigned HOST_WIDE_INT) size_needed)
27654 {
27655 /* It is possible that we copied enough so the main loop will not
27656 execute. */
27657 gcc_assert (size_needed > 1);
27658 if (jump_around_label == NULL_RTX)
27659 jump_around_label = gen_label_rtx ();
27660 emit_cmp_and_jump_insns (count_exp,
27661 GEN_INT (size_needed),
27662 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
27663 if (expected_size == -1
27664 || expected_size < (desired_align - align) / 2 + size_needed)
27665 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27666 else
27667 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27668 }
27669 }
27670 /* Ensure that alignment prologue won't copy past end of block. */
27671 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
27672 {
27673 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
27674 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
27675 Make sure it is power of 2. */
27676 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
27677
27678 /* To improve performance of small blocks, we jump around the VAL
27679 promoting mode. This mean that if the promoted VAL is not constant,
27680 we might not use it in the epilogue and have to use byte
27681 loop variant. */
27682 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
27683 force_loopy_epilogue = true;
27684 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27685 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27686 {
27687 /* If main algorithm works on QImode, no epilogue is needed.
27688 For small sizes just don't align anything. */
27689 if (size_needed == 1)
27690 desired_align = align;
27691 else
27692 goto epilogue;
27693 }
27694 else if (!count
27695 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27696 {
27697 label = gen_label_rtx ();
27698 emit_cmp_and_jump_insns (count_exp,
27699 GEN_INT (epilogue_size_needed),
27700 LTU, 0, counter_mode (count_exp), 1, label);
27701 if (expected_size == -1 || expected_size < epilogue_size_needed)
27702 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27703 else
27704 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27705 }
27706 }
27707
27708 /* Emit code to decide on runtime whether library call or inline should be
27709 used. */
27710 if (dynamic_check != -1)
27711 {
27712 if (!issetmem && CONST_INT_P (count_exp))
27713 {
27714 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
27715 {
27716 emit_block_copy_via_libcall (dst, src, count_exp);
27717 count_exp = const0_rtx;
27718 goto epilogue;
27719 }
27720 }
27721 else
27722 {
27723 rtx_code_label *hot_label = gen_label_rtx ();
27724 if (jump_around_label == NULL_RTX)
27725 jump_around_label = gen_label_rtx ();
27726 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
27727 LEU, 0, counter_mode (count_exp),
27728 1, hot_label);
27729 predict_jump (REG_BR_PROB_BASE * 90 / 100);
27730 if (issetmem)
27731 set_storage_via_libcall (dst, count_exp, val_exp);
27732 else
27733 emit_block_copy_via_libcall (dst, src, count_exp);
27734 emit_jump (jump_around_label);
27735 emit_label (hot_label);
27736 }
27737 }
27738
27739 /* Step 2: Alignment prologue. */
27740 /* Do the expensive promotion once we branched off the small blocks. */
27741 if (issetmem && !promoted_val)
27742 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27743 desired_align, align);
27744
27745 if (desired_align > align && !misaligned_prologue_used)
27746 {
27747 if (align_bytes == 0)
27748 {
27749 /* Except for the first move in prologue, we no longer know
27750 constant offset in aliasing info. It don't seems to worth
27751 the pain to maintain it for the first move, so throw away
27752 the info early. */
27753 dst = change_address (dst, BLKmode, destreg);
27754 if (!issetmem)
27755 src = change_address (src, BLKmode, srcreg);
27756 dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
27757 promoted_val, vec_promoted_val,
27758 count_exp, align, desired_align,
27759 issetmem);
27760 /* At most desired_align - align bytes are copied. */
27761 if (min_size < (unsigned)(desired_align - align))
27762 min_size = 0;
27763 else
27764 min_size -= desired_align - align;
27765 }
27766 else
27767 {
27768 /* If we know how many bytes need to be stored before dst is
27769 sufficiently aligned, maintain aliasing info accurately. */
27770 dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
27771 srcreg,
27772 promoted_val,
27773 vec_promoted_val,
27774 desired_align,
27775 align_bytes,
27776 issetmem);
27777
27778 count_exp = plus_constant (counter_mode (count_exp),
27779 count_exp, -align_bytes);
27780 count -= align_bytes;
27781 min_size -= align_bytes;
27782 max_size -= align_bytes;
27783 }
27784 if (need_zero_guard
27785 && min_size < (unsigned HOST_WIDE_INT) size_needed
27786 && (count < (unsigned HOST_WIDE_INT) size_needed
27787 || (align_bytes == 0
27788 && count < ((unsigned HOST_WIDE_INT) size_needed
27789 + desired_align - align))))
27790 {
27791 /* It is possible that we copied enough so the main loop will not
27792 execute. */
27793 gcc_assert (size_needed > 1);
27794 if (label == NULL_RTX)
27795 label = gen_label_rtx ();
27796 emit_cmp_and_jump_insns (count_exp,
27797 GEN_INT (size_needed),
27798 LTU, 0, counter_mode (count_exp), 1, label);
27799 if (expected_size == -1
27800 || expected_size < (desired_align - align) / 2 + size_needed)
27801 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27802 else
27803 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27804 }
27805 }
27806 if (label && size_needed == 1)
27807 {
27808 emit_label (label);
27809 LABEL_NUSES (label) = 1;
27810 label = NULL;
27811 epilogue_size_needed = 1;
27812 if (issetmem)
27813 promoted_val = val_exp;
27814 }
27815 else if (label == NULL_RTX && !misaligned_prologue_used)
27816 epilogue_size_needed = size_needed;
27817
27818 /* Step 3: Main loop. */
27819
27820 switch (alg)
27821 {
27822 case libcall:
27823 case no_stringop:
27824 case last_alg:
27825 gcc_unreachable ();
27826 case loop_1_byte:
27827 case loop:
27828 case unrolled_loop:
27829 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
27830 count_exp, move_mode, unroll_factor,
27831 expected_size, issetmem);
27832 break;
27833 case vector_loop:
27834 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
27835 vec_promoted_val, count_exp, move_mode,
27836 unroll_factor, expected_size, issetmem);
27837 break;
27838 case rep_prefix_8_byte:
27839 case rep_prefix_4_byte:
27840 case rep_prefix_1_byte:
27841 expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
27842 val_exp, count_exp, move_mode, issetmem);
27843 break;
27844 }
27845 /* Adjust properly the offset of src and dest memory for aliasing. */
27846 if (CONST_INT_P (count_exp))
27847 {
27848 if (!issetmem)
27849 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
27850 (count / size_needed) * size_needed);
27851 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
27852 (count / size_needed) * size_needed);
27853 }
27854 else
27855 {
27856 if (!issetmem)
27857 src = change_address (src, BLKmode, srcreg);
27858 dst = change_address (dst, BLKmode, destreg);
27859 }
27860
27861 /* Step 4: Epilogue to copy the remaining bytes. */
27862 epilogue:
27863 if (label)
27864 {
27865 /* When the main loop is done, COUNT_EXP might hold original count,
27866 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
27867 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
27868 bytes. Compensate if needed. */
27869
27870 if (size_needed < epilogue_size_needed)
27871 {
27872 tmp =
27873 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
27874 GEN_INT (size_needed - 1), count_exp, 1,
27875 OPTAB_DIRECT);
27876 if (tmp != count_exp)
27877 emit_move_insn (count_exp, tmp);
27878 }
27879 emit_label (label);
27880 LABEL_NUSES (label) = 1;
27881 }
27882
27883 if (count_exp != const0_rtx && epilogue_size_needed > 1)
27884 {
27885 if (force_loopy_epilogue)
27886 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
27887 epilogue_size_needed);
27888 else
27889 {
27890 if (issetmem)
27891 expand_setmem_epilogue (dst, destreg, promoted_val,
27892 vec_promoted_val, count_exp,
27893 epilogue_size_needed);
27894 else
27895 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
27896 epilogue_size_needed);
27897 }
27898 }
27899 if (jump_around_label)
27900 emit_label (jump_around_label);
27901 return true;
27902 }
27903
27904
27905 /* Expand the appropriate insns for doing strlen if not just doing
27906 repnz; scasb
27907
27908 out = result, initialized with the start address
27909 align_rtx = alignment of the address.
27910 scratch = scratch register, initialized with the startaddress when
27911 not aligned, otherwise undefined
27912
27913 This is just the body. It needs the initializations mentioned above and
27914 some address computing at the end. These things are done in i386.md. */
27915
27916 static void
27917 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
27918 {
27919 int align;
27920 rtx tmp;
27921 rtx_code_label *align_2_label = NULL;
27922 rtx_code_label *align_3_label = NULL;
27923 rtx_code_label *align_4_label = gen_label_rtx ();
27924 rtx_code_label *end_0_label = gen_label_rtx ();
27925 rtx mem;
27926 rtx tmpreg = gen_reg_rtx (SImode);
27927 rtx scratch = gen_reg_rtx (SImode);
27928 rtx cmp;
27929
27930 align = 0;
27931 if (CONST_INT_P (align_rtx))
27932 align = INTVAL (align_rtx);
27933
27934 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
27935
27936 /* Is there a known alignment and is it less than 4? */
27937 if (align < 4)
27938 {
27939 rtx scratch1 = gen_reg_rtx (Pmode);
27940 emit_move_insn (scratch1, out);
27941 /* Is there a known alignment and is it not 2? */
27942 if (align != 2)
27943 {
27944 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
27945 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
27946
27947 /* Leave just the 3 lower bits. */
27948 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
27949 NULL_RTX, 0, OPTAB_WIDEN);
27950
27951 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
27952 Pmode, 1, align_4_label);
27953 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
27954 Pmode, 1, align_2_label);
27955 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
27956 Pmode, 1, align_3_label);
27957 }
27958 else
27959 {
27960 /* Since the alignment is 2, we have to check 2 or 0 bytes;
27961 check if is aligned to 4 - byte. */
27962
27963 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
27964 NULL_RTX, 0, OPTAB_WIDEN);
27965
27966 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
27967 Pmode, 1, align_4_label);
27968 }
27969
27970 mem = change_address (src, QImode, out);
27971
27972 /* Now compare the bytes. */
27973
27974 /* Compare the first n unaligned byte on a byte per byte basis. */
27975 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
27976 QImode, 1, end_0_label);
27977
27978 /* Increment the address. */
27979 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
27980
27981 /* Not needed with an alignment of 2 */
27982 if (align != 2)
27983 {
27984 emit_label (align_2_label);
27985
27986 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
27987 end_0_label);
27988
27989 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
27990
27991 emit_label (align_3_label);
27992 }
27993
27994 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
27995 end_0_label);
27996
27997 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
27998 }
27999
28000 /* Generate loop to check 4 bytes at a time. It is not a good idea to
28001 align this loop. It gives only huge programs, but does not help to
28002 speed up. */
28003 emit_label (align_4_label);
28004
28005 mem = change_address (src, SImode, out);
28006 emit_move_insn (scratch, mem);
28007 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
28008
28009 /* This formula yields a nonzero result iff one of the bytes is zero.
28010 This saves three branches inside loop and many cycles. */
28011
28012 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
28013 emit_insn (gen_one_cmplsi2 (scratch, scratch));
28014 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
28015 emit_insn (gen_andsi3 (tmpreg, tmpreg,
28016 gen_int_mode (0x80808080, SImode)));
28017 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
28018 align_4_label);
28019
28020 if (TARGET_CMOVE)
28021 {
28022 rtx reg = gen_reg_rtx (SImode);
28023 rtx reg2 = gen_reg_rtx (Pmode);
28024 emit_move_insn (reg, tmpreg);
28025 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
28026
28027 /* If zero is not in the first two bytes, move two bytes forward. */
28028 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
28029 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28030 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
28031 emit_insn (gen_rtx_SET (tmpreg,
28032 gen_rtx_IF_THEN_ELSE (SImode, tmp,
28033 reg,
28034 tmpreg)));
28035 /* Emit lea manually to avoid clobbering of flags. */
28036 emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
28037
28038 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28039 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
28040 emit_insn (gen_rtx_SET (out,
28041 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
28042 reg2,
28043 out)));
28044 }
28045 else
28046 {
28047 rtx_code_label *end_2_label = gen_label_rtx ();
28048 /* Is zero in the first two bytes? */
28049
28050 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
28051 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28052 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
28053 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
28054 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
28055 pc_rtx);
28056 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
28057 JUMP_LABEL (tmp) = end_2_label;
28058
28059 /* Not in the first two. Move two bytes forward. */
28060 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
28061 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
28062
28063 emit_label (end_2_label);
28064
28065 }
28066
28067 /* Avoid branch in fixing the byte. */
28068 tmpreg = gen_lowpart (QImode, tmpreg);
28069 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
28070 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
28071 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
28072 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
28073
28074 emit_label (end_0_label);
28075 }
28076
28077 /* Expand strlen. */
28078
28079 bool
28080 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
28081 {
28082 rtx addr, scratch1, scratch2, scratch3, scratch4;
28083
28084 /* The generic case of strlen expander is long. Avoid it's
28085 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
28086
28087 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
28088 && !TARGET_INLINE_ALL_STRINGOPS
28089 && !optimize_insn_for_size_p ()
28090 && (!CONST_INT_P (align) || INTVAL (align) < 4))
28091 return false;
28092
28093 addr = force_reg (Pmode, XEXP (src, 0));
28094 scratch1 = gen_reg_rtx (Pmode);
28095
28096 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
28097 && !optimize_insn_for_size_p ())
28098 {
28099 /* Well it seems that some optimizer does not combine a call like
28100 foo(strlen(bar), strlen(bar));
28101 when the move and the subtraction is done here. It does calculate
28102 the length just once when these instructions are done inside of
28103 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
28104 often used and I use one fewer register for the lifetime of
28105 output_strlen_unroll() this is better. */
28106
28107 emit_move_insn (out, addr);
28108
28109 ix86_expand_strlensi_unroll_1 (out, src, align);
28110
28111 /* strlensi_unroll_1 returns the address of the zero at the end of
28112 the string, like memchr(), so compute the length by subtracting
28113 the start address. */
28114 emit_insn (ix86_gen_sub3 (out, out, addr));
28115 }
28116 else
28117 {
28118 rtx unspec;
28119
28120 /* Can't use this if the user has appropriated eax, ecx, or edi. */
28121 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
28122 return false;
28123 /* Can't use this for non-default address spaces. */
28124 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src)))
28125 return false;
28126
28127 scratch2 = gen_reg_rtx (Pmode);
28128 scratch3 = gen_reg_rtx (Pmode);
28129 scratch4 = force_reg (Pmode, constm1_rtx);
28130
28131 emit_move_insn (scratch3, addr);
28132 eoschar = force_reg (QImode, eoschar);
28133
28134 src = replace_equiv_address_nv (src, scratch3);
28135
28136 /* If .md starts supporting :P, this can be done in .md. */
28137 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
28138 scratch4), UNSPEC_SCAS);
28139 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
28140 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
28141 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
28142 }
28143 return true;
28144 }
28145
28146 /* For given symbol (function) construct code to compute address of it's PLT
28147 entry in large x86-64 PIC model. */
28148 static rtx
28149 construct_plt_address (rtx symbol)
28150 {
28151 rtx tmp, unspec;
28152
28153 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
28154 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
28155 gcc_assert (Pmode == DImode);
28156
28157 tmp = gen_reg_rtx (Pmode);
28158 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
28159
28160 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
28161 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
28162 return tmp;
28163 }
28164
28165 rtx
28166 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
28167 rtx callarg2,
28168 rtx pop, bool sibcall)
28169 {
28170 rtx vec[3];
28171 rtx use = NULL, call;
28172 unsigned int vec_len = 0;
28173 tree fndecl;
28174
28175 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
28176 {
28177 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
28178 if (fndecl
28179 && (lookup_attribute ("interrupt",
28180 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
28181 error ("interrupt service routine can't be called directly");
28182 }
28183 else
28184 fndecl = NULL_TREE;
28185
28186 if (pop == const0_rtx)
28187 pop = NULL;
28188 gcc_assert (!TARGET_64BIT || !pop);
28189
28190 if (TARGET_MACHO && !TARGET_64BIT)
28191 {
28192 #if TARGET_MACHO
28193 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
28194 fnaddr = machopic_indirect_call_target (fnaddr);
28195 #endif
28196 }
28197 else
28198 {
28199 /* Static functions and indirect calls don't need the pic register. Also,
28200 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
28201 it an indirect call. */
28202 rtx addr = XEXP (fnaddr, 0);
28203 if (flag_pic
28204 && GET_CODE (addr) == SYMBOL_REF
28205 && !SYMBOL_REF_LOCAL_P (addr))
28206 {
28207 if (flag_plt
28208 && (SYMBOL_REF_DECL (addr) == NULL_TREE
28209 || !lookup_attribute ("noplt",
28210 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
28211 {
28212 if (!TARGET_64BIT
28213 || (ix86_cmodel == CM_LARGE_PIC
28214 && DEFAULT_ABI != MS_ABI))
28215 {
28216 use_reg (&use, gen_rtx_REG (Pmode,
28217 REAL_PIC_OFFSET_TABLE_REGNUM));
28218 if (ix86_use_pseudo_pic_reg ())
28219 emit_move_insn (gen_rtx_REG (Pmode,
28220 REAL_PIC_OFFSET_TABLE_REGNUM),
28221 pic_offset_table_rtx);
28222 }
28223 }
28224 else if (!TARGET_PECOFF && !TARGET_MACHO)
28225 {
28226 if (TARGET_64BIT)
28227 {
28228 fnaddr = gen_rtx_UNSPEC (Pmode,
28229 gen_rtvec (1, addr),
28230 UNSPEC_GOTPCREL);
28231 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
28232 }
28233 else
28234 {
28235 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
28236 UNSPEC_GOT);
28237 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
28238 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
28239 fnaddr);
28240 }
28241 fnaddr = gen_const_mem (Pmode, fnaddr);
28242 /* Pmode may not be the same as word_mode for x32, which
28243 doesn't support indirect branch via 32-bit memory slot.
28244 Since x32 GOT slot is 64 bit with zero upper 32 bits,
28245 indirect branch via x32 GOT slot is OK. */
28246 if (GET_MODE (fnaddr) != word_mode)
28247 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
28248 fnaddr = gen_rtx_MEM (QImode, fnaddr);
28249 }
28250 }
28251 }
28252
28253 /* Skip setting up RAX register for -mskip-rax-setup when there are no
28254 parameters passed in vector registers. */
28255 if (TARGET_64BIT
28256 && (INTVAL (callarg2) > 0
28257 || (INTVAL (callarg2) == 0
28258 && (TARGET_SSE || !flag_skip_rax_setup))))
28259 {
28260 rtx al = gen_rtx_REG (QImode, AX_REG);
28261 emit_move_insn (al, callarg2);
28262 use_reg (&use, al);
28263 }
28264
28265 if (ix86_cmodel == CM_LARGE_PIC
28266 && !TARGET_PECOFF
28267 && MEM_P (fnaddr)
28268 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
28269 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
28270 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
28271 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
28272 branch via x32 GOT slot is OK. */
28273 else if (!(TARGET_X32
28274 && MEM_P (fnaddr)
28275 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
28276 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
28277 && (sibcall
28278 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
28279 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
28280 {
28281 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
28282 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
28283 }
28284
28285 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
28286
28287 if (retval)
28288 {
28289 /* We should add bounds as destination register in case
28290 pointer with bounds may be returned. */
28291 if (TARGET_MPX && SCALAR_INT_MODE_P (GET_MODE (retval)))
28292 {
28293 rtx b0 = gen_rtx_REG (BND64mode, FIRST_BND_REG);
28294 rtx b1 = gen_rtx_REG (BND64mode, FIRST_BND_REG + 1);
28295 if (GET_CODE (retval) == PARALLEL)
28296 {
28297 b0 = gen_rtx_EXPR_LIST (VOIDmode, b0, const0_rtx);
28298 b1 = gen_rtx_EXPR_LIST (VOIDmode, b1, const0_rtx);
28299 rtx par = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, b0, b1));
28300 retval = chkp_join_splitted_slot (retval, par);
28301 }
28302 else
28303 {
28304 retval = gen_rtx_PARALLEL (VOIDmode,
28305 gen_rtvec (3, retval, b0, b1));
28306 chkp_put_regs_to_expr_list (retval);
28307 }
28308 }
28309
28310 call = gen_rtx_SET (retval, call);
28311 }
28312 vec[vec_len++] = call;
28313
28314 if (pop)
28315 {
28316 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
28317 pop = gen_rtx_SET (stack_pointer_rtx, pop);
28318 vec[vec_len++] = pop;
28319 }
28320
28321 if (cfun->machine->no_caller_saved_registers
28322 && (!fndecl
28323 || (!TREE_THIS_VOLATILE (fndecl)
28324 && !lookup_attribute ("no_caller_saved_registers",
28325 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
28326 {
28327 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
28328 bool is_64bit_ms_abi = (TARGET_64BIT
28329 && ix86_function_abi (fndecl) == MS_ABI);
28330 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
28331
28332 /* If there are no caller-saved registers, add all registers
28333 that are clobbered by the call which returns. */
28334 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
28335 if (!fixed_regs[i]
28336 && (ix86_call_used_regs[i] == 1
28337 || (ix86_call_used_regs[i] & c_mask))
28338 && !STACK_REGNO_P (i)
28339 && !MMX_REGNO_P (i))
28340 clobber_reg (&use,
28341 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
28342 }
28343 else if (TARGET_64BIT_MS_ABI
28344 && (!callarg2 || INTVAL (callarg2) != -2))
28345 {
28346 int const cregs_size
28347 = ARRAY_SIZE (x86_64_ms_sysv_extra_clobbered_registers);
28348 int i;
28349
28350 for (i = 0; i < cregs_size; i++)
28351 {
28352 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
28353 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
28354
28355 clobber_reg (&use, gen_rtx_REG (mode, regno));
28356 }
28357 }
28358
28359 if (vec_len > 1)
28360 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
28361 call = emit_call_insn (call);
28362 if (use)
28363 CALL_INSN_FUNCTION_USAGE (call) = use;
28364
28365 return call;
28366 }
28367
28368 /* Return true if the function being called was marked with attribute
28369 "noplt" or using -fno-plt and we are compiling for non-PIC. We need
28370 to handle the non-PIC case in the backend because there is no easy
28371 interface for the front-end to force non-PLT calls to use the GOT.
28372 This is currently used only with 64-bit or 32-bit GOT32X ELF targets
28373 to call the function marked "noplt" indirectly. */
28374
28375 static bool
28376 ix86_nopic_noplt_attribute_p (rtx call_op)
28377 {
28378 if (flag_pic || ix86_cmodel == CM_LARGE
28379 || !(TARGET_64BIT || HAVE_AS_IX86_GOT32X)
28380 || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF
28381 || SYMBOL_REF_LOCAL_P (call_op))
28382 return false;
28383
28384 tree symbol_decl = SYMBOL_REF_DECL (call_op);
28385
28386 if (!flag_plt
28387 || (symbol_decl != NULL_TREE
28388 && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl))))
28389 return true;
28390
28391 return false;
28392 }
28393
28394 /* Output the assembly for a call instruction. */
28395
28396 const char *
28397 ix86_output_call_insn (rtx_insn *insn, rtx call_op)
28398 {
28399 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
28400 bool seh_nop_p = false;
28401 const char *xasm;
28402
28403 if (SIBLING_CALL_P (insn))
28404 {
28405 if (direct_p)
28406 {
28407 if (ix86_nopic_noplt_attribute_p (call_op))
28408 {
28409 if (TARGET_64BIT)
28410 xasm = "%!jmp\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
28411 else
28412 xasm = "%!jmp\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
28413 }
28414 else
28415 xasm = "%!jmp\t%P0";
28416 }
28417 /* SEH epilogue detection requires the indirect branch case
28418 to include REX.W. */
28419 else if (TARGET_SEH)
28420 xasm = "%!rex.W jmp\t%A0";
28421 else
28422 xasm = "%!jmp\t%A0";
28423
28424 output_asm_insn (xasm, &call_op);
28425 return "";
28426 }
28427
28428 /* SEH unwinding can require an extra nop to be emitted in several
28429 circumstances. Determine if we have one of those. */
28430 if (TARGET_SEH)
28431 {
28432 rtx_insn *i;
28433
28434 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
28435 {
28436 /* If we get to another real insn, we don't need the nop. */
28437 if (INSN_P (i))
28438 break;
28439
28440 /* If we get to the epilogue note, prevent a catch region from
28441 being adjacent to the standard epilogue sequence. If non-
28442 call-exceptions, we'll have done this during epilogue emission. */
28443 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
28444 && !flag_non_call_exceptions
28445 && !can_throw_internal (insn))
28446 {
28447 seh_nop_p = true;
28448 break;
28449 }
28450 }
28451
28452 /* If we didn't find a real insn following the call, prevent the
28453 unwinder from looking into the next function. */
28454 if (i == NULL)
28455 seh_nop_p = true;
28456 }
28457
28458 if (direct_p)
28459 {
28460 if (ix86_nopic_noplt_attribute_p (call_op))
28461 {
28462 if (TARGET_64BIT)
28463 xasm = "%!call\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
28464 else
28465 xasm = "%!call\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
28466 }
28467 else
28468 xasm = "%!call\t%P0";
28469 }
28470 else
28471 xasm = "%!call\t%A0";
28472
28473 output_asm_insn (xasm, &call_op);
28474
28475 if (seh_nop_p)
28476 return "nop";
28477
28478 return "";
28479 }
28480 \f
28481 /* Clear stack slot assignments remembered from previous functions.
28482 This is called from INIT_EXPANDERS once before RTL is emitted for each
28483 function. */
28484
28485 static struct machine_function *
28486 ix86_init_machine_status (void)
28487 {
28488 struct machine_function *f;
28489
28490 f = ggc_cleared_alloc<machine_function> ();
28491 f->use_fast_prologue_epilogue_nregs = -1;
28492 f->call_abi = ix86_abi;
28493
28494 return f;
28495 }
28496
28497 /* Return a MEM corresponding to a stack slot with mode MODE.
28498 Allocate a new slot if necessary.
28499
28500 The RTL for a function can have several slots available: N is
28501 which slot to use. */
28502
28503 rtx
28504 assign_386_stack_local (machine_mode mode, enum ix86_stack_slot n)
28505 {
28506 struct stack_local_entry *s;
28507
28508 gcc_assert (n < MAX_386_STACK_LOCALS);
28509
28510 for (s = ix86_stack_locals; s; s = s->next)
28511 if (s->mode == mode && s->n == n)
28512 return validize_mem (copy_rtx (s->rtl));
28513
28514 s = ggc_alloc<stack_local_entry> ();
28515 s->n = n;
28516 s->mode = mode;
28517 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
28518
28519 s->next = ix86_stack_locals;
28520 ix86_stack_locals = s;
28521 return validize_mem (copy_rtx (s->rtl));
28522 }
28523
28524 static void
28525 ix86_instantiate_decls (void)
28526 {
28527 struct stack_local_entry *s;
28528
28529 for (s = ix86_stack_locals; s; s = s->next)
28530 if (s->rtl != NULL_RTX)
28531 instantiate_decl_rtl (s->rtl);
28532 }
28533 \f
28534 /* Return the number used for encoding REG, in the range 0..7. */
28535
28536 static int
28537 reg_encoded_number (rtx reg)
28538 {
28539 unsigned regno = REGNO (reg);
28540 switch (regno)
28541 {
28542 case AX_REG:
28543 return 0;
28544 case CX_REG:
28545 return 1;
28546 case DX_REG:
28547 return 2;
28548 case BX_REG:
28549 return 3;
28550 case SP_REG:
28551 return 4;
28552 case BP_REG:
28553 return 5;
28554 case SI_REG:
28555 return 6;
28556 case DI_REG:
28557 return 7;
28558 default:
28559 break;
28560 }
28561 if (IN_RANGE (regno, FIRST_STACK_REG, LAST_STACK_REG))
28562 return regno - FIRST_STACK_REG;
28563 if (IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG))
28564 return regno - FIRST_SSE_REG;
28565 if (IN_RANGE (regno, FIRST_MMX_REG, LAST_MMX_REG))
28566 return regno - FIRST_MMX_REG;
28567 if (IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))
28568 return regno - FIRST_REX_SSE_REG;
28569 if (IN_RANGE (regno, FIRST_REX_INT_REG, LAST_REX_INT_REG))
28570 return regno - FIRST_REX_INT_REG;
28571 if (IN_RANGE (regno, FIRST_MASK_REG, LAST_MASK_REG))
28572 return regno - FIRST_MASK_REG;
28573 if (IN_RANGE (regno, FIRST_BND_REG, LAST_BND_REG))
28574 return regno - FIRST_BND_REG;
28575 return -1;
28576 }
28577
28578 /* Given an insn INSN with NOPERANDS OPERANDS, return the modr/m byte used
28579 in its encoding if it could be relevant for ROP mitigation, otherwise
28580 return -1. If POPNO0 and POPNO1 are nonnull, store the operand numbers
28581 used for calculating it into them. */
28582
28583 static int
28584 ix86_get_modrm_for_rop (rtx_insn *insn, rtx *operands, int noperands,
28585 int *popno0 = 0, int *popno1 = 0)
28586 {
28587 if (asm_noperands (PATTERN (insn)) >= 0)
28588 return -1;
28589 int has_modrm = get_attr_modrm (insn);
28590 if (!has_modrm)
28591 return -1;
28592 enum attr_modrm_class cls = get_attr_modrm_class (insn);
28593 rtx op0, op1;
28594 switch (cls)
28595 {
28596 case MODRM_CLASS_OP02:
28597 gcc_assert (noperands >= 3);
28598 if (popno0)
28599 {
28600 *popno0 = 0;
28601 *popno1 = 2;
28602 }
28603 op0 = operands[0];
28604 op1 = operands[2];
28605 break;
28606 case MODRM_CLASS_OP01:
28607 gcc_assert (noperands >= 2);
28608 if (popno0)
28609 {
28610 *popno0 = 0;
28611 *popno1 = 1;
28612 }
28613 op0 = operands[0];
28614 op1 = operands[1];
28615 break;
28616 default:
28617 return -1;
28618 }
28619 if (REG_P (op0) && REG_P (op1))
28620 {
28621 int enc0 = reg_encoded_number (op0);
28622 int enc1 = reg_encoded_number (op1);
28623 return 0xc0 + (enc1 << 3) + enc0;
28624 }
28625 return -1;
28626 }
28627
28628 /* Check whether x86 address PARTS is a pc-relative address. */
28629
28630 static bool
28631 rip_relative_addr_p (struct ix86_address *parts)
28632 {
28633 rtx base, index, disp;
28634
28635 base = parts->base;
28636 index = parts->index;
28637 disp = parts->disp;
28638
28639 if (disp && !base && !index)
28640 {
28641 if (TARGET_64BIT)
28642 {
28643 rtx symbol = disp;
28644
28645 if (GET_CODE (disp) == CONST)
28646 symbol = XEXP (disp, 0);
28647 if (GET_CODE (symbol) == PLUS
28648 && CONST_INT_P (XEXP (symbol, 1)))
28649 symbol = XEXP (symbol, 0);
28650
28651 if (GET_CODE (symbol) == LABEL_REF
28652 || (GET_CODE (symbol) == SYMBOL_REF
28653 && SYMBOL_REF_TLS_MODEL (symbol) == 0)
28654 || (GET_CODE (symbol) == UNSPEC
28655 && (XINT (symbol, 1) == UNSPEC_GOTPCREL
28656 || XINT (symbol, 1) == UNSPEC_PCREL
28657 || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
28658 return true;
28659 }
28660 }
28661 return false;
28662 }
28663
28664 /* Calculate the length of the memory address in the instruction encoding.
28665 Includes addr32 prefix, does not include the one-byte modrm, opcode,
28666 or other prefixes. We never generate addr32 prefix for LEA insn. */
28667
28668 int
28669 memory_address_length (rtx addr, bool lea)
28670 {
28671 struct ix86_address parts;
28672 rtx base, index, disp;
28673 int len;
28674 int ok;
28675
28676 if (GET_CODE (addr) == PRE_DEC
28677 || GET_CODE (addr) == POST_INC
28678 || GET_CODE (addr) == PRE_MODIFY
28679 || GET_CODE (addr) == POST_MODIFY)
28680 return 0;
28681
28682 ok = ix86_decompose_address (addr, &parts);
28683 gcc_assert (ok);
28684
28685 len = (parts.seg == ADDR_SPACE_GENERIC) ? 0 : 1;
28686
28687 /* If this is not LEA instruction, add the length of addr32 prefix. */
28688 if (TARGET_64BIT && !lea
28689 && (SImode_address_operand (addr, VOIDmode)
28690 || (parts.base && GET_MODE (parts.base) == SImode)
28691 || (parts.index && GET_MODE (parts.index) == SImode)))
28692 len++;
28693
28694 base = parts.base;
28695 index = parts.index;
28696 disp = parts.disp;
28697
28698 if (base && SUBREG_P (base))
28699 base = SUBREG_REG (base);
28700 if (index && SUBREG_P (index))
28701 index = SUBREG_REG (index);
28702
28703 gcc_assert (base == NULL_RTX || REG_P (base));
28704 gcc_assert (index == NULL_RTX || REG_P (index));
28705
28706 /* Rule of thumb:
28707 - esp as the base always wants an index,
28708 - ebp as the base always wants a displacement,
28709 - r12 as the base always wants an index,
28710 - r13 as the base always wants a displacement. */
28711
28712 /* Register Indirect. */
28713 if (base && !index && !disp)
28714 {
28715 /* esp (for its index) and ebp (for its displacement) need
28716 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
28717 code. */
28718 if (base == arg_pointer_rtx
28719 || base == frame_pointer_rtx
28720 || REGNO (base) == SP_REG
28721 || REGNO (base) == BP_REG
28722 || REGNO (base) == R12_REG
28723 || REGNO (base) == R13_REG)
28724 len++;
28725 }
28726
28727 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
28728 is not disp32, but disp32(%rip), so for disp32
28729 SIB byte is needed, unless print_operand_address
28730 optimizes it into disp32(%rip) or (%rip) is implied
28731 by UNSPEC. */
28732 else if (disp && !base && !index)
28733 {
28734 len += 4;
28735 if (rip_relative_addr_p (&parts))
28736 len++;
28737 }
28738 else
28739 {
28740 /* Find the length of the displacement constant. */
28741 if (disp)
28742 {
28743 if (base && satisfies_constraint_K (disp))
28744 len += 1;
28745 else
28746 len += 4;
28747 }
28748 /* ebp always wants a displacement. Similarly r13. */
28749 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
28750 len++;
28751
28752 /* An index requires the two-byte modrm form.... */
28753 if (index
28754 /* ...like esp (or r12), which always wants an index. */
28755 || base == arg_pointer_rtx
28756 || base == frame_pointer_rtx
28757 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
28758 len++;
28759 }
28760
28761 return len;
28762 }
28763
28764 /* Compute default value for "length_immediate" attribute. When SHORTFORM
28765 is set, expect that insn have 8bit immediate alternative. */
28766 int
28767 ix86_attr_length_immediate_default (rtx_insn *insn, bool shortform)
28768 {
28769 int len = 0;
28770 int i;
28771 extract_insn_cached (insn);
28772 for (i = recog_data.n_operands - 1; i >= 0; --i)
28773 if (CONSTANT_P (recog_data.operand[i]))
28774 {
28775 enum attr_mode mode = get_attr_mode (insn);
28776
28777 gcc_assert (!len);
28778 if (shortform && CONST_INT_P (recog_data.operand[i]))
28779 {
28780 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
28781 switch (mode)
28782 {
28783 case MODE_QI:
28784 len = 1;
28785 continue;
28786 case MODE_HI:
28787 ival = trunc_int_for_mode (ival, HImode);
28788 break;
28789 case MODE_SI:
28790 ival = trunc_int_for_mode (ival, SImode);
28791 break;
28792 default:
28793 break;
28794 }
28795 if (IN_RANGE (ival, -128, 127))
28796 {
28797 len = 1;
28798 continue;
28799 }
28800 }
28801 switch (mode)
28802 {
28803 case MODE_QI:
28804 len = 1;
28805 break;
28806 case MODE_HI:
28807 len = 2;
28808 break;
28809 case MODE_SI:
28810 len = 4;
28811 break;
28812 /* Immediates for DImode instructions are encoded
28813 as 32bit sign extended values. */
28814 case MODE_DI:
28815 len = 4;
28816 break;
28817 default:
28818 fatal_insn ("unknown insn mode", insn);
28819 }
28820 }
28821 return len;
28822 }
28823
28824 /* Compute default value for "length_address" attribute. */
28825 int
28826 ix86_attr_length_address_default (rtx_insn *insn)
28827 {
28828 int i;
28829
28830 if (get_attr_type (insn) == TYPE_LEA)
28831 {
28832 rtx set = PATTERN (insn), addr;
28833
28834 if (GET_CODE (set) == PARALLEL)
28835 set = XVECEXP (set, 0, 0);
28836
28837 gcc_assert (GET_CODE (set) == SET);
28838
28839 addr = SET_SRC (set);
28840
28841 return memory_address_length (addr, true);
28842 }
28843
28844 extract_insn_cached (insn);
28845 for (i = recog_data.n_operands - 1; i >= 0; --i)
28846 {
28847 rtx op = recog_data.operand[i];
28848 if (MEM_P (op))
28849 {
28850 constrain_operands_cached (insn, reload_completed);
28851 if (which_alternative != -1)
28852 {
28853 const char *constraints = recog_data.constraints[i];
28854 int alt = which_alternative;
28855
28856 while (*constraints == '=' || *constraints == '+')
28857 constraints++;
28858 while (alt-- > 0)
28859 while (*constraints++ != ',')
28860 ;
28861 /* Skip ignored operands. */
28862 if (*constraints == 'X')
28863 continue;
28864 }
28865
28866 int len = memory_address_length (XEXP (op, 0), false);
28867
28868 /* Account for segment prefix for non-default addr spaces. */
28869 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (op)))
28870 len++;
28871
28872 return len;
28873 }
28874 }
28875 return 0;
28876 }
28877
28878 /* Compute default value for "length_vex" attribute. It includes
28879 2 or 3 byte VEX prefix and 1 opcode byte. */
28880
28881 int
28882 ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
28883 bool has_vex_w)
28884 {
28885 int i;
28886
28887 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
28888 byte VEX prefix. */
28889 if (!has_0f_opcode || has_vex_w)
28890 return 3 + 1;
28891
28892 /* We can always use 2 byte VEX prefix in 32bit. */
28893 if (!TARGET_64BIT)
28894 return 2 + 1;
28895
28896 extract_insn_cached (insn);
28897
28898 for (i = recog_data.n_operands - 1; i >= 0; --i)
28899 if (REG_P (recog_data.operand[i]))
28900 {
28901 /* REX.W bit uses 3 byte VEX prefix. */
28902 if (GET_MODE (recog_data.operand[i]) == DImode
28903 && GENERAL_REG_P (recog_data.operand[i]))
28904 return 3 + 1;
28905 }
28906 else
28907 {
28908 /* REX.X or REX.B bits use 3 byte VEX prefix. */
28909 if (MEM_P (recog_data.operand[i])
28910 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
28911 return 3 + 1;
28912 }
28913
28914 return 2 + 1;
28915 }
28916 \f
28917 /* Return the maximum number of instructions a cpu can issue. */
28918
28919 static int
28920 ix86_issue_rate (void)
28921 {
28922 switch (ix86_tune)
28923 {
28924 case PROCESSOR_PENTIUM:
28925 case PROCESSOR_LAKEMONT:
28926 case PROCESSOR_BONNELL:
28927 case PROCESSOR_SILVERMONT:
28928 case PROCESSOR_KNL:
28929 case PROCESSOR_INTEL:
28930 case PROCESSOR_K6:
28931 case PROCESSOR_BTVER2:
28932 case PROCESSOR_PENTIUM4:
28933 case PROCESSOR_NOCONA:
28934 return 2;
28935
28936 case PROCESSOR_PENTIUMPRO:
28937 case PROCESSOR_ATHLON:
28938 case PROCESSOR_K8:
28939 case PROCESSOR_AMDFAM10:
28940 case PROCESSOR_GENERIC:
28941 case PROCESSOR_BTVER1:
28942 return 3;
28943
28944 case PROCESSOR_BDVER1:
28945 case PROCESSOR_BDVER2:
28946 case PROCESSOR_BDVER3:
28947 case PROCESSOR_BDVER4:
28948 case PROCESSOR_ZNVER1:
28949 case PROCESSOR_CORE2:
28950 case PROCESSOR_NEHALEM:
28951 case PROCESSOR_SANDYBRIDGE:
28952 case PROCESSOR_HASWELL:
28953 return 4;
28954
28955 default:
28956 return 1;
28957 }
28958 }
28959
28960 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
28961 by DEP_INSN and nothing set by DEP_INSN. */
28962
28963 static bool
28964 ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type)
28965 {
28966 rtx set, set2;
28967
28968 /* Simplify the test for uninteresting insns. */
28969 if (insn_type != TYPE_SETCC
28970 && insn_type != TYPE_ICMOV
28971 && insn_type != TYPE_FCMOV
28972 && insn_type != TYPE_IBR)
28973 return false;
28974
28975 if ((set = single_set (dep_insn)) != 0)
28976 {
28977 set = SET_DEST (set);
28978 set2 = NULL_RTX;
28979 }
28980 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
28981 && XVECLEN (PATTERN (dep_insn), 0) == 2
28982 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
28983 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
28984 {
28985 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
28986 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
28987 }
28988 else
28989 return false;
28990
28991 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
28992 return false;
28993
28994 /* This test is true if the dependent insn reads the flags but
28995 not any other potentially set register. */
28996 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
28997 return false;
28998
28999 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
29000 return false;
29001
29002 return true;
29003 }
29004
29005 /* Return true iff USE_INSN has a memory address with operands set by
29006 SET_INSN. */
29007
29008 bool
29009 ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn)
29010 {
29011 int i;
29012 extract_insn_cached (use_insn);
29013 for (i = recog_data.n_operands - 1; i >= 0; --i)
29014 if (MEM_P (recog_data.operand[i]))
29015 {
29016 rtx addr = XEXP (recog_data.operand[i], 0);
29017 return modified_in_p (addr, set_insn) != 0;
29018 }
29019 return false;
29020 }
29021
29022 /* Helper function for exact_store_load_dependency.
29023 Return true if addr is found in insn. */
29024 static bool
29025 exact_dependency_1 (rtx addr, rtx insn)
29026 {
29027 enum rtx_code code;
29028 const char *format_ptr;
29029 int i, j;
29030
29031 code = GET_CODE (insn);
29032 switch (code)
29033 {
29034 case MEM:
29035 if (rtx_equal_p (addr, insn))
29036 return true;
29037 break;
29038 case REG:
29039 CASE_CONST_ANY:
29040 case SYMBOL_REF:
29041 case CODE_LABEL:
29042 case PC:
29043 case CC0:
29044 case EXPR_LIST:
29045 return false;
29046 default:
29047 break;
29048 }
29049
29050 format_ptr = GET_RTX_FORMAT (code);
29051 for (i = 0; i < GET_RTX_LENGTH (code); i++)
29052 {
29053 switch (*format_ptr++)
29054 {
29055 case 'e':
29056 if (exact_dependency_1 (addr, XEXP (insn, i)))
29057 return true;
29058 break;
29059 case 'E':
29060 for (j = 0; j < XVECLEN (insn, i); j++)
29061 if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
29062 return true;
29063 break;
29064 }
29065 }
29066 return false;
29067 }
29068
29069 /* Return true if there exists exact dependency for store & load, i.e.
29070 the same memory address is used in them. */
29071 static bool
29072 exact_store_load_dependency (rtx_insn *store, rtx_insn *load)
29073 {
29074 rtx set1, set2;
29075
29076 set1 = single_set (store);
29077 if (!set1)
29078 return false;
29079 if (!MEM_P (SET_DEST (set1)))
29080 return false;
29081 set2 = single_set (load);
29082 if (!set2)
29083 return false;
29084 if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
29085 return true;
29086 return false;
29087 }
29088
29089 static int
29090 ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
29091 unsigned int)
29092 {
29093 enum attr_type insn_type, dep_insn_type;
29094 enum attr_memory memory;
29095 rtx set, set2;
29096 int dep_insn_code_number;
29097
29098 /* Anti and output dependencies have zero cost on all CPUs. */
29099 if (dep_type != 0)
29100 return 0;
29101
29102 dep_insn_code_number = recog_memoized (dep_insn);
29103
29104 /* If we can't recognize the insns, we can't really do anything. */
29105 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
29106 return cost;
29107
29108 insn_type = get_attr_type (insn);
29109 dep_insn_type = get_attr_type (dep_insn);
29110
29111 switch (ix86_tune)
29112 {
29113 case PROCESSOR_PENTIUM:
29114 case PROCESSOR_LAKEMONT:
29115 /* Address Generation Interlock adds a cycle of latency. */
29116 if (insn_type == TYPE_LEA)
29117 {
29118 rtx addr = PATTERN (insn);
29119
29120 if (GET_CODE (addr) == PARALLEL)
29121 addr = XVECEXP (addr, 0, 0);
29122
29123 gcc_assert (GET_CODE (addr) == SET);
29124
29125 addr = SET_SRC (addr);
29126 if (modified_in_p (addr, dep_insn))
29127 cost += 1;
29128 }
29129 else if (ix86_agi_dependent (dep_insn, insn))
29130 cost += 1;
29131
29132 /* ??? Compares pair with jump/setcc. */
29133 if (ix86_flags_dependent (insn, dep_insn, insn_type))
29134 cost = 0;
29135
29136 /* Floating point stores require value to be ready one cycle earlier. */
29137 if (insn_type == TYPE_FMOV
29138 && get_attr_memory (insn) == MEMORY_STORE
29139 && !ix86_agi_dependent (dep_insn, insn))
29140 cost += 1;
29141 break;
29142
29143 case PROCESSOR_PENTIUMPRO:
29144 /* INT->FP conversion is expensive. */
29145 if (get_attr_fp_int_src (dep_insn))
29146 cost += 5;
29147
29148 /* There is one cycle extra latency between an FP op and a store. */
29149 if (insn_type == TYPE_FMOV
29150 && (set = single_set (dep_insn)) != NULL_RTX
29151 && (set2 = single_set (insn)) != NULL_RTX
29152 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
29153 && MEM_P (SET_DEST (set2)))
29154 cost += 1;
29155
29156 memory = get_attr_memory (insn);
29157
29158 /* Show ability of reorder buffer to hide latency of load by executing
29159 in parallel with previous instruction in case
29160 previous instruction is not needed to compute the address. */
29161 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
29162 && !ix86_agi_dependent (dep_insn, insn))
29163 {
29164 /* Claim moves to take one cycle, as core can issue one load
29165 at time and the next load can start cycle later. */
29166 if (dep_insn_type == TYPE_IMOV
29167 || dep_insn_type == TYPE_FMOV)
29168 cost = 1;
29169 else if (cost > 1)
29170 cost--;
29171 }
29172 break;
29173
29174 case PROCESSOR_K6:
29175 /* The esp dependency is resolved before
29176 the instruction is really finished. */
29177 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
29178 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
29179 return 1;
29180
29181 /* INT->FP conversion is expensive. */
29182 if (get_attr_fp_int_src (dep_insn))
29183 cost += 5;
29184
29185 memory = get_attr_memory (insn);
29186
29187 /* Show ability of reorder buffer to hide latency of load by executing
29188 in parallel with previous instruction in case
29189 previous instruction is not needed to compute the address. */
29190 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
29191 && !ix86_agi_dependent (dep_insn, insn))
29192 {
29193 /* Claim moves to take one cycle, as core can issue one load
29194 at time and the next load can start cycle later. */
29195 if (dep_insn_type == TYPE_IMOV
29196 || dep_insn_type == TYPE_FMOV)
29197 cost = 1;
29198 else if (cost > 2)
29199 cost -= 2;
29200 else
29201 cost = 1;
29202 }
29203 break;
29204
29205 case PROCESSOR_AMDFAM10:
29206 case PROCESSOR_BDVER1:
29207 case PROCESSOR_BDVER2:
29208 case PROCESSOR_BDVER3:
29209 case PROCESSOR_BDVER4:
29210 case PROCESSOR_ZNVER1:
29211 case PROCESSOR_BTVER1:
29212 case PROCESSOR_BTVER2:
29213 case PROCESSOR_GENERIC:
29214 /* Stack engine allows to execute push&pop instructions in parall. */
29215 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
29216 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
29217 return 0;
29218 /* FALLTHRU */
29219
29220 case PROCESSOR_ATHLON:
29221 case PROCESSOR_K8:
29222 memory = get_attr_memory (insn);
29223
29224 /* Show ability of reorder buffer to hide latency of load by executing
29225 in parallel with previous instruction in case
29226 previous instruction is not needed to compute the address. */
29227 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
29228 && !ix86_agi_dependent (dep_insn, insn))
29229 {
29230 enum attr_unit unit = get_attr_unit (insn);
29231 int loadcost = 3;
29232
29233 /* Because of the difference between the length of integer and
29234 floating unit pipeline preparation stages, the memory operands
29235 for floating point are cheaper.
29236
29237 ??? For Athlon it the difference is most probably 2. */
29238 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
29239 loadcost = 3;
29240 else
29241 loadcost = TARGET_ATHLON ? 2 : 0;
29242
29243 if (cost >= loadcost)
29244 cost -= loadcost;
29245 else
29246 cost = 0;
29247 }
29248 break;
29249
29250 case PROCESSOR_CORE2:
29251 case PROCESSOR_NEHALEM:
29252 case PROCESSOR_SANDYBRIDGE:
29253 case PROCESSOR_HASWELL:
29254 /* Stack engine allows to execute push&pop instructions in parall. */
29255 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
29256 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
29257 return 0;
29258
29259 memory = get_attr_memory (insn);
29260
29261 /* Show ability of reorder buffer to hide latency of load by executing
29262 in parallel with previous instruction in case
29263 previous instruction is not needed to compute the address. */
29264 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
29265 && !ix86_agi_dependent (dep_insn, insn))
29266 {
29267 if (cost >= 4)
29268 cost -= 4;
29269 else
29270 cost = 0;
29271 }
29272 break;
29273
29274 case PROCESSOR_SILVERMONT:
29275 case PROCESSOR_KNL:
29276 case PROCESSOR_INTEL:
29277 if (!reload_completed)
29278 return cost;
29279
29280 /* Increase cost of integer loads. */
29281 memory = get_attr_memory (dep_insn);
29282 if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
29283 {
29284 enum attr_unit unit = get_attr_unit (dep_insn);
29285 if (unit == UNIT_INTEGER && cost == 1)
29286 {
29287 if (memory == MEMORY_LOAD)
29288 cost = 3;
29289 else
29290 {
29291 /* Increase cost of ld/st for short int types only
29292 because of store forwarding issue. */
29293 rtx set = single_set (dep_insn);
29294 if (set && (GET_MODE (SET_DEST (set)) == QImode
29295 || GET_MODE (SET_DEST (set)) == HImode))
29296 {
29297 /* Increase cost of store/load insn if exact
29298 dependence exists and it is load insn. */
29299 enum attr_memory insn_memory = get_attr_memory (insn);
29300 if (insn_memory == MEMORY_LOAD
29301 && exact_store_load_dependency (dep_insn, insn))
29302 cost = 3;
29303 }
29304 }
29305 }
29306 }
29307
29308 default:
29309 break;
29310 }
29311
29312 return cost;
29313 }
29314
29315 /* How many alternative schedules to try. This should be as wide as the
29316 scheduling freedom in the DFA, but no wider. Making this value too
29317 large results extra work for the scheduler. */
29318
29319 static int
29320 ia32_multipass_dfa_lookahead (void)
29321 {
29322 switch (ix86_tune)
29323 {
29324 case PROCESSOR_PENTIUM:
29325 case PROCESSOR_LAKEMONT:
29326 return 2;
29327
29328 case PROCESSOR_PENTIUMPRO:
29329 case PROCESSOR_K6:
29330 return 1;
29331
29332 case PROCESSOR_BDVER1:
29333 case PROCESSOR_BDVER2:
29334 case PROCESSOR_BDVER3:
29335 case PROCESSOR_BDVER4:
29336 /* We use lookahead value 4 for BD both before and after reload
29337 schedules. Plan is to have value 8 included for O3. */
29338 return 4;
29339
29340 case PROCESSOR_CORE2:
29341 case PROCESSOR_NEHALEM:
29342 case PROCESSOR_SANDYBRIDGE:
29343 case PROCESSOR_HASWELL:
29344 case PROCESSOR_BONNELL:
29345 case PROCESSOR_SILVERMONT:
29346 case PROCESSOR_KNL:
29347 case PROCESSOR_INTEL:
29348 /* Generally, we want haifa-sched:max_issue() to look ahead as far
29349 as many instructions can be executed on a cycle, i.e.,
29350 issue_rate. I wonder why tuning for many CPUs does not do this. */
29351 if (reload_completed)
29352 return ix86_issue_rate ();
29353 /* Don't use lookahead for pre-reload schedule to save compile time. */
29354 return 0;
29355
29356 default:
29357 return 0;
29358 }
29359 }
29360
29361 /* Return true if target platform supports macro-fusion. */
29362
29363 static bool
29364 ix86_macro_fusion_p ()
29365 {
29366 return TARGET_FUSE_CMP_AND_BRANCH;
29367 }
29368
29369 /* Check whether current microarchitecture support macro fusion
29370 for insn pair "CONDGEN + CONDJMP". Refer to
29371 "Intel Architectures Optimization Reference Manual". */
29372
29373 static bool
29374 ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
29375 {
29376 rtx src, dest;
29377 enum rtx_code ccode;
29378 rtx compare_set = NULL_RTX, test_if, cond;
29379 rtx alu_set = NULL_RTX, addr = NULL_RTX;
29380
29381 if (!any_condjump_p (condjmp))
29382 return false;
29383
29384 if (get_attr_type (condgen) != TYPE_TEST
29385 && get_attr_type (condgen) != TYPE_ICMP
29386 && get_attr_type (condgen) != TYPE_INCDEC
29387 && get_attr_type (condgen) != TYPE_ALU)
29388 return false;
29389
29390 compare_set = single_set (condgen);
29391 if (compare_set == NULL_RTX
29392 && !TARGET_FUSE_ALU_AND_BRANCH)
29393 return false;
29394
29395 if (compare_set == NULL_RTX)
29396 {
29397 int i;
29398 rtx pat = PATTERN (condgen);
29399 for (i = 0; i < XVECLEN (pat, 0); i++)
29400 if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
29401 {
29402 rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
29403 if (GET_CODE (set_src) == COMPARE)
29404 compare_set = XVECEXP (pat, 0, i);
29405 else
29406 alu_set = XVECEXP (pat, 0, i);
29407 }
29408 }
29409 if (compare_set == NULL_RTX)
29410 return false;
29411 src = SET_SRC (compare_set);
29412 if (GET_CODE (src) != COMPARE)
29413 return false;
29414
29415 /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
29416 supported. */
29417 if ((MEM_P (XEXP (src, 0))
29418 && CONST_INT_P (XEXP (src, 1)))
29419 || (MEM_P (XEXP (src, 1))
29420 && CONST_INT_P (XEXP (src, 0))))
29421 return false;
29422
29423 /* No fusion for RIP-relative address. */
29424 if (MEM_P (XEXP (src, 0)))
29425 addr = XEXP (XEXP (src, 0), 0);
29426 else if (MEM_P (XEXP (src, 1)))
29427 addr = XEXP (XEXP (src, 1), 0);
29428
29429 if (addr) {
29430 ix86_address parts;
29431 int ok = ix86_decompose_address (addr, &parts);
29432 gcc_assert (ok);
29433
29434 if (rip_relative_addr_p (&parts))
29435 return false;
29436 }
29437
29438 test_if = SET_SRC (pc_set (condjmp));
29439 cond = XEXP (test_if, 0);
29440 ccode = GET_CODE (cond);
29441 /* Check whether conditional jump use Sign or Overflow Flags. */
29442 if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
29443 && (ccode == GE
29444 || ccode == GT
29445 || ccode == LE
29446 || ccode == LT))
29447 return false;
29448
29449 /* Return true for TYPE_TEST and TYPE_ICMP. */
29450 if (get_attr_type (condgen) == TYPE_TEST
29451 || get_attr_type (condgen) == TYPE_ICMP)
29452 return true;
29453
29454 /* The following is the case that macro-fusion for alu + jmp. */
29455 if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
29456 return false;
29457
29458 /* No fusion for alu op with memory destination operand. */
29459 dest = SET_DEST (alu_set);
29460 if (MEM_P (dest))
29461 return false;
29462
29463 /* Macro-fusion for inc/dec + unsigned conditional jump is not
29464 supported. */
29465 if (get_attr_type (condgen) == TYPE_INCDEC
29466 && (ccode == GEU
29467 || ccode == GTU
29468 || ccode == LEU
29469 || ccode == LTU))
29470 return false;
29471
29472 return true;
29473 }
29474
29475 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
29476 execution. It is applied if
29477 (1) IMUL instruction is on the top of list;
29478 (2) There exists the only producer of independent IMUL instruction in
29479 ready list.
29480 Return index of IMUL producer if it was found and -1 otherwise. */
29481 static int
29482 do_reorder_for_imul (rtx_insn **ready, int n_ready)
29483 {
29484 rtx_insn *insn;
29485 rtx set, insn1, insn2;
29486 sd_iterator_def sd_it;
29487 dep_t dep;
29488 int index = -1;
29489 int i;
29490
29491 if (!TARGET_BONNELL)
29492 return index;
29493
29494 /* Check that IMUL instruction is on the top of ready list. */
29495 insn = ready[n_ready - 1];
29496 set = single_set (insn);
29497 if (!set)
29498 return index;
29499 if (!(GET_CODE (SET_SRC (set)) == MULT
29500 && GET_MODE (SET_SRC (set)) == SImode))
29501 return index;
29502
29503 /* Search for producer of independent IMUL instruction. */
29504 for (i = n_ready - 2; i >= 0; i--)
29505 {
29506 insn = ready[i];
29507 if (!NONDEBUG_INSN_P (insn))
29508 continue;
29509 /* Skip IMUL instruction. */
29510 insn2 = PATTERN (insn);
29511 if (GET_CODE (insn2) == PARALLEL)
29512 insn2 = XVECEXP (insn2, 0, 0);
29513 if (GET_CODE (insn2) == SET
29514 && GET_CODE (SET_SRC (insn2)) == MULT
29515 && GET_MODE (SET_SRC (insn2)) == SImode)
29516 continue;
29517
29518 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
29519 {
29520 rtx con;
29521 con = DEP_CON (dep);
29522 if (!NONDEBUG_INSN_P (con))
29523 continue;
29524 insn1 = PATTERN (con);
29525 if (GET_CODE (insn1) == PARALLEL)
29526 insn1 = XVECEXP (insn1, 0, 0);
29527
29528 if (GET_CODE (insn1) == SET
29529 && GET_CODE (SET_SRC (insn1)) == MULT
29530 && GET_MODE (SET_SRC (insn1)) == SImode)
29531 {
29532 sd_iterator_def sd_it1;
29533 dep_t dep1;
29534 /* Check if there is no other dependee for IMUL. */
29535 index = i;
29536 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
29537 {
29538 rtx pro;
29539 pro = DEP_PRO (dep1);
29540 if (!NONDEBUG_INSN_P (pro))
29541 continue;
29542 if (pro != insn)
29543 index = -1;
29544 }
29545 if (index >= 0)
29546 break;
29547 }
29548 }
29549 if (index >= 0)
29550 break;
29551 }
29552 return index;
29553 }
29554
29555 /* Try to find the best candidate on the top of ready list if two insns
29556 have the same priority - candidate is best if its dependees were
29557 scheduled earlier. Applied for Silvermont only.
29558 Return true if top 2 insns must be interchanged. */
29559 static bool
29560 swap_top_of_ready_list (rtx_insn **ready, int n_ready)
29561 {
29562 rtx_insn *top = ready[n_ready - 1];
29563 rtx_insn *next = ready[n_ready - 2];
29564 rtx set;
29565 sd_iterator_def sd_it;
29566 dep_t dep;
29567 int clock1 = -1;
29568 int clock2 = -1;
29569 #define INSN_TICK(INSN) (HID (INSN)->tick)
29570
29571 if (!TARGET_SILVERMONT && !TARGET_INTEL)
29572 return false;
29573
29574 if (!NONDEBUG_INSN_P (top))
29575 return false;
29576 if (!NONJUMP_INSN_P (top))
29577 return false;
29578 if (!NONDEBUG_INSN_P (next))
29579 return false;
29580 if (!NONJUMP_INSN_P (next))
29581 return false;
29582 set = single_set (top);
29583 if (!set)
29584 return false;
29585 set = single_set (next);
29586 if (!set)
29587 return false;
29588
29589 if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next))
29590 {
29591 if (INSN_PRIORITY (top) != INSN_PRIORITY (next))
29592 return false;
29593 /* Determine winner more precise. */
29594 FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep)
29595 {
29596 rtx pro;
29597 pro = DEP_PRO (dep);
29598 if (!NONDEBUG_INSN_P (pro))
29599 continue;
29600 if (INSN_TICK (pro) > clock1)
29601 clock1 = INSN_TICK (pro);
29602 }
29603 FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep)
29604 {
29605 rtx pro;
29606 pro = DEP_PRO (dep);
29607 if (!NONDEBUG_INSN_P (pro))
29608 continue;
29609 if (INSN_TICK (pro) > clock2)
29610 clock2 = INSN_TICK (pro);
29611 }
29612
29613 if (clock1 == clock2)
29614 {
29615 /* Determine winner - load must win. */
29616 enum attr_memory memory1, memory2;
29617 memory1 = get_attr_memory (top);
29618 memory2 = get_attr_memory (next);
29619 if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD)
29620 return true;
29621 }
29622 return (bool) (clock2 < clock1);
29623 }
29624 return false;
29625 #undef INSN_TICK
29626 }
29627
29628 /* Perform possible reodering of ready list for Atom/Silvermont only.
29629 Return issue rate. */
29630 static int
29631 ix86_sched_reorder (FILE *dump, int sched_verbose, rtx_insn **ready,
29632 int *pn_ready, int clock_var)
29633 {
29634 int issue_rate = -1;
29635 int n_ready = *pn_ready;
29636 int i;
29637 rtx_insn *insn;
29638 int index = -1;
29639
29640 /* Set up issue rate. */
29641 issue_rate = ix86_issue_rate ();
29642
29643 /* Do reodering for BONNELL/SILVERMONT only. */
29644 if (!TARGET_BONNELL && !TARGET_SILVERMONT && !TARGET_INTEL)
29645 return issue_rate;
29646
29647 /* Nothing to do if ready list contains only 1 instruction. */
29648 if (n_ready <= 1)
29649 return issue_rate;
29650
29651 /* Do reodering for post-reload scheduler only. */
29652 if (!reload_completed)
29653 return issue_rate;
29654
29655 if ((index = do_reorder_for_imul (ready, n_ready)) >= 0)
29656 {
29657 if (sched_verbose > 1)
29658 fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n",
29659 INSN_UID (ready[index]));
29660
29661 /* Put IMUL producer (ready[index]) at the top of ready list. */
29662 insn = ready[index];
29663 for (i = index; i < n_ready - 1; i++)
29664 ready[i] = ready[i + 1];
29665 ready[n_ready - 1] = insn;
29666 return issue_rate;
29667 }
29668
29669 /* Skip selective scheduling since HID is not populated in it. */
29670 if (clock_var != 0
29671 && !sel_sched_p ()
29672 && swap_top_of_ready_list (ready, n_ready))
29673 {
29674 if (sched_verbose > 1)
29675 fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n",
29676 INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2]));
29677 /* Swap 2 top elements of ready list. */
29678 insn = ready[n_ready - 1];
29679 ready[n_ready - 1] = ready[n_ready - 2];
29680 ready[n_ready - 2] = insn;
29681 }
29682 return issue_rate;
29683 }
29684
29685 static bool
29686 ix86_class_likely_spilled_p (reg_class_t);
29687
29688 /* Returns true if lhs of insn is HW function argument register and set up
29689 is_spilled to true if it is likely spilled HW register. */
29690 static bool
29691 insn_is_function_arg (rtx insn, bool* is_spilled)
29692 {
29693 rtx dst;
29694
29695 if (!NONDEBUG_INSN_P (insn))
29696 return false;
29697 /* Call instructions are not movable, ignore it. */
29698 if (CALL_P (insn))
29699 return false;
29700 insn = PATTERN (insn);
29701 if (GET_CODE (insn) == PARALLEL)
29702 insn = XVECEXP (insn, 0, 0);
29703 if (GET_CODE (insn) != SET)
29704 return false;
29705 dst = SET_DEST (insn);
29706 if (REG_P (dst) && HARD_REGISTER_P (dst)
29707 && ix86_function_arg_regno_p (REGNO (dst)))
29708 {
29709 /* Is it likely spilled HW register? */
29710 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
29711 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
29712 *is_spilled = true;
29713 return true;
29714 }
29715 return false;
29716 }
29717
29718 /* Add output dependencies for chain of function adjacent arguments if only
29719 there is a move to likely spilled HW register. Return first argument
29720 if at least one dependence was added or NULL otherwise. */
29721 static rtx_insn *
29722 add_parameter_dependencies (rtx_insn *call, rtx_insn *head)
29723 {
29724 rtx_insn *insn;
29725 rtx_insn *last = call;
29726 rtx_insn *first_arg = NULL;
29727 bool is_spilled = false;
29728
29729 head = PREV_INSN (head);
29730
29731 /* Find nearest to call argument passing instruction. */
29732 while (true)
29733 {
29734 last = PREV_INSN (last);
29735 if (last == head)
29736 return NULL;
29737 if (!NONDEBUG_INSN_P (last))
29738 continue;
29739 if (insn_is_function_arg (last, &is_spilled))
29740 break;
29741 return NULL;
29742 }
29743
29744 first_arg = last;
29745 while (true)
29746 {
29747 insn = PREV_INSN (last);
29748 if (!INSN_P (insn))
29749 break;
29750 if (insn == head)
29751 break;
29752 if (!NONDEBUG_INSN_P (insn))
29753 {
29754 last = insn;
29755 continue;
29756 }
29757 if (insn_is_function_arg (insn, &is_spilled))
29758 {
29759 /* Add output depdendence between two function arguments if chain
29760 of output arguments contains likely spilled HW registers. */
29761 if (is_spilled)
29762 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
29763 first_arg = last = insn;
29764 }
29765 else
29766 break;
29767 }
29768 if (!is_spilled)
29769 return NULL;
29770 return first_arg;
29771 }
29772
29773 /* Add output or anti dependency from insn to first_arg to restrict its code
29774 motion. */
29775 static void
29776 avoid_func_arg_motion (rtx_insn *first_arg, rtx_insn *insn)
29777 {
29778 rtx set;
29779 rtx tmp;
29780
29781 /* Add anti dependencies for bounds stores. */
29782 if (INSN_P (insn)
29783 && GET_CODE (PATTERN (insn)) == PARALLEL
29784 && GET_CODE (XVECEXP (PATTERN (insn), 0, 0)) == UNSPEC
29785 && XINT (XVECEXP (PATTERN (insn), 0, 0), 1) == UNSPEC_BNDSTX)
29786 {
29787 add_dependence (first_arg, insn, REG_DEP_ANTI);
29788 return;
29789 }
29790
29791 set = single_set (insn);
29792 if (!set)
29793 return;
29794 tmp = SET_DEST (set);
29795 if (REG_P (tmp))
29796 {
29797 /* Add output dependency to the first function argument. */
29798 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
29799 return;
29800 }
29801 /* Add anti dependency. */
29802 add_dependence (first_arg, insn, REG_DEP_ANTI);
29803 }
29804
29805 /* Avoid cross block motion of function argument through adding dependency
29806 from the first non-jump instruction in bb. */
29807 static void
29808 add_dependee_for_func_arg (rtx_insn *arg, basic_block bb)
29809 {
29810 rtx_insn *insn = BB_END (bb);
29811
29812 while (insn)
29813 {
29814 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
29815 {
29816 rtx set = single_set (insn);
29817 if (set)
29818 {
29819 avoid_func_arg_motion (arg, insn);
29820 return;
29821 }
29822 }
29823 if (insn == BB_HEAD (bb))
29824 return;
29825 insn = PREV_INSN (insn);
29826 }
29827 }
29828
29829 /* Hook for pre-reload schedule - avoid motion of function arguments
29830 passed in likely spilled HW registers. */
29831 static void
29832 ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail)
29833 {
29834 rtx_insn *insn;
29835 rtx_insn *first_arg = NULL;
29836 if (reload_completed)
29837 return;
29838 while (head != tail && DEBUG_INSN_P (head))
29839 head = NEXT_INSN (head);
29840 for (insn = tail; insn != head; insn = PREV_INSN (insn))
29841 if (INSN_P (insn) && CALL_P (insn))
29842 {
29843 first_arg = add_parameter_dependencies (insn, head);
29844 if (first_arg)
29845 {
29846 /* Add dependee for first argument to predecessors if only
29847 region contains more than one block. */
29848 basic_block bb = BLOCK_FOR_INSN (insn);
29849 int rgn = CONTAINING_RGN (bb->index);
29850 int nr_blks = RGN_NR_BLOCKS (rgn);
29851 /* Skip trivial regions and region head blocks that can have
29852 predecessors outside of region. */
29853 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
29854 {
29855 edge e;
29856 edge_iterator ei;
29857
29858 /* Regions are SCCs with the exception of selective
29859 scheduling with pipelining of outer blocks enabled.
29860 So also check that immediate predecessors of a non-head
29861 block are in the same region. */
29862 FOR_EACH_EDGE (e, ei, bb->preds)
29863 {
29864 /* Avoid creating of loop-carried dependencies through
29865 using topological ordering in the region. */
29866 if (rgn == CONTAINING_RGN (e->src->index)
29867 && BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
29868 add_dependee_for_func_arg (first_arg, e->src);
29869 }
29870 }
29871 insn = first_arg;
29872 if (insn == head)
29873 break;
29874 }
29875 }
29876 else if (first_arg)
29877 avoid_func_arg_motion (first_arg, insn);
29878 }
29879
29880 /* Hook for pre-reload schedule - set priority of moves from likely spilled
29881 HW registers to maximum, to schedule them at soon as possible. These are
29882 moves from function argument registers at the top of the function entry
29883 and moves from function return value registers after call. */
29884 static int
29885 ix86_adjust_priority (rtx_insn *insn, int priority)
29886 {
29887 rtx set;
29888
29889 if (reload_completed)
29890 return priority;
29891
29892 if (!NONDEBUG_INSN_P (insn))
29893 return priority;
29894
29895 set = single_set (insn);
29896 if (set)
29897 {
29898 rtx tmp = SET_SRC (set);
29899 if (REG_P (tmp)
29900 && HARD_REGISTER_P (tmp)
29901 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
29902 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
29903 return current_sched_info->sched_max_insns_priority;
29904 }
29905
29906 return priority;
29907 }
29908
29909 /* Model decoder of Core 2/i7.
29910 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
29911 track the instruction fetch block boundaries and make sure that long
29912 (9+ bytes) instructions are assigned to D0. */
29913
29914 /* Maximum length of an insn that can be handled by
29915 a secondary decoder unit. '8' for Core 2/i7. */
29916 static int core2i7_secondary_decoder_max_insn_size;
29917
29918 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
29919 '16' for Core 2/i7. */
29920 static int core2i7_ifetch_block_size;
29921
29922 /* Maximum number of instructions decoder can handle per cycle.
29923 '6' for Core 2/i7. */
29924 static int core2i7_ifetch_block_max_insns;
29925
29926 typedef struct ix86_first_cycle_multipass_data_ *
29927 ix86_first_cycle_multipass_data_t;
29928 typedef const struct ix86_first_cycle_multipass_data_ *
29929 const_ix86_first_cycle_multipass_data_t;
29930
29931 /* A variable to store target state across calls to max_issue within
29932 one cycle. */
29933 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
29934 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
29935
29936 /* Initialize DATA. */
29937 static void
29938 core2i7_first_cycle_multipass_init (void *_data)
29939 {
29940 ix86_first_cycle_multipass_data_t data
29941 = (ix86_first_cycle_multipass_data_t) _data;
29942
29943 data->ifetch_block_len = 0;
29944 data->ifetch_block_n_insns = 0;
29945 data->ready_try_change = NULL;
29946 data->ready_try_change_size = 0;
29947 }
29948
29949 /* Advancing the cycle; reset ifetch block counts. */
29950 static void
29951 core2i7_dfa_post_advance_cycle (void)
29952 {
29953 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
29954
29955 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
29956
29957 data->ifetch_block_len = 0;
29958 data->ifetch_block_n_insns = 0;
29959 }
29960
29961 static int min_insn_size (rtx_insn *);
29962
29963 /* Filter out insns from ready_try that the core will not be able to issue
29964 on current cycle due to decoder. */
29965 static void
29966 core2i7_first_cycle_multipass_filter_ready_try
29967 (const_ix86_first_cycle_multipass_data_t data,
29968 signed char *ready_try, int n_ready, bool first_cycle_insn_p)
29969 {
29970 while (n_ready--)
29971 {
29972 rtx_insn *insn;
29973 int insn_size;
29974
29975 if (ready_try[n_ready])
29976 continue;
29977
29978 insn = get_ready_element (n_ready);
29979 insn_size = min_insn_size (insn);
29980
29981 if (/* If this is a too long an insn for a secondary decoder ... */
29982 (!first_cycle_insn_p
29983 && insn_size > core2i7_secondary_decoder_max_insn_size)
29984 /* ... or it would not fit into the ifetch block ... */
29985 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
29986 /* ... or the decoder is full already ... */
29987 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
29988 /* ... mask the insn out. */
29989 {
29990 ready_try[n_ready] = 1;
29991
29992 if (data->ready_try_change)
29993 bitmap_set_bit (data->ready_try_change, n_ready);
29994 }
29995 }
29996 }
29997
29998 /* Prepare for a new round of multipass lookahead scheduling. */
29999 static void
30000 core2i7_first_cycle_multipass_begin (void *_data,
30001 signed char *ready_try, int n_ready,
30002 bool first_cycle_insn_p)
30003 {
30004 ix86_first_cycle_multipass_data_t data
30005 = (ix86_first_cycle_multipass_data_t) _data;
30006 const_ix86_first_cycle_multipass_data_t prev_data
30007 = ix86_first_cycle_multipass_data;
30008
30009 /* Restore the state from the end of the previous round. */
30010 data->ifetch_block_len = prev_data->ifetch_block_len;
30011 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
30012
30013 /* Filter instructions that cannot be issued on current cycle due to
30014 decoder restrictions. */
30015 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
30016 first_cycle_insn_p);
30017 }
30018
30019 /* INSN is being issued in current solution. Account for its impact on
30020 the decoder model. */
30021 static void
30022 core2i7_first_cycle_multipass_issue (void *_data,
30023 signed char *ready_try, int n_ready,
30024 rtx_insn *insn, const void *_prev_data)
30025 {
30026 ix86_first_cycle_multipass_data_t data
30027 = (ix86_first_cycle_multipass_data_t) _data;
30028 const_ix86_first_cycle_multipass_data_t prev_data
30029 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
30030
30031 int insn_size = min_insn_size (insn);
30032
30033 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
30034 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
30035 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
30036 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
30037
30038 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
30039 if (!data->ready_try_change)
30040 {
30041 data->ready_try_change = sbitmap_alloc (n_ready);
30042 data->ready_try_change_size = n_ready;
30043 }
30044 else if (data->ready_try_change_size < n_ready)
30045 {
30046 data->ready_try_change = sbitmap_resize (data->ready_try_change,
30047 n_ready, 0);
30048 data->ready_try_change_size = n_ready;
30049 }
30050 bitmap_clear (data->ready_try_change);
30051
30052 /* Filter out insns from ready_try that the core will not be able to issue
30053 on current cycle due to decoder. */
30054 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
30055 false);
30056 }
30057
30058 /* Revert the effect on ready_try. */
30059 static void
30060 core2i7_first_cycle_multipass_backtrack (const void *_data,
30061 signed char *ready_try,
30062 int n_ready ATTRIBUTE_UNUSED)
30063 {
30064 const_ix86_first_cycle_multipass_data_t data
30065 = (const_ix86_first_cycle_multipass_data_t) _data;
30066 unsigned int i = 0;
30067 sbitmap_iterator sbi;
30068
30069 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
30070 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
30071 {
30072 ready_try[i] = 0;
30073 }
30074 }
30075
30076 /* Save the result of multipass lookahead scheduling for the next round. */
30077 static void
30078 core2i7_first_cycle_multipass_end (const void *_data)
30079 {
30080 const_ix86_first_cycle_multipass_data_t data
30081 = (const_ix86_first_cycle_multipass_data_t) _data;
30082 ix86_first_cycle_multipass_data_t next_data
30083 = ix86_first_cycle_multipass_data;
30084
30085 if (data != NULL)
30086 {
30087 next_data->ifetch_block_len = data->ifetch_block_len;
30088 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
30089 }
30090 }
30091
30092 /* Deallocate target data. */
30093 static void
30094 core2i7_first_cycle_multipass_fini (void *_data)
30095 {
30096 ix86_first_cycle_multipass_data_t data
30097 = (ix86_first_cycle_multipass_data_t) _data;
30098
30099 if (data->ready_try_change)
30100 {
30101 sbitmap_free (data->ready_try_change);
30102 data->ready_try_change = NULL;
30103 data->ready_try_change_size = 0;
30104 }
30105 }
30106
30107 /* Prepare for scheduling pass. */
30108 static void
30109 ix86_sched_init_global (FILE *, int, int)
30110 {
30111 /* Install scheduling hooks for current CPU. Some of these hooks are used
30112 in time-critical parts of the scheduler, so we only set them up when
30113 they are actually used. */
30114 switch (ix86_tune)
30115 {
30116 case PROCESSOR_CORE2:
30117 case PROCESSOR_NEHALEM:
30118 case PROCESSOR_SANDYBRIDGE:
30119 case PROCESSOR_HASWELL:
30120 /* Do not perform multipass scheduling for pre-reload schedule
30121 to save compile time. */
30122 if (reload_completed)
30123 {
30124 targetm.sched.dfa_post_advance_cycle
30125 = core2i7_dfa_post_advance_cycle;
30126 targetm.sched.first_cycle_multipass_init
30127 = core2i7_first_cycle_multipass_init;
30128 targetm.sched.first_cycle_multipass_begin
30129 = core2i7_first_cycle_multipass_begin;
30130 targetm.sched.first_cycle_multipass_issue
30131 = core2i7_first_cycle_multipass_issue;
30132 targetm.sched.first_cycle_multipass_backtrack
30133 = core2i7_first_cycle_multipass_backtrack;
30134 targetm.sched.first_cycle_multipass_end
30135 = core2i7_first_cycle_multipass_end;
30136 targetm.sched.first_cycle_multipass_fini
30137 = core2i7_first_cycle_multipass_fini;
30138
30139 /* Set decoder parameters. */
30140 core2i7_secondary_decoder_max_insn_size = 8;
30141 core2i7_ifetch_block_size = 16;
30142 core2i7_ifetch_block_max_insns = 6;
30143 break;
30144 }
30145 /* Fall through. */
30146 default:
30147 targetm.sched.dfa_post_advance_cycle = NULL;
30148 targetm.sched.first_cycle_multipass_init = NULL;
30149 targetm.sched.first_cycle_multipass_begin = NULL;
30150 targetm.sched.first_cycle_multipass_issue = NULL;
30151 targetm.sched.first_cycle_multipass_backtrack = NULL;
30152 targetm.sched.first_cycle_multipass_end = NULL;
30153 targetm.sched.first_cycle_multipass_fini = NULL;
30154 break;
30155 }
30156 }
30157
30158 \f
30159 /* Compute the alignment given to a constant that is being placed in memory.
30160 EXP is the constant and ALIGN is the alignment that the object would
30161 ordinarily have.
30162 The value of this function is used instead of that alignment to align
30163 the object. */
30164
30165 int
30166 ix86_constant_alignment (tree exp, int align)
30167 {
30168 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
30169 || TREE_CODE (exp) == INTEGER_CST)
30170 {
30171 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
30172 return 64;
30173 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
30174 return 128;
30175 }
30176 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
30177 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
30178 return BITS_PER_WORD;
30179
30180 return align;
30181 }
30182
30183 /* Compute the alignment for a variable for Intel MCU psABI. TYPE is
30184 the data type, and ALIGN is the alignment that the object would
30185 ordinarily have. */
30186
30187 static int
30188 iamcu_alignment (tree type, int align)
30189 {
30190 enum machine_mode mode;
30191
30192 if (align < 32 || TYPE_USER_ALIGN (type))
30193 return align;
30194
30195 /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4
30196 bytes. */
30197 mode = TYPE_MODE (strip_array_types (type));
30198 switch (GET_MODE_CLASS (mode))
30199 {
30200 case MODE_INT:
30201 case MODE_COMPLEX_INT:
30202 case MODE_COMPLEX_FLOAT:
30203 case MODE_FLOAT:
30204 case MODE_DECIMAL_FLOAT:
30205 return 32;
30206 default:
30207 return align;
30208 }
30209 }
30210
30211 /* Compute the alignment for a static variable.
30212 TYPE is the data type, and ALIGN is the alignment that
30213 the object would ordinarily have. The value of this function is used
30214 instead of that alignment to align the object. */
30215
30216 int
30217 ix86_data_alignment (tree type, int align, bool opt)
30218 {
30219 /* GCC 4.8 and earlier used to incorrectly assume this alignment even
30220 for symbols from other compilation units or symbols that don't need
30221 to bind locally. In order to preserve some ABI compatibility with
30222 those compilers, ensure we don't decrease alignment from what we
30223 used to assume. */
30224
30225 int max_align_compat = MIN (256, MAX_OFILE_ALIGNMENT);
30226
30227 /* A data structure, equal or greater than the size of a cache line
30228 (64 bytes in the Pentium 4 and other recent Intel processors, including
30229 processors based on Intel Core microarchitecture) should be aligned
30230 so that its base address is a multiple of a cache line size. */
30231
30232 int max_align
30233 = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT);
30234
30235 if (max_align < BITS_PER_WORD)
30236 max_align = BITS_PER_WORD;
30237
30238 switch (ix86_align_data_type)
30239 {
30240 case ix86_align_data_type_abi: opt = false; break;
30241 case ix86_align_data_type_compat: max_align = BITS_PER_WORD; break;
30242 case ix86_align_data_type_cacheline: break;
30243 }
30244
30245 if (TARGET_IAMCU)
30246 align = iamcu_alignment (type, align);
30247
30248 if (opt
30249 && AGGREGATE_TYPE_P (type)
30250 && TYPE_SIZE (type)
30251 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST)
30252 {
30253 if (wi::geu_p (TYPE_SIZE (type), max_align_compat)
30254 && align < max_align_compat)
30255 align = max_align_compat;
30256 if (wi::geu_p (TYPE_SIZE (type), max_align)
30257 && align < max_align)
30258 align = max_align;
30259 }
30260
30261 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
30262 to 16byte boundary. */
30263 if (TARGET_64BIT)
30264 {
30265 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
30266 && TYPE_SIZE (type)
30267 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
30268 && wi::geu_p (TYPE_SIZE (type), 128)
30269 && align < 128)
30270 return 128;
30271 }
30272
30273 if (!opt)
30274 return align;
30275
30276 if (TREE_CODE (type) == ARRAY_TYPE)
30277 {
30278 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
30279 return 64;
30280 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
30281 return 128;
30282 }
30283 else if (TREE_CODE (type) == COMPLEX_TYPE)
30284 {
30285
30286 if (TYPE_MODE (type) == DCmode && align < 64)
30287 return 64;
30288 if ((TYPE_MODE (type) == XCmode
30289 || TYPE_MODE (type) == TCmode) && align < 128)
30290 return 128;
30291 }
30292 else if ((TREE_CODE (type) == RECORD_TYPE
30293 || TREE_CODE (type) == UNION_TYPE
30294 || TREE_CODE (type) == QUAL_UNION_TYPE)
30295 && TYPE_FIELDS (type))
30296 {
30297 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
30298 return 64;
30299 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
30300 return 128;
30301 }
30302 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
30303 || TREE_CODE (type) == INTEGER_TYPE)
30304 {
30305 if (TYPE_MODE (type) == DFmode && align < 64)
30306 return 64;
30307 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
30308 return 128;
30309 }
30310
30311 return align;
30312 }
30313
30314 /* Compute the alignment for a local variable or a stack slot. EXP is
30315 the data type or decl itself, MODE is the widest mode available and
30316 ALIGN is the alignment that the object would ordinarily have. The
30317 value of this macro is used instead of that alignment to align the
30318 object. */
30319
30320 unsigned int
30321 ix86_local_alignment (tree exp, machine_mode mode,
30322 unsigned int align)
30323 {
30324 tree type, decl;
30325
30326 if (exp && DECL_P (exp))
30327 {
30328 type = TREE_TYPE (exp);
30329 decl = exp;
30330 }
30331 else
30332 {
30333 type = exp;
30334 decl = NULL;
30335 }
30336
30337 /* Don't do dynamic stack realignment for long long objects with
30338 -mpreferred-stack-boundary=2. */
30339 if (!TARGET_64BIT
30340 && align == 64
30341 && ix86_preferred_stack_boundary < 64
30342 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
30343 && (!type || !TYPE_USER_ALIGN (type))
30344 && (!decl || !DECL_USER_ALIGN (decl)))
30345 align = 32;
30346
30347 /* If TYPE is NULL, we are allocating a stack slot for caller-save
30348 register in MODE. We will return the largest alignment of XF
30349 and DF. */
30350 if (!type)
30351 {
30352 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
30353 align = GET_MODE_ALIGNMENT (DFmode);
30354 return align;
30355 }
30356
30357 /* Don't increase alignment for Intel MCU psABI. */
30358 if (TARGET_IAMCU)
30359 return align;
30360
30361 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
30362 to 16byte boundary. Exact wording is:
30363
30364 An array uses the same alignment as its elements, except that a local or
30365 global array variable of length at least 16 bytes or
30366 a C99 variable-length array variable always has alignment of at least 16 bytes.
30367
30368 This was added to allow use of aligned SSE instructions at arrays. This
30369 rule is meant for static storage (where compiler can not do the analysis
30370 by itself). We follow it for automatic variables only when convenient.
30371 We fully control everything in the function compiled and functions from
30372 other unit can not rely on the alignment.
30373
30374 Exclude va_list type. It is the common case of local array where
30375 we can not benefit from the alignment.
30376
30377 TODO: Probably one should optimize for size only when var is not escaping. */
30378 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
30379 && TARGET_SSE)
30380 {
30381 if (AGGREGATE_TYPE_P (type)
30382 && (va_list_type_node == NULL_TREE
30383 || (TYPE_MAIN_VARIANT (type)
30384 != TYPE_MAIN_VARIANT (va_list_type_node)))
30385 && TYPE_SIZE (type)
30386 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
30387 && wi::geu_p (TYPE_SIZE (type), 16)
30388 && align < 128)
30389 return 128;
30390 }
30391 if (TREE_CODE (type) == ARRAY_TYPE)
30392 {
30393 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
30394 return 64;
30395 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
30396 return 128;
30397 }
30398 else if (TREE_CODE (type) == COMPLEX_TYPE)
30399 {
30400 if (TYPE_MODE (type) == DCmode && align < 64)
30401 return 64;
30402 if ((TYPE_MODE (type) == XCmode
30403 || TYPE_MODE (type) == TCmode) && align < 128)
30404 return 128;
30405 }
30406 else if ((TREE_CODE (type) == RECORD_TYPE
30407 || TREE_CODE (type) == UNION_TYPE
30408 || TREE_CODE (type) == QUAL_UNION_TYPE)
30409 && TYPE_FIELDS (type))
30410 {
30411 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
30412 return 64;
30413 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
30414 return 128;
30415 }
30416 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
30417 || TREE_CODE (type) == INTEGER_TYPE)
30418 {
30419
30420 if (TYPE_MODE (type) == DFmode && align < 64)
30421 return 64;
30422 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
30423 return 128;
30424 }
30425 return align;
30426 }
30427
30428 /* Compute the minimum required alignment for dynamic stack realignment
30429 purposes for a local variable, parameter or a stack slot. EXP is
30430 the data type or decl itself, MODE is its mode and ALIGN is the
30431 alignment that the object would ordinarily have. */
30432
30433 unsigned int
30434 ix86_minimum_alignment (tree exp, machine_mode mode,
30435 unsigned int align)
30436 {
30437 tree type, decl;
30438
30439 if (exp && DECL_P (exp))
30440 {
30441 type = TREE_TYPE (exp);
30442 decl = exp;
30443 }
30444 else
30445 {
30446 type = exp;
30447 decl = NULL;
30448 }
30449
30450 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
30451 return align;
30452
30453 /* Don't do dynamic stack realignment for long long objects with
30454 -mpreferred-stack-boundary=2. */
30455 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
30456 && (!type || !TYPE_USER_ALIGN (type))
30457 && (!decl || !DECL_USER_ALIGN (decl)))
30458 {
30459 gcc_checking_assert (!TARGET_STV);
30460 return 32;
30461 }
30462
30463 return align;
30464 }
30465 \f
30466 /* Find a location for the static chain incoming to a nested function.
30467 This is a register, unless all free registers are used by arguments. */
30468
30469 static rtx
30470 ix86_static_chain (const_tree fndecl_or_type, bool incoming_p)
30471 {
30472 unsigned regno;
30473
30474 /* While this function won't be called by the middle-end when a static
30475 chain isn't needed, it's also used throughout the backend so it's
30476 easiest to keep this check centralized. */
30477 if (DECL_P (fndecl_or_type) && !DECL_STATIC_CHAIN (fndecl_or_type))
30478 return NULL;
30479
30480 if (TARGET_64BIT)
30481 {
30482 /* We always use R10 in 64-bit mode. */
30483 regno = R10_REG;
30484 }
30485 else
30486 {
30487 const_tree fntype, fndecl;
30488 unsigned int ccvt;
30489
30490 /* By default in 32-bit mode we use ECX to pass the static chain. */
30491 regno = CX_REG;
30492
30493 if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL)
30494 {
30495 fntype = TREE_TYPE (fndecl_or_type);
30496 fndecl = fndecl_or_type;
30497 }
30498 else
30499 {
30500 fntype = fndecl_or_type;
30501 fndecl = NULL;
30502 }
30503
30504 ccvt = ix86_get_callcvt (fntype);
30505 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
30506 {
30507 /* Fastcall functions use ecx/edx for arguments, which leaves
30508 us with EAX for the static chain.
30509 Thiscall functions use ecx for arguments, which also
30510 leaves us with EAX for the static chain. */
30511 regno = AX_REG;
30512 }
30513 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
30514 {
30515 /* Thiscall functions use ecx for arguments, which leaves
30516 us with EAX and EDX for the static chain.
30517 We are using for abi-compatibility EAX. */
30518 regno = AX_REG;
30519 }
30520 else if (ix86_function_regparm (fntype, fndecl) == 3)
30521 {
30522 /* For regparm 3, we have no free call-clobbered registers in
30523 which to store the static chain. In order to implement this,
30524 we have the trampoline push the static chain to the stack.
30525 However, we can't push a value below the return address when
30526 we call the nested function directly, so we have to use an
30527 alternate entry point. For this we use ESI, and have the
30528 alternate entry point push ESI, so that things appear the
30529 same once we're executing the nested function. */
30530 if (incoming_p)
30531 {
30532 if (fndecl == current_function_decl)
30533 ix86_static_chain_on_stack = true;
30534 return gen_frame_mem (SImode,
30535 plus_constant (Pmode,
30536 arg_pointer_rtx, -8));
30537 }
30538 regno = SI_REG;
30539 }
30540 }
30541
30542 return gen_rtx_REG (Pmode, regno);
30543 }
30544
30545 /* Emit RTL insns to initialize the variable parts of a trampoline.
30546 FNDECL is the decl of the target address; M_TRAMP is a MEM for
30547 the trampoline, and CHAIN_VALUE is an RTX for the static chain
30548 to be passed to the target function. */
30549
30550 static void
30551 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
30552 {
30553 rtx mem, fnaddr;
30554 int opcode;
30555 int offset = 0;
30556
30557 fnaddr = XEXP (DECL_RTL (fndecl), 0);
30558
30559 if (TARGET_64BIT)
30560 {
30561 int size;
30562
30563 /* Load the function address to r11. Try to load address using
30564 the shorter movl instead of movabs. We may want to support
30565 movq for kernel mode, but kernel does not use trampolines at
30566 the moment. FNADDR is a 32bit address and may not be in
30567 DImode when ptr_mode == SImode. Always use movl in this
30568 case. */
30569 if (ptr_mode == SImode
30570 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
30571 {
30572 fnaddr = copy_addr_to_reg (fnaddr);
30573
30574 mem = adjust_address (m_tramp, HImode, offset);
30575 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
30576
30577 mem = adjust_address (m_tramp, SImode, offset + 2);
30578 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
30579 offset += 6;
30580 }
30581 else
30582 {
30583 mem = adjust_address (m_tramp, HImode, offset);
30584 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
30585
30586 mem = adjust_address (m_tramp, DImode, offset + 2);
30587 emit_move_insn (mem, fnaddr);
30588 offset += 10;
30589 }
30590
30591 /* Load static chain using movabs to r10. Use the shorter movl
30592 instead of movabs when ptr_mode == SImode. */
30593 if (ptr_mode == SImode)
30594 {
30595 opcode = 0xba41;
30596 size = 6;
30597 }
30598 else
30599 {
30600 opcode = 0xba49;
30601 size = 10;
30602 }
30603
30604 mem = adjust_address (m_tramp, HImode, offset);
30605 emit_move_insn (mem, gen_int_mode (opcode, HImode));
30606
30607 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
30608 emit_move_insn (mem, chain_value);
30609 offset += size;
30610
30611 /* Jump to r11; the last (unused) byte is a nop, only there to
30612 pad the write out to a single 32-bit store. */
30613 mem = adjust_address (m_tramp, SImode, offset);
30614 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
30615 offset += 4;
30616 }
30617 else
30618 {
30619 rtx disp, chain;
30620
30621 /* Depending on the static chain location, either load a register
30622 with a constant, or push the constant to the stack. All of the
30623 instructions are the same size. */
30624 chain = ix86_static_chain (fndecl, true);
30625 if (REG_P (chain))
30626 {
30627 switch (REGNO (chain))
30628 {
30629 case AX_REG:
30630 opcode = 0xb8; break;
30631 case CX_REG:
30632 opcode = 0xb9; break;
30633 default:
30634 gcc_unreachable ();
30635 }
30636 }
30637 else
30638 opcode = 0x68;
30639
30640 mem = adjust_address (m_tramp, QImode, offset);
30641 emit_move_insn (mem, gen_int_mode (opcode, QImode));
30642
30643 mem = adjust_address (m_tramp, SImode, offset + 1);
30644 emit_move_insn (mem, chain_value);
30645 offset += 5;
30646
30647 mem = adjust_address (m_tramp, QImode, offset);
30648 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
30649
30650 mem = adjust_address (m_tramp, SImode, offset + 1);
30651
30652 /* Compute offset from the end of the jmp to the target function.
30653 In the case in which the trampoline stores the static chain on
30654 the stack, we need to skip the first insn which pushes the
30655 (call-saved) register static chain; this push is 1 byte. */
30656 offset += 5;
30657 disp = expand_binop (SImode, sub_optab, fnaddr,
30658 plus_constant (Pmode, XEXP (m_tramp, 0),
30659 offset - (MEM_P (chain) ? 1 : 0)),
30660 NULL_RTX, 1, OPTAB_DIRECT);
30661 emit_move_insn (mem, disp);
30662 }
30663
30664 gcc_assert (offset <= TRAMPOLINE_SIZE);
30665
30666 #ifdef HAVE_ENABLE_EXECUTE_STACK
30667 #ifdef CHECK_EXECUTE_STACK_ENABLED
30668 if (CHECK_EXECUTE_STACK_ENABLED)
30669 #endif
30670 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
30671 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
30672 #endif
30673 }
30674 \f
30675 /* The following file contains several enumerations and data structures
30676 built from the definitions in i386-builtin-types.def. */
30677
30678 #include "i386-builtin-types.inc"
30679
30680 /* Table for the ix86 builtin non-function types. */
30681 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
30682
30683 /* Retrieve an element from the above table, building some of
30684 the types lazily. */
30685
30686 static tree
30687 ix86_get_builtin_type (enum ix86_builtin_type tcode)
30688 {
30689 unsigned int index;
30690 tree type, itype;
30691
30692 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
30693
30694 type = ix86_builtin_type_tab[(int) tcode];
30695 if (type != NULL)
30696 return type;
30697
30698 gcc_assert (tcode > IX86_BT_LAST_PRIM);
30699 if (tcode <= IX86_BT_LAST_VECT)
30700 {
30701 machine_mode mode;
30702
30703 index = tcode - IX86_BT_LAST_PRIM - 1;
30704 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
30705 mode = ix86_builtin_type_vect_mode[index];
30706
30707 type = build_vector_type_for_mode (itype, mode);
30708 }
30709 else
30710 {
30711 int quals;
30712
30713 index = tcode - IX86_BT_LAST_VECT - 1;
30714 if (tcode <= IX86_BT_LAST_PTR)
30715 quals = TYPE_UNQUALIFIED;
30716 else
30717 quals = TYPE_QUAL_CONST;
30718
30719 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
30720 if (quals != TYPE_UNQUALIFIED)
30721 itype = build_qualified_type (itype, quals);
30722
30723 type = build_pointer_type (itype);
30724 }
30725
30726 ix86_builtin_type_tab[(int) tcode] = type;
30727 return type;
30728 }
30729
30730 /* Table for the ix86 builtin function types. */
30731 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
30732
30733 /* Retrieve an element from the above table, building some of
30734 the types lazily. */
30735
30736 static tree
30737 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
30738 {
30739 tree type;
30740
30741 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
30742
30743 type = ix86_builtin_func_type_tab[(int) tcode];
30744 if (type != NULL)
30745 return type;
30746
30747 if (tcode <= IX86_BT_LAST_FUNC)
30748 {
30749 unsigned start = ix86_builtin_func_start[(int) tcode];
30750 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
30751 tree rtype, atype, args = void_list_node;
30752 unsigned i;
30753
30754 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
30755 for (i = after - 1; i > start; --i)
30756 {
30757 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
30758 args = tree_cons (NULL, atype, args);
30759 }
30760
30761 type = build_function_type (rtype, args);
30762 }
30763 else
30764 {
30765 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
30766 enum ix86_builtin_func_type icode;
30767
30768 icode = ix86_builtin_func_alias_base[index];
30769 type = ix86_get_builtin_func_type (icode);
30770 }
30771
30772 ix86_builtin_func_type_tab[(int) tcode] = type;
30773 return type;
30774 }
30775
30776
30777 /* Codes for all the SSE/MMX builtins. Builtins not mentioned in any
30778 bdesc_* arrays below should come first, then builtins for each bdesc_*
30779 array in ascending order, so that we can use direct array accesses. */
30780 enum ix86_builtins
30781 {
30782 IX86_BUILTIN_MASKMOVQ,
30783 IX86_BUILTIN_LDMXCSR,
30784 IX86_BUILTIN_STMXCSR,
30785 IX86_BUILTIN_MASKMOVDQU,
30786 IX86_BUILTIN_PSLLDQ128,
30787 IX86_BUILTIN_CLFLUSH,
30788 IX86_BUILTIN_MONITOR,
30789 IX86_BUILTIN_MWAIT,
30790 IX86_BUILTIN_CLZERO,
30791 IX86_BUILTIN_VEC_INIT_V2SI,
30792 IX86_BUILTIN_VEC_INIT_V4HI,
30793 IX86_BUILTIN_VEC_INIT_V8QI,
30794 IX86_BUILTIN_VEC_EXT_V2DF,
30795 IX86_BUILTIN_VEC_EXT_V2DI,
30796 IX86_BUILTIN_VEC_EXT_V4SF,
30797 IX86_BUILTIN_VEC_EXT_V4SI,
30798 IX86_BUILTIN_VEC_EXT_V8HI,
30799 IX86_BUILTIN_VEC_EXT_V2SI,
30800 IX86_BUILTIN_VEC_EXT_V4HI,
30801 IX86_BUILTIN_VEC_EXT_V16QI,
30802 IX86_BUILTIN_VEC_SET_V2DI,
30803 IX86_BUILTIN_VEC_SET_V4SF,
30804 IX86_BUILTIN_VEC_SET_V4SI,
30805 IX86_BUILTIN_VEC_SET_V8HI,
30806 IX86_BUILTIN_VEC_SET_V4HI,
30807 IX86_BUILTIN_VEC_SET_V16QI,
30808 IX86_BUILTIN_GATHERSIV2DF,
30809 IX86_BUILTIN_GATHERSIV4DF,
30810 IX86_BUILTIN_GATHERDIV2DF,
30811 IX86_BUILTIN_GATHERDIV4DF,
30812 IX86_BUILTIN_GATHERSIV4SF,
30813 IX86_BUILTIN_GATHERSIV8SF,
30814 IX86_BUILTIN_GATHERDIV4SF,
30815 IX86_BUILTIN_GATHERDIV8SF,
30816 IX86_BUILTIN_GATHERSIV2DI,
30817 IX86_BUILTIN_GATHERSIV4DI,
30818 IX86_BUILTIN_GATHERDIV2DI,
30819 IX86_BUILTIN_GATHERDIV4DI,
30820 IX86_BUILTIN_GATHERSIV4SI,
30821 IX86_BUILTIN_GATHERSIV8SI,
30822 IX86_BUILTIN_GATHERDIV4SI,
30823 IX86_BUILTIN_GATHERDIV8SI,
30824 IX86_BUILTIN_VFMSUBSD3_MASK3,
30825 IX86_BUILTIN_VFMSUBSS3_MASK3,
30826 IX86_BUILTIN_GATHER3SIV8SF,
30827 IX86_BUILTIN_GATHER3SIV4SF,
30828 IX86_BUILTIN_GATHER3SIV4DF,
30829 IX86_BUILTIN_GATHER3SIV2DF,
30830 IX86_BUILTIN_GATHER3DIV8SF,
30831 IX86_BUILTIN_GATHER3DIV4SF,
30832 IX86_BUILTIN_GATHER3DIV4DF,
30833 IX86_BUILTIN_GATHER3DIV2DF,
30834 IX86_BUILTIN_GATHER3SIV8SI,
30835 IX86_BUILTIN_GATHER3SIV4SI,
30836 IX86_BUILTIN_GATHER3SIV4DI,
30837 IX86_BUILTIN_GATHER3SIV2DI,
30838 IX86_BUILTIN_GATHER3DIV8SI,
30839 IX86_BUILTIN_GATHER3DIV4SI,
30840 IX86_BUILTIN_GATHER3DIV4DI,
30841 IX86_BUILTIN_GATHER3DIV2DI,
30842 IX86_BUILTIN_SCATTERSIV8SF,
30843 IX86_BUILTIN_SCATTERSIV4SF,
30844 IX86_BUILTIN_SCATTERSIV4DF,
30845 IX86_BUILTIN_SCATTERSIV2DF,
30846 IX86_BUILTIN_SCATTERDIV8SF,
30847 IX86_BUILTIN_SCATTERDIV4SF,
30848 IX86_BUILTIN_SCATTERDIV4DF,
30849 IX86_BUILTIN_SCATTERDIV2DF,
30850 IX86_BUILTIN_SCATTERSIV8SI,
30851 IX86_BUILTIN_SCATTERSIV4SI,
30852 IX86_BUILTIN_SCATTERSIV4DI,
30853 IX86_BUILTIN_SCATTERSIV2DI,
30854 IX86_BUILTIN_SCATTERDIV8SI,
30855 IX86_BUILTIN_SCATTERDIV4SI,
30856 IX86_BUILTIN_SCATTERDIV4DI,
30857 IX86_BUILTIN_SCATTERDIV2DI,
30858 /* Alternate 4 and 8 element gather/scatter for the vectorizer
30859 where all operands are 32-byte or 64-byte wide respectively. */
30860 IX86_BUILTIN_GATHERALTSIV4DF,
30861 IX86_BUILTIN_GATHERALTDIV8SF,
30862 IX86_BUILTIN_GATHERALTSIV4DI,
30863 IX86_BUILTIN_GATHERALTDIV8SI,
30864 IX86_BUILTIN_GATHER3ALTDIV16SF,
30865 IX86_BUILTIN_GATHER3ALTDIV16SI,
30866 IX86_BUILTIN_GATHER3ALTSIV4DF,
30867 IX86_BUILTIN_GATHER3ALTDIV8SF,
30868 IX86_BUILTIN_GATHER3ALTSIV4DI,
30869 IX86_BUILTIN_GATHER3ALTDIV8SI,
30870 IX86_BUILTIN_GATHER3ALTSIV8DF,
30871 IX86_BUILTIN_GATHER3ALTSIV8DI,
30872 IX86_BUILTIN_GATHER3DIV16SF,
30873 IX86_BUILTIN_GATHER3DIV16SI,
30874 IX86_BUILTIN_GATHER3DIV8DF,
30875 IX86_BUILTIN_GATHER3DIV8DI,
30876 IX86_BUILTIN_GATHER3SIV16SF,
30877 IX86_BUILTIN_GATHER3SIV16SI,
30878 IX86_BUILTIN_GATHER3SIV8DF,
30879 IX86_BUILTIN_GATHER3SIV8DI,
30880 IX86_BUILTIN_SCATTERALTSIV8DF,
30881 IX86_BUILTIN_SCATTERALTDIV16SF,
30882 IX86_BUILTIN_SCATTERALTSIV8DI,
30883 IX86_BUILTIN_SCATTERALTDIV16SI,
30884 IX86_BUILTIN_SCATTERDIV16SF,
30885 IX86_BUILTIN_SCATTERDIV16SI,
30886 IX86_BUILTIN_SCATTERDIV8DF,
30887 IX86_BUILTIN_SCATTERDIV8DI,
30888 IX86_BUILTIN_SCATTERSIV16SF,
30889 IX86_BUILTIN_SCATTERSIV16SI,
30890 IX86_BUILTIN_SCATTERSIV8DF,
30891 IX86_BUILTIN_SCATTERSIV8DI,
30892 IX86_BUILTIN_GATHERPFQPD,
30893 IX86_BUILTIN_GATHERPFDPS,
30894 IX86_BUILTIN_GATHERPFDPD,
30895 IX86_BUILTIN_GATHERPFQPS,
30896 IX86_BUILTIN_SCATTERPFDPD,
30897 IX86_BUILTIN_SCATTERPFDPS,
30898 IX86_BUILTIN_SCATTERPFQPD,
30899 IX86_BUILTIN_SCATTERPFQPS,
30900 IX86_BUILTIN_CLWB,
30901 IX86_BUILTIN_CLFLUSHOPT,
30902 IX86_BUILTIN_INFQ,
30903 IX86_BUILTIN_HUGE_VALQ,
30904 IX86_BUILTIN_NANQ,
30905 IX86_BUILTIN_NANSQ,
30906 IX86_BUILTIN_XABORT,
30907 IX86_BUILTIN_ADDCARRYX32,
30908 IX86_BUILTIN_ADDCARRYX64,
30909 IX86_BUILTIN_SBB32,
30910 IX86_BUILTIN_SBB64,
30911 IX86_BUILTIN_RDRAND16_STEP,
30912 IX86_BUILTIN_RDRAND32_STEP,
30913 IX86_BUILTIN_RDRAND64_STEP,
30914 IX86_BUILTIN_RDSEED16_STEP,
30915 IX86_BUILTIN_RDSEED32_STEP,
30916 IX86_BUILTIN_RDSEED64_STEP,
30917 IX86_BUILTIN_MONITORX,
30918 IX86_BUILTIN_MWAITX,
30919 IX86_BUILTIN_CFSTRING,
30920 IX86_BUILTIN_CPU_INIT,
30921 IX86_BUILTIN_CPU_IS,
30922 IX86_BUILTIN_CPU_SUPPORTS,
30923 IX86_BUILTIN_READ_FLAGS,
30924 IX86_BUILTIN_WRITE_FLAGS,
30925
30926 /* All the remaining builtins are tracked in bdesc_* arrays in
30927 i386-builtin.def. Don't add any IX86_BUILTIN_* enumerators after
30928 this point. */
30929 #define BDESC(mask, icode, name, code, comparison, flag) \
30930 code,
30931 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
30932 code, \
30933 IX86_BUILTIN__BDESC_##kindu##_FIRST = code,
30934 #define BDESC_END(kind, next_kind)
30935
30936 #include "i386-builtin.def"
30937
30938 #undef BDESC
30939 #undef BDESC_FIRST
30940 #undef BDESC_END
30941
30942 IX86_BUILTIN_MAX,
30943
30944 IX86_BUILTIN__BDESC_MAX_FIRST = IX86_BUILTIN_MAX,
30945
30946 /* Now just the aliases for bdesc_* start/end. */
30947 #define BDESC(mask, icode, name, code, comparison, flag)
30948 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag)
30949 #define BDESC_END(kind, next_kind) \
30950 IX86_BUILTIN__BDESC_##kind##_LAST \
30951 = IX86_BUILTIN__BDESC_##next_kind##_FIRST - 1,
30952
30953 #include "i386-builtin.def"
30954
30955 #undef BDESC
30956 #undef BDESC_FIRST
30957 #undef BDESC_END
30958
30959 /* Just to make sure there is no comma after the last enumerator. */
30960 IX86_BUILTIN__BDESC_MAX_LAST = IX86_BUILTIN__BDESC_MAX_FIRST
30961 };
30962
30963 /* Table for the ix86 builtin decls. */
30964 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
30965
30966 /* Table of all of the builtin functions that are possible with different ISA's
30967 but are waiting to be built until a function is declared to use that
30968 ISA. */
30969 struct builtin_isa {
30970 const char *name; /* function name */
30971 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
30972 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
30973 HOST_WIDE_INT isa2; /* additional isa_flags this builtin is defined for */
30974 bool const_p; /* true if the declaration is constant */
30975 bool leaf_p; /* true if the declaration has leaf attribute */
30976 bool nothrow_p; /* true if the declaration has nothrow attribute */
30977 bool set_and_not_built_p;
30978 };
30979
30980 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
30981
30982 /* Bits that can still enable any inclusion of a builtin. */
30983 static HOST_WIDE_INT deferred_isa_values = 0;
30984 static HOST_WIDE_INT deferred_isa_values2 = 0;
30985
30986 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
30987 of which isa_flags to use in the ix86_builtins_isa array. Stores the
30988 function decl in the ix86_builtins array. Returns the function decl or
30989 NULL_TREE, if the builtin was not added.
30990
30991 If the front end has a special hook for builtin functions, delay adding
30992 builtin functions that aren't in the current ISA until the ISA is changed
30993 with function specific optimization. Doing so, can save about 300K for the
30994 default compiler. When the builtin is expanded, check at that time whether
30995 it is valid.
30996
30997 If the front end doesn't have a special hook, record all builtins, even if
30998 it isn't an instruction set in the current ISA in case the user uses
30999 function specific options for a different ISA, so that we don't get scope
31000 errors if a builtin is added in the middle of a function scope. */
31001
31002 static inline tree
31003 def_builtin (HOST_WIDE_INT mask, const char *name,
31004 enum ix86_builtin_func_type tcode,
31005 enum ix86_builtins code)
31006 {
31007 tree decl = NULL_TREE;
31008
31009 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
31010 {
31011 ix86_builtins_isa[(int) code].isa = mask;
31012
31013 /* OPTION_MASK_ISA_AVX512VL has special meaning. Despite of generic case,
31014 where any bit set means that built-in is enable, this bit must be *and-ed*
31015 with another one. E.g.: OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL
31016 means that *both* cpuid bits must be set for the built-in to be available.
31017 Handle this here. */
31018 if (mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
31019 mask &= ~OPTION_MASK_ISA_AVX512VL;
31020
31021 mask &= ~OPTION_MASK_ISA_64BIT;
31022 if (mask == 0
31023 || (mask & ix86_isa_flags) != 0
31024 || (lang_hooks.builtin_function
31025 == lang_hooks.builtin_function_ext_scope))
31026
31027 {
31028 tree type = ix86_get_builtin_func_type (tcode);
31029 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
31030 NULL, NULL_TREE);
31031 ix86_builtins[(int) code] = decl;
31032 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
31033 }
31034 else
31035 {
31036 /* Just a MASK where set_and_not_built_p == true can potentially
31037 include a builtin. */
31038 deferred_isa_values |= mask;
31039 ix86_builtins[(int) code] = NULL_TREE;
31040 ix86_builtins_isa[(int) code].tcode = tcode;
31041 ix86_builtins_isa[(int) code].name = name;
31042 ix86_builtins_isa[(int) code].leaf_p = false;
31043 ix86_builtins_isa[(int) code].nothrow_p = false;
31044 ix86_builtins_isa[(int) code].const_p = false;
31045 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
31046 }
31047 }
31048
31049 return decl;
31050 }
31051
31052 /* Like def_builtin, but also marks the function decl "const". */
31053
31054 static inline tree
31055 def_builtin_const (HOST_WIDE_INT mask, const char *name,
31056 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
31057 {
31058 tree decl = def_builtin (mask, name, tcode, code);
31059 if (decl)
31060 TREE_READONLY (decl) = 1;
31061 else
31062 ix86_builtins_isa[(int) code].const_p = true;
31063
31064 return decl;
31065 }
31066
31067 /* Like def_builtin, but for additional isa2 flags. */
31068
31069 static inline tree
31070 def_builtin2 (HOST_WIDE_INT mask, const char *name,
31071 enum ix86_builtin_func_type tcode,
31072 enum ix86_builtins code)
31073 {
31074 tree decl = NULL_TREE;
31075
31076 ix86_builtins_isa[(int) code].isa2 = mask;
31077
31078 if (mask == 0
31079 || (mask & ix86_isa_flags2) != 0
31080 || (lang_hooks.builtin_function
31081 == lang_hooks.builtin_function_ext_scope))
31082
31083 {
31084 tree type = ix86_get_builtin_func_type (tcode);
31085 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
31086 NULL, NULL_TREE);
31087 ix86_builtins[(int) code] = decl;
31088 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
31089 }
31090 else
31091 {
31092 /* Just a MASK where set_and_not_built_p == true can potentially
31093 include a builtin. */
31094 deferred_isa_values2 |= mask;
31095 ix86_builtins[(int) code] = NULL_TREE;
31096 ix86_builtins_isa[(int) code].tcode = tcode;
31097 ix86_builtins_isa[(int) code].name = name;
31098 ix86_builtins_isa[(int) code].leaf_p = false;
31099 ix86_builtins_isa[(int) code].nothrow_p = false;
31100 ix86_builtins_isa[(int) code].const_p = false;
31101 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
31102 }
31103
31104 return decl;
31105 }
31106
31107 /* Like def_builtin, but also marks the function decl "const". */
31108
31109 static inline tree
31110 def_builtin_const2 (HOST_WIDE_INT mask, const char *name,
31111 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
31112 {
31113 tree decl = def_builtin2 (mask, name, tcode, code);
31114 if (decl)
31115 TREE_READONLY (decl) = 1;
31116 else
31117 ix86_builtins_isa[(int) code].const_p = true;
31118
31119 return decl;
31120 }
31121
31122 /* Add any new builtin functions for a given ISA that may not have been
31123 declared. This saves a bit of space compared to adding all of the
31124 declarations to the tree, even if we didn't use them. */
31125
31126 static void
31127 ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2)
31128 {
31129 if ((isa & deferred_isa_values) == 0
31130 && (isa2 & deferred_isa_values2) == 0)
31131 return;
31132
31133 /* Bits in ISA value can be removed from potential isa values. */
31134 deferred_isa_values &= ~isa;
31135 deferred_isa_values2 &= ~isa2;
31136
31137 int i;
31138 tree saved_current_target_pragma = current_target_pragma;
31139 current_target_pragma = NULL_TREE;
31140
31141 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
31142 {
31143 if (((ix86_builtins_isa[i].isa & isa) != 0
31144 || (ix86_builtins_isa[i].isa2 & isa2) != 0)
31145 && ix86_builtins_isa[i].set_and_not_built_p)
31146 {
31147 tree decl, type;
31148
31149 /* Don't define the builtin again. */
31150 ix86_builtins_isa[i].set_and_not_built_p = false;
31151
31152 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
31153 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
31154 type, i, BUILT_IN_MD, NULL,
31155 NULL_TREE);
31156
31157 ix86_builtins[i] = decl;
31158 if (ix86_builtins_isa[i].const_p)
31159 TREE_READONLY (decl) = 1;
31160 if (ix86_builtins_isa[i].leaf_p)
31161 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
31162 NULL_TREE);
31163 if (ix86_builtins_isa[i].nothrow_p)
31164 TREE_NOTHROW (decl) = 1;
31165 }
31166 }
31167
31168 current_target_pragma = saved_current_target_pragma;
31169 }
31170
31171 /* Bits for builtin_description.flag. */
31172
31173 /* Set when we don't support the comparison natively, and should
31174 swap_comparison in order to support it. */
31175 #define BUILTIN_DESC_SWAP_OPERANDS 1
31176
31177 struct builtin_description
31178 {
31179 const HOST_WIDE_INT mask;
31180 const enum insn_code icode;
31181 const char *const name;
31182 const enum ix86_builtins code;
31183 const enum rtx_code comparison;
31184 const int flag;
31185 };
31186
31187 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
31188 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
31189 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
31190 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
31191 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
31192 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
31193 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
31194 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
31195 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
31196 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
31197 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
31198 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
31199 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
31200 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
31201 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
31202 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
31203 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
31204 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
31205 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
31206 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
31207 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
31208 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
31209 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
31210 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
31211 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
31212 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
31213 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
31214 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
31215 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
31216 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
31217 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
31218 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
31219 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
31220 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
31221 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
31222 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
31223 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
31224 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
31225 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
31226 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
31227 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
31228 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
31229 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
31230 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
31231 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
31232 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
31233 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
31234 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
31235 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
31236 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
31237 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
31238 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
31239
31240 #define BDESC(mask, icode, name, code, comparison, flag) \
31241 { mask, icode, name, code, comparison, flag },
31242 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
31243 static const struct builtin_description bdesc_##kind[] = \
31244 { \
31245 BDESC (mask, icode, name, code, comparison, flag)
31246 #define BDESC_END(kind, next_kind) \
31247 };
31248
31249 #include "i386-builtin.def"
31250
31251 #undef BDESC
31252 #undef BDESC_FIRST
31253 #undef BDESC_END
31254 \f
31255 /* TM vector builtins. */
31256
31257 /* Reuse the existing x86-specific `struct builtin_description' cause
31258 we're lazy. Add casts to make them fit. */
31259 static const struct builtin_description bdesc_tm[] =
31260 {
31261 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
31262 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
31263 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
31264 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31265 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31266 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31267 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31268
31269 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
31270 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
31271 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
31272 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31273 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31274 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31275 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31276
31277 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
31278 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
31279 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
31280 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31281 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31282 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31283 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31284
31285 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
31286 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
31287 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
31288 };
31289
31290 /* Initialize the transactional memory vector load/store builtins. */
31291
31292 static void
31293 ix86_init_tm_builtins (void)
31294 {
31295 enum ix86_builtin_func_type ftype;
31296 const struct builtin_description *d;
31297 size_t i;
31298 tree decl;
31299 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
31300 tree attrs_log, attrs_type_log;
31301
31302 if (!flag_tm)
31303 return;
31304
31305 /* If there are no builtins defined, we must be compiling in a
31306 language without trans-mem support. */
31307 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
31308 return;
31309
31310 /* Use whatever attributes a normal TM load has. */
31311 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
31312 attrs_load = DECL_ATTRIBUTES (decl);
31313 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31314 /* Use whatever attributes a normal TM store has. */
31315 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
31316 attrs_store = DECL_ATTRIBUTES (decl);
31317 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31318 /* Use whatever attributes a normal TM log has. */
31319 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
31320 attrs_log = DECL_ATTRIBUTES (decl);
31321 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31322
31323 for (i = 0, d = bdesc_tm;
31324 i < ARRAY_SIZE (bdesc_tm);
31325 i++, d++)
31326 {
31327 if ((d->mask & ix86_isa_flags) != 0
31328 || (lang_hooks.builtin_function
31329 == lang_hooks.builtin_function_ext_scope))
31330 {
31331 tree type, attrs, attrs_type;
31332 enum built_in_function code = (enum built_in_function) d->code;
31333
31334 ftype = (enum ix86_builtin_func_type) d->flag;
31335 type = ix86_get_builtin_func_type (ftype);
31336
31337 if (BUILTIN_TM_LOAD_P (code))
31338 {
31339 attrs = attrs_load;
31340 attrs_type = attrs_type_load;
31341 }
31342 else if (BUILTIN_TM_STORE_P (code))
31343 {
31344 attrs = attrs_store;
31345 attrs_type = attrs_type_store;
31346 }
31347 else
31348 {
31349 attrs = attrs_log;
31350 attrs_type = attrs_type_log;
31351 }
31352 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
31353 /* The builtin without the prefix for
31354 calling it directly. */
31355 d->name + strlen ("__builtin_"),
31356 attrs);
31357 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
31358 set the TYPE_ATTRIBUTES. */
31359 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
31360
31361 set_builtin_decl (code, decl, false);
31362 }
31363 }
31364 }
31365
31366 /* Macros for verification of enum ix86_builtins order. */
31367 #define BDESC_VERIFY(x, y, z) \
31368 gcc_checking_assert ((x) == (enum ix86_builtins) ((y) + (z)))
31369 #define BDESC_VERIFYS(x, y, z) \
31370 STATIC_ASSERT ((x) == (enum ix86_builtins) ((y) + (z)))
31371
31372 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
31373 IX86_BUILTIN__BDESC_COMI_LAST, 1);
31374 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
31375 IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1);
31376 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
31377 IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1);
31378 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST,
31379 IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1);
31380 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
31381 IX86_BUILTIN__BDESC_ARGS_LAST, 1);
31382 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_FIRST,
31383 IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1);
31384 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_FIRST,
31385 IX86_BUILTIN__BDESC_ARGS2_LAST, 1);
31386 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
31387 IX86_BUILTIN__BDESC_MPX_LAST, 1);
31388 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
31389 IX86_BUILTIN__BDESC_MPX_CONST_LAST, 1);
31390 BDESC_VERIFYS (IX86_BUILTIN_MAX,
31391 IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1);
31392
31393 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
31394 in the current target ISA to allow the user to compile particular modules
31395 with different target specific options that differ from the command line
31396 options. */
31397 static void
31398 ix86_init_mmx_sse_builtins (void)
31399 {
31400 const struct builtin_description * d;
31401 enum ix86_builtin_func_type ftype;
31402 size_t i;
31403
31404 /* Add all special builtins with variable number of operands. */
31405 for (i = 0, d = bdesc_special_args;
31406 i < ARRAY_SIZE (bdesc_special_args);
31407 i++, d++)
31408 {
31409 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, i);
31410 if (d->name == 0)
31411 continue;
31412
31413 ftype = (enum ix86_builtin_func_type) d->flag;
31414 def_builtin (d->mask, d->name, ftype, d->code);
31415 }
31416 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST,
31417 IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
31418 ARRAY_SIZE (bdesc_special_args) - 1);
31419
31420 /* Add all builtins with variable number of operands. */
31421 for (i = 0, d = bdesc_args;
31422 i < ARRAY_SIZE (bdesc_args);
31423 i++, d++)
31424 {
31425 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS_FIRST, i);
31426 if (d->name == 0)
31427 continue;
31428
31429 ftype = (enum ix86_builtin_func_type) d->flag;
31430 def_builtin_const (d->mask, d->name, ftype, d->code);
31431 }
31432 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_LAST,
31433 IX86_BUILTIN__BDESC_ARGS_FIRST,
31434 ARRAY_SIZE (bdesc_args) - 1);
31435
31436 /* Add all builtins with variable number of operands. */
31437 for (i = 0, d = bdesc_args2;
31438 i < ARRAY_SIZE (bdesc_args2);
31439 i++, d++)
31440 {
31441 if (d->name == 0)
31442 continue;
31443
31444 ftype = (enum ix86_builtin_func_type) d->flag;
31445 def_builtin_const2 (d->mask, d->name, ftype, d->code);
31446 }
31447
31448 /* Add all builtins with rounding. */
31449 for (i = 0, d = bdesc_round_args;
31450 i < ARRAY_SIZE (bdesc_round_args);
31451 i++, d++)
31452 {
31453 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, i);
31454 if (d->name == 0)
31455 continue;
31456
31457 ftype = (enum ix86_builtin_func_type) d->flag;
31458 def_builtin_const (d->mask, d->name, ftype, d->code);
31459 }
31460 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_LAST,
31461 IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
31462 ARRAY_SIZE (bdesc_round_args) - 1);
31463
31464 /* pcmpestr[im] insns. */
31465 for (i = 0, d = bdesc_pcmpestr;
31466 i < ARRAY_SIZE (bdesc_pcmpestr);
31467 i++, d++)
31468 {
31469 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPESTR_FIRST, i);
31470 if (d->code == IX86_BUILTIN_PCMPESTRM128)
31471 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
31472 else
31473 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
31474 def_builtin_const (d->mask, d->name, ftype, d->code);
31475 }
31476 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_LAST,
31477 IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
31478 ARRAY_SIZE (bdesc_pcmpestr) - 1);
31479
31480 /* pcmpistr[im] insns. */
31481 for (i = 0, d = bdesc_pcmpistr;
31482 i < ARRAY_SIZE (bdesc_pcmpistr);
31483 i++, d++)
31484 {
31485 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPISTR_FIRST, i);
31486 if (d->code == IX86_BUILTIN_PCMPISTRM128)
31487 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
31488 else
31489 ftype = INT_FTYPE_V16QI_V16QI_INT;
31490 def_builtin_const (d->mask, d->name, ftype, d->code);
31491 }
31492 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_LAST,
31493 IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
31494 ARRAY_SIZE (bdesc_pcmpistr) - 1);
31495
31496 /* comi/ucomi insns. */
31497 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
31498 {
31499 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_COMI_FIRST, i);
31500 if (d->mask == OPTION_MASK_ISA_SSE2)
31501 ftype = INT_FTYPE_V2DF_V2DF;
31502 else
31503 ftype = INT_FTYPE_V4SF_V4SF;
31504 def_builtin_const (d->mask, d->name, ftype, d->code);
31505 }
31506 BDESC_VERIFYS (IX86_BUILTIN__BDESC_COMI_LAST,
31507 IX86_BUILTIN__BDESC_COMI_FIRST,
31508 ARRAY_SIZE (bdesc_comi) - 1);
31509
31510 /* SSE */
31511 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
31512 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
31513 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
31514 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
31515
31516 /* SSE or 3DNow!A */
31517 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
31518 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
31519 IX86_BUILTIN_MASKMOVQ);
31520
31521 /* SSE2 */
31522 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
31523 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
31524
31525 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
31526 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
31527 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
31528 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
31529
31530 /* SSE3. */
31531 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
31532 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
31533 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
31534 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
31535
31536 /* AES */
31537 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
31538 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
31539 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
31540 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
31541 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
31542 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
31543 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
31544 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
31545 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
31546 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
31547 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
31548 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
31549
31550 /* PCLMUL */
31551 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
31552 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
31553
31554 /* RDRND */
31555 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
31556 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
31557 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
31558 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
31559 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
31560 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
31561 IX86_BUILTIN_RDRAND64_STEP);
31562
31563 /* AVX2 */
31564 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
31565 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
31566 IX86_BUILTIN_GATHERSIV2DF);
31567
31568 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
31569 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
31570 IX86_BUILTIN_GATHERSIV4DF);
31571
31572 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
31573 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
31574 IX86_BUILTIN_GATHERDIV2DF);
31575
31576 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
31577 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
31578 IX86_BUILTIN_GATHERDIV4DF);
31579
31580 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
31581 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
31582 IX86_BUILTIN_GATHERSIV4SF);
31583
31584 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
31585 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
31586 IX86_BUILTIN_GATHERSIV8SF);
31587
31588 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
31589 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
31590 IX86_BUILTIN_GATHERDIV4SF);
31591
31592 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
31593 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
31594 IX86_BUILTIN_GATHERDIV8SF);
31595
31596 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
31597 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
31598 IX86_BUILTIN_GATHERSIV2DI);
31599
31600 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
31601 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
31602 IX86_BUILTIN_GATHERSIV4DI);
31603
31604 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
31605 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
31606 IX86_BUILTIN_GATHERDIV2DI);
31607
31608 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
31609 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
31610 IX86_BUILTIN_GATHERDIV4DI);
31611
31612 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
31613 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
31614 IX86_BUILTIN_GATHERSIV4SI);
31615
31616 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
31617 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
31618 IX86_BUILTIN_GATHERSIV8SI);
31619
31620 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
31621 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
31622 IX86_BUILTIN_GATHERDIV4SI);
31623
31624 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
31625 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
31626 IX86_BUILTIN_GATHERDIV8SI);
31627
31628 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
31629 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
31630 IX86_BUILTIN_GATHERALTSIV4DF);
31631
31632 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
31633 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
31634 IX86_BUILTIN_GATHERALTDIV8SF);
31635
31636 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
31637 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
31638 IX86_BUILTIN_GATHERALTSIV4DI);
31639
31640 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
31641 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
31642 IX86_BUILTIN_GATHERALTDIV8SI);
31643
31644 /* AVX512F */
31645 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16sf",
31646 V16SF_FTYPE_V16SF_PCFLOAT_V16SI_HI_INT,
31647 IX86_BUILTIN_GATHER3SIV16SF);
31648
31649 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8df",
31650 V8DF_FTYPE_V8DF_PCDOUBLE_V8SI_QI_INT,
31651 IX86_BUILTIN_GATHER3SIV8DF);
31652
31653 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16sf",
31654 V8SF_FTYPE_V8SF_PCFLOAT_V8DI_QI_INT,
31655 IX86_BUILTIN_GATHER3DIV16SF);
31656
31657 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8df",
31658 V8DF_FTYPE_V8DF_PCDOUBLE_V8DI_QI_INT,
31659 IX86_BUILTIN_GATHER3DIV8DF);
31660
31661 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16si",
31662 V16SI_FTYPE_V16SI_PCINT_V16SI_HI_INT,
31663 IX86_BUILTIN_GATHER3SIV16SI);
31664
31665 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8di",
31666 V8DI_FTYPE_V8DI_PCINT64_V8SI_QI_INT,
31667 IX86_BUILTIN_GATHER3SIV8DI);
31668
31669 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16si",
31670 V8SI_FTYPE_V8SI_PCINT_V8DI_QI_INT,
31671 IX86_BUILTIN_GATHER3DIV16SI);
31672
31673 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8di",
31674 V8DI_FTYPE_V8DI_PCINT64_V8DI_QI_INT,
31675 IX86_BUILTIN_GATHER3DIV8DI);
31676
31677 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8df ",
31678 V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT,
31679 IX86_BUILTIN_GATHER3ALTSIV8DF);
31680
31681 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8sf ",
31682 V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT,
31683 IX86_BUILTIN_GATHER3ALTDIV16SF);
31684
31685 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8di ",
31686 V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT,
31687 IX86_BUILTIN_GATHER3ALTSIV8DI);
31688
31689 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8si ",
31690 V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT,
31691 IX86_BUILTIN_GATHER3ALTDIV16SI);
31692
31693 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16sf",
31694 VOID_FTYPE_PFLOAT_HI_V16SI_V16SF_INT,
31695 IX86_BUILTIN_SCATTERSIV16SF);
31696
31697 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8df",
31698 VOID_FTYPE_PDOUBLE_QI_V8SI_V8DF_INT,
31699 IX86_BUILTIN_SCATTERSIV8DF);
31700
31701 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16sf",
31702 VOID_FTYPE_PFLOAT_QI_V8DI_V8SF_INT,
31703 IX86_BUILTIN_SCATTERDIV16SF);
31704
31705 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8df",
31706 VOID_FTYPE_PDOUBLE_QI_V8DI_V8DF_INT,
31707 IX86_BUILTIN_SCATTERDIV8DF);
31708
31709 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16si",
31710 VOID_FTYPE_PINT_HI_V16SI_V16SI_INT,
31711 IX86_BUILTIN_SCATTERSIV16SI);
31712
31713 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8di",
31714 VOID_FTYPE_PLONGLONG_QI_V8SI_V8DI_INT,
31715 IX86_BUILTIN_SCATTERSIV8DI);
31716
31717 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16si",
31718 VOID_FTYPE_PINT_QI_V8DI_V8SI_INT,
31719 IX86_BUILTIN_SCATTERDIV16SI);
31720
31721 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8di",
31722 VOID_FTYPE_PLONGLONG_QI_V8DI_V8DI_INT,
31723 IX86_BUILTIN_SCATTERDIV8DI);
31724
31725 /* AVX512VL */
31726 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2df",
31727 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_QI_INT,
31728 IX86_BUILTIN_GATHER3SIV2DF);
31729
31730 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4df",
31731 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_QI_INT,
31732 IX86_BUILTIN_GATHER3SIV4DF);
31733
31734 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2df",
31735 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_QI_INT,
31736 IX86_BUILTIN_GATHER3DIV2DF);
31737
31738 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4df",
31739 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_QI_INT,
31740 IX86_BUILTIN_GATHER3DIV4DF);
31741
31742 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4sf",
31743 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_QI_INT,
31744 IX86_BUILTIN_GATHER3SIV4SF);
31745
31746 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8sf",
31747 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_QI_INT,
31748 IX86_BUILTIN_GATHER3SIV8SF);
31749
31750 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4sf",
31751 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_QI_INT,
31752 IX86_BUILTIN_GATHER3DIV4SF);
31753
31754 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8sf",
31755 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_QI_INT,
31756 IX86_BUILTIN_GATHER3DIV8SF);
31757
31758 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2di",
31759 V2DI_FTYPE_V2DI_PCINT64_V4SI_QI_INT,
31760 IX86_BUILTIN_GATHER3SIV2DI);
31761
31762 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4di",
31763 V4DI_FTYPE_V4DI_PCINT64_V4SI_QI_INT,
31764 IX86_BUILTIN_GATHER3SIV4DI);
31765
31766 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2di",
31767 V2DI_FTYPE_V2DI_PCINT64_V2DI_QI_INT,
31768 IX86_BUILTIN_GATHER3DIV2DI);
31769
31770 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4di",
31771 V4DI_FTYPE_V4DI_PCINT64_V4DI_QI_INT,
31772 IX86_BUILTIN_GATHER3DIV4DI);
31773
31774 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4si",
31775 V4SI_FTYPE_V4SI_PCINT_V4SI_QI_INT,
31776 IX86_BUILTIN_GATHER3SIV4SI);
31777
31778 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8si",
31779 V8SI_FTYPE_V8SI_PCINT_V8SI_QI_INT,
31780 IX86_BUILTIN_GATHER3SIV8SI);
31781
31782 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4si",
31783 V4SI_FTYPE_V4SI_PCINT_V2DI_QI_INT,
31784 IX86_BUILTIN_GATHER3DIV4SI);
31785
31786 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8si",
31787 V4SI_FTYPE_V4SI_PCINT_V4DI_QI_INT,
31788 IX86_BUILTIN_GATHER3DIV8SI);
31789
31790 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4df ",
31791 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_QI_INT,
31792 IX86_BUILTIN_GATHER3ALTSIV4DF);
31793
31794 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8sf ",
31795 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_QI_INT,
31796 IX86_BUILTIN_GATHER3ALTDIV8SF);
31797
31798 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4di ",
31799 V4DI_FTYPE_V4DI_PCINT64_V8SI_QI_INT,
31800 IX86_BUILTIN_GATHER3ALTSIV4DI);
31801
31802 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8si ",
31803 V8SI_FTYPE_V8SI_PCINT_V4DI_QI_INT,
31804 IX86_BUILTIN_GATHER3ALTDIV8SI);
31805
31806 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8sf",
31807 VOID_FTYPE_PFLOAT_QI_V8SI_V8SF_INT,
31808 IX86_BUILTIN_SCATTERSIV8SF);
31809
31810 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4sf",
31811 VOID_FTYPE_PFLOAT_QI_V4SI_V4SF_INT,
31812 IX86_BUILTIN_SCATTERSIV4SF);
31813
31814 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4df",
31815 VOID_FTYPE_PDOUBLE_QI_V4SI_V4DF_INT,
31816 IX86_BUILTIN_SCATTERSIV4DF);
31817
31818 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2df",
31819 VOID_FTYPE_PDOUBLE_QI_V4SI_V2DF_INT,
31820 IX86_BUILTIN_SCATTERSIV2DF);
31821
31822 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8sf",
31823 VOID_FTYPE_PFLOAT_QI_V4DI_V4SF_INT,
31824 IX86_BUILTIN_SCATTERDIV8SF);
31825
31826 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4sf",
31827 VOID_FTYPE_PFLOAT_QI_V2DI_V4SF_INT,
31828 IX86_BUILTIN_SCATTERDIV4SF);
31829
31830 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4df",
31831 VOID_FTYPE_PDOUBLE_QI_V4DI_V4DF_INT,
31832 IX86_BUILTIN_SCATTERDIV4DF);
31833
31834 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2df",
31835 VOID_FTYPE_PDOUBLE_QI_V2DI_V2DF_INT,
31836 IX86_BUILTIN_SCATTERDIV2DF);
31837
31838 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8si",
31839 VOID_FTYPE_PINT_QI_V8SI_V8SI_INT,
31840 IX86_BUILTIN_SCATTERSIV8SI);
31841
31842 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4si",
31843 VOID_FTYPE_PINT_QI_V4SI_V4SI_INT,
31844 IX86_BUILTIN_SCATTERSIV4SI);
31845
31846 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4di",
31847 VOID_FTYPE_PLONGLONG_QI_V4SI_V4DI_INT,
31848 IX86_BUILTIN_SCATTERSIV4DI);
31849
31850 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2di",
31851 VOID_FTYPE_PLONGLONG_QI_V4SI_V2DI_INT,
31852 IX86_BUILTIN_SCATTERSIV2DI);
31853
31854 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8si",
31855 VOID_FTYPE_PINT_QI_V4DI_V4SI_INT,
31856 IX86_BUILTIN_SCATTERDIV8SI);
31857
31858 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4si",
31859 VOID_FTYPE_PINT_QI_V2DI_V4SI_INT,
31860 IX86_BUILTIN_SCATTERDIV4SI);
31861
31862 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4di",
31863 VOID_FTYPE_PLONGLONG_QI_V4DI_V4DI_INT,
31864 IX86_BUILTIN_SCATTERDIV4DI);
31865
31866 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2di",
31867 VOID_FTYPE_PLONGLONG_QI_V2DI_V2DI_INT,
31868 IX86_BUILTIN_SCATTERDIV2DI);
31869 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8df ",
31870 VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
31871 IX86_BUILTIN_SCATTERALTSIV8DF);
31872
31873 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8sf ",
31874 VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
31875 IX86_BUILTIN_SCATTERALTDIV16SF);
31876
31877 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8di ",
31878 VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
31879 IX86_BUILTIN_SCATTERALTSIV8DI);
31880
31881 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8si ",
31882 VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
31883 IX86_BUILTIN_SCATTERALTDIV16SI);
31884
31885 /* AVX512PF */
31886 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdpd",
31887 VOID_FTYPE_QI_V8SI_PCINT64_INT_INT,
31888 IX86_BUILTIN_GATHERPFDPD);
31889 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdps",
31890 VOID_FTYPE_HI_V16SI_PCINT_INT_INT,
31891 IX86_BUILTIN_GATHERPFDPS);
31892 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqpd",
31893 VOID_FTYPE_QI_V8DI_PCINT64_INT_INT,
31894 IX86_BUILTIN_GATHERPFQPD);
31895 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqps",
31896 VOID_FTYPE_QI_V8DI_PCINT_INT_INT,
31897 IX86_BUILTIN_GATHERPFQPS);
31898 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdpd",
31899 VOID_FTYPE_QI_V8SI_PCINT64_INT_INT,
31900 IX86_BUILTIN_SCATTERPFDPD);
31901 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdps",
31902 VOID_FTYPE_HI_V16SI_PCINT_INT_INT,
31903 IX86_BUILTIN_SCATTERPFDPS);
31904 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqpd",
31905 VOID_FTYPE_QI_V8DI_PCINT64_INT_INT,
31906 IX86_BUILTIN_SCATTERPFQPD);
31907 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqps",
31908 VOID_FTYPE_QI_V8DI_PCINT_INT_INT,
31909 IX86_BUILTIN_SCATTERPFQPS);
31910
31911 /* SHA */
31912 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg1",
31913 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1);
31914 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg2",
31915 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2);
31916 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1nexte",
31917 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE);
31918 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1rnds4",
31919 V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4);
31920 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg1",
31921 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1);
31922 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg2",
31923 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2);
31924 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256rnds2",
31925 V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2);
31926
31927 /* RTM. */
31928 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
31929 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
31930
31931 /* MMX access to the vec_init patterns. */
31932 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
31933 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
31934
31935 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
31936 V4HI_FTYPE_HI_HI_HI_HI,
31937 IX86_BUILTIN_VEC_INIT_V4HI);
31938
31939 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
31940 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
31941 IX86_BUILTIN_VEC_INIT_V8QI);
31942
31943 /* Access to the vec_extract patterns. */
31944 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
31945 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
31946 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
31947 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
31948 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
31949 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
31950 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
31951 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
31952 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
31953 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
31954
31955 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
31956 "__builtin_ia32_vec_ext_v4hi",
31957 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
31958
31959 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
31960 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
31961
31962 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
31963 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
31964
31965 /* Access to the vec_set patterns. */
31966 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
31967 "__builtin_ia32_vec_set_v2di",
31968 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
31969
31970 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
31971 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
31972
31973 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
31974 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
31975
31976 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
31977 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
31978
31979 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
31980 "__builtin_ia32_vec_set_v4hi",
31981 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
31982
31983 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
31984 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
31985
31986 /* RDSEED */
31987 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
31988 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
31989 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
31990 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
31991 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
31992 "__builtin_ia32_rdseed_di_step",
31993 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
31994
31995 /* ADCX */
31996 def_builtin (0, "__builtin_ia32_addcarryx_u32",
31997 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
31998 def_builtin (OPTION_MASK_ISA_64BIT,
31999 "__builtin_ia32_addcarryx_u64",
32000 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
32001 IX86_BUILTIN_ADDCARRYX64);
32002
32003 /* SBB */
32004 def_builtin (0, "__builtin_ia32_sbb_u32",
32005 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_SBB32);
32006 def_builtin (OPTION_MASK_ISA_64BIT,
32007 "__builtin_ia32_sbb_u64",
32008 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
32009 IX86_BUILTIN_SBB64);
32010
32011 /* Read/write FLAGS. */
32012 def_builtin (~OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u32",
32013 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
32014 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u64",
32015 UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
32016 def_builtin (~OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u32",
32017 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS);
32018 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u64",
32019 VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS);
32020
32021 /* CLFLUSHOPT. */
32022 def_builtin (OPTION_MASK_ISA_CLFLUSHOPT, "__builtin_ia32_clflushopt",
32023 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSHOPT);
32024
32025 /* CLWB. */
32026 def_builtin (OPTION_MASK_ISA_CLWB, "__builtin_ia32_clwb",
32027 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLWB);
32028
32029 /* MONITORX and MWAITX. */
32030 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_monitorx",
32031 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITORX);
32032 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_mwaitx",
32033 VOID_FTYPE_UNSIGNED_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAITX);
32034
32035 /* CLZERO. */
32036 def_builtin (OPTION_MASK_ISA_CLZERO, "__builtin_ia32_clzero",
32037 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLZERO);
32038
32039 /* Add FMA4 multi-arg argument instructions */
32040 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
32041 {
32042 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, i);
32043 if (d->name == 0)
32044 continue;
32045
32046 ftype = (enum ix86_builtin_func_type) d->flag;
32047 def_builtin_const (d->mask, d->name, ftype, d->code);
32048 }
32049 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST,
32050 IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
32051 ARRAY_SIZE (bdesc_multi_arg) - 1);
32052 }
32053
32054 static void
32055 ix86_init_mpx_builtins ()
32056 {
32057 const struct builtin_description * d;
32058 enum ix86_builtin_func_type ftype;
32059 tree decl;
32060 size_t i;
32061
32062 for (i = 0, d = bdesc_mpx;
32063 i < ARRAY_SIZE (bdesc_mpx);
32064 i++, d++)
32065 {
32066 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_FIRST, i);
32067 if (d->name == 0)
32068 continue;
32069
32070 ftype = (enum ix86_builtin_func_type) d->flag;
32071 decl = def_builtin (d->mask, d->name, ftype, d->code);
32072
32073 /* With no leaf and nothrow flags for MPX builtins
32074 abnormal edges may follow its call when setjmp
32075 presents in the function. Since we may have a lot
32076 of MPX builtins calls it causes lots of useless
32077 edges and enormous PHI nodes. To avoid this we mark
32078 MPX builtins as leaf and nothrow. */
32079 if (decl)
32080 {
32081 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
32082 NULL_TREE);
32083 TREE_NOTHROW (decl) = 1;
32084 }
32085 else
32086 {
32087 ix86_builtins_isa[(int)d->code].leaf_p = true;
32088 ix86_builtins_isa[(int)d->code].nothrow_p = true;
32089 }
32090 }
32091 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_LAST,
32092 IX86_BUILTIN__BDESC_MPX_FIRST,
32093 ARRAY_SIZE (bdesc_mpx) - 1);
32094
32095 for (i = 0, d = bdesc_mpx_const;
32096 i < ARRAY_SIZE (bdesc_mpx_const);
32097 i++, d++)
32098 {
32099 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_CONST_FIRST, i);
32100 if (d->name == 0)
32101 continue;
32102
32103 ftype = (enum ix86_builtin_func_type) d->flag;
32104 decl = def_builtin_const (d->mask, d->name, ftype, d->code);
32105
32106 if (decl)
32107 {
32108 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
32109 NULL_TREE);
32110 TREE_NOTHROW (decl) = 1;
32111 }
32112 else
32113 {
32114 ix86_builtins_isa[(int)d->code].leaf_p = true;
32115 ix86_builtins_isa[(int)d->code].nothrow_p = true;
32116 }
32117 }
32118 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_LAST,
32119 IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
32120 ARRAY_SIZE (bdesc_mpx_const) - 1);
32121 }
32122 #undef BDESC_VERIFY
32123 #undef BDESC_VERIFYS
32124
32125 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
32126 to return a pointer to VERSION_DECL if the outcome of the expression
32127 formed by PREDICATE_CHAIN is true. This function will be called during
32128 version dispatch to decide which function version to execute. It returns
32129 the basic block at the end, to which more conditions can be added. */
32130
32131 static basic_block
32132 add_condition_to_bb (tree function_decl, tree version_decl,
32133 tree predicate_chain, basic_block new_bb)
32134 {
32135 gimple *return_stmt;
32136 tree convert_expr, result_var;
32137 gimple *convert_stmt;
32138 gimple *call_cond_stmt;
32139 gimple *if_else_stmt;
32140
32141 basic_block bb1, bb2, bb3;
32142 edge e12, e23;
32143
32144 tree cond_var, and_expr_var = NULL_TREE;
32145 gimple_seq gseq;
32146
32147 tree predicate_decl, predicate_arg;
32148
32149 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
32150
32151 gcc_assert (new_bb != NULL);
32152 gseq = bb_seq (new_bb);
32153
32154
32155 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
32156 build_fold_addr_expr (version_decl));
32157 result_var = create_tmp_var (ptr_type_node);
32158 convert_stmt = gimple_build_assign (result_var, convert_expr);
32159 return_stmt = gimple_build_return (result_var);
32160
32161 if (predicate_chain == NULL_TREE)
32162 {
32163 gimple_seq_add_stmt (&gseq, convert_stmt);
32164 gimple_seq_add_stmt (&gseq, return_stmt);
32165 set_bb_seq (new_bb, gseq);
32166 gimple_set_bb (convert_stmt, new_bb);
32167 gimple_set_bb (return_stmt, new_bb);
32168 pop_cfun ();
32169 return new_bb;
32170 }
32171
32172 while (predicate_chain != NULL)
32173 {
32174 cond_var = create_tmp_var (integer_type_node);
32175 predicate_decl = TREE_PURPOSE (predicate_chain);
32176 predicate_arg = TREE_VALUE (predicate_chain);
32177 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
32178 gimple_call_set_lhs (call_cond_stmt, cond_var);
32179
32180 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
32181 gimple_set_bb (call_cond_stmt, new_bb);
32182 gimple_seq_add_stmt (&gseq, call_cond_stmt);
32183
32184 predicate_chain = TREE_CHAIN (predicate_chain);
32185
32186 if (and_expr_var == NULL)
32187 and_expr_var = cond_var;
32188 else
32189 {
32190 gimple *assign_stmt;
32191 /* Use MIN_EXPR to check if any integer is zero?.
32192 and_expr_var = min_expr <cond_var, and_expr_var> */
32193 assign_stmt = gimple_build_assign (and_expr_var,
32194 build2 (MIN_EXPR, integer_type_node,
32195 cond_var, and_expr_var));
32196
32197 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
32198 gimple_set_bb (assign_stmt, new_bb);
32199 gimple_seq_add_stmt (&gseq, assign_stmt);
32200 }
32201 }
32202
32203 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
32204 integer_zero_node,
32205 NULL_TREE, NULL_TREE);
32206 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
32207 gimple_set_bb (if_else_stmt, new_bb);
32208 gimple_seq_add_stmt (&gseq, if_else_stmt);
32209
32210 gimple_seq_add_stmt (&gseq, convert_stmt);
32211 gimple_seq_add_stmt (&gseq, return_stmt);
32212 set_bb_seq (new_bb, gseq);
32213
32214 bb1 = new_bb;
32215 e12 = split_block (bb1, if_else_stmt);
32216 bb2 = e12->dest;
32217 e12->flags &= ~EDGE_FALLTHRU;
32218 e12->flags |= EDGE_TRUE_VALUE;
32219
32220 e23 = split_block (bb2, return_stmt);
32221
32222 gimple_set_bb (convert_stmt, bb2);
32223 gimple_set_bb (return_stmt, bb2);
32224
32225 bb3 = e23->dest;
32226 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
32227
32228 remove_edge (e23);
32229 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
32230
32231 pop_cfun ();
32232
32233 return bb3;
32234 }
32235
32236 /* This parses the attribute arguments to target in DECL and determines
32237 the right builtin to use to match the platform specification.
32238 It returns the priority value for this version decl. If PREDICATE_LIST
32239 is not NULL, it stores the list of cpu features that need to be checked
32240 before dispatching this function. */
32241
32242 static unsigned int
32243 get_builtin_code_for_version (tree decl, tree *predicate_list)
32244 {
32245 tree attrs;
32246 struct cl_target_option cur_target;
32247 tree target_node;
32248 struct cl_target_option *new_target;
32249 const char *arg_str = NULL;
32250 const char *attrs_str = NULL;
32251 char *tok_str = NULL;
32252 char *token;
32253
32254 /* Priority of i386 features, greater value is higher priority. This is
32255 used to decide the order in which function dispatch must happen. For
32256 instance, a version specialized for SSE4.2 should be checked for dispatch
32257 before a version for SSE3, as SSE4.2 implies SSE3. */
32258 enum feature_priority
32259 {
32260 P_ZERO = 0,
32261 P_MMX,
32262 P_SSE,
32263 P_SSE2,
32264 P_SSE3,
32265 P_SSSE3,
32266 P_PROC_SSSE3,
32267 P_SSE4_A,
32268 P_PROC_SSE4_A,
32269 P_SSE4_1,
32270 P_SSE4_2,
32271 P_PROC_SSE4_2,
32272 P_POPCNT,
32273 P_AES,
32274 P_PCLMUL,
32275 P_AVX,
32276 P_PROC_AVX,
32277 P_BMI,
32278 P_PROC_BMI,
32279 P_FMA4,
32280 P_XOP,
32281 P_PROC_XOP,
32282 P_FMA,
32283 P_PROC_FMA,
32284 P_BMI2,
32285 P_AVX2,
32286 P_PROC_AVX2,
32287 P_AVX512F,
32288 P_PROC_AVX512F
32289 };
32290
32291 enum feature_priority priority = P_ZERO;
32292
32293 /* These are the target attribute strings for which a dispatcher is
32294 available, from fold_builtin_cpu. */
32295
32296 static struct _feature_list
32297 {
32298 const char *const name;
32299 const enum feature_priority priority;
32300 }
32301 const feature_list[] =
32302 {
32303 {"mmx", P_MMX},
32304 {"sse", P_SSE},
32305 {"sse2", P_SSE2},
32306 {"sse3", P_SSE3},
32307 {"sse4a", P_SSE4_A},
32308 {"ssse3", P_SSSE3},
32309 {"sse4.1", P_SSE4_1},
32310 {"sse4.2", P_SSE4_2},
32311 {"popcnt", P_POPCNT},
32312 {"aes", P_AES},
32313 {"pclmul", P_PCLMUL},
32314 {"avx", P_AVX},
32315 {"bmi", P_BMI},
32316 {"fma4", P_FMA4},
32317 {"xop", P_XOP},
32318 {"fma", P_FMA},
32319 {"bmi2", P_BMI2},
32320 {"avx2", P_AVX2},
32321 {"avx512f", P_AVX512F}
32322 };
32323
32324
32325 static unsigned int NUM_FEATURES
32326 = sizeof (feature_list) / sizeof (struct _feature_list);
32327
32328 unsigned int i;
32329
32330 tree predicate_chain = NULL_TREE;
32331 tree predicate_decl, predicate_arg;
32332
32333 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
32334 gcc_assert (attrs != NULL);
32335
32336 attrs = TREE_VALUE (TREE_VALUE (attrs));
32337
32338 gcc_assert (TREE_CODE (attrs) == STRING_CST);
32339 attrs_str = TREE_STRING_POINTER (attrs);
32340
32341 /* Return priority zero for default function. */
32342 if (strcmp (attrs_str, "default") == 0)
32343 return 0;
32344
32345 /* Handle arch= if specified. For priority, set it to be 1 more than
32346 the best instruction set the processor can handle. For instance, if
32347 there is a version for atom and a version for ssse3 (the highest ISA
32348 priority for atom), the atom version must be checked for dispatch
32349 before the ssse3 version. */
32350 if (strstr (attrs_str, "arch=") != NULL)
32351 {
32352 cl_target_option_save (&cur_target, &global_options);
32353 target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
32354 &global_options_set);
32355
32356 gcc_assert (target_node);
32357 new_target = TREE_TARGET_OPTION (target_node);
32358 gcc_assert (new_target);
32359
32360 if (new_target->arch_specified && new_target->arch > 0)
32361 {
32362 switch (new_target->arch)
32363 {
32364 case PROCESSOR_CORE2:
32365 arg_str = "core2";
32366 priority = P_PROC_SSSE3;
32367 break;
32368 case PROCESSOR_NEHALEM:
32369 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AES)
32370 arg_str = "westmere";
32371 else
32372 /* We translate "arch=corei7" and "arch=nehalem" to
32373 "corei7" so that it will be mapped to M_INTEL_COREI7
32374 as cpu type to cover all M_INTEL_COREI7_XXXs. */
32375 arg_str = "corei7";
32376 priority = P_PROC_SSE4_2;
32377 break;
32378 case PROCESSOR_SANDYBRIDGE:
32379 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C)
32380 arg_str = "ivybridge";
32381 else
32382 arg_str = "sandybridge";
32383 priority = P_PROC_AVX;
32384 break;
32385 case PROCESSOR_HASWELL:
32386 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
32387 arg_str = "skylake-avx512";
32388 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_XSAVES)
32389 arg_str = "skylake";
32390 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX)
32391 arg_str = "broadwell";
32392 else
32393 arg_str = "haswell";
32394 priority = P_PROC_AVX2;
32395 break;
32396 case PROCESSOR_BONNELL:
32397 arg_str = "bonnell";
32398 priority = P_PROC_SSSE3;
32399 break;
32400 case PROCESSOR_KNL:
32401 arg_str = "knl";
32402 priority = P_PROC_AVX512F;
32403 break;
32404 case PROCESSOR_SILVERMONT:
32405 arg_str = "silvermont";
32406 priority = P_PROC_SSE4_2;
32407 break;
32408 case PROCESSOR_AMDFAM10:
32409 arg_str = "amdfam10h";
32410 priority = P_PROC_SSE4_A;
32411 break;
32412 case PROCESSOR_BTVER1:
32413 arg_str = "btver1";
32414 priority = P_PROC_SSE4_A;
32415 break;
32416 case PROCESSOR_BTVER2:
32417 arg_str = "btver2";
32418 priority = P_PROC_BMI;
32419 break;
32420 case PROCESSOR_BDVER1:
32421 arg_str = "bdver1";
32422 priority = P_PROC_XOP;
32423 break;
32424 case PROCESSOR_BDVER2:
32425 arg_str = "bdver2";
32426 priority = P_PROC_FMA;
32427 break;
32428 case PROCESSOR_BDVER3:
32429 arg_str = "bdver3";
32430 priority = P_PROC_FMA;
32431 break;
32432 case PROCESSOR_BDVER4:
32433 arg_str = "bdver4";
32434 priority = P_PROC_AVX2;
32435 break;
32436 case PROCESSOR_ZNVER1:
32437 arg_str = "znver1";
32438 priority = P_PROC_AVX2;
32439 break;
32440 }
32441 }
32442
32443 cl_target_option_restore (&global_options, &cur_target);
32444
32445 if (predicate_list && arg_str == NULL)
32446 {
32447 error_at (DECL_SOURCE_LOCATION (decl),
32448 "No dispatcher found for the versioning attributes");
32449 return 0;
32450 }
32451
32452 if (predicate_list)
32453 {
32454 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
32455 /* For a C string literal the length includes the trailing NULL. */
32456 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
32457 predicate_chain = tree_cons (predicate_decl, predicate_arg,
32458 predicate_chain);
32459 }
32460 }
32461
32462 /* Process feature name. */
32463 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
32464 strcpy (tok_str, attrs_str);
32465 token = strtok (tok_str, ",");
32466 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
32467
32468 while (token != NULL)
32469 {
32470 /* Do not process "arch=" */
32471 if (strncmp (token, "arch=", 5) == 0)
32472 {
32473 token = strtok (NULL, ",");
32474 continue;
32475 }
32476 for (i = 0; i < NUM_FEATURES; ++i)
32477 {
32478 if (strcmp (token, feature_list[i].name) == 0)
32479 {
32480 if (predicate_list)
32481 {
32482 predicate_arg = build_string_literal (
32483 strlen (feature_list[i].name) + 1,
32484 feature_list[i].name);
32485 predicate_chain = tree_cons (predicate_decl, predicate_arg,
32486 predicate_chain);
32487 }
32488 /* Find the maximum priority feature. */
32489 if (feature_list[i].priority > priority)
32490 priority = feature_list[i].priority;
32491
32492 break;
32493 }
32494 }
32495 if (predicate_list && i == NUM_FEATURES)
32496 {
32497 error_at (DECL_SOURCE_LOCATION (decl),
32498 "No dispatcher found for %s", token);
32499 return 0;
32500 }
32501 token = strtok (NULL, ",");
32502 }
32503 free (tok_str);
32504
32505 if (predicate_list && predicate_chain == NULL_TREE)
32506 {
32507 error_at (DECL_SOURCE_LOCATION (decl),
32508 "No dispatcher found for the versioning attributes : %s",
32509 attrs_str);
32510 return 0;
32511 }
32512 else if (predicate_list)
32513 {
32514 predicate_chain = nreverse (predicate_chain);
32515 *predicate_list = predicate_chain;
32516 }
32517
32518 return priority;
32519 }
32520
32521 /* This compares the priority of target features in function DECL1
32522 and DECL2. It returns positive value if DECL1 is higher priority,
32523 negative value if DECL2 is higher priority and 0 if they are the
32524 same. */
32525
32526 static int
32527 ix86_compare_version_priority (tree decl1, tree decl2)
32528 {
32529 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
32530 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
32531
32532 return (int)priority1 - (int)priority2;
32533 }
32534
32535 /* V1 and V2 point to function versions with different priorities
32536 based on the target ISA. This function compares their priorities. */
32537
32538 static int
32539 feature_compare (const void *v1, const void *v2)
32540 {
32541 typedef struct _function_version_info
32542 {
32543 tree version_decl;
32544 tree predicate_chain;
32545 unsigned int dispatch_priority;
32546 } function_version_info;
32547
32548 const function_version_info c1 = *(const function_version_info *)v1;
32549 const function_version_info c2 = *(const function_version_info *)v2;
32550 return (c2.dispatch_priority - c1.dispatch_priority);
32551 }
32552
32553 /* This function generates the dispatch function for
32554 multi-versioned functions. DISPATCH_DECL is the function which will
32555 contain the dispatch logic. FNDECLS are the function choices for
32556 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
32557 in DISPATCH_DECL in which the dispatch code is generated. */
32558
32559 static int
32560 dispatch_function_versions (tree dispatch_decl,
32561 void *fndecls_p,
32562 basic_block *empty_bb)
32563 {
32564 tree default_decl;
32565 gimple *ifunc_cpu_init_stmt;
32566 gimple_seq gseq;
32567 int ix;
32568 tree ele;
32569 vec<tree> *fndecls;
32570 unsigned int num_versions = 0;
32571 unsigned int actual_versions = 0;
32572 unsigned int i;
32573
32574 struct _function_version_info
32575 {
32576 tree version_decl;
32577 tree predicate_chain;
32578 unsigned int dispatch_priority;
32579 }*function_version_info;
32580
32581 gcc_assert (dispatch_decl != NULL
32582 && fndecls_p != NULL
32583 && empty_bb != NULL);
32584
32585 /*fndecls_p is actually a vector. */
32586 fndecls = static_cast<vec<tree> *> (fndecls_p);
32587
32588 /* At least one more version other than the default. */
32589 num_versions = fndecls->length ();
32590 gcc_assert (num_versions >= 2);
32591
32592 function_version_info = (struct _function_version_info *)
32593 XNEWVEC (struct _function_version_info, (num_versions - 1));
32594
32595 /* The first version in the vector is the default decl. */
32596 default_decl = (*fndecls)[0];
32597
32598 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
32599
32600 gseq = bb_seq (*empty_bb);
32601 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
32602 constructors, so explicity call __builtin_cpu_init here. */
32603 ifunc_cpu_init_stmt = gimple_build_call_vec (
32604 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
32605 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
32606 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
32607 set_bb_seq (*empty_bb, gseq);
32608
32609 pop_cfun ();
32610
32611
32612 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
32613 {
32614 tree version_decl = ele;
32615 tree predicate_chain = NULL_TREE;
32616 unsigned int priority;
32617 /* Get attribute string, parse it and find the right predicate decl.
32618 The predicate function could be a lengthy combination of many
32619 features, like arch-type and various isa-variants. */
32620 priority = get_builtin_code_for_version (version_decl,
32621 &predicate_chain);
32622
32623 if (predicate_chain == NULL_TREE)
32624 continue;
32625
32626 function_version_info [actual_versions].version_decl = version_decl;
32627 function_version_info [actual_versions].predicate_chain
32628 = predicate_chain;
32629 function_version_info [actual_versions].dispatch_priority = priority;
32630 actual_versions++;
32631 }
32632
32633 /* Sort the versions according to descending order of dispatch priority. The
32634 priority is based on the ISA. This is not a perfect solution. There
32635 could still be ambiguity. If more than one function version is suitable
32636 to execute, which one should be dispatched? In future, allow the user
32637 to specify a dispatch priority next to the version. */
32638 qsort (function_version_info, actual_versions,
32639 sizeof (struct _function_version_info), feature_compare);
32640
32641 for (i = 0; i < actual_versions; ++i)
32642 *empty_bb = add_condition_to_bb (dispatch_decl,
32643 function_version_info[i].version_decl,
32644 function_version_info[i].predicate_chain,
32645 *empty_bb);
32646
32647 /* dispatch default version at the end. */
32648 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
32649 NULL, *empty_bb);
32650
32651 free (function_version_info);
32652 return 0;
32653 }
32654
32655 /* Comparator function to be used in qsort routine to sort attribute
32656 specification strings to "target". */
32657
32658 static int
32659 attr_strcmp (const void *v1, const void *v2)
32660 {
32661 const char *c1 = *(char *const*)v1;
32662 const char *c2 = *(char *const*)v2;
32663 return strcmp (c1, c2);
32664 }
32665
32666 /* ARGLIST is the argument to target attribute. This function tokenizes
32667 the comma separated arguments, sorts them and returns a string which
32668 is a unique identifier for the comma separated arguments. It also
32669 replaces non-identifier characters "=,-" with "_". */
32670
32671 static char *
32672 sorted_attr_string (tree arglist)
32673 {
32674 tree arg;
32675 size_t str_len_sum = 0;
32676 char **args = NULL;
32677 char *attr_str, *ret_str;
32678 char *attr = NULL;
32679 unsigned int argnum = 1;
32680 unsigned int i;
32681
32682 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
32683 {
32684 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
32685 size_t len = strlen (str);
32686 str_len_sum += len + 1;
32687 if (arg != arglist)
32688 argnum++;
32689 for (i = 0; i < strlen (str); i++)
32690 if (str[i] == ',')
32691 argnum++;
32692 }
32693
32694 attr_str = XNEWVEC (char, str_len_sum);
32695 str_len_sum = 0;
32696 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
32697 {
32698 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
32699 size_t len = strlen (str);
32700 memcpy (attr_str + str_len_sum, str, len);
32701 attr_str[str_len_sum + len] = TREE_CHAIN (arg) ? ',' : '\0';
32702 str_len_sum += len + 1;
32703 }
32704
32705 /* Replace "=,-" with "_". */
32706 for (i = 0; i < strlen (attr_str); i++)
32707 if (attr_str[i] == '=' || attr_str[i]== '-')
32708 attr_str[i] = '_';
32709
32710 if (argnum == 1)
32711 return attr_str;
32712
32713 args = XNEWVEC (char *, argnum);
32714
32715 i = 0;
32716 attr = strtok (attr_str, ",");
32717 while (attr != NULL)
32718 {
32719 args[i] = attr;
32720 i++;
32721 attr = strtok (NULL, ",");
32722 }
32723
32724 qsort (args, argnum, sizeof (char *), attr_strcmp);
32725
32726 ret_str = XNEWVEC (char, str_len_sum);
32727 str_len_sum = 0;
32728 for (i = 0; i < argnum; i++)
32729 {
32730 size_t len = strlen (args[i]);
32731 memcpy (ret_str + str_len_sum, args[i], len);
32732 ret_str[str_len_sum + len] = i < argnum - 1 ? '_' : '\0';
32733 str_len_sum += len + 1;
32734 }
32735
32736 XDELETEVEC (args);
32737 XDELETEVEC (attr_str);
32738 return ret_str;
32739 }
32740
32741 /* This function changes the assembler name for functions that are
32742 versions. If DECL is a function version and has a "target"
32743 attribute, it appends the attribute string to its assembler name. */
32744
32745 static tree
32746 ix86_mangle_function_version_assembler_name (tree decl, tree id)
32747 {
32748 tree version_attr;
32749 const char *orig_name, *version_string;
32750 char *attr_str, *assembler_name;
32751
32752 if (DECL_DECLARED_INLINE_P (decl)
32753 && lookup_attribute ("gnu_inline",
32754 DECL_ATTRIBUTES (decl)))
32755 error_at (DECL_SOURCE_LOCATION (decl),
32756 "Function versions cannot be marked as gnu_inline,"
32757 " bodies have to be generated");
32758
32759 if (DECL_VIRTUAL_P (decl)
32760 || DECL_VINDEX (decl))
32761 sorry ("Virtual function multiversioning not supported");
32762
32763 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
32764
32765 /* target attribute string cannot be NULL. */
32766 gcc_assert (version_attr != NULL_TREE);
32767
32768 orig_name = IDENTIFIER_POINTER (id);
32769 version_string
32770 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
32771
32772 if (strcmp (version_string, "default") == 0)
32773 return id;
32774
32775 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
32776 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
32777
32778 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
32779
32780 /* Allow assembler name to be modified if already set. */
32781 if (DECL_ASSEMBLER_NAME_SET_P (decl))
32782 SET_DECL_RTL (decl, NULL);
32783
32784 tree ret = get_identifier (assembler_name);
32785 XDELETEVEC (attr_str);
32786 XDELETEVEC (assembler_name);
32787 return ret;
32788 }
32789
32790 /* This function returns true if FN1 and FN2 are versions of the same function,
32791 that is, the target strings of the function decls are different. This assumes
32792 that FN1 and FN2 have the same signature. */
32793
32794 static bool
32795 ix86_function_versions (tree fn1, tree fn2)
32796 {
32797 tree attr1, attr2;
32798 char *target1, *target2;
32799 bool result;
32800
32801 if (TREE_CODE (fn1) != FUNCTION_DECL
32802 || TREE_CODE (fn2) != FUNCTION_DECL)
32803 return false;
32804
32805 attr1 = lookup_attribute ("target", DECL_ATTRIBUTES (fn1));
32806 attr2 = lookup_attribute ("target", DECL_ATTRIBUTES (fn2));
32807
32808 /* At least one function decl should have the target attribute specified. */
32809 if (attr1 == NULL_TREE && attr2 == NULL_TREE)
32810 return false;
32811
32812 /* Diagnose missing target attribute if one of the decls is already
32813 multi-versioned. */
32814 if (attr1 == NULL_TREE || attr2 == NULL_TREE)
32815 {
32816 if (DECL_FUNCTION_VERSIONED (fn1) || DECL_FUNCTION_VERSIONED (fn2))
32817 {
32818 if (attr2 != NULL_TREE)
32819 {
32820 std::swap (fn1, fn2);
32821 attr1 = attr2;
32822 }
32823 error_at (DECL_SOURCE_LOCATION (fn2),
32824 "missing %<target%> attribute for multi-versioned %D",
32825 fn2);
32826 inform (DECL_SOURCE_LOCATION (fn1),
32827 "previous declaration of %D", fn1);
32828 /* Prevent diagnosing of the same error multiple times. */
32829 DECL_ATTRIBUTES (fn2)
32830 = tree_cons (get_identifier ("target"),
32831 copy_node (TREE_VALUE (attr1)),
32832 DECL_ATTRIBUTES (fn2));
32833 }
32834 return false;
32835 }
32836
32837 target1 = sorted_attr_string (TREE_VALUE (attr1));
32838 target2 = sorted_attr_string (TREE_VALUE (attr2));
32839
32840 /* The sorted target strings must be different for fn1 and fn2
32841 to be versions. */
32842 if (strcmp (target1, target2) == 0)
32843 result = false;
32844 else
32845 result = true;
32846
32847 XDELETEVEC (target1);
32848 XDELETEVEC (target2);
32849
32850 return result;
32851 }
32852
32853 static tree
32854 ix86_mangle_decl_assembler_name (tree decl, tree id)
32855 {
32856 /* For function version, add the target suffix to the assembler name. */
32857 if (TREE_CODE (decl) == FUNCTION_DECL
32858 && DECL_FUNCTION_VERSIONED (decl))
32859 id = ix86_mangle_function_version_assembler_name (decl, id);
32860 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
32861 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
32862 #endif
32863
32864 return id;
32865 }
32866
32867 /* Return a new name by appending SUFFIX to the DECL name. If make_unique
32868 is true, append the full path name of the source file. */
32869
32870 static char *
32871 make_name (tree decl, const char *suffix, bool make_unique)
32872 {
32873 char *global_var_name;
32874 int name_len;
32875 const char *name;
32876 const char *unique_name = NULL;
32877
32878 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
32879
32880 /* Get a unique name that can be used globally without any chances
32881 of collision at link time. */
32882 if (make_unique)
32883 unique_name = IDENTIFIER_POINTER (get_file_function_name ("\0"));
32884
32885 name_len = strlen (name) + strlen (suffix) + 2;
32886
32887 if (make_unique)
32888 name_len += strlen (unique_name) + 1;
32889 global_var_name = XNEWVEC (char, name_len);
32890
32891 /* Use '.' to concatenate names as it is demangler friendly. */
32892 if (make_unique)
32893 snprintf (global_var_name, name_len, "%s.%s.%s", name, unique_name,
32894 suffix);
32895 else
32896 snprintf (global_var_name, name_len, "%s.%s", name, suffix);
32897
32898 return global_var_name;
32899 }
32900
32901 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
32902
32903 /* Make a dispatcher declaration for the multi-versioned function DECL.
32904 Calls to DECL function will be replaced with calls to the dispatcher
32905 by the front-end. Return the decl created. */
32906
32907 static tree
32908 make_dispatcher_decl (const tree decl)
32909 {
32910 tree func_decl;
32911 char *func_name;
32912 tree fn_type, func_type;
32913 bool is_uniq = false;
32914
32915 if (TREE_PUBLIC (decl) == 0)
32916 is_uniq = true;
32917
32918 func_name = make_name (decl, "ifunc", is_uniq);
32919
32920 fn_type = TREE_TYPE (decl);
32921 func_type = build_function_type (TREE_TYPE (fn_type),
32922 TYPE_ARG_TYPES (fn_type));
32923
32924 func_decl = build_fn_decl (func_name, func_type);
32925 XDELETEVEC (func_name);
32926 TREE_USED (func_decl) = 1;
32927 DECL_CONTEXT (func_decl) = NULL_TREE;
32928 DECL_INITIAL (func_decl) = error_mark_node;
32929 DECL_ARTIFICIAL (func_decl) = 1;
32930 /* Mark this func as external, the resolver will flip it again if
32931 it gets generated. */
32932 DECL_EXTERNAL (func_decl) = 1;
32933 /* This will be of type IFUNCs have to be externally visible. */
32934 TREE_PUBLIC (func_decl) = 1;
32935
32936 return func_decl;
32937 }
32938
32939 #endif
32940
32941 /* Returns true if decl is multi-versioned and DECL is the default function,
32942 that is it is not tagged with target specific optimization. */
32943
32944 static bool
32945 is_function_default_version (const tree decl)
32946 {
32947 if (TREE_CODE (decl) != FUNCTION_DECL
32948 || !DECL_FUNCTION_VERSIONED (decl))
32949 return false;
32950 tree attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
32951 gcc_assert (attr);
32952 attr = TREE_VALUE (TREE_VALUE (attr));
32953 return (TREE_CODE (attr) == STRING_CST
32954 && strcmp (TREE_STRING_POINTER (attr), "default") == 0);
32955 }
32956
32957 /* Make a dispatcher declaration for the multi-versioned function DECL.
32958 Calls to DECL function will be replaced with calls to the dispatcher
32959 by the front-end. Returns the decl of the dispatcher function. */
32960
32961 static tree
32962 ix86_get_function_versions_dispatcher (void *decl)
32963 {
32964 tree fn = (tree) decl;
32965 struct cgraph_node *node = NULL;
32966 struct cgraph_node *default_node = NULL;
32967 struct cgraph_function_version_info *node_v = NULL;
32968 struct cgraph_function_version_info *first_v = NULL;
32969
32970 tree dispatch_decl = NULL;
32971
32972 struct cgraph_function_version_info *default_version_info = NULL;
32973
32974 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
32975
32976 node = cgraph_node::get (fn);
32977 gcc_assert (node != NULL);
32978
32979 node_v = node->function_version ();
32980 gcc_assert (node_v != NULL);
32981
32982 if (node_v->dispatcher_resolver != NULL)
32983 return node_v->dispatcher_resolver;
32984
32985 /* Find the default version and make it the first node. */
32986 first_v = node_v;
32987 /* Go to the beginning of the chain. */
32988 while (first_v->prev != NULL)
32989 first_v = first_v->prev;
32990 default_version_info = first_v;
32991 while (default_version_info != NULL)
32992 {
32993 if (is_function_default_version
32994 (default_version_info->this_node->decl))
32995 break;
32996 default_version_info = default_version_info->next;
32997 }
32998
32999 /* If there is no default node, just return NULL. */
33000 if (default_version_info == NULL)
33001 return NULL;
33002
33003 /* Make default info the first node. */
33004 if (first_v != default_version_info)
33005 {
33006 default_version_info->prev->next = default_version_info->next;
33007 if (default_version_info->next)
33008 default_version_info->next->prev = default_version_info->prev;
33009 first_v->prev = default_version_info;
33010 default_version_info->next = first_v;
33011 default_version_info->prev = NULL;
33012 }
33013
33014 default_node = default_version_info->this_node;
33015
33016 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
33017 if (targetm.has_ifunc_p ())
33018 {
33019 struct cgraph_function_version_info *it_v = NULL;
33020 struct cgraph_node *dispatcher_node = NULL;
33021 struct cgraph_function_version_info *dispatcher_version_info = NULL;
33022
33023 /* Right now, the dispatching is done via ifunc. */
33024 dispatch_decl = make_dispatcher_decl (default_node->decl);
33025
33026 dispatcher_node = cgraph_node::get_create (dispatch_decl);
33027 gcc_assert (dispatcher_node != NULL);
33028 dispatcher_node->dispatcher_function = 1;
33029 dispatcher_version_info
33030 = dispatcher_node->insert_new_function_version ();
33031 dispatcher_version_info->next = default_version_info;
33032 dispatcher_node->definition = 1;
33033
33034 /* Set the dispatcher for all the versions. */
33035 it_v = default_version_info;
33036 while (it_v != NULL)
33037 {
33038 it_v->dispatcher_resolver = dispatch_decl;
33039 it_v = it_v->next;
33040 }
33041 }
33042 else
33043 #endif
33044 {
33045 error_at (DECL_SOURCE_LOCATION (default_node->decl),
33046 "multiversioning needs ifunc which is not supported "
33047 "on this target");
33048 }
33049
33050 return dispatch_decl;
33051 }
33052
33053 /* Make the resolver function decl to dispatch the versions of
33054 a multi-versioned function, DEFAULT_DECL. Create an
33055 empty basic block in the resolver and store the pointer in
33056 EMPTY_BB. Return the decl of the resolver function. */
33057
33058 static tree
33059 make_resolver_func (const tree default_decl,
33060 const tree dispatch_decl,
33061 basic_block *empty_bb)
33062 {
33063 char *resolver_name;
33064 tree decl, type, decl_name, t;
33065 bool is_uniq = false;
33066
33067 /* IFUNC's have to be globally visible. So, if the default_decl is
33068 not, then the name of the IFUNC should be made unique. */
33069 if (TREE_PUBLIC (default_decl) == 0)
33070 is_uniq = true;
33071
33072 /* Append the filename to the resolver function if the versions are
33073 not externally visible. This is because the resolver function has
33074 to be externally visible for the loader to find it. So, appending
33075 the filename will prevent conflicts with a resolver function from
33076 another module which is based on the same version name. */
33077 resolver_name = make_name (default_decl, "resolver", is_uniq);
33078
33079 /* The resolver function should return a (void *). */
33080 type = build_function_type_list (ptr_type_node, NULL_TREE);
33081
33082 decl = build_fn_decl (resolver_name, type);
33083 decl_name = get_identifier (resolver_name);
33084 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
33085
33086 DECL_NAME (decl) = decl_name;
33087 TREE_USED (decl) = 1;
33088 DECL_ARTIFICIAL (decl) = 1;
33089 DECL_IGNORED_P (decl) = 0;
33090 /* IFUNC resolvers have to be externally visible. */
33091 TREE_PUBLIC (decl) = 1;
33092 DECL_UNINLINABLE (decl) = 1;
33093
33094 /* Resolver is not external, body is generated. */
33095 DECL_EXTERNAL (decl) = 0;
33096 DECL_EXTERNAL (dispatch_decl) = 0;
33097
33098 DECL_CONTEXT (decl) = NULL_TREE;
33099 DECL_INITIAL (decl) = make_node (BLOCK);
33100 DECL_STATIC_CONSTRUCTOR (decl) = 0;
33101
33102 if (DECL_COMDAT_GROUP (default_decl)
33103 || TREE_PUBLIC (default_decl))
33104 {
33105 /* In this case, each translation unit with a call to this
33106 versioned function will put out a resolver. Ensure it
33107 is comdat to keep just one copy. */
33108 DECL_COMDAT (decl) = 1;
33109 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
33110 }
33111 /* Build result decl and add to function_decl. */
33112 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
33113 DECL_ARTIFICIAL (t) = 1;
33114 DECL_IGNORED_P (t) = 1;
33115 DECL_RESULT (decl) = t;
33116
33117 gimplify_function_tree (decl);
33118 push_cfun (DECL_STRUCT_FUNCTION (decl));
33119 *empty_bb = init_lowered_empty_function (decl, false, 0);
33120
33121 cgraph_node::add_new_function (decl, true);
33122 symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
33123
33124 pop_cfun ();
33125
33126 gcc_assert (dispatch_decl != NULL);
33127 /* Mark dispatch_decl as "ifunc" with resolver as resolver_name. */
33128 DECL_ATTRIBUTES (dispatch_decl)
33129 = make_attribute ("ifunc", resolver_name, DECL_ATTRIBUTES (dispatch_decl));
33130
33131 /* Create the alias for dispatch to resolver here. */
33132 /*cgraph_create_function_alias (dispatch_decl, decl);*/
33133 cgraph_node::create_same_body_alias (dispatch_decl, decl);
33134 XDELETEVEC (resolver_name);
33135 return decl;
33136 }
33137
33138 /* Generate the dispatching code body to dispatch multi-versioned function
33139 DECL. The target hook is called to process the "target" attributes and
33140 provide the code to dispatch the right function at run-time. NODE points
33141 to the dispatcher decl whose body will be created. */
33142
33143 static tree
33144 ix86_generate_version_dispatcher_body (void *node_p)
33145 {
33146 tree resolver_decl;
33147 basic_block empty_bb;
33148 tree default_ver_decl;
33149 struct cgraph_node *versn;
33150 struct cgraph_node *node;
33151
33152 struct cgraph_function_version_info *node_version_info = NULL;
33153 struct cgraph_function_version_info *versn_info = NULL;
33154
33155 node = (cgraph_node *)node_p;
33156
33157 node_version_info = node->function_version ();
33158 gcc_assert (node->dispatcher_function
33159 && node_version_info != NULL);
33160
33161 if (node_version_info->dispatcher_resolver)
33162 return node_version_info->dispatcher_resolver;
33163
33164 /* The first version in the chain corresponds to the default version. */
33165 default_ver_decl = node_version_info->next->this_node->decl;
33166
33167 /* node is going to be an alias, so remove the finalized bit. */
33168 node->definition = false;
33169
33170 resolver_decl = make_resolver_func (default_ver_decl,
33171 node->decl, &empty_bb);
33172
33173 node_version_info->dispatcher_resolver = resolver_decl;
33174
33175 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
33176
33177 auto_vec<tree, 2> fn_ver_vec;
33178
33179 for (versn_info = node_version_info->next; versn_info;
33180 versn_info = versn_info->next)
33181 {
33182 versn = versn_info->this_node;
33183 /* Check for virtual functions here again, as by this time it should
33184 have been determined if this function needs a vtable index or
33185 not. This happens for methods in derived classes that override
33186 virtual methods in base classes but are not explicitly marked as
33187 virtual. */
33188 if (DECL_VINDEX (versn->decl))
33189 sorry ("Virtual function multiversioning not supported");
33190
33191 fn_ver_vec.safe_push (versn->decl);
33192 }
33193
33194 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
33195 cgraph_edge::rebuild_edges ();
33196 pop_cfun ();
33197 return resolver_decl;
33198 }
33199 /* This builds the processor_model struct type defined in
33200 libgcc/config/i386/cpuinfo.c */
33201
33202 static tree
33203 build_processor_model_struct (void)
33204 {
33205 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
33206 "__cpu_features"};
33207 tree field = NULL_TREE, field_chain = NULL_TREE;
33208 int i;
33209 tree type = make_node (RECORD_TYPE);
33210
33211 /* The first 3 fields are unsigned int. */
33212 for (i = 0; i < 3; ++i)
33213 {
33214 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
33215 get_identifier (field_name[i]), unsigned_type_node);
33216 if (field_chain != NULL_TREE)
33217 DECL_CHAIN (field) = field_chain;
33218 field_chain = field;
33219 }
33220
33221 /* The last field is an array of unsigned integers of size one. */
33222 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
33223 get_identifier (field_name[3]),
33224 build_array_type (unsigned_type_node,
33225 build_index_type (size_one_node)));
33226 if (field_chain != NULL_TREE)
33227 DECL_CHAIN (field) = field_chain;
33228 field_chain = field;
33229
33230 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
33231 return type;
33232 }
33233
33234 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
33235
33236 static tree
33237 make_var_decl (tree type, const char *name)
33238 {
33239 tree new_decl;
33240
33241 new_decl = build_decl (UNKNOWN_LOCATION,
33242 VAR_DECL,
33243 get_identifier(name),
33244 type);
33245
33246 DECL_EXTERNAL (new_decl) = 1;
33247 TREE_STATIC (new_decl) = 1;
33248 TREE_PUBLIC (new_decl) = 1;
33249 DECL_INITIAL (new_decl) = 0;
33250 DECL_ARTIFICIAL (new_decl) = 0;
33251 DECL_PRESERVE_P (new_decl) = 1;
33252
33253 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
33254 assemble_variable (new_decl, 0, 0, 0);
33255
33256 return new_decl;
33257 }
33258
33259 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
33260 into an integer defined in libgcc/config/i386/cpuinfo.c */
33261
33262 static tree
33263 fold_builtin_cpu (tree fndecl, tree *args)
33264 {
33265 unsigned int i;
33266 enum ix86_builtins fn_code = (enum ix86_builtins)
33267 DECL_FUNCTION_CODE (fndecl);
33268 tree param_string_cst = NULL;
33269
33270 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
33271 enum processor_features
33272 {
33273 F_CMOV = 0,
33274 F_MMX,
33275 F_POPCNT,
33276 F_SSE,
33277 F_SSE2,
33278 F_SSE3,
33279 F_SSSE3,
33280 F_SSE4_1,
33281 F_SSE4_2,
33282 F_AVX,
33283 F_AVX2,
33284 F_SSE4_A,
33285 F_FMA4,
33286 F_XOP,
33287 F_FMA,
33288 F_AVX512F,
33289 F_BMI,
33290 F_BMI2,
33291 F_AES,
33292 F_PCLMUL,
33293 F_AVX512VL,
33294 F_AVX512BW,
33295 F_AVX512DQ,
33296 F_AVX512CD,
33297 F_AVX512ER,
33298 F_AVX512PF,
33299 F_AVX512VBMI,
33300 F_AVX512IFMA,
33301 F_AVX5124VNNIW,
33302 F_AVX5124FMAPS,
33303 F_MAX
33304 };
33305
33306 /* These are the values for vendor types and cpu types and subtypes
33307 in cpuinfo.c. Cpu types and subtypes should be subtracted by
33308 the corresponding start value. */
33309 enum processor_model
33310 {
33311 M_INTEL = 1,
33312 M_AMD,
33313 M_CPU_TYPE_START,
33314 M_INTEL_BONNELL,
33315 M_INTEL_CORE2,
33316 M_INTEL_COREI7,
33317 M_AMDFAM10H,
33318 M_AMDFAM15H,
33319 M_INTEL_SILVERMONT,
33320 M_INTEL_KNL,
33321 M_AMD_BTVER1,
33322 M_AMD_BTVER2,
33323 M_CPU_SUBTYPE_START,
33324 M_INTEL_COREI7_NEHALEM,
33325 M_INTEL_COREI7_WESTMERE,
33326 M_INTEL_COREI7_SANDYBRIDGE,
33327 M_AMDFAM10H_BARCELONA,
33328 M_AMDFAM10H_SHANGHAI,
33329 M_AMDFAM10H_ISTANBUL,
33330 M_AMDFAM15H_BDVER1,
33331 M_AMDFAM15H_BDVER2,
33332 M_AMDFAM15H_BDVER3,
33333 M_AMDFAM15H_BDVER4,
33334 M_AMDFAM17H_ZNVER1,
33335 M_INTEL_COREI7_IVYBRIDGE,
33336 M_INTEL_COREI7_HASWELL,
33337 M_INTEL_COREI7_BROADWELL,
33338 M_INTEL_COREI7_SKYLAKE,
33339 M_INTEL_COREI7_SKYLAKE_AVX512
33340 };
33341
33342 static struct _arch_names_table
33343 {
33344 const char *const name;
33345 const enum processor_model model;
33346 }
33347 const arch_names_table[] =
33348 {
33349 {"amd", M_AMD},
33350 {"intel", M_INTEL},
33351 {"atom", M_INTEL_BONNELL},
33352 {"slm", M_INTEL_SILVERMONT},
33353 {"core2", M_INTEL_CORE2},
33354 {"corei7", M_INTEL_COREI7},
33355 {"nehalem", M_INTEL_COREI7_NEHALEM},
33356 {"westmere", M_INTEL_COREI7_WESTMERE},
33357 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
33358 {"ivybridge", M_INTEL_COREI7_IVYBRIDGE},
33359 {"haswell", M_INTEL_COREI7_HASWELL},
33360 {"broadwell", M_INTEL_COREI7_BROADWELL},
33361 {"skylake", M_INTEL_COREI7_SKYLAKE},
33362 {"skylake-avx512", M_INTEL_COREI7_SKYLAKE_AVX512},
33363 {"bonnell", M_INTEL_BONNELL},
33364 {"silvermont", M_INTEL_SILVERMONT},
33365 {"knl", M_INTEL_KNL},
33366 {"amdfam10h", M_AMDFAM10H},
33367 {"barcelona", M_AMDFAM10H_BARCELONA},
33368 {"shanghai", M_AMDFAM10H_SHANGHAI},
33369 {"istanbul", M_AMDFAM10H_ISTANBUL},
33370 {"btver1", M_AMD_BTVER1},
33371 {"amdfam15h", M_AMDFAM15H},
33372 {"bdver1", M_AMDFAM15H_BDVER1},
33373 {"bdver2", M_AMDFAM15H_BDVER2},
33374 {"bdver3", M_AMDFAM15H_BDVER3},
33375 {"bdver4", M_AMDFAM15H_BDVER4},
33376 {"btver2", M_AMD_BTVER2},
33377 {"znver1", M_AMDFAM17H_ZNVER1},
33378 };
33379
33380 static struct _isa_names_table
33381 {
33382 const char *const name;
33383 const enum processor_features feature;
33384 }
33385 const isa_names_table[] =
33386 {
33387 {"cmov", F_CMOV},
33388 {"mmx", F_MMX},
33389 {"popcnt", F_POPCNT},
33390 {"sse", F_SSE},
33391 {"sse2", F_SSE2},
33392 {"sse3", F_SSE3},
33393 {"ssse3", F_SSSE3},
33394 {"sse4a", F_SSE4_A},
33395 {"sse4.1", F_SSE4_1},
33396 {"sse4.2", F_SSE4_2},
33397 {"avx", F_AVX},
33398 {"fma4", F_FMA4},
33399 {"xop", F_XOP},
33400 {"fma", F_FMA},
33401 {"avx2", F_AVX2},
33402 {"avx512f", F_AVX512F},
33403 {"bmi", F_BMI},
33404 {"bmi2", F_BMI2},
33405 {"aes", F_AES},
33406 {"pclmul", F_PCLMUL},
33407 {"avx512vl",F_AVX512VL},
33408 {"avx512bw",F_AVX512BW},
33409 {"avx512dq",F_AVX512DQ},
33410 {"avx512cd",F_AVX512CD},
33411 {"avx512er",F_AVX512ER},
33412 {"avx512pf",F_AVX512PF},
33413 {"avx512vbmi",F_AVX512VBMI},
33414 {"avx512ifma",F_AVX512IFMA},
33415 {"avx5124vnniw",F_AVX5124VNNIW},
33416 {"avx5124fmaps",F_AVX5124FMAPS},
33417 };
33418
33419 tree __processor_model_type = build_processor_model_struct ();
33420 tree __cpu_model_var = make_var_decl (__processor_model_type,
33421 "__cpu_model");
33422
33423
33424 varpool_node::add (__cpu_model_var);
33425
33426 gcc_assert ((args != NULL) && (*args != NULL));
33427
33428 param_string_cst = *args;
33429 while (param_string_cst
33430 && TREE_CODE (param_string_cst) != STRING_CST)
33431 {
33432 /* *args must be a expr that can contain other EXPRS leading to a
33433 STRING_CST. */
33434 if (!EXPR_P (param_string_cst))
33435 {
33436 error ("Parameter to builtin must be a string constant or literal");
33437 return integer_zero_node;
33438 }
33439 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
33440 }
33441
33442 gcc_assert (param_string_cst);
33443
33444 if (fn_code == IX86_BUILTIN_CPU_IS)
33445 {
33446 tree ref;
33447 tree field;
33448 tree final;
33449
33450 unsigned int field_val = 0;
33451 unsigned int NUM_ARCH_NAMES
33452 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
33453
33454 for (i = 0; i < NUM_ARCH_NAMES; i++)
33455 if (strcmp (arch_names_table[i].name,
33456 TREE_STRING_POINTER (param_string_cst)) == 0)
33457 break;
33458
33459 if (i == NUM_ARCH_NAMES)
33460 {
33461 error ("Parameter to builtin not valid: %s",
33462 TREE_STRING_POINTER (param_string_cst));
33463 return integer_zero_node;
33464 }
33465
33466 field = TYPE_FIELDS (__processor_model_type);
33467 field_val = arch_names_table[i].model;
33468
33469 /* CPU types are stored in the next field. */
33470 if (field_val > M_CPU_TYPE_START
33471 && field_val < M_CPU_SUBTYPE_START)
33472 {
33473 field = DECL_CHAIN (field);
33474 field_val -= M_CPU_TYPE_START;
33475 }
33476
33477 /* CPU subtypes are stored in the next field. */
33478 if (field_val > M_CPU_SUBTYPE_START)
33479 {
33480 field = DECL_CHAIN ( DECL_CHAIN (field));
33481 field_val -= M_CPU_SUBTYPE_START;
33482 }
33483
33484 /* Get the appropriate field in __cpu_model. */
33485 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
33486 field, NULL_TREE);
33487
33488 /* Check the value. */
33489 final = build2 (EQ_EXPR, unsigned_type_node, ref,
33490 build_int_cstu (unsigned_type_node, field_val));
33491 return build1 (CONVERT_EXPR, integer_type_node, final);
33492 }
33493 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
33494 {
33495 tree ref;
33496 tree array_elt;
33497 tree field;
33498 tree final;
33499
33500 unsigned int field_val = 0;
33501 unsigned int NUM_ISA_NAMES
33502 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
33503
33504 for (i = 0; i < NUM_ISA_NAMES; i++)
33505 if (strcmp (isa_names_table[i].name,
33506 TREE_STRING_POINTER (param_string_cst)) == 0)
33507 break;
33508
33509 if (i == NUM_ISA_NAMES)
33510 {
33511 error ("Parameter to builtin not valid: %s",
33512 TREE_STRING_POINTER (param_string_cst));
33513 return integer_zero_node;
33514 }
33515
33516 field = TYPE_FIELDS (__processor_model_type);
33517 /* Get the last field, which is __cpu_features. */
33518 while (DECL_CHAIN (field))
33519 field = DECL_CHAIN (field);
33520
33521 /* Get the appropriate field: __cpu_model.__cpu_features */
33522 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
33523 field, NULL_TREE);
33524
33525 /* Access the 0th element of __cpu_features array. */
33526 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
33527 integer_zero_node, NULL_TREE, NULL_TREE);
33528
33529 field_val = (1 << isa_names_table[i].feature);
33530 /* Return __cpu_model.__cpu_features[0] & field_val */
33531 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
33532 build_int_cstu (unsigned_type_node, field_val));
33533 return build1 (CONVERT_EXPR, integer_type_node, final);
33534 }
33535 gcc_unreachable ();
33536 }
33537
33538 static tree
33539 ix86_fold_builtin (tree fndecl, int n_args,
33540 tree *args, bool ignore ATTRIBUTE_UNUSED)
33541 {
33542 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
33543 {
33544 enum ix86_builtins fn_code = (enum ix86_builtins)
33545 DECL_FUNCTION_CODE (fndecl);
33546 switch (fn_code)
33547 {
33548 case IX86_BUILTIN_CPU_IS:
33549 case IX86_BUILTIN_CPU_SUPPORTS:
33550 gcc_assert (n_args == 1);
33551 return fold_builtin_cpu (fndecl, args);
33552
33553 case IX86_BUILTIN_NANQ:
33554 case IX86_BUILTIN_NANSQ:
33555 {
33556 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33557 const char *str = c_getstr (*args);
33558 int quiet = fn_code == IX86_BUILTIN_NANQ;
33559 REAL_VALUE_TYPE real;
33560
33561 if (str && real_nan (&real, str, quiet, TYPE_MODE (type)))
33562 return build_real (type, real);
33563 return NULL_TREE;
33564 }
33565
33566 case IX86_BUILTIN_INFQ:
33567 case IX86_BUILTIN_HUGE_VALQ:
33568 {
33569 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33570 REAL_VALUE_TYPE inf;
33571 real_inf (&inf);
33572 return build_real (type, inf);
33573 }
33574
33575 case IX86_BUILTIN_TZCNT16:
33576 case IX86_BUILTIN_CTZS:
33577 case IX86_BUILTIN_TZCNT32:
33578 case IX86_BUILTIN_TZCNT64:
33579 gcc_assert (n_args == 1);
33580 if (TREE_CODE (args[0]) == INTEGER_CST)
33581 {
33582 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33583 tree arg = args[0];
33584 if (fn_code == IX86_BUILTIN_TZCNT16
33585 || fn_code == IX86_BUILTIN_CTZS)
33586 arg = fold_convert (short_unsigned_type_node, arg);
33587 if (integer_zerop (arg))
33588 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
33589 else
33590 return fold_const_call (CFN_CTZ, type, arg);
33591 }
33592 break;
33593
33594 case IX86_BUILTIN_LZCNT16:
33595 case IX86_BUILTIN_CLZS:
33596 case IX86_BUILTIN_LZCNT32:
33597 case IX86_BUILTIN_LZCNT64:
33598 gcc_assert (n_args == 1);
33599 if (TREE_CODE (args[0]) == INTEGER_CST)
33600 {
33601 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33602 tree arg = args[0];
33603 if (fn_code == IX86_BUILTIN_LZCNT16
33604 || fn_code == IX86_BUILTIN_CLZS)
33605 arg = fold_convert (short_unsigned_type_node, arg);
33606 if (integer_zerop (arg))
33607 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
33608 else
33609 return fold_const_call (CFN_CLZ, type, arg);
33610 }
33611 break;
33612
33613 case IX86_BUILTIN_BEXTR32:
33614 case IX86_BUILTIN_BEXTR64:
33615 case IX86_BUILTIN_BEXTRI32:
33616 case IX86_BUILTIN_BEXTRI64:
33617 gcc_assert (n_args == 2);
33618 if (tree_fits_uhwi_p (args[1]))
33619 {
33620 unsigned HOST_WIDE_INT res = 0;
33621 unsigned int prec = TYPE_PRECISION (TREE_TYPE (args[0]));
33622 unsigned int start = tree_to_uhwi (args[1]);
33623 unsigned int len = (start & 0xff00) >> 8;
33624 start &= 0xff;
33625 if (start >= prec || len == 0)
33626 res = 0;
33627 else if (!tree_fits_uhwi_p (args[0]))
33628 break;
33629 else
33630 res = tree_to_uhwi (args[0]) >> start;
33631 if (len > prec)
33632 len = prec;
33633 if (len < HOST_BITS_PER_WIDE_INT)
33634 res &= (HOST_WIDE_INT_1U << len) - 1;
33635 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33636 }
33637 break;
33638
33639 case IX86_BUILTIN_BZHI32:
33640 case IX86_BUILTIN_BZHI64:
33641 gcc_assert (n_args == 2);
33642 if (tree_fits_uhwi_p (args[1]))
33643 {
33644 unsigned int idx = tree_to_uhwi (args[1]) & 0xff;
33645 if (idx >= TYPE_PRECISION (TREE_TYPE (args[0])))
33646 return args[0];
33647 if (!tree_fits_uhwi_p (args[0]))
33648 break;
33649 unsigned HOST_WIDE_INT res = tree_to_uhwi (args[0]);
33650 res &= ~(HOST_WIDE_INT_M1U << idx);
33651 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33652 }
33653 break;
33654
33655 case IX86_BUILTIN_PDEP32:
33656 case IX86_BUILTIN_PDEP64:
33657 gcc_assert (n_args == 2);
33658 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
33659 {
33660 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
33661 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
33662 unsigned HOST_WIDE_INT res = 0;
33663 unsigned HOST_WIDE_INT m, k = 1;
33664 for (m = 1; m; m <<= 1)
33665 if ((mask & m) != 0)
33666 {
33667 if ((src & k) != 0)
33668 res |= m;
33669 k <<= 1;
33670 }
33671 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33672 }
33673 break;
33674
33675 case IX86_BUILTIN_PEXT32:
33676 case IX86_BUILTIN_PEXT64:
33677 gcc_assert (n_args == 2);
33678 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
33679 {
33680 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
33681 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
33682 unsigned HOST_WIDE_INT res = 0;
33683 unsigned HOST_WIDE_INT m, k = 1;
33684 for (m = 1; m; m <<= 1)
33685 if ((mask & m) != 0)
33686 {
33687 if ((src & m) != 0)
33688 res |= k;
33689 k <<= 1;
33690 }
33691 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33692 }
33693 break;
33694
33695 default:
33696 break;
33697 }
33698 }
33699
33700 #ifdef SUBTARGET_FOLD_BUILTIN
33701 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
33702 #endif
33703
33704 return NULL_TREE;
33705 }
33706
33707 /* Fold a MD builtin (use ix86_fold_builtin for folding into
33708 constant) in GIMPLE. */
33709
33710 bool
33711 ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
33712 {
33713 gimple *stmt = gsi_stmt (*gsi);
33714 tree fndecl = gimple_call_fndecl (stmt);
33715 gcc_checking_assert (fndecl && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD);
33716 int n_args = gimple_call_num_args (stmt);
33717 enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl);
33718 tree decl = NULL_TREE;
33719 tree arg0, arg1;
33720
33721 switch (fn_code)
33722 {
33723 case IX86_BUILTIN_TZCNT32:
33724 decl = builtin_decl_implicit (BUILT_IN_CTZ);
33725 goto fold_tzcnt_lzcnt;
33726
33727 case IX86_BUILTIN_TZCNT64:
33728 decl = builtin_decl_implicit (BUILT_IN_CTZLL);
33729 goto fold_tzcnt_lzcnt;
33730
33731 case IX86_BUILTIN_LZCNT32:
33732 decl = builtin_decl_implicit (BUILT_IN_CLZ);
33733 goto fold_tzcnt_lzcnt;
33734
33735 case IX86_BUILTIN_LZCNT64:
33736 decl = builtin_decl_implicit (BUILT_IN_CLZLL);
33737 goto fold_tzcnt_lzcnt;
33738
33739 fold_tzcnt_lzcnt:
33740 gcc_assert (n_args == 1);
33741 arg0 = gimple_call_arg (stmt, 0);
33742 if (TREE_CODE (arg0) == SSA_NAME && decl && gimple_call_lhs (stmt))
33743 {
33744 int prec = TYPE_PRECISION (TREE_TYPE (arg0));
33745 /* If arg0 is provably non-zero, optimize into generic
33746 __builtin_c[tl]z{,ll} function the middle-end handles
33747 better. */
33748 if (!expr_not_equal_to (arg0, wi::zero (prec)))
33749 return false;
33750
33751 location_t loc = gimple_location (stmt);
33752 gimple *g = gimple_build_call (decl, 1, arg0);
33753 gimple_set_location (g, loc);
33754 tree lhs = make_ssa_name (integer_type_node);
33755 gimple_call_set_lhs (g, lhs);
33756 gsi_insert_before (gsi, g, GSI_SAME_STMT);
33757 g = gimple_build_assign (gimple_call_lhs (stmt), NOP_EXPR, lhs);
33758 gimple_set_location (g, loc);
33759 gsi_replace (gsi, g, false);
33760 return true;
33761 }
33762 break;
33763
33764 case IX86_BUILTIN_BZHI32:
33765 case IX86_BUILTIN_BZHI64:
33766 gcc_assert (n_args == 2);
33767 arg1 = gimple_call_arg (stmt, 1);
33768 if (tree_fits_uhwi_p (arg1) && gimple_call_lhs (stmt))
33769 {
33770 unsigned int idx = tree_to_uhwi (arg1) & 0xff;
33771 arg0 = gimple_call_arg (stmt, 0);
33772 if (idx < TYPE_PRECISION (TREE_TYPE (arg0)))
33773 break;
33774 location_t loc = gimple_location (stmt);
33775 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
33776 gimple_set_location (g, loc);
33777 gsi_replace (gsi, g, false);
33778 return true;
33779 }
33780 break;
33781
33782 case IX86_BUILTIN_PDEP32:
33783 case IX86_BUILTIN_PDEP64:
33784 case IX86_BUILTIN_PEXT32:
33785 case IX86_BUILTIN_PEXT64:
33786 gcc_assert (n_args == 2);
33787 arg1 = gimple_call_arg (stmt, 1);
33788 if (integer_all_onesp (arg1) && gimple_call_lhs (stmt))
33789 {
33790 location_t loc = gimple_location (stmt);
33791 arg0 = gimple_call_arg (stmt, 0);
33792 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
33793 gimple_set_location (g, loc);
33794 gsi_replace (gsi, g, false);
33795 return true;
33796 }
33797 break;
33798
33799 default:
33800 break;
33801 }
33802
33803 return false;
33804 }
33805
33806 /* Make builtins to detect cpu type and features supported. NAME is
33807 the builtin name, CODE is the builtin code, and FTYPE is the function
33808 type of the builtin. */
33809
33810 static void
33811 make_cpu_type_builtin (const char* name, int code,
33812 enum ix86_builtin_func_type ftype, bool is_const)
33813 {
33814 tree decl;
33815 tree type;
33816
33817 type = ix86_get_builtin_func_type (ftype);
33818 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
33819 NULL, NULL_TREE);
33820 gcc_assert (decl != NULL_TREE);
33821 ix86_builtins[(int) code] = decl;
33822 TREE_READONLY (decl) = is_const;
33823 }
33824
33825 /* Make builtins to get CPU type and features supported. The created
33826 builtins are :
33827
33828 __builtin_cpu_init (), to detect cpu type and features,
33829 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
33830 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
33831 */
33832
33833 static void
33834 ix86_init_platform_type_builtins (void)
33835 {
33836 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
33837 INT_FTYPE_VOID, false);
33838 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
33839 INT_FTYPE_PCCHAR, true);
33840 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
33841 INT_FTYPE_PCCHAR, true);
33842 }
33843
33844 /* Internal method for ix86_init_builtins. */
33845
33846 static void
33847 ix86_init_builtins_va_builtins_abi (void)
33848 {
33849 tree ms_va_ref, sysv_va_ref;
33850 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
33851 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
33852 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
33853 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
33854
33855 if (!TARGET_64BIT)
33856 return;
33857 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
33858 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
33859 ms_va_ref = build_reference_type (ms_va_list_type_node);
33860 sysv_va_ref =
33861 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
33862
33863 fnvoid_va_end_ms =
33864 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
33865 fnvoid_va_start_ms =
33866 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
33867 fnvoid_va_end_sysv =
33868 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
33869 fnvoid_va_start_sysv =
33870 build_varargs_function_type_list (void_type_node, sysv_va_ref,
33871 NULL_TREE);
33872 fnvoid_va_copy_ms =
33873 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
33874 NULL_TREE);
33875 fnvoid_va_copy_sysv =
33876 build_function_type_list (void_type_node, sysv_va_ref,
33877 sysv_va_ref, NULL_TREE);
33878
33879 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
33880 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
33881 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
33882 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
33883 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
33884 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
33885 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
33886 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33887 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
33888 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33889 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
33890 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33891 }
33892
33893 static void
33894 ix86_init_builtin_types (void)
33895 {
33896 tree float80_type_node, const_string_type_node;
33897
33898 /* The __float80 type. */
33899 float80_type_node = long_double_type_node;
33900 if (TYPE_MODE (float80_type_node) != XFmode)
33901 {
33902 if (float64x_type_node != NULL_TREE
33903 && TYPE_MODE (float64x_type_node) == XFmode)
33904 float80_type_node = float64x_type_node;
33905 else
33906 {
33907 /* The __float80 type. */
33908 float80_type_node = make_node (REAL_TYPE);
33909
33910 TYPE_PRECISION (float80_type_node) = 80;
33911 layout_type (float80_type_node);
33912 }
33913 }
33914 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
33915
33916 /* The __float128 type. The node has already been created as
33917 _Float128, so we only need to register the __float128 name for
33918 it. */
33919 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
33920
33921 const_string_type_node
33922 = build_pointer_type (build_qualified_type
33923 (char_type_node, TYPE_QUAL_CONST));
33924
33925 /* This macro is built by i386-builtin-types.awk. */
33926 DEFINE_BUILTIN_PRIMITIVE_TYPES;
33927 }
33928
33929 static void
33930 ix86_init_builtins (void)
33931 {
33932 tree ftype, decl;
33933
33934 ix86_init_builtin_types ();
33935
33936 /* Builtins to get CPU type and features. */
33937 ix86_init_platform_type_builtins ();
33938
33939 /* TFmode support builtins. */
33940 def_builtin_const (0, "__builtin_infq",
33941 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
33942 def_builtin_const (0, "__builtin_huge_valq",
33943 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
33944
33945 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_CONST_STRING);
33946 decl = add_builtin_function ("__builtin_nanq", ftype, IX86_BUILTIN_NANQ,
33947 BUILT_IN_MD, "nanq", NULL_TREE);
33948 TREE_READONLY (decl) = 1;
33949 ix86_builtins[(int) IX86_BUILTIN_NANQ] = decl;
33950
33951 decl = add_builtin_function ("__builtin_nansq", ftype, IX86_BUILTIN_NANSQ,
33952 BUILT_IN_MD, "nansq", NULL_TREE);
33953 TREE_READONLY (decl) = 1;
33954 ix86_builtins[(int) IX86_BUILTIN_NANSQ] = decl;
33955
33956 /* We will expand them to normal call if SSE isn't available since
33957 they are used by libgcc. */
33958 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
33959 decl = add_builtin_function ("__builtin_fabsq", ftype, IX86_BUILTIN_FABSQ,
33960 BUILT_IN_MD, "__fabstf2", NULL_TREE);
33961 TREE_READONLY (decl) = 1;
33962 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = decl;
33963
33964 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
33965 decl = add_builtin_function ("__builtin_copysignq", ftype,
33966 IX86_BUILTIN_COPYSIGNQ, BUILT_IN_MD,
33967 "__copysigntf3", NULL_TREE);
33968 TREE_READONLY (decl) = 1;
33969 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = decl;
33970
33971 ix86_init_tm_builtins ();
33972 ix86_init_mmx_sse_builtins ();
33973 ix86_init_mpx_builtins ();
33974
33975 if (TARGET_LP64)
33976 ix86_init_builtins_va_builtins_abi ();
33977
33978 #ifdef SUBTARGET_INIT_BUILTINS
33979 SUBTARGET_INIT_BUILTINS;
33980 #endif
33981 }
33982
33983 /* Return the ix86 builtin for CODE. */
33984
33985 static tree
33986 ix86_builtin_decl (unsigned code, bool)
33987 {
33988 if (code >= IX86_BUILTIN_MAX)
33989 return error_mark_node;
33990
33991 return ix86_builtins[code];
33992 }
33993
33994 /* Errors in the source file can cause expand_expr to return const0_rtx
33995 where we expect a vector. To avoid crashing, use one of the vector
33996 clear instructions. */
33997 static rtx
33998 safe_vector_operand (rtx x, machine_mode mode)
33999 {
34000 if (x == const0_rtx)
34001 x = CONST0_RTX (mode);
34002 return x;
34003 }
34004
34005 /* Fixup modeless constants to fit required mode. */
34006 static rtx
34007 fixup_modeless_constant (rtx x, machine_mode mode)
34008 {
34009 if (GET_MODE (x) == VOIDmode)
34010 x = convert_to_mode (mode, x, 1);
34011 return x;
34012 }
34013
34014 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
34015
34016 static rtx
34017 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
34018 {
34019 rtx pat;
34020 tree arg0 = CALL_EXPR_ARG (exp, 0);
34021 tree arg1 = CALL_EXPR_ARG (exp, 1);
34022 rtx op0 = expand_normal (arg0);
34023 rtx op1 = expand_normal (arg1);
34024 machine_mode tmode = insn_data[icode].operand[0].mode;
34025 machine_mode mode0 = insn_data[icode].operand[1].mode;
34026 machine_mode mode1 = insn_data[icode].operand[2].mode;
34027
34028 if (VECTOR_MODE_P (mode0))
34029 op0 = safe_vector_operand (op0, mode0);
34030 if (VECTOR_MODE_P (mode1))
34031 op1 = safe_vector_operand (op1, mode1);
34032
34033 if (optimize || !target
34034 || GET_MODE (target) != tmode
34035 || !insn_data[icode].operand[0].predicate (target, tmode))
34036 target = gen_reg_rtx (tmode);
34037
34038 if (GET_MODE (op1) == SImode && mode1 == TImode)
34039 {
34040 rtx x = gen_reg_rtx (V4SImode);
34041 emit_insn (gen_sse2_loadd (x, op1));
34042 op1 = gen_lowpart (TImode, x);
34043 }
34044
34045 if (!insn_data[icode].operand[1].predicate (op0, mode0))
34046 op0 = copy_to_mode_reg (mode0, op0);
34047 if (!insn_data[icode].operand[2].predicate (op1, mode1))
34048 op1 = copy_to_mode_reg (mode1, op1);
34049
34050 pat = GEN_FCN (icode) (target, op0, op1);
34051 if (! pat)
34052 return 0;
34053
34054 emit_insn (pat);
34055
34056 return target;
34057 }
34058
34059 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
34060
34061 static rtx
34062 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
34063 enum ix86_builtin_func_type m_type,
34064 enum rtx_code sub_code)
34065 {
34066 rtx pat;
34067 int i;
34068 int nargs;
34069 bool comparison_p = false;
34070 bool tf_p = false;
34071 bool last_arg_constant = false;
34072 int num_memory = 0;
34073 struct {
34074 rtx op;
34075 machine_mode mode;
34076 } args[4];
34077
34078 machine_mode tmode = insn_data[icode].operand[0].mode;
34079
34080 switch (m_type)
34081 {
34082 case MULTI_ARG_4_DF2_DI_I:
34083 case MULTI_ARG_4_DF2_DI_I1:
34084 case MULTI_ARG_4_SF2_SI_I:
34085 case MULTI_ARG_4_SF2_SI_I1:
34086 nargs = 4;
34087 last_arg_constant = true;
34088 break;
34089
34090 case MULTI_ARG_3_SF:
34091 case MULTI_ARG_3_DF:
34092 case MULTI_ARG_3_SF2:
34093 case MULTI_ARG_3_DF2:
34094 case MULTI_ARG_3_DI:
34095 case MULTI_ARG_3_SI:
34096 case MULTI_ARG_3_SI_DI:
34097 case MULTI_ARG_3_HI:
34098 case MULTI_ARG_3_HI_SI:
34099 case MULTI_ARG_3_QI:
34100 case MULTI_ARG_3_DI2:
34101 case MULTI_ARG_3_SI2:
34102 case MULTI_ARG_3_HI2:
34103 case MULTI_ARG_3_QI2:
34104 nargs = 3;
34105 break;
34106
34107 case MULTI_ARG_2_SF:
34108 case MULTI_ARG_2_DF:
34109 case MULTI_ARG_2_DI:
34110 case MULTI_ARG_2_SI:
34111 case MULTI_ARG_2_HI:
34112 case MULTI_ARG_2_QI:
34113 nargs = 2;
34114 break;
34115
34116 case MULTI_ARG_2_DI_IMM:
34117 case MULTI_ARG_2_SI_IMM:
34118 case MULTI_ARG_2_HI_IMM:
34119 case MULTI_ARG_2_QI_IMM:
34120 nargs = 2;
34121 last_arg_constant = true;
34122 break;
34123
34124 case MULTI_ARG_1_SF:
34125 case MULTI_ARG_1_DF:
34126 case MULTI_ARG_1_SF2:
34127 case MULTI_ARG_1_DF2:
34128 case MULTI_ARG_1_DI:
34129 case MULTI_ARG_1_SI:
34130 case MULTI_ARG_1_HI:
34131 case MULTI_ARG_1_QI:
34132 case MULTI_ARG_1_SI_DI:
34133 case MULTI_ARG_1_HI_DI:
34134 case MULTI_ARG_1_HI_SI:
34135 case MULTI_ARG_1_QI_DI:
34136 case MULTI_ARG_1_QI_SI:
34137 case MULTI_ARG_1_QI_HI:
34138 nargs = 1;
34139 break;
34140
34141 case MULTI_ARG_2_DI_CMP:
34142 case MULTI_ARG_2_SI_CMP:
34143 case MULTI_ARG_2_HI_CMP:
34144 case MULTI_ARG_2_QI_CMP:
34145 nargs = 2;
34146 comparison_p = true;
34147 break;
34148
34149 case MULTI_ARG_2_SF_TF:
34150 case MULTI_ARG_2_DF_TF:
34151 case MULTI_ARG_2_DI_TF:
34152 case MULTI_ARG_2_SI_TF:
34153 case MULTI_ARG_2_HI_TF:
34154 case MULTI_ARG_2_QI_TF:
34155 nargs = 2;
34156 tf_p = true;
34157 break;
34158
34159 default:
34160 gcc_unreachable ();
34161 }
34162
34163 if (optimize || !target
34164 || GET_MODE (target) != tmode
34165 || !insn_data[icode].operand[0].predicate (target, tmode))
34166 target = gen_reg_rtx (tmode);
34167
34168 gcc_assert (nargs <= 4);
34169
34170 for (i = 0; i < nargs; i++)
34171 {
34172 tree arg = CALL_EXPR_ARG (exp, i);
34173 rtx op = expand_normal (arg);
34174 int adjust = (comparison_p) ? 1 : 0;
34175 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
34176
34177 if (last_arg_constant && i == nargs - 1)
34178 {
34179 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
34180 {
34181 enum insn_code new_icode = icode;
34182 switch (icode)
34183 {
34184 case CODE_FOR_xop_vpermil2v2df3:
34185 case CODE_FOR_xop_vpermil2v4sf3:
34186 case CODE_FOR_xop_vpermil2v4df3:
34187 case CODE_FOR_xop_vpermil2v8sf3:
34188 error ("the last argument must be a 2-bit immediate");
34189 return gen_reg_rtx (tmode);
34190 case CODE_FOR_xop_rotlv2di3:
34191 new_icode = CODE_FOR_rotlv2di3;
34192 goto xop_rotl;
34193 case CODE_FOR_xop_rotlv4si3:
34194 new_icode = CODE_FOR_rotlv4si3;
34195 goto xop_rotl;
34196 case CODE_FOR_xop_rotlv8hi3:
34197 new_icode = CODE_FOR_rotlv8hi3;
34198 goto xop_rotl;
34199 case CODE_FOR_xop_rotlv16qi3:
34200 new_icode = CODE_FOR_rotlv16qi3;
34201 xop_rotl:
34202 if (CONST_INT_P (op))
34203 {
34204 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
34205 op = GEN_INT (INTVAL (op) & mask);
34206 gcc_checking_assert
34207 (insn_data[icode].operand[i + 1].predicate (op, mode));
34208 }
34209 else
34210 {
34211 gcc_checking_assert
34212 (nargs == 2
34213 && insn_data[new_icode].operand[0].mode == tmode
34214 && insn_data[new_icode].operand[1].mode == tmode
34215 && insn_data[new_icode].operand[2].mode == mode
34216 && insn_data[new_icode].operand[0].predicate
34217 == insn_data[icode].operand[0].predicate
34218 && insn_data[new_icode].operand[1].predicate
34219 == insn_data[icode].operand[1].predicate);
34220 icode = new_icode;
34221 goto non_constant;
34222 }
34223 break;
34224 default:
34225 gcc_unreachable ();
34226 }
34227 }
34228 }
34229 else
34230 {
34231 non_constant:
34232 if (VECTOR_MODE_P (mode))
34233 op = safe_vector_operand (op, mode);
34234
34235 /* If we aren't optimizing, only allow one memory operand to be
34236 generated. */
34237 if (memory_operand (op, mode))
34238 num_memory++;
34239
34240 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
34241
34242 if (optimize
34243 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
34244 || num_memory > 1)
34245 op = force_reg (mode, op);
34246 }
34247
34248 args[i].op = op;
34249 args[i].mode = mode;
34250 }
34251
34252 switch (nargs)
34253 {
34254 case 1:
34255 pat = GEN_FCN (icode) (target, args[0].op);
34256 break;
34257
34258 case 2:
34259 if (tf_p)
34260 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34261 GEN_INT ((int)sub_code));
34262 else if (! comparison_p)
34263 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
34264 else
34265 {
34266 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
34267 args[0].op,
34268 args[1].op);
34269
34270 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
34271 }
34272 break;
34273
34274 case 3:
34275 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
34276 break;
34277
34278 case 4:
34279 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
34280 break;
34281
34282 default:
34283 gcc_unreachable ();
34284 }
34285
34286 if (! pat)
34287 return 0;
34288
34289 emit_insn (pat);
34290 return target;
34291 }
34292
34293 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
34294 insns with vec_merge. */
34295
34296 static rtx
34297 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
34298 rtx target)
34299 {
34300 rtx pat;
34301 tree arg0 = CALL_EXPR_ARG (exp, 0);
34302 rtx op1, op0 = expand_normal (arg0);
34303 machine_mode tmode = insn_data[icode].operand[0].mode;
34304 machine_mode mode0 = insn_data[icode].operand[1].mode;
34305
34306 if (optimize || !target
34307 || GET_MODE (target) != tmode
34308 || !insn_data[icode].operand[0].predicate (target, tmode))
34309 target = gen_reg_rtx (tmode);
34310
34311 if (VECTOR_MODE_P (mode0))
34312 op0 = safe_vector_operand (op0, mode0);
34313
34314 if ((optimize && !register_operand (op0, mode0))
34315 || !insn_data[icode].operand[1].predicate (op0, mode0))
34316 op0 = copy_to_mode_reg (mode0, op0);
34317
34318 op1 = op0;
34319 if (!insn_data[icode].operand[2].predicate (op1, mode0))
34320 op1 = copy_to_mode_reg (mode0, op1);
34321
34322 pat = GEN_FCN (icode) (target, op0, op1);
34323 if (! pat)
34324 return 0;
34325 emit_insn (pat);
34326 return target;
34327 }
34328
34329 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
34330
34331 static rtx
34332 ix86_expand_sse_compare (const struct builtin_description *d,
34333 tree exp, rtx target, bool swap)
34334 {
34335 rtx pat;
34336 tree arg0 = CALL_EXPR_ARG (exp, 0);
34337 tree arg1 = CALL_EXPR_ARG (exp, 1);
34338 rtx op0 = expand_normal (arg0);
34339 rtx op1 = expand_normal (arg1);
34340 rtx op2;
34341 machine_mode tmode = insn_data[d->icode].operand[0].mode;
34342 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
34343 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
34344 enum rtx_code comparison = d->comparison;
34345
34346 if (VECTOR_MODE_P (mode0))
34347 op0 = safe_vector_operand (op0, mode0);
34348 if (VECTOR_MODE_P (mode1))
34349 op1 = safe_vector_operand (op1, mode1);
34350
34351 /* Swap operands if we have a comparison that isn't available in
34352 hardware. */
34353 if (swap)
34354 std::swap (op0, op1);
34355
34356 if (optimize || !target
34357 || GET_MODE (target) != tmode
34358 || !insn_data[d->icode].operand[0].predicate (target, tmode))
34359 target = gen_reg_rtx (tmode);
34360
34361 if ((optimize && !register_operand (op0, mode0))
34362 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
34363 op0 = copy_to_mode_reg (mode0, op0);
34364 if ((optimize && !register_operand (op1, mode1))
34365 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
34366 op1 = copy_to_mode_reg (mode1, op1);
34367
34368 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
34369 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
34370 if (! pat)
34371 return 0;
34372 emit_insn (pat);
34373 return target;
34374 }
34375
34376 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
34377
34378 static rtx
34379 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
34380 rtx target)
34381 {
34382 rtx pat;
34383 tree arg0 = CALL_EXPR_ARG (exp, 0);
34384 tree arg1 = CALL_EXPR_ARG (exp, 1);
34385 rtx op0 = expand_normal (arg0);
34386 rtx op1 = expand_normal (arg1);
34387 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
34388 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
34389 enum rtx_code comparison = d->comparison;
34390
34391 if (VECTOR_MODE_P (mode0))
34392 op0 = safe_vector_operand (op0, mode0);
34393 if (VECTOR_MODE_P (mode1))
34394 op1 = safe_vector_operand (op1, mode1);
34395
34396 /* Swap operands if we have a comparison that isn't available in
34397 hardware. */
34398 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
34399 std::swap (op0, op1);
34400
34401 target = gen_reg_rtx (SImode);
34402 emit_move_insn (target, const0_rtx);
34403 target = gen_rtx_SUBREG (QImode, target, 0);
34404
34405 if ((optimize && !register_operand (op0, mode0))
34406 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34407 op0 = copy_to_mode_reg (mode0, op0);
34408 if ((optimize && !register_operand (op1, mode1))
34409 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
34410 op1 = copy_to_mode_reg (mode1, op1);
34411
34412 pat = GEN_FCN (d->icode) (op0, op1);
34413 if (! pat)
34414 return 0;
34415 emit_insn (pat);
34416 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34417 gen_rtx_fmt_ee (comparison, QImode,
34418 SET_DEST (pat),
34419 const0_rtx)));
34420
34421 return SUBREG_REG (target);
34422 }
34423
34424 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
34425
34426 static rtx
34427 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
34428 rtx target)
34429 {
34430 rtx pat;
34431 tree arg0 = CALL_EXPR_ARG (exp, 0);
34432 rtx op1, op0 = expand_normal (arg0);
34433 machine_mode tmode = insn_data[d->icode].operand[0].mode;
34434 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
34435
34436 if (optimize || target == 0
34437 || GET_MODE (target) != tmode
34438 || !insn_data[d->icode].operand[0].predicate (target, tmode))
34439 target = gen_reg_rtx (tmode);
34440
34441 if (VECTOR_MODE_P (mode0))
34442 op0 = safe_vector_operand (op0, mode0);
34443
34444 if ((optimize && !register_operand (op0, mode0))
34445 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34446 op0 = copy_to_mode_reg (mode0, op0);
34447
34448 op1 = GEN_INT (d->comparison);
34449
34450 pat = GEN_FCN (d->icode) (target, op0, op1);
34451 if (! pat)
34452 return 0;
34453 emit_insn (pat);
34454 return target;
34455 }
34456
34457 static rtx
34458 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
34459 tree exp, rtx target)
34460 {
34461 rtx pat;
34462 tree arg0 = CALL_EXPR_ARG (exp, 0);
34463 tree arg1 = CALL_EXPR_ARG (exp, 1);
34464 rtx op0 = expand_normal (arg0);
34465 rtx op1 = expand_normal (arg1);
34466 rtx op2;
34467 machine_mode tmode = insn_data[d->icode].operand[0].mode;
34468 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
34469 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
34470
34471 if (optimize || target == 0
34472 || GET_MODE (target) != tmode
34473 || !insn_data[d->icode].operand[0].predicate (target, tmode))
34474 target = gen_reg_rtx (tmode);
34475
34476 op0 = safe_vector_operand (op0, mode0);
34477 op1 = safe_vector_operand (op1, mode1);
34478
34479 if ((optimize && !register_operand (op0, mode0))
34480 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34481 op0 = copy_to_mode_reg (mode0, op0);
34482 if ((optimize && !register_operand (op1, mode1))
34483 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
34484 op1 = copy_to_mode_reg (mode1, op1);
34485
34486 op2 = GEN_INT (d->comparison);
34487
34488 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
34489 if (! pat)
34490 return 0;
34491 emit_insn (pat);
34492 return target;
34493 }
34494
34495 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
34496
34497 static rtx
34498 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
34499 rtx target)
34500 {
34501 rtx pat;
34502 tree arg0 = CALL_EXPR_ARG (exp, 0);
34503 tree arg1 = CALL_EXPR_ARG (exp, 1);
34504 rtx op0 = expand_normal (arg0);
34505 rtx op1 = expand_normal (arg1);
34506 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
34507 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
34508 enum rtx_code comparison = d->comparison;
34509
34510 if (VECTOR_MODE_P (mode0))
34511 op0 = safe_vector_operand (op0, mode0);
34512 if (VECTOR_MODE_P (mode1))
34513 op1 = safe_vector_operand (op1, mode1);
34514
34515 target = gen_reg_rtx (SImode);
34516 emit_move_insn (target, const0_rtx);
34517 target = gen_rtx_SUBREG (QImode, target, 0);
34518
34519 if ((optimize && !register_operand (op0, mode0))
34520 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34521 op0 = copy_to_mode_reg (mode0, op0);
34522 if ((optimize && !register_operand (op1, mode1))
34523 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
34524 op1 = copy_to_mode_reg (mode1, op1);
34525
34526 pat = GEN_FCN (d->icode) (op0, op1);
34527 if (! pat)
34528 return 0;
34529 emit_insn (pat);
34530 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34531 gen_rtx_fmt_ee (comparison, QImode,
34532 SET_DEST (pat),
34533 const0_rtx)));
34534
34535 return SUBREG_REG (target);
34536 }
34537
34538 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
34539
34540 static rtx
34541 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
34542 tree exp, rtx target)
34543 {
34544 rtx pat;
34545 tree arg0 = CALL_EXPR_ARG (exp, 0);
34546 tree arg1 = CALL_EXPR_ARG (exp, 1);
34547 tree arg2 = CALL_EXPR_ARG (exp, 2);
34548 tree arg3 = CALL_EXPR_ARG (exp, 3);
34549 tree arg4 = CALL_EXPR_ARG (exp, 4);
34550 rtx scratch0, scratch1;
34551 rtx op0 = expand_normal (arg0);
34552 rtx op1 = expand_normal (arg1);
34553 rtx op2 = expand_normal (arg2);
34554 rtx op3 = expand_normal (arg3);
34555 rtx op4 = expand_normal (arg4);
34556 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
34557
34558 tmode0 = insn_data[d->icode].operand[0].mode;
34559 tmode1 = insn_data[d->icode].operand[1].mode;
34560 modev2 = insn_data[d->icode].operand[2].mode;
34561 modei3 = insn_data[d->icode].operand[3].mode;
34562 modev4 = insn_data[d->icode].operand[4].mode;
34563 modei5 = insn_data[d->icode].operand[5].mode;
34564 modeimm = insn_data[d->icode].operand[6].mode;
34565
34566 if (VECTOR_MODE_P (modev2))
34567 op0 = safe_vector_operand (op0, modev2);
34568 if (VECTOR_MODE_P (modev4))
34569 op2 = safe_vector_operand (op2, modev4);
34570
34571 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
34572 op0 = copy_to_mode_reg (modev2, op0);
34573 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
34574 op1 = copy_to_mode_reg (modei3, op1);
34575 if ((optimize && !register_operand (op2, modev4))
34576 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
34577 op2 = copy_to_mode_reg (modev4, op2);
34578 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
34579 op3 = copy_to_mode_reg (modei5, op3);
34580
34581 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
34582 {
34583 error ("the fifth argument must be an 8-bit immediate");
34584 return const0_rtx;
34585 }
34586
34587 if (d->code == IX86_BUILTIN_PCMPESTRI128)
34588 {
34589 if (optimize || !target
34590 || GET_MODE (target) != tmode0
34591 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
34592 target = gen_reg_rtx (tmode0);
34593
34594 scratch1 = gen_reg_rtx (tmode1);
34595
34596 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
34597 }
34598 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
34599 {
34600 if (optimize || !target
34601 || GET_MODE (target) != tmode1
34602 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
34603 target = gen_reg_rtx (tmode1);
34604
34605 scratch0 = gen_reg_rtx (tmode0);
34606
34607 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
34608 }
34609 else
34610 {
34611 gcc_assert (d->flag);
34612
34613 scratch0 = gen_reg_rtx (tmode0);
34614 scratch1 = gen_reg_rtx (tmode1);
34615
34616 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
34617 }
34618
34619 if (! pat)
34620 return 0;
34621
34622 emit_insn (pat);
34623
34624 if (d->flag)
34625 {
34626 target = gen_reg_rtx (SImode);
34627 emit_move_insn (target, const0_rtx);
34628 target = gen_rtx_SUBREG (QImode, target, 0);
34629
34630 emit_insn
34631 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34632 gen_rtx_fmt_ee (EQ, QImode,
34633 gen_rtx_REG ((machine_mode) d->flag,
34634 FLAGS_REG),
34635 const0_rtx)));
34636 return SUBREG_REG (target);
34637 }
34638 else
34639 return target;
34640 }
34641
34642
34643 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
34644
34645 static rtx
34646 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
34647 tree exp, rtx target)
34648 {
34649 rtx pat;
34650 tree arg0 = CALL_EXPR_ARG (exp, 0);
34651 tree arg1 = CALL_EXPR_ARG (exp, 1);
34652 tree arg2 = CALL_EXPR_ARG (exp, 2);
34653 rtx scratch0, scratch1;
34654 rtx op0 = expand_normal (arg0);
34655 rtx op1 = expand_normal (arg1);
34656 rtx op2 = expand_normal (arg2);
34657 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
34658
34659 tmode0 = insn_data[d->icode].operand[0].mode;
34660 tmode1 = insn_data[d->icode].operand[1].mode;
34661 modev2 = insn_data[d->icode].operand[2].mode;
34662 modev3 = insn_data[d->icode].operand[3].mode;
34663 modeimm = insn_data[d->icode].operand[4].mode;
34664
34665 if (VECTOR_MODE_P (modev2))
34666 op0 = safe_vector_operand (op0, modev2);
34667 if (VECTOR_MODE_P (modev3))
34668 op1 = safe_vector_operand (op1, modev3);
34669
34670 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
34671 op0 = copy_to_mode_reg (modev2, op0);
34672 if ((optimize && !register_operand (op1, modev3))
34673 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
34674 op1 = copy_to_mode_reg (modev3, op1);
34675
34676 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
34677 {
34678 error ("the third argument must be an 8-bit immediate");
34679 return const0_rtx;
34680 }
34681
34682 if (d->code == IX86_BUILTIN_PCMPISTRI128)
34683 {
34684 if (optimize || !target
34685 || GET_MODE (target) != tmode0
34686 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
34687 target = gen_reg_rtx (tmode0);
34688
34689 scratch1 = gen_reg_rtx (tmode1);
34690
34691 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
34692 }
34693 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
34694 {
34695 if (optimize || !target
34696 || GET_MODE (target) != tmode1
34697 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
34698 target = gen_reg_rtx (tmode1);
34699
34700 scratch0 = gen_reg_rtx (tmode0);
34701
34702 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
34703 }
34704 else
34705 {
34706 gcc_assert (d->flag);
34707
34708 scratch0 = gen_reg_rtx (tmode0);
34709 scratch1 = gen_reg_rtx (tmode1);
34710
34711 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
34712 }
34713
34714 if (! pat)
34715 return 0;
34716
34717 emit_insn (pat);
34718
34719 if (d->flag)
34720 {
34721 target = gen_reg_rtx (SImode);
34722 emit_move_insn (target, const0_rtx);
34723 target = gen_rtx_SUBREG (QImode, target, 0);
34724
34725 emit_insn
34726 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34727 gen_rtx_fmt_ee (EQ, QImode,
34728 gen_rtx_REG ((machine_mode) d->flag,
34729 FLAGS_REG),
34730 const0_rtx)));
34731 return SUBREG_REG (target);
34732 }
34733 else
34734 return target;
34735 }
34736
34737 /* Subroutine of ix86_expand_builtin to take care of insns with
34738 variable number of operands. */
34739
34740 static rtx
34741 ix86_expand_args_builtin (const struct builtin_description *d,
34742 tree exp, rtx target)
34743 {
34744 rtx pat, real_target;
34745 unsigned int i, nargs;
34746 unsigned int nargs_constant = 0;
34747 unsigned int mask_pos = 0;
34748 int num_memory = 0;
34749 struct
34750 {
34751 rtx op;
34752 machine_mode mode;
34753 } args[6];
34754 bool last_arg_count = false;
34755 enum insn_code icode = d->icode;
34756 const struct insn_data_d *insn_p = &insn_data[icode];
34757 machine_mode tmode = insn_p->operand[0].mode;
34758 machine_mode rmode = VOIDmode;
34759 bool swap = false;
34760 enum rtx_code comparison = d->comparison;
34761
34762 switch ((enum ix86_builtin_func_type) d->flag)
34763 {
34764 case V2DF_FTYPE_V2DF_ROUND:
34765 case V4DF_FTYPE_V4DF_ROUND:
34766 case V8DF_FTYPE_V8DF_ROUND:
34767 case V4SF_FTYPE_V4SF_ROUND:
34768 case V8SF_FTYPE_V8SF_ROUND:
34769 case V16SF_FTYPE_V16SF_ROUND:
34770 case V4SI_FTYPE_V4SF_ROUND:
34771 case V8SI_FTYPE_V8SF_ROUND:
34772 case V16SI_FTYPE_V16SF_ROUND:
34773 return ix86_expand_sse_round (d, exp, target);
34774 case V4SI_FTYPE_V2DF_V2DF_ROUND:
34775 case V8SI_FTYPE_V4DF_V4DF_ROUND:
34776 case V16SI_FTYPE_V8DF_V8DF_ROUND:
34777 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
34778 case INT_FTYPE_V8SF_V8SF_PTEST:
34779 case INT_FTYPE_V4DI_V4DI_PTEST:
34780 case INT_FTYPE_V4DF_V4DF_PTEST:
34781 case INT_FTYPE_V4SF_V4SF_PTEST:
34782 case INT_FTYPE_V2DI_V2DI_PTEST:
34783 case INT_FTYPE_V2DF_V2DF_PTEST:
34784 return ix86_expand_sse_ptest (d, exp, target);
34785 case FLOAT128_FTYPE_FLOAT128:
34786 case FLOAT_FTYPE_FLOAT:
34787 case INT_FTYPE_INT:
34788 case UINT_FTYPE_UINT:
34789 case UINT16_FTYPE_UINT16:
34790 case UINT64_FTYPE_INT:
34791 case UINT64_FTYPE_UINT64:
34792 case INT64_FTYPE_INT64:
34793 case INT64_FTYPE_V4SF:
34794 case INT64_FTYPE_V2DF:
34795 case INT_FTYPE_V16QI:
34796 case INT_FTYPE_V8QI:
34797 case INT_FTYPE_V8SF:
34798 case INT_FTYPE_V4DF:
34799 case INT_FTYPE_V4SF:
34800 case INT_FTYPE_V2DF:
34801 case INT_FTYPE_V32QI:
34802 case V16QI_FTYPE_V16QI:
34803 case V8SI_FTYPE_V8SF:
34804 case V8SI_FTYPE_V4SI:
34805 case V8HI_FTYPE_V8HI:
34806 case V8HI_FTYPE_V16QI:
34807 case V8QI_FTYPE_V8QI:
34808 case V8SF_FTYPE_V8SF:
34809 case V8SF_FTYPE_V8SI:
34810 case V8SF_FTYPE_V4SF:
34811 case V8SF_FTYPE_V8HI:
34812 case V4SI_FTYPE_V4SI:
34813 case V4SI_FTYPE_V16QI:
34814 case V4SI_FTYPE_V4SF:
34815 case V4SI_FTYPE_V8SI:
34816 case V4SI_FTYPE_V8HI:
34817 case V4SI_FTYPE_V4DF:
34818 case V4SI_FTYPE_V2DF:
34819 case V4HI_FTYPE_V4HI:
34820 case V4DF_FTYPE_V4DF:
34821 case V4DF_FTYPE_V4SI:
34822 case V4DF_FTYPE_V4SF:
34823 case V4DF_FTYPE_V2DF:
34824 case V4SF_FTYPE_V4SF:
34825 case V4SF_FTYPE_V4SI:
34826 case V4SF_FTYPE_V8SF:
34827 case V4SF_FTYPE_V4DF:
34828 case V4SF_FTYPE_V8HI:
34829 case V4SF_FTYPE_V2DF:
34830 case V2DI_FTYPE_V2DI:
34831 case V2DI_FTYPE_V16QI:
34832 case V2DI_FTYPE_V8HI:
34833 case V2DI_FTYPE_V4SI:
34834 case V2DF_FTYPE_V2DF:
34835 case V2DF_FTYPE_V4SI:
34836 case V2DF_FTYPE_V4DF:
34837 case V2DF_FTYPE_V4SF:
34838 case V2DF_FTYPE_V2SI:
34839 case V2SI_FTYPE_V2SI:
34840 case V2SI_FTYPE_V4SF:
34841 case V2SI_FTYPE_V2SF:
34842 case V2SI_FTYPE_V2DF:
34843 case V2SF_FTYPE_V2SF:
34844 case V2SF_FTYPE_V2SI:
34845 case V32QI_FTYPE_V32QI:
34846 case V32QI_FTYPE_V16QI:
34847 case V16HI_FTYPE_V16HI:
34848 case V16HI_FTYPE_V8HI:
34849 case V8SI_FTYPE_V8SI:
34850 case V16HI_FTYPE_V16QI:
34851 case V8SI_FTYPE_V16QI:
34852 case V4DI_FTYPE_V16QI:
34853 case V8SI_FTYPE_V8HI:
34854 case V4DI_FTYPE_V8HI:
34855 case V4DI_FTYPE_V4SI:
34856 case V4DI_FTYPE_V2DI:
34857 case UQI_FTYPE_UQI:
34858 case UHI_FTYPE_UHI:
34859 case USI_FTYPE_USI:
34860 case USI_FTYPE_UQI:
34861 case USI_FTYPE_UHI:
34862 case UDI_FTYPE_UDI:
34863 case UHI_FTYPE_V16QI:
34864 case USI_FTYPE_V32QI:
34865 case UDI_FTYPE_V64QI:
34866 case V16QI_FTYPE_UHI:
34867 case V32QI_FTYPE_USI:
34868 case V64QI_FTYPE_UDI:
34869 case V8HI_FTYPE_UQI:
34870 case V16HI_FTYPE_UHI:
34871 case V32HI_FTYPE_USI:
34872 case V4SI_FTYPE_UQI:
34873 case V8SI_FTYPE_UQI:
34874 case V4SI_FTYPE_UHI:
34875 case V8SI_FTYPE_UHI:
34876 case UQI_FTYPE_V8HI:
34877 case UHI_FTYPE_V16HI:
34878 case USI_FTYPE_V32HI:
34879 case UQI_FTYPE_V4SI:
34880 case UQI_FTYPE_V8SI:
34881 case UHI_FTYPE_V16SI:
34882 case UQI_FTYPE_V2DI:
34883 case UQI_FTYPE_V4DI:
34884 case UQI_FTYPE_V8DI:
34885 case V16SI_FTYPE_UHI:
34886 case V2DI_FTYPE_UQI:
34887 case V4DI_FTYPE_UQI:
34888 case V16SI_FTYPE_INT:
34889 case V16SF_FTYPE_V8SF:
34890 case V16SI_FTYPE_V8SI:
34891 case V16SF_FTYPE_V4SF:
34892 case V16SI_FTYPE_V4SI:
34893 case V16SI_FTYPE_V16SF:
34894 case V16SF_FTYPE_V16SF:
34895 case V8DI_FTYPE_UQI:
34896 case V8DF_FTYPE_V4DF:
34897 case V8DF_FTYPE_V2DF:
34898 case V8DF_FTYPE_V8DF:
34899 nargs = 1;
34900 break;
34901 case V4SF_FTYPE_V4SF_VEC_MERGE:
34902 case V2DF_FTYPE_V2DF_VEC_MERGE:
34903 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
34904 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
34905 case V16QI_FTYPE_V16QI_V16QI:
34906 case V16QI_FTYPE_V8HI_V8HI:
34907 case V16SF_FTYPE_V16SF_V16SF:
34908 case V8QI_FTYPE_V8QI_V8QI:
34909 case V8QI_FTYPE_V4HI_V4HI:
34910 case V8HI_FTYPE_V8HI_V8HI:
34911 case V8HI_FTYPE_V16QI_V16QI:
34912 case V8HI_FTYPE_V4SI_V4SI:
34913 case V8SF_FTYPE_V8SF_V8SF:
34914 case V8SF_FTYPE_V8SF_V8SI:
34915 case V8DF_FTYPE_V8DF_V8DF:
34916 case V4SI_FTYPE_V4SI_V4SI:
34917 case V4SI_FTYPE_V8HI_V8HI:
34918 case V4SI_FTYPE_V2DF_V2DF:
34919 case V4HI_FTYPE_V4HI_V4HI:
34920 case V4HI_FTYPE_V8QI_V8QI:
34921 case V4HI_FTYPE_V2SI_V2SI:
34922 case V4DF_FTYPE_V4DF_V4DF:
34923 case V4DF_FTYPE_V4DF_V4DI:
34924 case V4SF_FTYPE_V4SF_V4SF:
34925 case V4SF_FTYPE_V4SF_V4SI:
34926 case V4SF_FTYPE_V4SF_V2SI:
34927 case V4SF_FTYPE_V4SF_V2DF:
34928 case V4SF_FTYPE_V4SF_UINT:
34929 case V4SF_FTYPE_V4SF_DI:
34930 case V4SF_FTYPE_V4SF_SI:
34931 case V2DI_FTYPE_V2DI_V2DI:
34932 case V2DI_FTYPE_V16QI_V16QI:
34933 case V2DI_FTYPE_V4SI_V4SI:
34934 case V2DI_FTYPE_V2DI_V16QI:
34935 case V2SI_FTYPE_V2SI_V2SI:
34936 case V2SI_FTYPE_V4HI_V4HI:
34937 case V2SI_FTYPE_V2SF_V2SF:
34938 case V2DF_FTYPE_V2DF_V2DF:
34939 case V2DF_FTYPE_V2DF_V4SF:
34940 case V2DF_FTYPE_V2DF_V2DI:
34941 case V2DF_FTYPE_V2DF_DI:
34942 case V2DF_FTYPE_V2DF_SI:
34943 case V2DF_FTYPE_V2DF_UINT:
34944 case V2SF_FTYPE_V2SF_V2SF:
34945 case V1DI_FTYPE_V1DI_V1DI:
34946 case V1DI_FTYPE_V8QI_V8QI:
34947 case V1DI_FTYPE_V2SI_V2SI:
34948 case V32QI_FTYPE_V16HI_V16HI:
34949 case V16HI_FTYPE_V8SI_V8SI:
34950 case V32QI_FTYPE_V32QI_V32QI:
34951 case V16HI_FTYPE_V32QI_V32QI:
34952 case V16HI_FTYPE_V16HI_V16HI:
34953 case V8SI_FTYPE_V4DF_V4DF:
34954 case V8SI_FTYPE_V8SI_V8SI:
34955 case V8SI_FTYPE_V16HI_V16HI:
34956 case V4DI_FTYPE_V4DI_V4DI:
34957 case V4DI_FTYPE_V8SI_V8SI:
34958 case V8DI_FTYPE_V64QI_V64QI:
34959 if (comparison == UNKNOWN)
34960 return ix86_expand_binop_builtin (icode, exp, target);
34961 nargs = 2;
34962 break;
34963 case V4SF_FTYPE_V4SF_V4SF_SWAP:
34964 case V2DF_FTYPE_V2DF_V2DF_SWAP:
34965 gcc_assert (comparison != UNKNOWN);
34966 nargs = 2;
34967 swap = true;
34968 break;
34969 case V16HI_FTYPE_V16HI_V8HI_COUNT:
34970 case V16HI_FTYPE_V16HI_SI_COUNT:
34971 case V8SI_FTYPE_V8SI_V4SI_COUNT:
34972 case V8SI_FTYPE_V8SI_SI_COUNT:
34973 case V4DI_FTYPE_V4DI_V2DI_COUNT:
34974 case V4DI_FTYPE_V4DI_INT_COUNT:
34975 case V8HI_FTYPE_V8HI_V8HI_COUNT:
34976 case V8HI_FTYPE_V8HI_SI_COUNT:
34977 case V4SI_FTYPE_V4SI_V4SI_COUNT:
34978 case V4SI_FTYPE_V4SI_SI_COUNT:
34979 case V4HI_FTYPE_V4HI_V4HI_COUNT:
34980 case V4HI_FTYPE_V4HI_SI_COUNT:
34981 case V2DI_FTYPE_V2DI_V2DI_COUNT:
34982 case V2DI_FTYPE_V2DI_SI_COUNT:
34983 case V2SI_FTYPE_V2SI_V2SI_COUNT:
34984 case V2SI_FTYPE_V2SI_SI_COUNT:
34985 case V1DI_FTYPE_V1DI_V1DI_COUNT:
34986 case V1DI_FTYPE_V1DI_SI_COUNT:
34987 nargs = 2;
34988 last_arg_count = true;
34989 break;
34990 case UINT64_FTYPE_UINT64_UINT64:
34991 case UINT_FTYPE_UINT_UINT:
34992 case UINT_FTYPE_UINT_USHORT:
34993 case UINT_FTYPE_UINT_UCHAR:
34994 case UINT16_FTYPE_UINT16_INT:
34995 case UINT8_FTYPE_UINT8_INT:
34996 case UQI_FTYPE_UQI_UQI:
34997 case UHI_FTYPE_UHI_UHI:
34998 case USI_FTYPE_USI_USI:
34999 case UDI_FTYPE_UDI_UDI:
35000 case V16SI_FTYPE_V8DF_V8DF:
35001 nargs = 2;
35002 break;
35003 case V2DI_FTYPE_V2DI_INT_CONVERT:
35004 nargs = 2;
35005 rmode = V1TImode;
35006 nargs_constant = 1;
35007 break;
35008 case V4DI_FTYPE_V4DI_INT_CONVERT:
35009 nargs = 2;
35010 rmode = V2TImode;
35011 nargs_constant = 1;
35012 break;
35013 case V8DI_FTYPE_V8DI_INT_CONVERT:
35014 nargs = 2;
35015 rmode = V4TImode;
35016 nargs_constant = 1;
35017 break;
35018 case V8HI_FTYPE_V8HI_INT:
35019 case V8HI_FTYPE_V8SF_INT:
35020 case V16HI_FTYPE_V16SF_INT:
35021 case V8HI_FTYPE_V4SF_INT:
35022 case V8SF_FTYPE_V8SF_INT:
35023 case V4SF_FTYPE_V16SF_INT:
35024 case V16SF_FTYPE_V16SF_INT:
35025 case V4SI_FTYPE_V4SI_INT:
35026 case V4SI_FTYPE_V8SI_INT:
35027 case V4HI_FTYPE_V4HI_INT:
35028 case V4DF_FTYPE_V4DF_INT:
35029 case V4DF_FTYPE_V8DF_INT:
35030 case V4SF_FTYPE_V4SF_INT:
35031 case V4SF_FTYPE_V8SF_INT:
35032 case V2DI_FTYPE_V2DI_INT:
35033 case V2DF_FTYPE_V2DF_INT:
35034 case V2DF_FTYPE_V4DF_INT:
35035 case V16HI_FTYPE_V16HI_INT:
35036 case V8SI_FTYPE_V8SI_INT:
35037 case V16SI_FTYPE_V16SI_INT:
35038 case V4SI_FTYPE_V16SI_INT:
35039 case V4DI_FTYPE_V4DI_INT:
35040 case V2DI_FTYPE_V4DI_INT:
35041 case V4DI_FTYPE_V8DI_INT:
35042 case QI_FTYPE_V4SF_INT:
35043 case QI_FTYPE_V2DF_INT:
35044 case UQI_FTYPE_UQI_INT:
35045 case UHI_FTYPE_UHI_INT:
35046 case USI_FTYPE_USI_INT:
35047 case UDI_FTYPE_UDI_INT:
35048 nargs = 2;
35049 nargs_constant = 1;
35050 break;
35051 case V16QI_FTYPE_V16QI_V16QI_V16QI:
35052 case V8SF_FTYPE_V8SF_V8SF_V8SF:
35053 case V4DF_FTYPE_V4DF_V4DF_V4DF:
35054 case V4SF_FTYPE_V4SF_V4SF_V4SF:
35055 case V2DF_FTYPE_V2DF_V2DF_V2DF:
35056 case V32QI_FTYPE_V32QI_V32QI_V32QI:
35057 case UHI_FTYPE_V16SI_V16SI_UHI:
35058 case UQI_FTYPE_V8DI_V8DI_UQI:
35059 case V16HI_FTYPE_V16SI_V16HI_UHI:
35060 case V16QI_FTYPE_V16SI_V16QI_UHI:
35061 case V16QI_FTYPE_V8DI_V16QI_UQI:
35062 case V16SF_FTYPE_V16SF_V16SF_UHI:
35063 case V16SF_FTYPE_V4SF_V16SF_UHI:
35064 case V16SI_FTYPE_SI_V16SI_UHI:
35065 case V16SI_FTYPE_V16HI_V16SI_UHI:
35066 case V16SI_FTYPE_V16QI_V16SI_UHI:
35067 case V8SF_FTYPE_V4SF_V8SF_UQI:
35068 case V4DF_FTYPE_V2DF_V4DF_UQI:
35069 case V8SI_FTYPE_V4SI_V8SI_UQI:
35070 case V8SI_FTYPE_SI_V8SI_UQI:
35071 case V4SI_FTYPE_V4SI_V4SI_UQI:
35072 case V4SI_FTYPE_SI_V4SI_UQI:
35073 case V4DI_FTYPE_V2DI_V4DI_UQI:
35074 case V4DI_FTYPE_DI_V4DI_UQI:
35075 case V2DI_FTYPE_V2DI_V2DI_UQI:
35076 case V2DI_FTYPE_DI_V2DI_UQI:
35077 case V64QI_FTYPE_V64QI_V64QI_UDI:
35078 case V64QI_FTYPE_V16QI_V64QI_UDI:
35079 case V64QI_FTYPE_QI_V64QI_UDI:
35080 case V32QI_FTYPE_V32QI_V32QI_USI:
35081 case V32QI_FTYPE_V16QI_V32QI_USI:
35082 case V32QI_FTYPE_QI_V32QI_USI:
35083 case V16QI_FTYPE_V16QI_V16QI_UHI:
35084 case V16QI_FTYPE_QI_V16QI_UHI:
35085 case V32HI_FTYPE_V8HI_V32HI_USI:
35086 case V32HI_FTYPE_HI_V32HI_USI:
35087 case V16HI_FTYPE_V8HI_V16HI_UHI:
35088 case V16HI_FTYPE_HI_V16HI_UHI:
35089 case V8HI_FTYPE_V8HI_V8HI_UQI:
35090 case V8HI_FTYPE_HI_V8HI_UQI:
35091 case V8SF_FTYPE_V8HI_V8SF_UQI:
35092 case V4SF_FTYPE_V8HI_V4SF_UQI:
35093 case V8SI_FTYPE_V8SF_V8SI_UQI:
35094 case V4SI_FTYPE_V4SF_V4SI_UQI:
35095 case V4DI_FTYPE_V4SF_V4DI_UQI:
35096 case V2DI_FTYPE_V4SF_V2DI_UQI:
35097 case V4SF_FTYPE_V4DI_V4SF_UQI:
35098 case V4SF_FTYPE_V2DI_V4SF_UQI:
35099 case V4DF_FTYPE_V4DI_V4DF_UQI:
35100 case V2DF_FTYPE_V2DI_V2DF_UQI:
35101 case V16QI_FTYPE_V8HI_V16QI_UQI:
35102 case V16QI_FTYPE_V16HI_V16QI_UHI:
35103 case V16QI_FTYPE_V4SI_V16QI_UQI:
35104 case V16QI_FTYPE_V8SI_V16QI_UQI:
35105 case V8HI_FTYPE_V4SI_V8HI_UQI:
35106 case V8HI_FTYPE_V8SI_V8HI_UQI:
35107 case V16QI_FTYPE_V2DI_V16QI_UQI:
35108 case V16QI_FTYPE_V4DI_V16QI_UQI:
35109 case V8HI_FTYPE_V2DI_V8HI_UQI:
35110 case V8HI_FTYPE_V4DI_V8HI_UQI:
35111 case V4SI_FTYPE_V2DI_V4SI_UQI:
35112 case V4SI_FTYPE_V4DI_V4SI_UQI:
35113 case V32QI_FTYPE_V32HI_V32QI_USI:
35114 case UHI_FTYPE_V16QI_V16QI_UHI:
35115 case USI_FTYPE_V32QI_V32QI_USI:
35116 case UDI_FTYPE_V64QI_V64QI_UDI:
35117 case UQI_FTYPE_V8HI_V8HI_UQI:
35118 case UHI_FTYPE_V16HI_V16HI_UHI:
35119 case USI_FTYPE_V32HI_V32HI_USI:
35120 case UQI_FTYPE_V4SI_V4SI_UQI:
35121 case UQI_FTYPE_V8SI_V8SI_UQI:
35122 case UQI_FTYPE_V2DI_V2DI_UQI:
35123 case UQI_FTYPE_V4DI_V4DI_UQI:
35124 case V4SF_FTYPE_V2DF_V4SF_UQI:
35125 case V4SF_FTYPE_V4DF_V4SF_UQI:
35126 case V16SI_FTYPE_V16SI_V16SI_UHI:
35127 case V16SI_FTYPE_V4SI_V16SI_UHI:
35128 case V2DI_FTYPE_V4SI_V2DI_UQI:
35129 case V2DI_FTYPE_V8HI_V2DI_UQI:
35130 case V2DI_FTYPE_V16QI_V2DI_UQI:
35131 case V4DI_FTYPE_V4DI_V4DI_UQI:
35132 case V4DI_FTYPE_V4SI_V4DI_UQI:
35133 case V4DI_FTYPE_V8HI_V4DI_UQI:
35134 case V4DI_FTYPE_V16QI_V4DI_UQI:
35135 case V4DI_FTYPE_V4DF_V4DI_UQI:
35136 case V2DI_FTYPE_V2DF_V2DI_UQI:
35137 case V4SI_FTYPE_V4DF_V4SI_UQI:
35138 case V4SI_FTYPE_V2DF_V4SI_UQI:
35139 case V4SI_FTYPE_V8HI_V4SI_UQI:
35140 case V4SI_FTYPE_V16QI_V4SI_UQI:
35141 case V4DI_FTYPE_V4DI_V4DI_V4DI:
35142 case V8DF_FTYPE_V2DF_V8DF_UQI:
35143 case V8DF_FTYPE_V4DF_V8DF_UQI:
35144 case V8DF_FTYPE_V8DF_V8DF_UQI:
35145 case V8SF_FTYPE_V8SF_V8SF_UQI:
35146 case V8SF_FTYPE_V8SI_V8SF_UQI:
35147 case V4DF_FTYPE_V4DF_V4DF_UQI:
35148 case V4SF_FTYPE_V4SF_V4SF_UQI:
35149 case V2DF_FTYPE_V2DF_V2DF_UQI:
35150 case V2DF_FTYPE_V4SF_V2DF_UQI:
35151 case V2DF_FTYPE_V4SI_V2DF_UQI:
35152 case V4SF_FTYPE_V4SI_V4SF_UQI:
35153 case V4DF_FTYPE_V4SF_V4DF_UQI:
35154 case V4DF_FTYPE_V4SI_V4DF_UQI:
35155 case V8SI_FTYPE_V8SI_V8SI_UQI:
35156 case V8SI_FTYPE_V8HI_V8SI_UQI:
35157 case V8SI_FTYPE_V16QI_V8SI_UQI:
35158 case V8DF_FTYPE_V8SI_V8DF_UQI:
35159 case V8DI_FTYPE_DI_V8DI_UQI:
35160 case V16SF_FTYPE_V8SF_V16SF_UHI:
35161 case V16SI_FTYPE_V8SI_V16SI_UHI:
35162 case V16HI_FTYPE_V16HI_V16HI_UHI:
35163 case V8HI_FTYPE_V16QI_V8HI_UQI:
35164 case V16HI_FTYPE_V16QI_V16HI_UHI:
35165 case V32HI_FTYPE_V32HI_V32HI_USI:
35166 case V32HI_FTYPE_V32QI_V32HI_USI:
35167 case V8DI_FTYPE_V16QI_V8DI_UQI:
35168 case V8DI_FTYPE_V2DI_V8DI_UQI:
35169 case V8DI_FTYPE_V4DI_V8DI_UQI:
35170 case V8DI_FTYPE_V8DI_V8DI_UQI:
35171 case V8DI_FTYPE_V8HI_V8DI_UQI:
35172 case V8DI_FTYPE_V8SI_V8DI_UQI:
35173 case V8HI_FTYPE_V8DI_V8HI_UQI:
35174 case V8SI_FTYPE_V8DI_V8SI_UQI:
35175 case V4SI_FTYPE_V4SI_V4SI_V4SI:
35176 nargs = 3;
35177 break;
35178 case V32QI_FTYPE_V32QI_V32QI_INT:
35179 case V16HI_FTYPE_V16HI_V16HI_INT:
35180 case V16QI_FTYPE_V16QI_V16QI_INT:
35181 case V4DI_FTYPE_V4DI_V4DI_INT:
35182 case V8HI_FTYPE_V8HI_V8HI_INT:
35183 case V8SI_FTYPE_V8SI_V8SI_INT:
35184 case V8SI_FTYPE_V8SI_V4SI_INT:
35185 case V8SF_FTYPE_V8SF_V8SF_INT:
35186 case V8SF_FTYPE_V8SF_V4SF_INT:
35187 case V4SI_FTYPE_V4SI_V4SI_INT:
35188 case V4DF_FTYPE_V4DF_V4DF_INT:
35189 case V16SF_FTYPE_V16SF_V16SF_INT:
35190 case V16SF_FTYPE_V16SF_V4SF_INT:
35191 case V16SI_FTYPE_V16SI_V4SI_INT:
35192 case V4DF_FTYPE_V4DF_V2DF_INT:
35193 case V4SF_FTYPE_V4SF_V4SF_INT:
35194 case V2DI_FTYPE_V2DI_V2DI_INT:
35195 case V4DI_FTYPE_V4DI_V2DI_INT:
35196 case V2DF_FTYPE_V2DF_V2DF_INT:
35197 case UQI_FTYPE_V8DI_V8UDI_INT:
35198 case UQI_FTYPE_V8DF_V8DF_INT:
35199 case UQI_FTYPE_V2DF_V2DF_INT:
35200 case UQI_FTYPE_V4SF_V4SF_INT:
35201 case UHI_FTYPE_V16SI_V16SI_INT:
35202 case UHI_FTYPE_V16SF_V16SF_INT:
35203 nargs = 3;
35204 nargs_constant = 1;
35205 break;
35206 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
35207 nargs = 3;
35208 rmode = V4DImode;
35209 nargs_constant = 1;
35210 break;
35211 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
35212 nargs = 3;
35213 rmode = V2DImode;
35214 nargs_constant = 1;
35215 break;
35216 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
35217 nargs = 3;
35218 rmode = DImode;
35219 nargs_constant = 1;
35220 break;
35221 case V2DI_FTYPE_V2DI_UINT_UINT:
35222 nargs = 3;
35223 nargs_constant = 2;
35224 break;
35225 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
35226 nargs = 3;
35227 rmode = V8DImode;
35228 nargs_constant = 1;
35229 break;
35230 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
35231 nargs = 5;
35232 rmode = V8DImode;
35233 mask_pos = 2;
35234 nargs_constant = 1;
35235 break;
35236 case QI_FTYPE_V8DF_INT_UQI:
35237 case QI_FTYPE_V4DF_INT_UQI:
35238 case QI_FTYPE_V2DF_INT_UQI:
35239 case HI_FTYPE_V16SF_INT_UHI:
35240 case QI_FTYPE_V8SF_INT_UQI:
35241 case QI_FTYPE_V4SF_INT_UQI:
35242 nargs = 3;
35243 mask_pos = 1;
35244 nargs_constant = 1;
35245 break;
35246 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
35247 nargs = 5;
35248 rmode = V4DImode;
35249 mask_pos = 2;
35250 nargs_constant = 1;
35251 break;
35252 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
35253 nargs = 5;
35254 rmode = V2DImode;
35255 mask_pos = 2;
35256 nargs_constant = 1;
35257 break;
35258 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
35259 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
35260 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
35261 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
35262 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
35263 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
35264 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
35265 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
35266 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
35267 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
35268 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
35269 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
35270 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
35271 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
35272 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
35273 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
35274 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
35275 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
35276 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
35277 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
35278 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
35279 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
35280 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
35281 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
35282 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
35283 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
35284 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
35285 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
35286 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
35287 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
35288 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
35289 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
35290 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
35291 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
35292 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
35293 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
35294 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
35295 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
35296 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
35297 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
35298 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
35299 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
35300 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
35301 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
35302 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
35303 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
35304 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
35305 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
35306 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
35307 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
35308 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
35309 nargs = 4;
35310 break;
35311 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
35312 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
35313 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
35314 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
35315 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
35316 nargs = 4;
35317 nargs_constant = 1;
35318 break;
35319 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
35320 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
35321 case QI_FTYPE_V4DF_V4DF_INT_UQI:
35322 case QI_FTYPE_V8SF_V8SF_INT_UQI:
35323 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
35324 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
35325 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
35326 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
35327 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
35328 case USI_FTYPE_V32QI_V32QI_INT_USI:
35329 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
35330 case USI_FTYPE_V32HI_V32HI_INT_USI:
35331 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
35332 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
35333 nargs = 4;
35334 mask_pos = 1;
35335 nargs_constant = 1;
35336 break;
35337 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
35338 nargs = 4;
35339 nargs_constant = 2;
35340 break;
35341 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
35342 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
35343 nargs = 4;
35344 break;
35345 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
35346 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
35347 mask_pos = 1;
35348 nargs = 4;
35349 nargs_constant = 1;
35350 break;
35351 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
35352 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
35353 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
35354 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
35355 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
35356 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
35357 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
35358 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
35359 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
35360 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
35361 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
35362 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
35363 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
35364 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
35365 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
35366 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
35367 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
35368 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
35369 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
35370 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
35371 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
35372 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
35373 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
35374 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
35375 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
35376 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
35377 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
35378 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
35379 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
35380 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
35381 nargs = 4;
35382 mask_pos = 2;
35383 nargs_constant = 1;
35384 break;
35385 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
35386 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
35387 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
35388 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
35389 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
35390 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
35391 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
35392 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
35393 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
35394 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
35395 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
35396 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
35397 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
35398 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
35399 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
35400 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
35401 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
35402 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
35403 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
35404 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
35405 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
35406 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
35407 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
35408 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
35409 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
35410 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
35411 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
35412 nargs = 5;
35413 mask_pos = 2;
35414 nargs_constant = 1;
35415 break;
35416 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
35417 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
35418 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
35419 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
35420 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
35421 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
35422 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
35423 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
35424 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
35425 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
35426 nargs = 5;
35427 mask_pos = 1;
35428 nargs_constant = 1;
35429 break;
35430
35431 default:
35432 gcc_unreachable ();
35433 }
35434
35435 gcc_assert (nargs <= ARRAY_SIZE (args));
35436
35437 if (comparison != UNKNOWN)
35438 {
35439 gcc_assert (nargs == 2);
35440 return ix86_expand_sse_compare (d, exp, target, swap);
35441 }
35442
35443 if (rmode == VOIDmode || rmode == tmode)
35444 {
35445 if (optimize
35446 || target == 0
35447 || GET_MODE (target) != tmode
35448 || !insn_p->operand[0].predicate (target, tmode))
35449 target = gen_reg_rtx (tmode);
35450 real_target = target;
35451 }
35452 else
35453 {
35454 real_target = gen_reg_rtx (tmode);
35455 target = lowpart_subreg (rmode, real_target, tmode);
35456 }
35457
35458 for (i = 0; i < nargs; i++)
35459 {
35460 tree arg = CALL_EXPR_ARG (exp, i);
35461 rtx op = expand_normal (arg);
35462 machine_mode mode = insn_p->operand[i + 1].mode;
35463 bool match = insn_p->operand[i + 1].predicate (op, mode);
35464
35465 if (last_arg_count && (i + 1) == nargs)
35466 {
35467 /* SIMD shift insns take either an 8-bit immediate or
35468 register as count. But builtin functions take int as
35469 count. If count doesn't match, we put it in register. */
35470 if (!match)
35471 {
35472 op = lowpart_subreg (SImode, op, GET_MODE (op));
35473 if (!insn_p->operand[i + 1].predicate (op, mode))
35474 op = copy_to_reg (op);
35475 }
35476 }
35477 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
35478 (!mask_pos && (nargs - i) <= nargs_constant))
35479 {
35480 if (!match)
35481 switch (icode)
35482 {
35483 case CODE_FOR_avx_vinsertf128v4di:
35484 case CODE_FOR_avx_vextractf128v4di:
35485 error ("the last argument must be an 1-bit immediate");
35486 return const0_rtx;
35487
35488 case CODE_FOR_avx512f_cmpv8di3_mask:
35489 case CODE_FOR_avx512f_cmpv16si3_mask:
35490 case CODE_FOR_avx512f_ucmpv8di3_mask:
35491 case CODE_FOR_avx512f_ucmpv16si3_mask:
35492 case CODE_FOR_avx512vl_cmpv4di3_mask:
35493 case CODE_FOR_avx512vl_cmpv8si3_mask:
35494 case CODE_FOR_avx512vl_ucmpv4di3_mask:
35495 case CODE_FOR_avx512vl_ucmpv8si3_mask:
35496 case CODE_FOR_avx512vl_cmpv2di3_mask:
35497 case CODE_FOR_avx512vl_cmpv4si3_mask:
35498 case CODE_FOR_avx512vl_ucmpv2di3_mask:
35499 case CODE_FOR_avx512vl_ucmpv4si3_mask:
35500 error ("the last argument must be a 3-bit immediate");
35501 return const0_rtx;
35502
35503 case CODE_FOR_sse4_1_roundsd:
35504 case CODE_FOR_sse4_1_roundss:
35505
35506 case CODE_FOR_sse4_1_roundpd:
35507 case CODE_FOR_sse4_1_roundps:
35508 case CODE_FOR_avx_roundpd256:
35509 case CODE_FOR_avx_roundps256:
35510
35511 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
35512 case CODE_FOR_sse4_1_roundps_sfix:
35513 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
35514 case CODE_FOR_avx_roundps_sfix256:
35515
35516 case CODE_FOR_sse4_1_blendps:
35517 case CODE_FOR_avx_blendpd256:
35518 case CODE_FOR_avx_vpermilv4df:
35519 case CODE_FOR_avx_vpermilv4df_mask:
35520 case CODE_FOR_avx512f_getmantv8df_mask:
35521 case CODE_FOR_avx512f_getmantv16sf_mask:
35522 case CODE_FOR_avx512vl_getmantv8sf_mask:
35523 case CODE_FOR_avx512vl_getmantv4df_mask:
35524 case CODE_FOR_avx512vl_getmantv4sf_mask:
35525 case CODE_FOR_avx512vl_getmantv2df_mask:
35526 case CODE_FOR_avx512dq_rangepv8df_mask_round:
35527 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
35528 case CODE_FOR_avx512dq_rangepv4df_mask:
35529 case CODE_FOR_avx512dq_rangepv8sf_mask:
35530 case CODE_FOR_avx512dq_rangepv2df_mask:
35531 case CODE_FOR_avx512dq_rangepv4sf_mask:
35532 case CODE_FOR_avx_shufpd256_mask:
35533 error ("the last argument must be a 4-bit immediate");
35534 return const0_rtx;
35535
35536 case CODE_FOR_sha1rnds4:
35537 case CODE_FOR_sse4_1_blendpd:
35538 case CODE_FOR_avx_vpermilv2df:
35539 case CODE_FOR_avx_vpermilv2df_mask:
35540 case CODE_FOR_xop_vpermil2v2df3:
35541 case CODE_FOR_xop_vpermil2v4sf3:
35542 case CODE_FOR_xop_vpermil2v4df3:
35543 case CODE_FOR_xop_vpermil2v8sf3:
35544 case CODE_FOR_avx512f_vinsertf32x4_mask:
35545 case CODE_FOR_avx512f_vinserti32x4_mask:
35546 case CODE_FOR_avx512f_vextractf32x4_mask:
35547 case CODE_FOR_avx512f_vextracti32x4_mask:
35548 case CODE_FOR_sse2_shufpd:
35549 case CODE_FOR_sse2_shufpd_mask:
35550 case CODE_FOR_avx512dq_shuf_f64x2_mask:
35551 case CODE_FOR_avx512dq_shuf_i64x2_mask:
35552 case CODE_FOR_avx512vl_shuf_i32x4_mask:
35553 case CODE_FOR_avx512vl_shuf_f32x4_mask:
35554 error ("the last argument must be a 2-bit immediate");
35555 return const0_rtx;
35556
35557 case CODE_FOR_avx_vextractf128v4df:
35558 case CODE_FOR_avx_vextractf128v8sf:
35559 case CODE_FOR_avx_vextractf128v8si:
35560 case CODE_FOR_avx_vinsertf128v4df:
35561 case CODE_FOR_avx_vinsertf128v8sf:
35562 case CODE_FOR_avx_vinsertf128v8si:
35563 case CODE_FOR_avx512f_vinsertf64x4_mask:
35564 case CODE_FOR_avx512f_vinserti64x4_mask:
35565 case CODE_FOR_avx512f_vextractf64x4_mask:
35566 case CODE_FOR_avx512f_vextracti64x4_mask:
35567 case CODE_FOR_avx512dq_vinsertf32x8_mask:
35568 case CODE_FOR_avx512dq_vinserti32x8_mask:
35569 case CODE_FOR_avx512vl_vinsertv4df:
35570 case CODE_FOR_avx512vl_vinsertv4di:
35571 case CODE_FOR_avx512vl_vinsertv8sf:
35572 case CODE_FOR_avx512vl_vinsertv8si:
35573 error ("the last argument must be a 1-bit immediate");
35574 return const0_rtx;
35575
35576 case CODE_FOR_avx_vmcmpv2df3:
35577 case CODE_FOR_avx_vmcmpv4sf3:
35578 case CODE_FOR_avx_cmpv2df3:
35579 case CODE_FOR_avx_cmpv4sf3:
35580 case CODE_FOR_avx_cmpv4df3:
35581 case CODE_FOR_avx_cmpv8sf3:
35582 case CODE_FOR_avx512f_cmpv8df3_mask:
35583 case CODE_FOR_avx512f_cmpv16sf3_mask:
35584 case CODE_FOR_avx512f_vmcmpv2df3_mask:
35585 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
35586 error ("the last argument must be a 5-bit immediate");
35587 return const0_rtx;
35588
35589 default:
35590 switch (nargs_constant)
35591 {
35592 case 2:
35593 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
35594 (!mask_pos && (nargs - i) == nargs_constant))
35595 {
35596 error ("the next to last argument must be an 8-bit immediate");
35597 break;
35598 }
35599 /* FALLTHRU */
35600 case 1:
35601 error ("the last argument must be an 8-bit immediate");
35602 break;
35603 default:
35604 gcc_unreachable ();
35605 }
35606 return const0_rtx;
35607 }
35608 }
35609 else
35610 {
35611 if (VECTOR_MODE_P (mode))
35612 op = safe_vector_operand (op, mode);
35613
35614 /* If we aren't optimizing, only allow one memory operand to
35615 be generated. */
35616 if (memory_operand (op, mode))
35617 num_memory++;
35618
35619 op = fixup_modeless_constant (op, mode);
35620
35621 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
35622 {
35623 if (optimize || !match || num_memory > 1)
35624 op = copy_to_mode_reg (mode, op);
35625 }
35626 else
35627 {
35628 op = copy_to_reg (op);
35629 op = lowpart_subreg (mode, op, GET_MODE (op));
35630 }
35631 }
35632
35633 args[i].op = op;
35634 args[i].mode = mode;
35635 }
35636
35637 switch (nargs)
35638 {
35639 case 1:
35640 pat = GEN_FCN (icode) (real_target, args[0].op);
35641 break;
35642 case 2:
35643 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
35644 break;
35645 case 3:
35646 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35647 args[2].op);
35648 break;
35649 case 4:
35650 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35651 args[2].op, args[3].op);
35652 break;
35653 case 5:
35654 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35655 args[2].op, args[3].op, args[4].op);
35656 break;
35657 case 6:
35658 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35659 args[2].op, args[3].op, args[4].op,
35660 args[5].op);
35661 break;
35662 default:
35663 gcc_unreachable ();
35664 }
35665
35666 if (! pat)
35667 return 0;
35668
35669 emit_insn (pat);
35670 return target;
35671 }
35672
35673 /* Transform pattern of following layout:
35674 (parallel [
35675 set (A B)
35676 (unspec [C] UNSPEC_EMBEDDED_ROUNDING)])
35677 ])
35678 into:
35679 (set (A B))
35680
35681 Or:
35682 (parallel [ A B
35683 ...
35684 (unspec [C] UNSPEC_EMBEDDED_ROUNDING)
35685 ...
35686 ])
35687 into:
35688 (parallel [ A B ... ]) */
35689
35690 static rtx
35691 ix86_erase_embedded_rounding (rtx pat)
35692 {
35693 if (GET_CODE (pat) == INSN)
35694 pat = PATTERN (pat);
35695
35696 gcc_assert (GET_CODE (pat) == PARALLEL);
35697
35698 if (XVECLEN (pat, 0) == 2)
35699 {
35700 rtx p0 = XVECEXP (pat, 0, 0);
35701 rtx p1 = XVECEXP (pat, 0, 1);
35702
35703 gcc_assert (GET_CODE (p0) == SET
35704 && GET_CODE (p1) == UNSPEC
35705 && XINT (p1, 1) == UNSPEC_EMBEDDED_ROUNDING);
35706
35707 return p0;
35708 }
35709 else
35710 {
35711 rtx *res = XALLOCAVEC (rtx, XVECLEN (pat, 0));
35712 int i = 0;
35713 int j = 0;
35714
35715 for (; i < XVECLEN (pat, 0); ++i)
35716 {
35717 rtx elem = XVECEXP (pat, 0, i);
35718 if (GET_CODE (elem) != UNSPEC
35719 || XINT (elem, 1) != UNSPEC_EMBEDDED_ROUNDING)
35720 res [j++] = elem;
35721 }
35722
35723 /* No more than 1 occurence was removed. */
35724 gcc_assert (j >= XVECLEN (pat, 0) - 1);
35725
35726 return gen_rtx_PARALLEL (GET_MODE (pat), gen_rtvec_v (j, res));
35727 }
35728 }
35729
35730 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
35731 with rounding. */
35732 static rtx
35733 ix86_expand_sse_comi_round (const struct builtin_description *d,
35734 tree exp, rtx target)
35735 {
35736 rtx pat, set_dst;
35737 tree arg0 = CALL_EXPR_ARG (exp, 0);
35738 tree arg1 = CALL_EXPR_ARG (exp, 1);
35739 tree arg2 = CALL_EXPR_ARG (exp, 2);
35740 tree arg3 = CALL_EXPR_ARG (exp, 3);
35741 rtx op0 = expand_normal (arg0);
35742 rtx op1 = expand_normal (arg1);
35743 rtx op2 = expand_normal (arg2);
35744 rtx op3 = expand_normal (arg3);
35745 enum insn_code icode = d->icode;
35746 const struct insn_data_d *insn_p = &insn_data[icode];
35747 machine_mode mode0 = insn_p->operand[0].mode;
35748 machine_mode mode1 = insn_p->operand[1].mode;
35749 enum rtx_code comparison = UNEQ;
35750 bool need_ucomi = false;
35751
35752 /* See avxintrin.h for values. */
35753 enum rtx_code comi_comparisons[32] =
35754 {
35755 UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT,
35756 UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE,
35757 UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT
35758 };
35759 bool need_ucomi_values[32] =
35760 {
35761 true, false, false, true, true, false, false, true,
35762 true, false, false, true, true, false, false, true,
35763 false, true, true, false, false, true, true, false,
35764 false, true, true, false, false, true, true, false
35765 };
35766
35767 if (!CONST_INT_P (op2))
35768 {
35769 error ("the third argument must be comparison constant");
35770 return const0_rtx;
35771 }
35772 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
35773 {
35774 error ("incorrect comparison mode");
35775 return const0_rtx;
35776 }
35777
35778 if (!insn_p->operand[2].predicate (op3, SImode))
35779 {
35780 error ("incorrect rounding operand");
35781 return const0_rtx;
35782 }
35783
35784 comparison = comi_comparisons[INTVAL (op2)];
35785 need_ucomi = need_ucomi_values[INTVAL (op2)];
35786
35787 if (VECTOR_MODE_P (mode0))
35788 op0 = safe_vector_operand (op0, mode0);
35789 if (VECTOR_MODE_P (mode1))
35790 op1 = safe_vector_operand (op1, mode1);
35791
35792 target = gen_reg_rtx (SImode);
35793 emit_move_insn (target, const0_rtx);
35794 target = gen_rtx_SUBREG (QImode, target, 0);
35795
35796 if ((optimize && !register_operand (op0, mode0))
35797 || !insn_p->operand[0].predicate (op0, mode0))
35798 op0 = copy_to_mode_reg (mode0, op0);
35799 if ((optimize && !register_operand (op1, mode1))
35800 || !insn_p->operand[1].predicate (op1, mode1))
35801 op1 = copy_to_mode_reg (mode1, op1);
35802
35803 if (need_ucomi)
35804 icode = icode == CODE_FOR_sse_comi_round
35805 ? CODE_FOR_sse_ucomi_round
35806 : CODE_FOR_sse2_ucomi_round;
35807
35808 pat = GEN_FCN (icode) (op0, op1, op3);
35809 if (! pat)
35810 return 0;
35811
35812 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
35813 if (INTVAL (op3) == NO_ROUND)
35814 {
35815 pat = ix86_erase_embedded_rounding (pat);
35816 if (! pat)
35817 return 0;
35818
35819 set_dst = SET_DEST (pat);
35820 }
35821 else
35822 {
35823 gcc_assert (GET_CODE (XVECEXP (pat, 0, 0)) == SET);
35824 set_dst = SET_DEST (XVECEXP (pat, 0, 0));
35825 }
35826
35827 emit_insn (pat);
35828 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35829 gen_rtx_fmt_ee (comparison, QImode,
35830 set_dst,
35831 const0_rtx)));
35832
35833 return SUBREG_REG (target);
35834 }
35835
35836 static rtx
35837 ix86_expand_round_builtin (const struct builtin_description *d,
35838 tree exp, rtx target)
35839 {
35840 rtx pat;
35841 unsigned int i, nargs;
35842 struct
35843 {
35844 rtx op;
35845 machine_mode mode;
35846 } args[6];
35847 enum insn_code icode = d->icode;
35848 const struct insn_data_d *insn_p = &insn_data[icode];
35849 machine_mode tmode = insn_p->operand[0].mode;
35850 unsigned int nargs_constant = 0;
35851 unsigned int redundant_embed_rnd = 0;
35852
35853 switch ((enum ix86_builtin_func_type) d->flag)
35854 {
35855 case UINT64_FTYPE_V2DF_INT:
35856 case UINT64_FTYPE_V4SF_INT:
35857 case UINT_FTYPE_V2DF_INT:
35858 case UINT_FTYPE_V4SF_INT:
35859 case INT64_FTYPE_V2DF_INT:
35860 case INT64_FTYPE_V4SF_INT:
35861 case INT_FTYPE_V2DF_INT:
35862 case INT_FTYPE_V4SF_INT:
35863 nargs = 2;
35864 break;
35865 case V4SF_FTYPE_V4SF_UINT_INT:
35866 case V4SF_FTYPE_V4SF_UINT64_INT:
35867 case V2DF_FTYPE_V2DF_UINT64_INT:
35868 case V4SF_FTYPE_V4SF_INT_INT:
35869 case V4SF_FTYPE_V4SF_INT64_INT:
35870 case V2DF_FTYPE_V2DF_INT64_INT:
35871 case V4SF_FTYPE_V4SF_V4SF_INT:
35872 case V2DF_FTYPE_V2DF_V2DF_INT:
35873 case V4SF_FTYPE_V4SF_V2DF_INT:
35874 case V2DF_FTYPE_V2DF_V4SF_INT:
35875 nargs = 3;
35876 break;
35877 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
35878 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
35879 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
35880 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
35881 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
35882 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
35883 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
35884 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
35885 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
35886 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
35887 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
35888 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
35889 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
35890 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
35891 nargs = 4;
35892 break;
35893 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
35894 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
35895 nargs_constant = 2;
35896 nargs = 4;
35897 break;
35898 case INT_FTYPE_V4SF_V4SF_INT_INT:
35899 case INT_FTYPE_V2DF_V2DF_INT_INT:
35900 return ix86_expand_sse_comi_round (d, exp, target);
35901 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
35902 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
35903 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
35904 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
35905 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
35906 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
35907 nargs = 5;
35908 break;
35909 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
35910 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
35911 nargs_constant = 4;
35912 nargs = 5;
35913 break;
35914 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
35915 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
35916 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
35917 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
35918 nargs_constant = 3;
35919 nargs = 5;
35920 break;
35921 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
35922 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
35923 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
35924 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
35925 nargs = 6;
35926 nargs_constant = 4;
35927 break;
35928 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
35929 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
35930 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
35931 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
35932 nargs = 6;
35933 nargs_constant = 3;
35934 break;
35935 default:
35936 gcc_unreachable ();
35937 }
35938 gcc_assert (nargs <= ARRAY_SIZE (args));
35939
35940 if (optimize
35941 || target == 0
35942 || GET_MODE (target) != tmode
35943 || !insn_p->operand[0].predicate (target, tmode))
35944 target = gen_reg_rtx (tmode);
35945
35946 for (i = 0; i < nargs; i++)
35947 {
35948 tree arg = CALL_EXPR_ARG (exp, i);
35949 rtx op = expand_normal (arg);
35950 machine_mode mode = insn_p->operand[i + 1].mode;
35951 bool match = insn_p->operand[i + 1].predicate (op, mode);
35952
35953 if (i == nargs - nargs_constant)
35954 {
35955 if (!match)
35956 {
35957 switch (icode)
35958 {
35959 case CODE_FOR_avx512f_getmantv8df_mask_round:
35960 case CODE_FOR_avx512f_getmantv16sf_mask_round:
35961 case CODE_FOR_avx512f_vgetmantv2df_round:
35962 case CODE_FOR_avx512f_vgetmantv4sf_round:
35963 error ("the immediate argument must be a 4-bit immediate");
35964 return const0_rtx;
35965 case CODE_FOR_avx512f_cmpv8df3_mask_round:
35966 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
35967 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
35968 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
35969 error ("the immediate argument must be a 5-bit immediate");
35970 return const0_rtx;
35971 default:
35972 error ("the immediate argument must be an 8-bit immediate");
35973 return const0_rtx;
35974 }
35975 }
35976 }
35977 else if (i == nargs-1)
35978 {
35979 if (!insn_p->operand[nargs].predicate (op, SImode))
35980 {
35981 error ("incorrect rounding operand");
35982 return const0_rtx;
35983 }
35984
35985 /* If there is no rounding use normal version of the pattern. */
35986 if (INTVAL (op) == NO_ROUND)
35987 redundant_embed_rnd = 1;
35988 }
35989 else
35990 {
35991 if (VECTOR_MODE_P (mode))
35992 op = safe_vector_operand (op, mode);
35993
35994 op = fixup_modeless_constant (op, mode);
35995
35996 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
35997 {
35998 if (optimize || !match)
35999 op = copy_to_mode_reg (mode, op);
36000 }
36001 else
36002 {
36003 op = copy_to_reg (op);
36004 op = lowpart_subreg (mode, op, GET_MODE (op));
36005 }
36006 }
36007
36008 args[i].op = op;
36009 args[i].mode = mode;
36010 }
36011
36012 switch (nargs)
36013 {
36014 case 1:
36015 pat = GEN_FCN (icode) (target, args[0].op);
36016 break;
36017 case 2:
36018 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
36019 break;
36020 case 3:
36021 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36022 args[2].op);
36023 break;
36024 case 4:
36025 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36026 args[2].op, args[3].op);
36027 break;
36028 case 5:
36029 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36030 args[2].op, args[3].op, args[4].op);
36031 break;
36032 case 6:
36033 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36034 args[2].op, args[3].op, args[4].op,
36035 args[5].op);
36036 break;
36037 default:
36038 gcc_unreachable ();
36039 }
36040
36041 if (!pat)
36042 return 0;
36043
36044 if (redundant_embed_rnd)
36045 pat = ix86_erase_embedded_rounding (pat);
36046
36047 emit_insn (pat);
36048 return target;
36049 }
36050
36051 /* Subroutine of ix86_expand_builtin to take care of special insns
36052 with variable number of operands. */
36053
36054 static rtx
36055 ix86_expand_special_args_builtin (const struct builtin_description *d,
36056 tree exp, rtx target)
36057 {
36058 tree arg;
36059 rtx pat, op;
36060 unsigned int i, nargs, arg_adjust, memory;
36061 bool aligned_mem = false;
36062 struct
36063 {
36064 rtx op;
36065 machine_mode mode;
36066 } args[3];
36067 enum insn_code icode = d->icode;
36068 bool last_arg_constant = false;
36069 const struct insn_data_d *insn_p = &insn_data[icode];
36070 machine_mode tmode = insn_p->operand[0].mode;
36071 enum { load, store } klass;
36072
36073 switch ((enum ix86_builtin_func_type) d->flag)
36074 {
36075 case VOID_FTYPE_VOID:
36076 emit_insn (GEN_FCN (icode) (target));
36077 return 0;
36078 case VOID_FTYPE_UINT64:
36079 case VOID_FTYPE_UNSIGNED:
36080 nargs = 0;
36081 klass = store;
36082 memory = 0;
36083 break;
36084
36085 case INT_FTYPE_VOID:
36086 case USHORT_FTYPE_VOID:
36087 case UINT64_FTYPE_VOID:
36088 case UNSIGNED_FTYPE_VOID:
36089 nargs = 0;
36090 klass = load;
36091 memory = 0;
36092 break;
36093 case UINT64_FTYPE_PUNSIGNED:
36094 case V2DI_FTYPE_PV2DI:
36095 case V4DI_FTYPE_PV4DI:
36096 case V32QI_FTYPE_PCCHAR:
36097 case V16QI_FTYPE_PCCHAR:
36098 case V8SF_FTYPE_PCV4SF:
36099 case V8SF_FTYPE_PCFLOAT:
36100 case V4SF_FTYPE_PCFLOAT:
36101 case V4DF_FTYPE_PCV2DF:
36102 case V4DF_FTYPE_PCDOUBLE:
36103 case V2DF_FTYPE_PCDOUBLE:
36104 case VOID_FTYPE_PVOID:
36105 case V8DI_FTYPE_PV8DI:
36106 nargs = 1;
36107 klass = load;
36108 memory = 0;
36109 switch (icode)
36110 {
36111 case CODE_FOR_sse4_1_movntdqa:
36112 case CODE_FOR_avx2_movntdqa:
36113 case CODE_FOR_avx512f_movntdqa:
36114 aligned_mem = true;
36115 break;
36116 default:
36117 break;
36118 }
36119 break;
36120 case VOID_FTYPE_PV2SF_V4SF:
36121 case VOID_FTYPE_PV8DI_V8DI:
36122 case VOID_FTYPE_PV4DI_V4DI:
36123 case VOID_FTYPE_PV2DI_V2DI:
36124 case VOID_FTYPE_PCHAR_V32QI:
36125 case VOID_FTYPE_PCHAR_V16QI:
36126 case VOID_FTYPE_PFLOAT_V16SF:
36127 case VOID_FTYPE_PFLOAT_V8SF:
36128 case VOID_FTYPE_PFLOAT_V4SF:
36129 case VOID_FTYPE_PDOUBLE_V8DF:
36130 case VOID_FTYPE_PDOUBLE_V4DF:
36131 case VOID_FTYPE_PDOUBLE_V2DF:
36132 case VOID_FTYPE_PLONGLONG_LONGLONG:
36133 case VOID_FTYPE_PULONGLONG_ULONGLONG:
36134 case VOID_FTYPE_PINT_INT:
36135 nargs = 1;
36136 klass = store;
36137 /* Reserve memory operand for target. */
36138 memory = ARRAY_SIZE (args);
36139 switch (icode)
36140 {
36141 /* These builtins and instructions require the memory
36142 to be properly aligned. */
36143 case CODE_FOR_avx_movntv4di:
36144 case CODE_FOR_sse2_movntv2di:
36145 case CODE_FOR_avx_movntv8sf:
36146 case CODE_FOR_sse_movntv4sf:
36147 case CODE_FOR_sse4a_vmmovntv4sf:
36148 case CODE_FOR_avx_movntv4df:
36149 case CODE_FOR_sse2_movntv2df:
36150 case CODE_FOR_sse4a_vmmovntv2df:
36151 case CODE_FOR_sse2_movntidi:
36152 case CODE_FOR_sse_movntq:
36153 case CODE_FOR_sse2_movntisi:
36154 case CODE_FOR_avx512f_movntv16sf:
36155 case CODE_FOR_avx512f_movntv8df:
36156 case CODE_FOR_avx512f_movntv8di:
36157 aligned_mem = true;
36158 break;
36159 default:
36160 break;
36161 }
36162 break;
36163 case V4SF_FTYPE_V4SF_PCV2SF:
36164 case V2DF_FTYPE_V2DF_PCDOUBLE:
36165 nargs = 2;
36166 klass = load;
36167 memory = 1;
36168 break;
36169 case V8SF_FTYPE_PCV8SF_V8SI:
36170 case V4DF_FTYPE_PCV4DF_V4DI:
36171 case V4SF_FTYPE_PCV4SF_V4SI:
36172 case V2DF_FTYPE_PCV2DF_V2DI:
36173 case V8SI_FTYPE_PCV8SI_V8SI:
36174 case V4DI_FTYPE_PCV4DI_V4DI:
36175 case V4SI_FTYPE_PCV4SI_V4SI:
36176 case V2DI_FTYPE_PCV2DI_V2DI:
36177 nargs = 2;
36178 klass = load;
36179 memory = 0;
36180 break;
36181 case VOID_FTYPE_PV8DF_V8DF_UQI:
36182 case VOID_FTYPE_PV4DF_V4DF_UQI:
36183 case VOID_FTYPE_PV2DF_V2DF_UQI:
36184 case VOID_FTYPE_PV16SF_V16SF_UHI:
36185 case VOID_FTYPE_PV8SF_V8SF_UQI:
36186 case VOID_FTYPE_PV4SF_V4SF_UQI:
36187 case VOID_FTYPE_PV8DI_V8DI_UQI:
36188 case VOID_FTYPE_PV4DI_V4DI_UQI:
36189 case VOID_FTYPE_PV2DI_V2DI_UQI:
36190 case VOID_FTYPE_PV16SI_V16SI_UHI:
36191 case VOID_FTYPE_PV8SI_V8SI_UQI:
36192 case VOID_FTYPE_PV4SI_V4SI_UQI:
36193 switch (icode)
36194 {
36195 /* These builtins and instructions require the memory
36196 to be properly aligned. */
36197 case CODE_FOR_avx512f_storev16sf_mask:
36198 case CODE_FOR_avx512f_storev16si_mask:
36199 case CODE_FOR_avx512f_storev8df_mask:
36200 case CODE_FOR_avx512f_storev8di_mask:
36201 case CODE_FOR_avx512vl_storev8sf_mask:
36202 case CODE_FOR_avx512vl_storev8si_mask:
36203 case CODE_FOR_avx512vl_storev4df_mask:
36204 case CODE_FOR_avx512vl_storev4di_mask:
36205 case CODE_FOR_avx512vl_storev4sf_mask:
36206 case CODE_FOR_avx512vl_storev4si_mask:
36207 case CODE_FOR_avx512vl_storev2df_mask:
36208 case CODE_FOR_avx512vl_storev2di_mask:
36209 aligned_mem = true;
36210 break;
36211 default:
36212 break;
36213 }
36214 /* FALLTHRU */
36215 case VOID_FTYPE_PV8SF_V8SI_V8SF:
36216 case VOID_FTYPE_PV4DF_V4DI_V4DF:
36217 case VOID_FTYPE_PV4SF_V4SI_V4SF:
36218 case VOID_FTYPE_PV2DF_V2DI_V2DF:
36219 case VOID_FTYPE_PV8SI_V8SI_V8SI:
36220 case VOID_FTYPE_PV4DI_V4DI_V4DI:
36221 case VOID_FTYPE_PV4SI_V4SI_V4SI:
36222 case VOID_FTYPE_PV2DI_V2DI_V2DI:
36223 case VOID_FTYPE_PV8SI_V8DI_UQI:
36224 case VOID_FTYPE_PV8HI_V8DI_UQI:
36225 case VOID_FTYPE_PV16HI_V16SI_UHI:
36226 case VOID_FTYPE_PV16QI_V8DI_UQI:
36227 case VOID_FTYPE_PV16QI_V16SI_UHI:
36228 case VOID_FTYPE_PV4SI_V4DI_UQI:
36229 case VOID_FTYPE_PV4SI_V2DI_UQI:
36230 case VOID_FTYPE_PV8HI_V4DI_UQI:
36231 case VOID_FTYPE_PV8HI_V2DI_UQI:
36232 case VOID_FTYPE_PV8HI_V8SI_UQI:
36233 case VOID_FTYPE_PV8HI_V4SI_UQI:
36234 case VOID_FTYPE_PV16QI_V4DI_UQI:
36235 case VOID_FTYPE_PV16QI_V2DI_UQI:
36236 case VOID_FTYPE_PV16QI_V8SI_UQI:
36237 case VOID_FTYPE_PV16QI_V4SI_UQI:
36238 case VOID_FTYPE_PCHAR_V64QI_UDI:
36239 case VOID_FTYPE_PCHAR_V32QI_USI:
36240 case VOID_FTYPE_PCHAR_V16QI_UHI:
36241 case VOID_FTYPE_PSHORT_V32HI_USI:
36242 case VOID_FTYPE_PSHORT_V16HI_UHI:
36243 case VOID_FTYPE_PSHORT_V8HI_UQI:
36244 case VOID_FTYPE_PINT_V16SI_UHI:
36245 case VOID_FTYPE_PINT_V8SI_UQI:
36246 case VOID_FTYPE_PINT_V4SI_UQI:
36247 case VOID_FTYPE_PINT64_V8DI_UQI:
36248 case VOID_FTYPE_PINT64_V4DI_UQI:
36249 case VOID_FTYPE_PINT64_V2DI_UQI:
36250 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
36251 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
36252 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
36253 case VOID_FTYPE_PFLOAT_V16SF_UHI:
36254 case VOID_FTYPE_PFLOAT_V8SF_UQI:
36255 case VOID_FTYPE_PFLOAT_V4SF_UQI:
36256 nargs = 2;
36257 klass = store;
36258 /* Reserve memory operand for target. */
36259 memory = ARRAY_SIZE (args);
36260 break;
36261 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
36262 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
36263 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
36264 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
36265 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
36266 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
36267 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
36268 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
36269 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
36270 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
36271 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
36272 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
36273 switch (icode)
36274 {
36275 /* These builtins and instructions require the memory
36276 to be properly aligned. */
36277 case CODE_FOR_avx512f_loadv16sf_mask:
36278 case CODE_FOR_avx512f_loadv16si_mask:
36279 case CODE_FOR_avx512f_loadv8df_mask:
36280 case CODE_FOR_avx512f_loadv8di_mask:
36281 case CODE_FOR_avx512vl_loadv8sf_mask:
36282 case CODE_FOR_avx512vl_loadv8si_mask:
36283 case CODE_FOR_avx512vl_loadv4df_mask:
36284 case CODE_FOR_avx512vl_loadv4di_mask:
36285 case CODE_FOR_avx512vl_loadv4sf_mask:
36286 case CODE_FOR_avx512vl_loadv4si_mask:
36287 case CODE_FOR_avx512vl_loadv2df_mask:
36288 case CODE_FOR_avx512vl_loadv2di_mask:
36289 case CODE_FOR_avx512bw_loadv64qi_mask:
36290 case CODE_FOR_avx512vl_loadv32qi_mask:
36291 case CODE_FOR_avx512vl_loadv16qi_mask:
36292 case CODE_FOR_avx512bw_loadv32hi_mask:
36293 case CODE_FOR_avx512vl_loadv16hi_mask:
36294 case CODE_FOR_avx512vl_loadv8hi_mask:
36295 aligned_mem = true;
36296 break;
36297 default:
36298 break;
36299 }
36300 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
36301 case V32QI_FTYPE_PCCHAR_V32QI_USI:
36302 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
36303 case V32HI_FTYPE_PCSHORT_V32HI_USI:
36304 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
36305 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
36306 case V16SI_FTYPE_PCINT_V16SI_UHI:
36307 case V8SI_FTYPE_PCINT_V8SI_UQI:
36308 case V4SI_FTYPE_PCINT_V4SI_UQI:
36309 case V8DI_FTYPE_PCINT64_V8DI_UQI:
36310 case V4DI_FTYPE_PCINT64_V4DI_UQI:
36311 case V2DI_FTYPE_PCINT64_V2DI_UQI:
36312 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
36313 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
36314 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
36315 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
36316 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
36317 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
36318 nargs = 3;
36319 klass = load;
36320 memory = 0;
36321 break;
36322 case VOID_FTYPE_UINT_UINT_UINT:
36323 case VOID_FTYPE_UINT64_UINT_UINT:
36324 case UCHAR_FTYPE_UINT_UINT_UINT:
36325 case UCHAR_FTYPE_UINT64_UINT_UINT:
36326 nargs = 3;
36327 klass = load;
36328 memory = ARRAY_SIZE (args);
36329 last_arg_constant = true;
36330 break;
36331 default:
36332 gcc_unreachable ();
36333 }
36334
36335 gcc_assert (nargs <= ARRAY_SIZE (args));
36336
36337 if (klass == store)
36338 {
36339 arg = CALL_EXPR_ARG (exp, 0);
36340 op = expand_normal (arg);
36341 gcc_assert (target == 0);
36342 if (memory)
36343 {
36344 op = ix86_zero_extend_to_Pmode (op);
36345 target = gen_rtx_MEM (tmode, op);
36346 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
36347 on it. Try to improve it using get_pointer_alignment,
36348 and if the special builtin is one that requires strict
36349 mode alignment, also from it's GET_MODE_ALIGNMENT.
36350 Failure to do so could lead to ix86_legitimate_combined_insn
36351 rejecting all changes to such insns. */
36352 unsigned int align = get_pointer_alignment (arg);
36353 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
36354 align = GET_MODE_ALIGNMENT (tmode);
36355 if (MEM_ALIGN (target) < align)
36356 set_mem_align (target, align);
36357 }
36358 else
36359 target = force_reg (tmode, op);
36360 arg_adjust = 1;
36361 }
36362 else
36363 {
36364 arg_adjust = 0;
36365 if (optimize
36366 || target == 0
36367 || !register_operand (target, tmode)
36368 || GET_MODE (target) != tmode)
36369 target = gen_reg_rtx (tmode);
36370 }
36371
36372 for (i = 0; i < nargs; i++)
36373 {
36374 machine_mode mode = insn_p->operand[i + 1].mode;
36375 bool match;
36376
36377 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
36378 op = expand_normal (arg);
36379 match = insn_p->operand[i + 1].predicate (op, mode);
36380
36381 if (last_arg_constant && (i + 1) == nargs)
36382 {
36383 if (!match)
36384 {
36385 if (icode == CODE_FOR_lwp_lwpvalsi3
36386 || icode == CODE_FOR_lwp_lwpinssi3
36387 || icode == CODE_FOR_lwp_lwpvaldi3
36388 || icode == CODE_FOR_lwp_lwpinsdi3)
36389 error ("the last argument must be a 32-bit immediate");
36390 else
36391 error ("the last argument must be an 8-bit immediate");
36392 return const0_rtx;
36393 }
36394 }
36395 else
36396 {
36397 if (i == memory)
36398 {
36399 /* This must be the memory operand. */
36400 op = ix86_zero_extend_to_Pmode (op);
36401 op = gen_rtx_MEM (mode, op);
36402 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
36403 on it. Try to improve it using get_pointer_alignment,
36404 and if the special builtin is one that requires strict
36405 mode alignment, also from it's GET_MODE_ALIGNMENT.
36406 Failure to do so could lead to ix86_legitimate_combined_insn
36407 rejecting all changes to such insns. */
36408 unsigned int align = get_pointer_alignment (arg);
36409 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
36410 align = GET_MODE_ALIGNMENT (mode);
36411 if (MEM_ALIGN (op) < align)
36412 set_mem_align (op, align);
36413 }
36414 else
36415 {
36416 /* This must be register. */
36417 if (VECTOR_MODE_P (mode))
36418 op = safe_vector_operand (op, mode);
36419
36420 op = fixup_modeless_constant (op, mode);
36421
36422 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
36423 op = copy_to_mode_reg (mode, op);
36424 else
36425 {
36426 op = copy_to_reg (op);
36427 op = lowpart_subreg (mode, op, GET_MODE (op));
36428 }
36429 }
36430 }
36431
36432 args[i].op = op;
36433 args[i].mode = mode;
36434 }
36435
36436 switch (nargs)
36437 {
36438 case 0:
36439 pat = GEN_FCN (icode) (target);
36440 break;
36441 case 1:
36442 pat = GEN_FCN (icode) (target, args[0].op);
36443 break;
36444 case 2:
36445 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
36446 break;
36447 case 3:
36448 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
36449 break;
36450 default:
36451 gcc_unreachable ();
36452 }
36453
36454 if (! pat)
36455 return 0;
36456 emit_insn (pat);
36457 return klass == store ? 0 : target;
36458 }
36459
36460 /* Return the integer constant in ARG. Constrain it to be in the range
36461 of the subparts of VEC_TYPE; issue an error if not. */
36462
36463 static int
36464 get_element_number (tree vec_type, tree arg)
36465 {
36466 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
36467
36468 if (!tree_fits_uhwi_p (arg)
36469 || (elt = tree_to_uhwi (arg), elt > max))
36470 {
36471 error ("selector must be an integer constant in the range 0..%wi", max);
36472 return 0;
36473 }
36474
36475 return elt;
36476 }
36477
36478 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36479 ix86_expand_vector_init. We DO have language-level syntax for this, in
36480 the form of (type){ init-list }. Except that since we can't place emms
36481 instructions from inside the compiler, we can't allow the use of MMX
36482 registers unless the user explicitly asks for it. So we do *not* define
36483 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
36484 we have builtins invoked by mmintrin.h that gives us license to emit
36485 these sorts of instructions. */
36486
36487 static rtx
36488 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
36489 {
36490 machine_mode tmode = TYPE_MODE (type);
36491 machine_mode inner_mode = GET_MODE_INNER (tmode);
36492 int i, n_elt = GET_MODE_NUNITS (tmode);
36493 rtvec v = rtvec_alloc (n_elt);
36494
36495 gcc_assert (VECTOR_MODE_P (tmode));
36496 gcc_assert (call_expr_nargs (exp) == n_elt);
36497
36498 for (i = 0; i < n_elt; ++i)
36499 {
36500 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
36501 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
36502 }
36503
36504 if (!target || !register_operand (target, tmode))
36505 target = gen_reg_rtx (tmode);
36506
36507 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
36508 return target;
36509 }
36510
36511 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36512 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
36513 had a language-level syntax for referencing vector elements. */
36514
36515 static rtx
36516 ix86_expand_vec_ext_builtin (tree exp, rtx target)
36517 {
36518 machine_mode tmode, mode0;
36519 tree arg0, arg1;
36520 int elt;
36521 rtx op0;
36522
36523 arg0 = CALL_EXPR_ARG (exp, 0);
36524 arg1 = CALL_EXPR_ARG (exp, 1);
36525
36526 op0 = expand_normal (arg0);
36527 elt = get_element_number (TREE_TYPE (arg0), arg1);
36528
36529 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
36530 mode0 = TYPE_MODE (TREE_TYPE (arg0));
36531 gcc_assert (VECTOR_MODE_P (mode0));
36532
36533 op0 = force_reg (mode0, op0);
36534
36535 if (optimize || !target || !register_operand (target, tmode))
36536 target = gen_reg_rtx (tmode);
36537
36538 ix86_expand_vector_extract (true, target, op0, elt);
36539
36540 return target;
36541 }
36542
36543 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36544 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
36545 a language-level syntax for referencing vector elements. */
36546
36547 static rtx
36548 ix86_expand_vec_set_builtin (tree exp)
36549 {
36550 machine_mode tmode, mode1;
36551 tree arg0, arg1, arg2;
36552 int elt;
36553 rtx op0, op1, target;
36554
36555 arg0 = CALL_EXPR_ARG (exp, 0);
36556 arg1 = CALL_EXPR_ARG (exp, 1);
36557 arg2 = CALL_EXPR_ARG (exp, 2);
36558
36559 tmode = TYPE_MODE (TREE_TYPE (arg0));
36560 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
36561 gcc_assert (VECTOR_MODE_P (tmode));
36562
36563 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
36564 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
36565 elt = get_element_number (TREE_TYPE (arg0), arg2);
36566
36567 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
36568 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
36569
36570 op0 = force_reg (tmode, op0);
36571 op1 = force_reg (mode1, op1);
36572
36573 /* OP0 is the source of these builtin functions and shouldn't be
36574 modified. Create a copy, use it and return it as target. */
36575 target = gen_reg_rtx (tmode);
36576 emit_move_insn (target, op0);
36577 ix86_expand_vector_set (true, target, op1, elt);
36578
36579 return target;
36580 }
36581
36582 /* Emit conditional move of SRC to DST with condition
36583 OP1 CODE OP2. */
36584 static void
36585 ix86_emit_cmove (rtx dst, rtx src, enum rtx_code code, rtx op1, rtx op2)
36586 {
36587 rtx t;
36588
36589 if (TARGET_CMOVE)
36590 {
36591 t = ix86_expand_compare (code, op1, op2);
36592 emit_insn (gen_rtx_SET (dst, gen_rtx_IF_THEN_ELSE (GET_MODE (dst), t,
36593 src, dst)));
36594 }
36595 else
36596 {
36597 rtx_code_label *nomove = gen_label_rtx ();
36598 emit_cmp_and_jump_insns (op1, op2, reverse_condition (code),
36599 const0_rtx, GET_MODE (op1), 1, nomove);
36600 emit_move_insn (dst, src);
36601 emit_label (nomove);
36602 }
36603 }
36604
36605 /* Choose max of DST and SRC and put it to DST. */
36606 static void
36607 ix86_emit_move_max (rtx dst, rtx src)
36608 {
36609 ix86_emit_cmove (dst, src, LTU, dst, src);
36610 }
36611
36612 /* Expand an expression EXP that calls a built-in function,
36613 with result going to TARGET if that's convenient
36614 (and in mode MODE if that's convenient).
36615 SUBTARGET may be used as the target for computing one of EXP's operands.
36616 IGNORE is nonzero if the value is to be ignored. */
36617
36618 static rtx
36619 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
36620 machine_mode mode, int ignore)
36621 {
36622 size_t i;
36623 enum insn_code icode;
36624 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
36625 tree arg0, arg1, arg2, arg3, arg4;
36626 rtx op0, op1, op2, op3, op4, pat, insn;
36627 machine_mode mode0, mode1, mode2, mode3, mode4;
36628 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
36629
36630 /* For CPU builtins that can be folded, fold first and expand the fold. */
36631 switch (fcode)
36632 {
36633 case IX86_BUILTIN_CPU_INIT:
36634 {
36635 /* Make it call __cpu_indicator_init in libgcc. */
36636 tree call_expr, fndecl, type;
36637 type = build_function_type_list (integer_type_node, NULL_TREE);
36638 fndecl = build_fn_decl ("__cpu_indicator_init", type);
36639 call_expr = build_call_expr (fndecl, 0);
36640 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
36641 }
36642 case IX86_BUILTIN_CPU_IS:
36643 case IX86_BUILTIN_CPU_SUPPORTS:
36644 {
36645 tree arg0 = CALL_EXPR_ARG (exp, 0);
36646 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
36647 gcc_assert (fold_expr != NULL_TREE);
36648 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
36649 }
36650 }
36651
36652 /* Determine whether the builtin function is available under the current ISA.
36653 Originally the builtin was not created if it wasn't applicable to the
36654 current ISA based on the command line switches. With function specific
36655 options, we need to check in the context of the function making the call
36656 whether it is supported. */
36657 if ((ix86_builtins_isa[fcode].isa
36658 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
36659 || (ix86_builtins_isa[fcode].isa2
36660 && !(ix86_builtins_isa[fcode].isa2 & ix86_isa_flags2)))
36661 {
36662 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa,
36663 ix86_builtins_isa[fcode].isa2, 0, 0,
36664 NULL, NULL, (enum fpmath_unit) 0,
36665 false);
36666 if (!opts)
36667 error ("%qE needs unknown isa option", fndecl);
36668 else
36669 {
36670 gcc_assert (opts != NULL);
36671 error ("%qE needs isa option %s", fndecl, opts);
36672 free (opts);
36673 }
36674 return expand_call (exp, target, ignore);
36675 }
36676
36677 switch (fcode)
36678 {
36679 case IX86_BUILTIN_BNDMK:
36680 if (!target
36681 || GET_MODE (target) != BNDmode
36682 || !register_operand (target, BNDmode))
36683 target = gen_reg_rtx (BNDmode);
36684
36685 arg0 = CALL_EXPR_ARG (exp, 0);
36686 arg1 = CALL_EXPR_ARG (exp, 1);
36687
36688 op0 = expand_normal (arg0);
36689 op1 = expand_normal (arg1);
36690
36691 if (!register_operand (op0, Pmode))
36692 op0 = ix86_zero_extend_to_Pmode (op0);
36693 if (!register_operand (op1, Pmode))
36694 op1 = ix86_zero_extend_to_Pmode (op1);
36695
36696 /* Builtin arg1 is size of block but instruction op1 should
36697 be (size - 1). */
36698 op1 = expand_simple_binop (Pmode, PLUS, op1, constm1_rtx,
36699 NULL_RTX, 1, OPTAB_DIRECT);
36700
36701 emit_insn (BNDmode == BND64mode
36702 ? gen_bnd64_mk (target, op0, op1)
36703 : gen_bnd32_mk (target, op0, op1));
36704 return target;
36705
36706 case IX86_BUILTIN_BNDSTX:
36707 arg0 = CALL_EXPR_ARG (exp, 0);
36708 arg1 = CALL_EXPR_ARG (exp, 1);
36709 arg2 = CALL_EXPR_ARG (exp, 2);
36710
36711 op0 = expand_normal (arg0);
36712 op1 = expand_normal (arg1);
36713 op2 = expand_normal (arg2);
36714
36715 if (!register_operand (op0, Pmode))
36716 op0 = ix86_zero_extend_to_Pmode (op0);
36717 if (!register_operand (op1, BNDmode))
36718 op1 = copy_to_mode_reg (BNDmode, op1);
36719 if (!register_operand (op2, Pmode))
36720 op2 = ix86_zero_extend_to_Pmode (op2);
36721
36722 emit_insn (BNDmode == BND64mode
36723 ? gen_bnd64_stx (op2, op0, op1)
36724 : gen_bnd32_stx (op2, op0, op1));
36725 return 0;
36726
36727 case IX86_BUILTIN_BNDLDX:
36728 if (!target
36729 || GET_MODE (target) != BNDmode
36730 || !register_operand (target, BNDmode))
36731 target = gen_reg_rtx (BNDmode);
36732
36733 arg0 = CALL_EXPR_ARG (exp, 0);
36734 arg1 = CALL_EXPR_ARG (exp, 1);
36735
36736 op0 = expand_normal (arg0);
36737 op1 = expand_normal (arg1);
36738
36739 if (!register_operand (op0, Pmode))
36740 op0 = ix86_zero_extend_to_Pmode (op0);
36741 if (!register_operand (op1, Pmode))
36742 op1 = ix86_zero_extend_to_Pmode (op1);
36743
36744 emit_insn (BNDmode == BND64mode
36745 ? gen_bnd64_ldx (target, op0, op1)
36746 : gen_bnd32_ldx (target, op0, op1));
36747 return target;
36748
36749 case IX86_BUILTIN_BNDCL:
36750 arg0 = CALL_EXPR_ARG (exp, 0);
36751 arg1 = CALL_EXPR_ARG (exp, 1);
36752
36753 op0 = expand_normal (arg0);
36754 op1 = expand_normal (arg1);
36755
36756 if (!register_operand (op0, Pmode))
36757 op0 = ix86_zero_extend_to_Pmode (op0);
36758 if (!register_operand (op1, BNDmode))
36759 op1 = copy_to_mode_reg (BNDmode, op1);
36760
36761 emit_insn (BNDmode == BND64mode
36762 ? gen_bnd64_cl (op1, op0)
36763 : gen_bnd32_cl (op1, op0));
36764 return 0;
36765
36766 case IX86_BUILTIN_BNDCU:
36767 arg0 = CALL_EXPR_ARG (exp, 0);
36768 arg1 = CALL_EXPR_ARG (exp, 1);
36769
36770 op0 = expand_normal (arg0);
36771 op1 = expand_normal (arg1);
36772
36773 if (!register_operand (op0, Pmode))
36774 op0 = ix86_zero_extend_to_Pmode (op0);
36775 if (!register_operand (op1, BNDmode))
36776 op1 = copy_to_mode_reg (BNDmode, op1);
36777
36778 emit_insn (BNDmode == BND64mode
36779 ? gen_bnd64_cu (op1, op0)
36780 : gen_bnd32_cu (op1, op0));
36781 return 0;
36782
36783 case IX86_BUILTIN_BNDRET:
36784 arg0 = CALL_EXPR_ARG (exp, 0);
36785 gcc_assert (TREE_CODE (arg0) == SSA_NAME);
36786 target = chkp_get_rtl_bounds (arg0);
36787
36788 /* If no bounds were specified for returned value,
36789 then use INIT bounds. It usually happens when
36790 some built-in function is expanded. */
36791 if (!target)
36792 {
36793 rtx t1 = gen_reg_rtx (Pmode);
36794 rtx t2 = gen_reg_rtx (Pmode);
36795 target = gen_reg_rtx (BNDmode);
36796 emit_move_insn (t1, const0_rtx);
36797 emit_move_insn (t2, constm1_rtx);
36798 emit_insn (BNDmode == BND64mode
36799 ? gen_bnd64_mk (target, t1, t2)
36800 : gen_bnd32_mk (target, t1, t2));
36801 }
36802
36803 gcc_assert (target && REG_P (target));
36804 return target;
36805
36806 case IX86_BUILTIN_BNDNARROW:
36807 {
36808 rtx m1, m1h1, m1h2, lb, ub, t1;
36809
36810 /* Return value and lb. */
36811 arg0 = CALL_EXPR_ARG (exp, 0);
36812 /* Bounds. */
36813 arg1 = CALL_EXPR_ARG (exp, 1);
36814 /* Size. */
36815 arg2 = CALL_EXPR_ARG (exp, 2);
36816
36817 lb = expand_normal (arg0);
36818 op1 = expand_normal (arg1);
36819 op2 = expand_normal (arg2);
36820
36821 /* Size was passed but we need to use (size - 1) as for bndmk. */
36822 op2 = expand_simple_binop (Pmode, PLUS, op2, constm1_rtx,
36823 NULL_RTX, 1, OPTAB_DIRECT);
36824
36825 /* Add LB to size and inverse to get UB. */
36826 op2 = expand_simple_binop (Pmode, PLUS, op2, lb,
36827 op2, 1, OPTAB_DIRECT);
36828 ub = expand_simple_unop (Pmode, NOT, op2, op2, 1);
36829
36830 if (!register_operand (lb, Pmode))
36831 lb = ix86_zero_extend_to_Pmode (lb);
36832 if (!register_operand (ub, Pmode))
36833 ub = ix86_zero_extend_to_Pmode (ub);
36834
36835 /* We need to move bounds to memory before any computations. */
36836 if (MEM_P (op1))
36837 m1 = op1;
36838 else
36839 {
36840 m1 = assign_386_stack_local (BNDmode, SLOT_TEMP);
36841 emit_move_insn (m1, op1);
36842 }
36843
36844 /* Generate mem expression to be used for access to LB and UB. */
36845 m1h1 = adjust_address (m1, Pmode, 0);
36846 m1h2 = adjust_address (m1, Pmode, GET_MODE_SIZE (Pmode));
36847
36848 t1 = gen_reg_rtx (Pmode);
36849
36850 /* Compute LB. */
36851 emit_move_insn (t1, m1h1);
36852 ix86_emit_move_max (t1, lb);
36853 emit_move_insn (m1h1, t1);
36854
36855 /* Compute UB. UB is stored in 1's complement form. Therefore
36856 we also use max here. */
36857 emit_move_insn (t1, m1h2);
36858 ix86_emit_move_max (t1, ub);
36859 emit_move_insn (m1h2, t1);
36860
36861 op2 = gen_reg_rtx (BNDmode);
36862 emit_move_insn (op2, m1);
36863
36864 return chkp_join_splitted_slot (lb, op2);
36865 }
36866
36867 case IX86_BUILTIN_BNDINT:
36868 {
36869 rtx res, rh1, rh2, lb1, lb2, ub1, ub2;
36870
36871 if (!target
36872 || GET_MODE (target) != BNDmode
36873 || !register_operand (target, BNDmode))
36874 target = gen_reg_rtx (BNDmode);
36875
36876 arg0 = CALL_EXPR_ARG (exp, 0);
36877 arg1 = CALL_EXPR_ARG (exp, 1);
36878
36879 op0 = expand_normal (arg0);
36880 op1 = expand_normal (arg1);
36881
36882 res = assign_386_stack_local (BNDmode, SLOT_TEMP);
36883 rh1 = adjust_address (res, Pmode, 0);
36884 rh2 = adjust_address (res, Pmode, GET_MODE_SIZE (Pmode));
36885
36886 /* Put first bounds to temporaries. */
36887 lb1 = gen_reg_rtx (Pmode);
36888 ub1 = gen_reg_rtx (Pmode);
36889 if (MEM_P (op0))
36890 {
36891 emit_move_insn (lb1, adjust_address (op0, Pmode, 0));
36892 emit_move_insn (ub1, adjust_address (op0, Pmode,
36893 GET_MODE_SIZE (Pmode)));
36894 }
36895 else
36896 {
36897 emit_move_insn (res, op0);
36898 emit_move_insn (lb1, rh1);
36899 emit_move_insn (ub1, rh2);
36900 }
36901
36902 /* Put second bounds to temporaries. */
36903 lb2 = gen_reg_rtx (Pmode);
36904 ub2 = gen_reg_rtx (Pmode);
36905 if (MEM_P (op1))
36906 {
36907 emit_move_insn (lb2, adjust_address (op1, Pmode, 0));
36908 emit_move_insn (ub2, adjust_address (op1, Pmode,
36909 GET_MODE_SIZE (Pmode)));
36910 }
36911 else
36912 {
36913 emit_move_insn (res, op1);
36914 emit_move_insn (lb2, rh1);
36915 emit_move_insn (ub2, rh2);
36916 }
36917
36918 /* Compute LB. */
36919 ix86_emit_move_max (lb1, lb2);
36920 emit_move_insn (rh1, lb1);
36921
36922 /* Compute UB. UB is stored in 1's complement form. Therefore
36923 we also use max here. */
36924 ix86_emit_move_max (ub1, ub2);
36925 emit_move_insn (rh2, ub1);
36926
36927 emit_move_insn (target, res);
36928
36929 return target;
36930 }
36931
36932 case IX86_BUILTIN_SIZEOF:
36933 {
36934 tree name;
36935 rtx symbol;
36936
36937 if (!target
36938 || GET_MODE (target) != Pmode
36939 || !register_operand (target, Pmode))
36940 target = gen_reg_rtx (Pmode);
36941
36942 arg0 = CALL_EXPR_ARG (exp, 0);
36943 gcc_assert (VAR_P (arg0));
36944
36945 name = DECL_ASSEMBLER_NAME (arg0);
36946 symbol = gen_rtx_SYMBOL_REF (Pmode, IDENTIFIER_POINTER (name));
36947
36948 emit_insn (Pmode == SImode
36949 ? gen_move_size_reloc_si (target, symbol)
36950 : gen_move_size_reloc_di (target, symbol));
36951
36952 return target;
36953 }
36954
36955 case IX86_BUILTIN_BNDLOWER:
36956 {
36957 rtx mem, hmem;
36958
36959 if (!target
36960 || GET_MODE (target) != Pmode
36961 || !register_operand (target, Pmode))
36962 target = gen_reg_rtx (Pmode);
36963
36964 arg0 = CALL_EXPR_ARG (exp, 0);
36965 op0 = expand_normal (arg0);
36966
36967 /* We need to move bounds to memory first. */
36968 if (MEM_P (op0))
36969 mem = op0;
36970 else
36971 {
36972 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
36973 emit_move_insn (mem, op0);
36974 }
36975
36976 /* Generate mem expression to access LB and load it. */
36977 hmem = adjust_address (mem, Pmode, 0);
36978 emit_move_insn (target, hmem);
36979
36980 return target;
36981 }
36982
36983 case IX86_BUILTIN_BNDUPPER:
36984 {
36985 rtx mem, hmem, res;
36986
36987 if (!target
36988 || GET_MODE (target) != Pmode
36989 || !register_operand (target, Pmode))
36990 target = gen_reg_rtx (Pmode);
36991
36992 arg0 = CALL_EXPR_ARG (exp, 0);
36993 op0 = expand_normal (arg0);
36994
36995 /* We need to move bounds to memory first. */
36996 if (MEM_P (op0))
36997 mem = op0;
36998 else
36999 {
37000 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
37001 emit_move_insn (mem, op0);
37002 }
37003
37004 /* Generate mem expression to access UB. */
37005 hmem = adjust_address (mem, Pmode, GET_MODE_SIZE (Pmode));
37006
37007 /* We need to inverse all bits of UB. */
37008 res = expand_simple_unop (Pmode, NOT, hmem, target, 1);
37009
37010 if (res != target)
37011 emit_move_insn (target, res);
37012
37013 return target;
37014 }
37015
37016 case IX86_BUILTIN_MASKMOVQ:
37017 case IX86_BUILTIN_MASKMOVDQU:
37018 icode = (fcode == IX86_BUILTIN_MASKMOVQ
37019 ? CODE_FOR_mmx_maskmovq
37020 : CODE_FOR_sse2_maskmovdqu);
37021 /* Note the arg order is different from the operand order. */
37022 arg1 = CALL_EXPR_ARG (exp, 0);
37023 arg2 = CALL_EXPR_ARG (exp, 1);
37024 arg0 = CALL_EXPR_ARG (exp, 2);
37025 op0 = expand_normal (arg0);
37026 op1 = expand_normal (arg1);
37027 op2 = expand_normal (arg2);
37028 mode0 = insn_data[icode].operand[0].mode;
37029 mode1 = insn_data[icode].operand[1].mode;
37030 mode2 = insn_data[icode].operand[2].mode;
37031
37032 op0 = ix86_zero_extend_to_Pmode (op0);
37033 op0 = gen_rtx_MEM (mode1, op0);
37034
37035 if (!insn_data[icode].operand[0].predicate (op0, mode0))
37036 op0 = copy_to_mode_reg (mode0, op0);
37037 if (!insn_data[icode].operand[1].predicate (op1, mode1))
37038 op1 = copy_to_mode_reg (mode1, op1);
37039 if (!insn_data[icode].operand[2].predicate (op2, mode2))
37040 op2 = copy_to_mode_reg (mode2, op2);
37041 pat = GEN_FCN (icode) (op0, op1, op2);
37042 if (! pat)
37043 return 0;
37044 emit_insn (pat);
37045 return 0;
37046
37047 case IX86_BUILTIN_LDMXCSR:
37048 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
37049 target = assign_386_stack_local (SImode, SLOT_TEMP);
37050 emit_move_insn (target, op0);
37051 emit_insn (gen_sse_ldmxcsr (target));
37052 return 0;
37053
37054 case IX86_BUILTIN_STMXCSR:
37055 target = assign_386_stack_local (SImode, SLOT_TEMP);
37056 emit_insn (gen_sse_stmxcsr (target));
37057 return copy_to_mode_reg (SImode, target);
37058
37059 case IX86_BUILTIN_CLFLUSH:
37060 arg0 = CALL_EXPR_ARG (exp, 0);
37061 op0 = expand_normal (arg0);
37062 icode = CODE_FOR_sse2_clflush;
37063 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37064 op0 = ix86_zero_extend_to_Pmode (op0);
37065
37066 emit_insn (gen_sse2_clflush (op0));
37067 return 0;
37068
37069 case IX86_BUILTIN_CLWB:
37070 arg0 = CALL_EXPR_ARG (exp, 0);
37071 op0 = expand_normal (arg0);
37072 icode = CODE_FOR_clwb;
37073 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37074 op0 = ix86_zero_extend_to_Pmode (op0);
37075
37076 emit_insn (gen_clwb (op0));
37077 return 0;
37078
37079 case IX86_BUILTIN_CLFLUSHOPT:
37080 arg0 = CALL_EXPR_ARG (exp, 0);
37081 op0 = expand_normal (arg0);
37082 icode = CODE_FOR_clflushopt;
37083 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37084 op0 = ix86_zero_extend_to_Pmode (op0);
37085
37086 emit_insn (gen_clflushopt (op0));
37087 return 0;
37088
37089 case IX86_BUILTIN_MONITOR:
37090 case IX86_BUILTIN_MONITORX:
37091 arg0 = CALL_EXPR_ARG (exp, 0);
37092 arg1 = CALL_EXPR_ARG (exp, 1);
37093 arg2 = CALL_EXPR_ARG (exp, 2);
37094 op0 = expand_normal (arg0);
37095 op1 = expand_normal (arg1);
37096 op2 = expand_normal (arg2);
37097 if (!REG_P (op0))
37098 op0 = ix86_zero_extend_to_Pmode (op0);
37099 if (!REG_P (op1))
37100 op1 = copy_to_mode_reg (SImode, op1);
37101 if (!REG_P (op2))
37102 op2 = copy_to_mode_reg (SImode, op2);
37103
37104 emit_insn (fcode == IX86_BUILTIN_MONITOR
37105 ? ix86_gen_monitor (op0, op1, op2)
37106 : ix86_gen_monitorx (op0, op1, op2));
37107 return 0;
37108
37109 case IX86_BUILTIN_MWAIT:
37110 arg0 = CALL_EXPR_ARG (exp, 0);
37111 arg1 = CALL_EXPR_ARG (exp, 1);
37112 op0 = expand_normal (arg0);
37113 op1 = expand_normal (arg1);
37114 if (!REG_P (op0))
37115 op0 = copy_to_mode_reg (SImode, op0);
37116 if (!REG_P (op1))
37117 op1 = copy_to_mode_reg (SImode, op1);
37118 emit_insn (gen_sse3_mwait (op0, op1));
37119 return 0;
37120
37121 case IX86_BUILTIN_MWAITX:
37122 arg0 = CALL_EXPR_ARG (exp, 0);
37123 arg1 = CALL_EXPR_ARG (exp, 1);
37124 arg2 = CALL_EXPR_ARG (exp, 2);
37125 op0 = expand_normal (arg0);
37126 op1 = expand_normal (arg1);
37127 op2 = expand_normal (arg2);
37128 if (!REG_P (op0))
37129 op0 = copy_to_mode_reg (SImode, op0);
37130 if (!REG_P (op1))
37131 op1 = copy_to_mode_reg (SImode, op1);
37132 if (!REG_P (op2))
37133 op2 = copy_to_mode_reg (SImode, op2);
37134 emit_insn (gen_mwaitx (op0, op1, op2));
37135 return 0;
37136
37137 case IX86_BUILTIN_CLZERO:
37138 arg0 = CALL_EXPR_ARG (exp, 0);
37139 op0 = expand_normal (arg0);
37140 if (!REG_P (op0))
37141 op0 = ix86_zero_extend_to_Pmode (op0);
37142 emit_insn (ix86_gen_clzero (op0));
37143 return 0;
37144
37145 case IX86_BUILTIN_VEC_INIT_V2SI:
37146 case IX86_BUILTIN_VEC_INIT_V4HI:
37147 case IX86_BUILTIN_VEC_INIT_V8QI:
37148 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
37149
37150 case IX86_BUILTIN_VEC_EXT_V2DF:
37151 case IX86_BUILTIN_VEC_EXT_V2DI:
37152 case IX86_BUILTIN_VEC_EXT_V4SF:
37153 case IX86_BUILTIN_VEC_EXT_V4SI:
37154 case IX86_BUILTIN_VEC_EXT_V8HI:
37155 case IX86_BUILTIN_VEC_EXT_V2SI:
37156 case IX86_BUILTIN_VEC_EXT_V4HI:
37157 case IX86_BUILTIN_VEC_EXT_V16QI:
37158 return ix86_expand_vec_ext_builtin (exp, target);
37159
37160 case IX86_BUILTIN_VEC_SET_V2DI:
37161 case IX86_BUILTIN_VEC_SET_V4SF:
37162 case IX86_BUILTIN_VEC_SET_V4SI:
37163 case IX86_BUILTIN_VEC_SET_V8HI:
37164 case IX86_BUILTIN_VEC_SET_V4HI:
37165 case IX86_BUILTIN_VEC_SET_V16QI:
37166 return ix86_expand_vec_set_builtin (exp);
37167
37168 case IX86_BUILTIN_NANQ:
37169 case IX86_BUILTIN_NANSQ:
37170 return expand_call (exp, target, ignore);
37171
37172 case IX86_BUILTIN_RDPMC:
37173 case IX86_BUILTIN_RDTSC:
37174 case IX86_BUILTIN_RDTSCP:
37175
37176 op0 = gen_reg_rtx (DImode);
37177 op1 = gen_reg_rtx (DImode);
37178
37179 if (fcode == IX86_BUILTIN_RDPMC)
37180 {
37181 arg0 = CALL_EXPR_ARG (exp, 0);
37182 op2 = expand_normal (arg0);
37183 if (!register_operand (op2, SImode))
37184 op2 = copy_to_mode_reg (SImode, op2);
37185
37186 insn = (TARGET_64BIT
37187 ? gen_rdpmc_rex64 (op0, op1, op2)
37188 : gen_rdpmc (op0, op2));
37189 emit_insn (insn);
37190 }
37191 else if (fcode == IX86_BUILTIN_RDTSC)
37192 {
37193 insn = (TARGET_64BIT
37194 ? gen_rdtsc_rex64 (op0, op1)
37195 : gen_rdtsc (op0));
37196 emit_insn (insn);
37197 }
37198 else
37199 {
37200 op2 = gen_reg_rtx (SImode);
37201
37202 insn = (TARGET_64BIT
37203 ? gen_rdtscp_rex64 (op0, op1, op2)
37204 : gen_rdtscp (op0, op2));
37205 emit_insn (insn);
37206
37207 arg0 = CALL_EXPR_ARG (exp, 0);
37208 op4 = expand_normal (arg0);
37209 if (!address_operand (op4, VOIDmode))
37210 {
37211 op4 = convert_memory_address (Pmode, op4);
37212 op4 = copy_addr_to_reg (op4);
37213 }
37214 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
37215 }
37216
37217 if (target == 0)
37218 {
37219 /* mode is VOIDmode if __builtin_rd* has been called
37220 without lhs. */
37221 if (mode == VOIDmode)
37222 return target;
37223 target = gen_reg_rtx (mode);
37224 }
37225
37226 if (TARGET_64BIT)
37227 {
37228 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
37229 op1, 1, OPTAB_DIRECT);
37230 op0 = expand_simple_binop (DImode, IOR, op0, op1,
37231 op0, 1, OPTAB_DIRECT);
37232 }
37233
37234 emit_move_insn (target, op0);
37235 return target;
37236
37237 case IX86_BUILTIN_FXSAVE:
37238 case IX86_BUILTIN_FXRSTOR:
37239 case IX86_BUILTIN_FXSAVE64:
37240 case IX86_BUILTIN_FXRSTOR64:
37241 case IX86_BUILTIN_FNSTENV:
37242 case IX86_BUILTIN_FLDENV:
37243 mode0 = BLKmode;
37244 switch (fcode)
37245 {
37246 case IX86_BUILTIN_FXSAVE:
37247 icode = CODE_FOR_fxsave;
37248 break;
37249 case IX86_BUILTIN_FXRSTOR:
37250 icode = CODE_FOR_fxrstor;
37251 break;
37252 case IX86_BUILTIN_FXSAVE64:
37253 icode = CODE_FOR_fxsave64;
37254 break;
37255 case IX86_BUILTIN_FXRSTOR64:
37256 icode = CODE_FOR_fxrstor64;
37257 break;
37258 case IX86_BUILTIN_FNSTENV:
37259 icode = CODE_FOR_fnstenv;
37260 break;
37261 case IX86_BUILTIN_FLDENV:
37262 icode = CODE_FOR_fldenv;
37263 break;
37264 default:
37265 gcc_unreachable ();
37266 }
37267
37268 arg0 = CALL_EXPR_ARG (exp, 0);
37269 op0 = expand_normal (arg0);
37270
37271 if (!address_operand (op0, VOIDmode))
37272 {
37273 op0 = convert_memory_address (Pmode, op0);
37274 op0 = copy_addr_to_reg (op0);
37275 }
37276 op0 = gen_rtx_MEM (mode0, op0);
37277
37278 pat = GEN_FCN (icode) (op0);
37279 if (pat)
37280 emit_insn (pat);
37281 return 0;
37282
37283 case IX86_BUILTIN_XSAVE:
37284 case IX86_BUILTIN_XRSTOR:
37285 case IX86_BUILTIN_XSAVE64:
37286 case IX86_BUILTIN_XRSTOR64:
37287 case IX86_BUILTIN_XSAVEOPT:
37288 case IX86_BUILTIN_XSAVEOPT64:
37289 case IX86_BUILTIN_XSAVES:
37290 case IX86_BUILTIN_XRSTORS:
37291 case IX86_BUILTIN_XSAVES64:
37292 case IX86_BUILTIN_XRSTORS64:
37293 case IX86_BUILTIN_XSAVEC:
37294 case IX86_BUILTIN_XSAVEC64:
37295 arg0 = CALL_EXPR_ARG (exp, 0);
37296 arg1 = CALL_EXPR_ARG (exp, 1);
37297 op0 = expand_normal (arg0);
37298 op1 = expand_normal (arg1);
37299
37300 if (!address_operand (op0, VOIDmode))
37301 {
37302 op0 = convert_memory_address (Pmode, op0);
37303 op0 = copy_addr_to_reg (op0);
37304 }
37305 op0 = gen_rtx_MEM (BLKmode, op0);
37306
37307 op1 = force_reg (DImode, op1);
37308
37309 if (TARGET_64BIT)
37310 {
37311 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
37312 NULL, 1, OPTAB_DIRECT);
37313 switch (fcode)
37314 {
37315 case IX86_BUILTIN_XSAVE:
37316 icode = CODE_FOR_xsave_rex64;
37317 break;
37318 case IX86_BUILTIN_XRSTOR:
37319 icode = CODE_FOR_xrstor_rex64;
37320 break;
37321 case IX86_BUILTIN_XSAVE64:
37322 icode = CODE_FOR_xsave64;
37323 break;
37324 case IX86_BUILTIN_XRSTOR64:
37325 icode = CODE_FOR_xrstor64;
37326 break;
37327 case IX86_BUILTIN_XSAVEOPT:
37328 icode = CODE_FOR_xsaveopt_rex64;
37329 break;
37330 case IX86_BUILTIN_XSAVEOPT64:
37331 icode = CODE_FOR_xsaveopt64;
37332 break;
37333 case IX86_BUILTIN_XSAVES:
37334 icode = CODE_FOR_xsaves_rex64;
37335 break;
37336 case IX86_BUILTIN_XRSTORS:
37337 icode = CODE_FOR_xrstors_rex64;
37338 break;
37339 case IX86_BUILTIN_XSAVES64:
37340 icode = CODE_FOR_xsaves64;
37341 break;
37342 case IX86_BUILTIN_XRSTORS64:
37343 icode = CODE_FOR_xrstors64;
37344 break;
37345 case IX86_BUILTIN_XSAVEC:
37346 icode = CODE_FOR_xsavec_rex64;
37347 break;
37348 case IX86_BUILTIN_XSAVEC64:
37349 icode = CODE_FOR_xsavec64;
37350 break;
37351 default:
37352 gcc_unreachable ();
37353 }
37354
37355 op2 = gen_lowpart (SImode, op2);
37356 op1 = gen_lowpart (SImode, op1);
37357 pat = GEN_FCN (icode) (op0, op1, op2);
37358 }
37359 else
37360 {
37361 switch (fcode)
37362 {
37363 case IX86_BUILTIN_XSAVE:
37364 icode = CODE_FOR_xsave;
37365 break;
37366 case IX86_BUILTIN_XRSTOR:
37367 icode = CODE_FOR_xrstor;
37368 break;
37369 case IX86_BUILTIN_XSAVEOPT:
37370 icode = CODE_FOR_xsaveopt;
37371 break;
37372 case IX86_BUILTIN_XSAVES:
37373 icode = CODE_FOR_xsaves;
37374 break;
37375 case IX86_BUILTIN_XRSTORS:
37376 icode = CODE_FOR_xrstors;
37377 break;
37378 case IX86_BUILTIN_XSAVEC:
37379 icode = CODE_FOR_xsavec;
37380 break;
37381 default:
37382 gcc_unreachable ();
37383 }
37384 pat = GEN_FCN (icode) (op0, op1);
37385 }
37386
37387 if (pat)
37388 emit_insn (pat);
37389 return 0;
37390
37391 case IX86_BUILTIN_LLWPCB:
37392 arg0 = CALL_EXPR_ARG (exp, 0);
37393 op0 = expand_normal (arg0);
37394 icode = CODE_FOR_lwp_llwpcb;
37395 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37396 op0 = ix86_zero_extend_to_Pmode (op0);
37397 emit_insn (gen_lwp_llwpcb (op0));
37398 return 0;
37399
37400 case IX86_BUILTIN_SLWPCB:
37401 icode = CODE_FOR_lwp_slwpcb;
37402 if (!target
37403 || !insn_data[icode].operand[0].predicate (target, Pmode))
37404 target = gen_reg_rtx (Pmode);
37405 emit_insn (gen_lwp_slwpcb (target));
37406 return target;
37407
37408 case IX86_BUILTIN_BEXTRI32:
37409 case IX86_BUILTIN_BEXTRI64:
37410 arg0 = CALL_EXPR_ARG (exp, 0);
37411 arg1 = CALL_EXPR_ARG (exp, 1);
37412 op0 = expand_normal (arg0);
37413 op1 = expand_normal (arg1);
37414 icode = (fcode == IX86_BUILTIN_BEXTRI32
37415 ? CODE_FOR_tbm_bextri_si
37416 : CODE_FOR_tbm_bextri_di);
37417 if (!CONST_INT_P (op1))
37418 {
37419 error ("last argument must be an immediate");
37420 return const0_rtx;
37421 }
37422 else
37423 {
37424 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
37425 unsigned char lsb_index = INTVAL (op1) & 0xFF;
37426 op1 = GEN_INT (length);
37427 op2 = GEN_INT (lsb_index);
37428 pat = GEN_FCN (icode) (target, op0, op1, op2);
37429 if (pat)
37430 emit_insn (pat);
37431 return target;
37432 }
37433
37434 case IX86_BUILTIN_RDRAND16_STEP:
37435 icode = CODE_FOR_rdrandhi_1;
37436 mode0 = HImode;
37437 goto rdrand_step;
37438
37439 case IX86_BUILTIN_RDRAND32_STEP:
37440 icode = CODE_FOR_rdrandsi_1;
37441 mode0 = SImode;
37442 goto rdrand_step;
37443
37444 case IX86_BUILTIN_RDRAND64_STEP:
37445 icode = CODE_FOR_rdranddi_1;
37446 mode0 = DImode;
37447
37448 rdrand_step:
37449 op0 = gen_reg_rtx (mode0);
37450 emit_insn (GEN_FCN (icode) (op0));
37451
37452 arg0 = CALL_EXPR_ARG (exp, 0);
37453 op1 = expand_normal (arg0);
37454 if (!address_operand (op1, VOIDmode))
37455 {
37456 op1 = convert_memory_address (Pmode, op1);
37457 op1 = copy_addr_to_reg (op1);
37458 }
37459 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
37460
37461 op1 = gen_reg_rtx (SImode);
37462 emit_move_insn (op1, CONST1_RTX (SImode));
37463
37464 /* Emit SImode conditional move. */
37465 if (mode0 == HImode)
37466 {
37467 op2 = gen_reg_rtx (SImode);
37468 emit_insn (gen_zero_extendhisi2 (op2, op0));
37469 }
37470 else if (mode0 == SImode)
37471 op2 = op0;
37472 else
37473 op2 = gen_rtx_SUBREG (SImode, op0, 0);
37474
37475 if (target == 0
37476 || !register_operand (target, SImode))
37477 target = gen_reg_rtx (SImode);
37478
37479 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
37480 const0_rtx);
37481 emit_insn (gen_rtx_SET (target,
37482 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
37483 return target;
37484
37485 case IX86_BUILTIN_RDSEED16_STEP:
37486 icode = CODE_FOR_rdseedhi_1;
37487 mode0 = HImode;
37488 goto rdseed_step;
37489
37490 case IX86_BUILTIN_RDSEED32_STEP:
37491 icode = CODE_FOR_rdseedsi_1;
37492 mode0 = SImode;
37493 goto rdseed_step;
37494
37495 case IX86_BUILTIN_RDSEED64_STEP:
37496 icode = CODE_FOR_rdseeddi_1;
37497 mode0 = DImode;
37498
37499 rdseed_step:
37500 op0 = gen_reg_rtx (mode0);
37501 emit_insn (GEN_FCN (icode) (op0));
37502
37503 arg0 = CALL_EXPR_ARG (exp, 0);
37504 op1 = expand_normal (arg0);
37505 if (!address_operand (op1, VOIDmode))
37506 {
37507 op1 = convert_memory_address (Pmode, op1);
37508 op1 = copy_addr_to_reg (op1);
37509 }
37510 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
37511
37512 op2 = gen_reg_rtx (QImode);
37513
37514 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
37515 const0_rtx);
37516 emit_insn (gen_rtx_SET (op2, pat));
37517
37518 if (target == 0
37519 || !register_operand (target, SImode))
37520 target = gen_reg_rtx (SImode);
37521
37522 emit_insn (gen_zero_extendqisi2 (target, op2));
37523 return target;
37524
37525 case IX86_BUILTIN_SBB32:
37526 icode = CODE_FOR_subborrowsi;
37527 mode0 = SImode;
37528 goto handlecarry;
37529
37530 case IX86_BUILTIN_SBB64:
37531 icode = CODE_FOR_subborrowdi;
37532 mode0 = DImode;
37533 goto handlecarry;
37534
37535 case IX86_BUILTIN_ADDCARRYX32:
37536 icode = CODE_FOR_addcarrysi;
37537 mode0 = SImode;
37538 goto handlecarry;
37539
37540 case IX86_BUILTIN_ADDCARRYX64:
37541 icode = CODE_FOR_addcarrydi;
37542 mode0 = DImode;
37543
37544 handlecarry:
37545 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
37546 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
37547 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
37548 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
37549
37550 op1 = expand_normal (arg0);
37551 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
37552
37553 op2 = expand_normal (arg1);
37554 if (!register_operand (op2, mode0))
37555 op2 = copy_to_mode_reg (mode0, op2);
37556
37557 op3 = expand_normal (arg2);
37558 if (!register_operand (op3, mode0))
37559 op3 = copy_to_mode_reg (mode0, op3);
37560
37561 op4 = expand_normal (arg3);
37562 if (!address_operand (op4, VOIDmode))
37563 {
37564 op4 = convert_memory_address (Pmode, op4);
37565 op4 = copy_addr_to_reg (op4);
37566 }
37567
37568 /* Generate CF from input operand. */
37569 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
37570
37571 /* Generate instruction that consumes CF. */
37572 op0 = gen_reg_rtx (mode0);
37573
37574 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
37575 pat = gen_rtx_LTU (mode0, op1, const0_rtx);
37576 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat));
37577
37578 /* Return current CF value. */
37579 if (target == 0)
37580 target = gen_reg_rtx (QImode);
37581
37582 PUT_MODE (pat, QImode);
37583 emit_insn (gen_rtx_SET (target, pat));
37584
37585 /* Store the result. */
37586 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
37587
37588 return target;
37589
37590 case IX86_BUILTIN_READ_FLAGS:
37591 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
37592
37593 if (optimize
37594 || target == NULL_RTX
37595 || !nonimmediate_operand (target, word_mode)
37596 || GET_MODE (target) != word_mode)
37597 target = gen_reg_rtx (word_mode);
37598
37599 emit_insn (gen_pop (target));
37600 return target;
37601
37602 case IX86_BUILTIN_WRITE_FLAGS:
37603
37604 arg0 = CALL_EXPR_ARG (exp, 0);
37605 op0 = expand_normal (arg0);
37606 if (!general_no_elim_operand (op0, word_mode))
37607 op0 = copy_to_mode_reg (word_mode, op0);
37608
37609 emit_insn (gen_push (op0));
37610 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
37611 return 0;
37612
37613 case IX86_BUILTIN_KORTESTC16:
37614 icode = CODE_FOR_kortestchi;
37615 mode0 = HImode;
37616 mode1 = CCCmode;
37617 goto kortest;
37618
37619 case IX86_BUILTIN_KORTESTZ16:
37620 icode = CODE_FOR_kortestzhi;
37621 mode0 = HImode;
37622 mode1 = CCZmode;
37623
37624 kortest:
37625 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
37626 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
37627 op0 = expand_normal (arg0);
37628 op1 = expand_normal (arg1);
37629
37630 op0 = copy_to_reg (op0);
37631 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
37632 op1 = copy_to_reg (op1);
37633 op1 = lowpart_subreg (mode0, op1, GET_MODE (op1));
37634
37635 target = gen_reg_rtx (QImode);
37636 emit_insn (gen_rtx_SET (target, const0_rtx));
37637
37638 /* Emit kortest. */
37639 emit_insn (GEN_FCN (icode) (op0, op1));
37640 /* And use setcc to return result from flags. */
37641 ix86_expand_setcc (target, EQ,
37642 gen_rtx_REG (mode1, FLAGS_REG), const0_rtx);
37643 return target;
37644
37645 case IX86_BUILTIN_GATHERSIV2DF:
37646 icode = CODE_FOR_avx2_gathersiv2df;
37647 goto gather_gen;
37648 case IX86_BUILTIN_GATHERSIV4DF:
37649 icode = CODE_FOR_avx2_gathersiv4df;
37650 goto gather_gen;
37651 case IX86_BUILTIN_GATHERDIV2DF:
37652 icode = CODE_FOR_avx2_gatherdiv2df;
37653 goto gather_gen;
37654 case IX86_BUILTIN_GATHERDIV4DF:
37655 icode = CODE_FOR_avx2_gatherdiv4df;
37656 goto gather_gen;
37657 case IX86_BUILTIN_GATHERSIV4SF:
37658 icode = CODE_FOR_avx2_gathersiv4sf;
37659 goto gather_gen;
37660 case IX86_BUILTIN_GATHERSIV8SF:
37661 icode = CODE_FOR_avx2_gathersiv8sf;
37662 goto gather_gen;
37663 case IX86_BUILTIN_GATHERDIV4SF:
37664 icode = CODE_FOR_avx2_gatherdiv4sf;
37665 goto gather_gen;
37666 case IX86_BUILTIN_GATHERDIV8SF:
37667 icode = CODE_FOR_avx2_gatherdiv8sf;
37668 goto gather_gen;
37669 case IX86_BUILTIN_GATHERSIV2DI:
37670 icode = CODE_FOR_avx2_gathersiv2di;
37671 goto gather_gen;
37672 case IX86_BUILTIN_GATHERSIV4DI:
37673 icode = CODE_FOR_avx2_gathersiv4di;
37674 goto gather_gen;
37675 case IX86_BUILTIN_GATHERDIV2DI:
37676 icode = CODE_FOR_avx2_gatherdiv2di;
37677 goto gather_gen;
37678 case IX86_BUILTIN_GATHERDIV4DI:
37679 icode = CODE_FOR_avx2_gatherdiv4di;
37680 goto gather_gen;
37681 case IX86_BUILTIN_GATHERSIV4SI:
37682 icode = CODE_FOR_avx2_gathersiv4si;
37683 goto gather_gen;
37684 case IX86_BUILTIN_GATHERSIV8SI:
37685 icode = CODE_FOR_avx2_gathersiv8si;
37686 goto gather_gen;
37687 case IX86_BUILTIN_GATHERDIV4SI:
37688 icode = CODE_FOR_avx2_gatherdiv4si;
37689 goto gather_gen;
37690 case IX86_BUILTIN_GATHERDIV8SI:
37691 icode = CODE_FOR_avx2_gatherdiv8si;
37692 goto gather_gen;
37693 case IX86_BUILTIN_GATHERALTSIV4DF:
37694 icode = CODE_FOR_avx2_gathersiv4df;
37695 goto gather_gen;
37696 case IX86_BUILTIN_GATHERALTDIV8SF:
37697 icode = CODE_FOR_avx2_gatherdiv8sf;
37698 goto gather_gen;
37699 case IX86_BUILTIN_GATHERALTSIV4DI:
37700 icode = CODE_FOR_avx2_gathersiv4di;
37701 goto gather_gen;
37702 case IX86_BUILTIN_GATHERALTDIV8SI:
37703 icode = CODE_FOR_avx2_gatherdiv8si;
37704 goto gather_gen;
37705 case IX86_BUILTIN_GATHER3SIV16SF:
37706 icode = CODE_FOR_avx512f_gathersiv16sf;
37707 goto gather_gen;
37708 case IX86_BUILTIN_GATHER3SIV8DF:
37709 icode = CODE_FOR_avx512f_gathersiv8df;
37710 goto gather_gen;
37711 case IX86_BUILTIN_GATHER3DIV16SF:
37712 icode = CODE_FOR_avx512f_gatherdiv16sf;
37713 goto gather_gen;
37714 case IX86_BUILTIN_GATHER3DIV8DF:
37715 icode = CODE_FOR_avx512f_gatherdiv8df;
37716 goto gather_gen;
37717 case IX86_BUILTIN_GATHER3SIV16SI:
37718 icode = CODE_FOR_avx512f_gathersiv16si;
37719 goto gather_gen;
37720 case IX86_BUILTIN_GATHER3SIV8DI:
37721 icode = CODE_FOR_avx512f_gathersiv8di;
37722 goto gather_gen;
37723 case IX86_BUILTIN_GATHER3DIV16SI:
37724 icode = CODE_FOR_avx512f_gatherdiv16si;
37725 goto gather_gen;
37726 case IX86_BUILTIN_GATHER3DIV8DI:
37727 icode = CODE_FOR_avx512f_gatherdiv8di;
37728 goto gather_gen;
37729 case IX86_BUILTIN_GATHER3ALTSIV8DF:
37730 icode = CODE_FOR_avx512f_gathersiv8df;
37731 goto gather_gen;
37732 case IX86_BUILTIN_GATHER3ALTDIV16SF:
37733 icode = CODE_FOR_avx512f_gatherdiv16sf;
37734 goto gather_gen;
37735 case IX86_BUILTIN_GATHER3ALTSIV8DI:
37736 icode = CODE_FOR_avx512f_gathersiv8di;
37737 goto gather_gen;
37738 case IX86_BUILTIN_GATHER3ALTDIV16SI:
37739 icode = CODE_FOR_avx512f_gatherdiv16si;
37740 goto gather_gen;
37741 case IX86_BUILTIN_GATHER3SIV2DF:
37742 icode = CODE_FOR_avx512vl_gathersiv2df;
37743 goto gather_gen;
37744 case IX86_BUILTIN_GATHER3SIV4DF:
37745 icode = CODE_FOR_avx512vl_gathersiv4df;
37746 goto gather_gen;
37747 case IX86_BUILTIN_GATHER3DIV2DF:
37748 icode = CODE_FOR_avx512vl_gatherdiv2df;
37749 goto gather_gen;
37750 case IX86_BUILTIN_GATHER3DIV4DF:
37751 icode = CODE_FOR_avx512vl_gatherdiv4df;
37752 goto gather_gen;
37753 case IX86_BUILTIN_GATHER3SIV4SF:
37754 icode = CODE_FOR_avx512vl_gathersiv4sf;
37755 goto gather_gen;
37756 case IX86_BUILTIN_GATHER3SIV8SF:
37757 icode = CODE_FOR_avx512vl_gathersiv8sf;
37758 goto gather_gen;
37759 case IX86_BUILTIN_GATHER3DIV4SF:
37760 icode = CODE_FOR_avx512vl_gatherdiv4sf;
37761 goto gather_gen;
37762 case IX86_BUILTIN_GATHER3DIV8SF:
37763 icode = CODE_FOR_avx512vl_gatherdiv8sf;
37764 goto gather_gen;
37765 case IX86_BUILTIN_GATHER3SIV2DI:
37766 icode = CODE_FOR_avx512vl_gathersiv2di;
37767 goto gather_gen;
37768 case IX86_BUILTIN_GATHER3SIV4DI:
37769 icode = CODE_FOR_avx512vl_gathersiv4di;
37770 goto gather_gen;
37771 case IX86_BUILTIN_GATHER3DIV2DI:
37772 icode = CODE_FOR_avx512vl_gatherdiv2di;
37773 goto gather_gen;
37774 case IX86_BUILTIN_GATHER3DIV4DI:
37775 icode = CODE_FOR_avx512vl_gatherdiv4di;
37776 goto gather_gen;
37777 case IX86_BUILTIN_GATHER3SIV4SI:
37778 icode = CODE_FOR_avx512vl_gathersiv4si;
37779 goto gather_gen;
37780 case IX86_BUILTIN_GATHER3SIV8SI:
37781 icode = CODE_FOR_avx512vl_gathersiv8si;
37782 goto gather_gen;
37783 case IX86_BUILTIN_GATHER3DIV4SI:
37784 icode = CODE_FOR_avx512vl_gatherdiv4si;
37785 goto gather_gen;
37786 case IX86_BUILTIN_GATHER3DIV8SI:
37787 icode = CODE_FOR_avx512vl_gatherdiv8si;
37788 goto gather_gen;
37789 case IX86_BUILTIN_GATHER3ALTSIV4DF:
37790 icode = CODE_FOR_avx512vl_gathersiv4df;
37791 goto gather_gen;
37792 case IX86_BUILTIN_GATHER3ALTDIV8SF:
37793 icode = CODE_FOR_avx512vl_gatherdiv8sf;
37794 goto gather_gen;
37795 case IX86_BUILTIN_GATHER3ALTSIV4DI:
37796 icode = CODE_FOR_avx512vl_gathersiv4di;
37797 goto gather_gen;
37798 case IX86_BUILTIN_GATHER3ALTDIV8SI:
37799 icode = CODE_FOR_avx512vl_gatherdiv8si;
37800 goto gather_gen;
37801 case IX86_BUILTIN_SCATTERSIV16SF:
37802 icode = CODE_FOR_avx512f_scattersiv16sf;
37803 goto scatter_gen;
37804 case IX86_BUILTIN_SCATTERSIV8DF:
37805 icode = CODE_FOR_avx512f_scattersiv8df;
37806 goto scatter_gen;
37807 case IX86_BUILTIN_SCATTERDIV16SF:
37808 icode = CODE_FOR_avx512f_scatterdiv16sf;
37809 goto scatter_gen;
37810 case IX86_BUILTIN_SCATTERDIV8DF:
37811 icode = CODE_FOR_avx512f_scatterdiv8df;
37812 goto scatter_gen;
37813 case IX86_BUILTIN_SCATTERSIV16SI:
37814 icode = CODE_FOR_avx512f_scattersiv16si;
37815 goto scatter_gen;
37816 case IX86_BUILTIN_SCATTERSIV8DI:
37817 icode = CODE_FOR_avx512f_scattersiv8di;
37818 goto scatter_gen;
37819 case IX86_BUILTIN_SCATTERDIV16SI:
37820 icode = CODE_FOR_avx512f_scatterdiv16si;
37821 goto scatter_gen;
37822 case IX86_BUILTIN_SCATTERDIV8DI:
37823 icode = CODE_FOR_avx512f_scatterdiv8di;
37824 goto scatter_gen;
37825 case IX86_BUILTIN_SCATTERSIV8SF:
37826 icode = CODE_FOR_avx512vl_scattersiv8sf;
37827 goto scatter_gen;
37828 case IX86_BUILTIN_SCATTERSIV4SF:
37829 icode = CODE_FOR_avx512vl_scattersiv4sf;
37830 goto scatter_gen;
37831 case IX86_BUILTIN_SCATTERSIV4DF:
37832 icode = CODE_FOR_avx512vl_scattersiv4df;
37833 goto scatter_gen;
37834 case IX86_BUILTIN_SCATTERSIV2DF:
37835 icode = CODE_FOR_avx512vl_scattersiv2df;
37836 goto scatter_gen;
37837 case IX86_BUILTIN_SCATTERDIV8SF:
37838 icode = CODE_FOR_avx512vl_scatterdiv8sf;
37839 goto scatter_gen;
37840 case IX86_BUILTIN_SCATTERDIV4SF:
37841 icode = CODE_FOR_avx512vl_scatterdiv4sf;
37842 goto scatter_gen;
37843 case IX86_BUILTIN_SCATTERDIV4DF:
37844 icode = CODE_FOR_avx512vl_scatterdiv4df;
37845 goto scatter_gen;
37846 case IX86_BUILTIN_SCATTERDIV2DF:
37847 icode = CODE_FOR_avx512vl_scatterdiv2df;
37848 goto scatter_gen;
37849 case IX86_BUILTIN_SCATTERSIV8SI:
37850 icode = CODE_FOR_avx512vl_scattersiv8si;
37851 goto scatter_gen;
37852 case IX86_BUILTIN_SCATTERSIV4SI:
37853 icode = CODE_FOR_avx512vl_scattersiv4si;
37854 goto scatter_gen;
37855 case IX86_BUILTIN_SCATTERSIV4DI:
37856 icode = CODE_FOR_avx512vl_scattersiv4di;
37857 goto scatter_gen;
37858 case IX86_BUILTIN_SCATTERSIV2DI:
37859 icode = CODE_FOR_avx512vl_scattersiv2di;
37860 goto scatter_gen;
37861 case IX86_BUILTIN_SCATTERDIV8SI:
37862 icode = CODE_FOR_avx512vl_scatterdiv8si;
37863 goto scatter_gen;
37864 case IX86_BUILTIN_SCATTERDIV4SI:
37865 icode = CODE_FOR_avx512vl_scatterdiv4si;
37866 goto scatter_gen;
37867 case IX86_BUILTIN_SCATTERDIV4DI:
37868 icode = CODE_FOR_avx512vl_scatterdiv4di;
37869 goto scatter_gen;
37870 case IX86_BUILTIN_SCATTERDIV2DI:
37871 icode = CODE_FOR_avx512vl_scatterdiv2di;
37872 goto scatter_gen;
37873 case IX86_BUILTIN_GATHERPFDPD:
37874 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
37875 goto vec_prefetch_gen;
37876 case IX86_BUILTIN_SCATTERALTSIV8DF:
37877 icode = CODE_FOR_avx512f_scattersiv8df;
37878 goto scatter_gen;
37879 case IX86_BUILTIN_SCATTERALTDIV16SF:
37880 icode = CODE_FOR_avx512f_scatterdiv16sf;
37881 goto scatter_gen;
37882 case IX86_BUILTIN_SCATTERALTSIV8DI:
37883 icode = CODE_FOR_avx512f_scattersiv8di;
37884 goto scatter_gen;
37885 case IX86_BUILTIN_SCATTERALTDIV16SI:
37886 icode = CODE_FOR_avx512f_scatterdiv16si;
37887 goto scatter_gen;
37888 case IX86_BUILTIN_GATHERPFDPS:
37889 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
37890 goto vec_prefetch_gen;
37891 case IX86_BUILTIN_GATHERPFQPD:
37892 icode = CODE_FOR_avx512pf_gatherpfv8didf;
37893 goto vec_prefetch_gen;
37894 case IX86_BUILTIN_GATHERPFQPS:
37895 icode = CODE_FOR_avx512pf_gatherpfv8disf;
37896 goto vec_prefetch_gen;
37897 case IX86_BUILTIN_SCATTERPFDPD:
37898 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
37899 goto vec_prefetch_gen;
37900 case IX86_BUILTIN_SCATTERPFDPS:
37901 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
37902 goto vec_prefetch_gen;
37903 case IX86_BUILTIN_SCATTERPFQPD:
37904 icode = CODE_FOR_avx512pf_scatterpfv8didf;
37905 goto vec_prefetch_gen;
37906 case IX86_BUILTIN_SCATTERPFQPS:
37907 icode = CODE_FOR_avx512pf_scatterpfv8disf;
37908 goto vec_prefetch_gen;
37909
37910 gather_gen:
37911 rtx half;
37912 rtx (*gen) (rtx, rtx);
37913
37914 arg0 = CALL_EXPR_ARG (exp, 0);
37915 arg1 = CALL_EXPR_ARG (exp, 1);
37916 arg2 = CALL_EXPR_ARG (exp, 2);
37917 arg3 = CALL_EXPR_ARG (exp, 3);
37918 arg4 = CALL_EXPR_ARG (exp, 4);
37919 op0 = expand_normal (arg0);
37920 op1 = expand_normal (arg1);
37921 op2 = expand_normal (arg2);
37922 op3 = expand_normal (arg3);
37923 op4 = expand_normal (arg4);
37924 /* Note the arg order is different from the operand order. */
37925 mode0 = insn_data[icode].operand[1].mode;
37926 mode2 = insn_data[icode].operand[3].mode;
37927 mode3 = insn_data[icode].operand[4].mode;
37928 mode4 = insn_data[icode].operand[5].mode;
37929
37930 if (target == NULL_RTX
37931 || GET_MODE (target) != insn_data[icode].operand[0].mode
37932 || !insn_data[icode].operand[0].predicate (target,
37933 GET_MODE (target)))
37934 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
37935 else
37936 subtarget = target;
37937
37938 switch (fcode)
37939 {
37940 case IX86_BUILTIN_GATHER3ALTSIV8DF:
37941 case IX86_BUILTIN_GATHER3ALTSIV8DI:
37942 half = gen_reg_rtx (V8SImode);
37943 if (!nonimmediate_operand (op2, V16SImode))
37944 op2 = copy_to_mode_reg (V16SImode, op2);
37945 emit_insn (gen_vec_extract_lo_v16si (half, op2));
37946 op2 = half;
37947 break;
37948 case IX86_BUILTIN_GATHER3ALTSIV4DF:
37949 case IX86_BUILTIN_GATHER3ALTSIV4DI:
37950 case IX86_BUILTIN_GATHERALTSIV4DF:
37951 case IX86_BUILTIN_GATHERALTSIV4DI:
37952 half = gen_reg_rtx (V4SImode);
37953 if (!nonimmediate_operand (op2, V8SImode))
37954 op2 = copy_to_mode_reg (V8SImode, op2);
37955 emit_insn (gen_vec_extract_lo_v8si (half, op2));
37956 op2 = half;
37957 break;
37958 case IX86_BUILTIN_GATHER3ALTDIV16SF:
37959 case IX86_BUILTIN_GATHER3ALTDIV16SI:
37960 half = gen_reg_rtx (mode0);
37961 if (mode0 == V8SFmode)
37962 gen = gen_vec_extract_lo_v16sf;
37963 else
37964 gen = gen_vec_extract_lo_v16si;
37965 if (!nonimmediate_operand (op0, GET_MODE (op0)))
37966 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
37967 emit_insn (gen (half, op0));
37968 op0 = half;
37969 if (GET_MODE (op3) != VOIDmode)
37970 {
37971 if (!nonimmediate_operand (op3, GET_MODE (op3)))
37972 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
37973 emit_insn (gen (half, op3));
37974 op3 = half;
37975 }
37976 break;
37977 case IX86_BUILTIN_GATHER3ALTDIV8SF:
37978 case IX86_BUILTIN_GATHER3ALTDIV8SI:
37979 case IX86_BUILTIN_GATHERALTDIV8SF:
37980 case IX86_BUILTIN_GATHERALTDIV8SI:
37981 half = gen_reg_rtx (mode0);
37982 if (mode0 == V4SFmode)
37983 gen = gen_vec_extract_lo_v8sf;
37984 else
37985 gen = gen_vec_extract_lo_v8si;
37986 if (!nonimmediate_operand (op0, GET_MODE (op0)))
37987 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
37988 emit_insn (gen (half, op0));
37989 op0 = half;
37990 if (GET_MODE (op3) != VOIDmode)
37991 {
37992 if (!nonimmediate_operand (op3, GET_MODE (op3)))
37993 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
37994 emit_insn (gen (half, op3));
37995 op3 = half;
37996 }
37997 break;
37998 default:
37999 break;
38000 }
38001
38002 /* Force memory operand only with base register here. But we
38003 don't want to do it on memory operand for other builtin
38004 functions. */
38005 op1 = ix86_zero_extend_to_Pmode (op1);
38006
38007 if (!insn_data[icode].operand[1].predicate (op0, mode0))
38008 op0 = copy_to_mode_reg (mode0, op0);
38009 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
38010 op1 = copy_to_mode_reg (Pmode, op1);
38011 if (!insn_data[icode].operand[3].predicate (op2, mode2))
38012 op2 = copy_to_mode_reg (mode2, op2);
38013
38014 op3 = fixup_modeless_constant (op3, mode3);
38015
38016 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
38017 {
38018 if (!insn_data[icode].operand[4].predicate (op3, mode3))
38019 op3 = copy_to_mode_reg (mode3, op3);
38020 }
38021 else
38022 {
38023 op3 = copy_to_reg (op3);
38024 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
38025 }
38026 if (!insn_data[icode].operand[5].predicate (op4, mode4))
38027 {
38028 error ("the last argument must be scale 1, 2, 4, 8");
38029 return const0_rtx;
38030 }
38031
38032 /* Optimize. If mask is known to have all high bits set,
38033 replace op0 with pc_rtx to signal that the instruction
38034 overwrites the whole destination and doesn't use its
38035 previous contents. */
38036 if (optimize)
38037 {
38038 if (TREE_CODE (arg3) == INTEGER_CST)
38039 {
38040 if (integer_all_onesp (arg3))
38041 op0 = pc_rtx;
38042 }
38043 else if (TREE_CODE (arg3) == VECTOR_CST)
38044 {
38045 unsigned int negative = 0;
38046 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
38047 {
38048 tree cst = VECTOR_CST_ELT (arg3, i);
38049 if (TREE_CODE (cst) == INTEGER_CST
38050 && tree_int_cst_sign_bit (cst))
38051 negative++;
38052 else if (TREE_CODE (cst) == REAL_CST
38053 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
38054 negative++;
38055 }
38056 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
38057 op0 = pc_rtx;
38058 }
38059 else if (TREE_CODE (arg3) == SSA_NAME
38060 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
38061 {
38062 /* Recognize also when mask is like:
38063 __v2df src = _mm_setzero_pd ();
38064 __v2df mask = _mm_cmpeq_pd (src, src);
38065 or
38066 __v8sf src = _mm256_setzero_ps ();
38067 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
38068 as that is a cheaper way to load all ones into
38069 a register than having to load a constant from
38070 memory. */
38071 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
38072 if (is_gimple_call (def_stmt))
38073 {
38074 tree fndecl = gimple_call_fndecl (def_stmt);
38075 if (fndecl
38076 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
38077 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
38078 {
38079 case IX86_BUILTIN_CMPPD:
38080 case IX86_BUILTIN_CMPPS:
38081 case IX86_BUILTIN_CMPPD256:
38082 case IX86_BUILTIN_CMPPS256:
38083 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
38084 break;
38085 /* FALLTHRU */
38086 case IX86_BUILTIN_CMPEQPD:
38087 case IX86_BUILTIN_CMPEQPS:
38088 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
38089 && initializer_zerop (gimple_call_arg (def_stmt,
38090 1)))
38091 op0 = pc_rtx;
38092 break;
38093 default:
38094 break;
38095 }
38096 }
38097 }
38098 }
38099
38100 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
38101 if (! pat)
38102 return const0_rtx;
38103 emit_insn (pat);
38104
38105 switch (fcode)
38106 {
38107 case IX86_BUILTIN_GATHER3DIV16SF:
38108 if (target == NULL_RTX)
38109 target = gen_reg_rtx (V8SFmode);
38110 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
38111 break;
38112 case IX86_BUILTIN_GATHER3DIV16SI:
38113 if (target == NULL_RTX)
38114 target = gen_reg_rtx (V8SImode);
38115 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
38116 break;
38117 case IX86_BUILTIN_GATHER3DIV8SF:
38118 case IX86_BUILTIN_GATHERDIV8SF:
38119 if (target == NULL_RTX)
38120 target = gen_reg_rtx (V4SFmode);
38121 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
38122 break;
38123 case IX86_BUILTIN_GATHER3DIV8SI:
38124 case IX86_BUILTIN_GATHERDIV8SI:
38125 if (target == NULL_RTX)
38126 target = gen_reg_rtx (V4SImode);
38127 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
38128 break;
38129 default:
38130 target = subtarget;
38131 break;
38132 }
38133 return target;
38134
38135 scatter_gen:
38136 arg0 = CALL_EXPR_ARG (exp, 0);
38137 arg1 = CALL_EXPR_ARG (exp, 1);
38138 arg2 = CALL_EXPR_ARG (exp, 2);
38139 arg3 = CALL_EXPR_ARG (exp, 3);
38140 arg4 = CALL_EXPR_ARG (exp, 4);
38141 op0 = expand_normal (arg0);
38142 op1 = expand_normal (arg1);
38143 op2 = expand_normal (arg2);
38144 op3 = expand_normal (arg3);
38145 op4 = expand_normal (arg4);
38146 mode1 = insn_data[icode].operand[1].mode;
38147 mode2 = insn_data[icode].operand[2].mode;
38148 mode3 = insn_data[icode].operand[3].mode;
38149 mode4 = insn_data[icode].operand[4].mode;
38150
38151 /* Scatter instruction stores operand op3 to memory with
38152 indices from op2 and scale from op4 under writemask op1.
38153 If index operand op2 has more elements then source operand
38154 op3 one need to use only its low half. And vice versa. */
38155 switch (fcode)
38156 {
38157 case IX86_BUILTIN_SCATTERALTSIV8DF:
38158 case IX86_BUILTIN_SCATTERALTSIV8DI:
38159 half = gen_reg_rtx (V8SImode);
38160 if (!nonimmediate_operand (op2, V16SImode))
38161 op2 = copy_to_mode_reg (V16SImode, op2);
38162 emit_insn (gen_vec_extract_lo_v16si (half, op2));
38163 op2 = half;
38164 break;
38165 case IX86_BUILTIN_SCATTERALTDIV16SF:
38166 case IX86_BUILTIN_SCATTERALTDIV16SI:
38167 half = gen_reg_rtx (mode3);
38168 if (mode3 == V8SFmode)
38169 gen = gen_vec_extract_lo_v16sf;
38170 else
38171 gen = gen_vec_extract_lo_v16si;
38172 if (!nonimmediate_operand (op3, GET_MODE (op3)))
38173 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
38174 emit_insn (gen (half, op3));
38175 op3 = half;
38176 break;
38177 default:
38178 break;
38179 }
38180
38181 /* Force memory operand only with base register here. But we
38182 don't want to do it on memory operand for other builtin
38183 functions. */
38184 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
38185
38186 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
38187 op0 = copy_to_mode_reg (Pmode, op0);
38188
38189 op1 = fixup_modeless_constant (op1, mode1);
38190
38191 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
38192 {
38193 if (!insn_data[icode].operand[1].predicate (op1, mode1))
38194 op1 = copy_to_mode_reg (mode1, op1);
38195 }
38196 else
38197 {
38198 op1 = copy_to_reg (op1);
38199 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
38200 }
38201
38202 if (!insn_data[icode].operand[2].predicate (op2, mode2))
38203 op2 = copy_to_mode_reg (mode2, op2);
38204
38205 if (!insn_data[icode].operand[3].predicate (op3, mode3))
38206 op3 = copy_to_mode_reg (mode3, op3);
38207
38208 if (!insn_data[icode].operand[4].predicate (op4, mode4))
38209 {
38210 error ("the last argument must be scale 1, 2, 4, 8");
38211 return const0_rtx;
38212 }
38213
38214 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
38215 if (! pat)
38216 return const0_rtx;
38217
38218 emit_insn (pat);
38219 return 0;
38220
38221 vec_prefetch_gen:
38222 arg0 = CALL_EXPR_ARG (exp, 0);
38223 arg1 = CALL_EXPR_ARG (exp, 1);
38224 arg2 = CALL_EXPR_ARG (exp, 2);
38225 arg3 = CALL_EXPR_ARG (exp, 3);
38226 arg4 = CALL_EXPR_ARG (exp, 4);
38227 op0 = expand_normal (arg0);
38228 op1 = expand_normal (arg1);
38229 op2 = expand_normal (arg2);
38230 op3 = expand_normal (arg3);
38231 op4 = expand_normal (arg4);
38232 mode0 = insn_data[icode].operand[0].mode;
38233 mode1 = insn_data[icode].operand[1].mode;
38234 mode3 = insn_data[icode].operand[3].mode;
38235 mode4 = insn_data[icode].operand[4].mode;
38236
38237 op0 = fixup_modeless_constant (op0, mode0);
38238
38239 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
38240 {
38241 if (!insn_data[icode].operand[0].predicate (op0, mode0))
38242 op0 = copy_to_mode_reg (mode0, op0);
38243 }
38244 else
38245 {
38246 op0 = copy_to_reg (op0);
38247 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
38248 }
38249
38250 if (!insn_data[icode].operand[1].predicate (op1, mode1))
38251 op1 = copy_to_mode_reg (mode1, op1);
38252
38253 /* Force memory operand only with base register here. But we
38254 don't want to do it on memory operand for other builtin
38255 functions. */
38256 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
38257
38258 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
38259 op2 = copy_to_mode_reg (Pmode, op2);
38260
38261 if (!insn_data[icode].operand[3].predicate (op3, mode3))
38262 {
38263 error ("the forth argument must be scale 1, 2, 4, 8");
38264 return const0_rtx;
38265 }
38266
38267 if (!insn_data[icode].operand[4].predicate (op4, mode4))
38268 {
38269 error ("incorrect hint operand");
38270 return const0_rtx;
38271 }
38272
38273 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
38274 if (! pat)
38275 return const0_rtx;
38276
38277 emit_insn (pat);
38278
38279 return 0;
38280
38281 case IX86_BUILTIN_XABORT:
38282 icode = CODE_FOR_xabort;
38283 arg0 = CALL_EXPR_ARG (exp, 0);
38284 op0 = expand_normal (arg0);
38285 mode0 = insn_data[icode].operand[0].mode;
38286 if (!insn_data[icode].operand[0].predicate (op0, mode0))
38287 {
38288 error ("the xabort's argument must be an 8-bit immediate");
38289 return const0_rtx;
38290 }
38291 emit_insn (gen_xabort (op0));
38292 return 0;
38293
38294 default:
38295 break;
38296 }
38297
38298 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
38299 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
38300 {
38301 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
38302 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
38303 target);
38304 }
38305
38306 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
38307 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
38308 {
38309 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
38310 switch (fcode)
38311 {
38312 case IX86_BUILTIN_FABSQ:
38313 case IX86_BUILTIN_COPYSIGNQ:
38314 if (!TARGET_SSE)
38315 /* Emit a normal call if SSE isn't available. */
38316 return expand_call (exp, target, ignore);
38317 /* FALLTHRU */
38318 default:
38319 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
38320 }
38321 }
38322
38323 if (fcode >= IX86_BUILTIN__BDESC_ARGS2_FIRST
38324 && fcode <= IX86_BUILTIN__BDESC_ARGS2_LAST)
38325 {
38326 i = fcode - IX86_BUILTIN__BDESC_ARGS2_FIRST;
38327 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
38328 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
38329 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
38330 int masked = 1;
38331 machine_mode mode, wide_mode, nar_mode;
38332
38333 nar_mode = V4SFmode;
38334 mode = V16SFmode;
38335 wide_mode = V64SFmode;
38336 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
38337 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
38338
38339 switch (fcode)
38340 {
38341 case IX86_BUILTIN_4FMAPS:
38342 fcn = gen_avx5124fmaddps_4fmaddps;
38343 masked = 0;
38344 goto v4fma_expand;
38345
38346 case IX86_BUILTIN_4DPWSSD:
38347 nar_mode = V4SImode;
38348 mode = V16SImode;
38349 wide_mode = V64SImode;
38350 fcn = gen_avx5124vnniw_vp4dpwssd;
38351 masked = 0;
38352 goto v4fma_expand;
38353
38354 case IX86_BUILTIN_4DPWSSDS:
38355 nar_mode = V4SImode;
38356 mode = V16SImode;
38357 wide_mode = V64SImode;
38358 fcn = gen_avx5124vnniw_vp4dpwssds;
38359 masked = 0;
38360 goto v4fma_expand;
38361
38362 case IX86_BUILTIN_4FNMAPS:
38363 fcn = gen_avx5124fmaddps_4fnmaddps;
38364 masked = 0;
38365 goto v4fma_expand;
38366
38367 case IX86_BUILTIN_4FNMAPS_MASK:
38368 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
38369 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
38370 goto v4fma_expand;
38371
38372 case IX86_BUILTIN_4DPWSSD_MASK:
38373 nar_mode = V4SImode;
38374 mode = V16SImode;
38375 wide_mode = V64SImode;
38376 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
38377 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
38378 goto v4fma_expand;
38379
38380 case IX86_BUILTIN_4DPWSSDS_MASK:
38381 nar_mode = V4SImode;
38382 mode = V16SImode;
38383 wide_mode = V64SImode;
38384 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
38385 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
38386 goto v4fma_expand;
38387
38388 case IX86_BUILTIN_4FMAPS_MASK:
38389 {
38390 tree args[4];
38391 rtx ops[4];
38392 rtx wide_reg;
38393 rtx accum;
38394 rtx addr;
38395 rtx mem;
38396
38397 v4fma_expand:
38398 wide_reg = gen_reg_rtx (wide_mode);
38399 for (i = 0; i < 4; i++)
38400 {
38401 args[i] = CALL_EXPR_ARG (exp, i);
38402 ops[i] = expand_normal (args[i]);
38403
38404 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
38405 ops[i]);
38406 }
38407
38408 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
38409 accum = force_reg (mode, accum);
38410
38411 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
38412 addr = force_reg (Pmode, addr);
38413
38414 mem = gen_rtx_MEM (nar_mode, addr);
38415
38416 target = gen_reg_rtx (mode);
38417
38418 emit_move_insn (target, accum);
38419
38420 if (! masked)
38421 emit_insn (fcn (target, accum, wide_reg, mem));
38422 else
38423 {
38424 rtx merge, mask;
38425 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
38426
38427 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
38428
38429 if (CONST_INT_P (mask))
38430 mask = fixup_modeless_constant (mask, HImode);
38431
38432 mask = force_reg (HImode, mask);
38433
38434 if (GET_MODE (mask) != HImode)
38435 mask = gen_rtx_SUBREG (HImode, mask, 0);
38436
38437 /* If merge is 0 then we're about to emit z-masked variant. */
38438 if (const0_operand (merge, mode))
38439 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
38440 /* If merge is the same as accum then emit merge-masked variant. */
38441 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
38442 {
38443 merge = force_reg (mode, merge);
38444 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
38445 }
38446 /* Merge with something unknown might happen if we z-mask w/ -O0. */
38447 else
38448 {
38449 target = gen_reg_rtx (mode);
38450 emit_move_insn (target, merge);
38451 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
38452 }
38453 }
38454 return target;
38455 }
38456
38457 case IX86_BUILTIN_4FNMASS:
38458 fcn = gen_avx5124fmaddps_4fnmaddss;
38459 masked = 0;
38460 goto s4fma_expand;
38461
38462 case IX86_BUILTIN_4FMASS:
38463 fcn = gen_avx5124fmaddps_4fmaddss;
38464 masked = 0;
38465 goto s4fma_expand;
38466
38467 case IX86_BUILTIN_4FNMASS_MASK:
38468 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
38469 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
38470 goto s4fma_expand;
38471
38472 case IX86_BUILTIN_4FMASS_MASK:
38473 {
38474 tree args[4];
38475 rtx ops[4];
38476 rtx wide_reg;
38477 rtx accum;
38478 rtx addr;
38479 rtx mem;
38480
38481 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
38482 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
38483
38484 s4fma_expand:
38485 mode = V4SFmode;
38486 wide_reg = gen_reg_rtx (V64SFmode);
38487 for (i = 0; i < 4; i++)
38488 {
38489 rtx tmp;
38490 args[i] = CALL_EXPR_ARG (exp, i);
38491 ops[i] = expand_normal (args[i]);
38492
38493 tmp = gen_reg_rtx (SFmode);
38494 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
38495
38496 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
38497 gen_rtx_SUBREG (V16SFmode, tmp, 0));
38498 }
38499
38500 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
38501 accum = force_reg (V4SFmode, accum);
38502
38503 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
38504 addr = force_reg (Pmode, addr);
38505
38506 mem = gen_rtx_MEM (V4SFmode, addr);
38507
38508 target = gen_reg_rtx (V4SFmode);
38509
38510 emit_move_insn (target, accum);
38511
38512 if (! masked)
38513 emit_insn (fcn (target, accum, wide_reg, mem));
38514 else
38515 {
38516 rtx merge, mask;
38517 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
38518
38519 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
38520
38521 if (CONST_INT_P (mask))
38522 mask = fixup_modeless_constant (mask, QImode);
38523
38524 mask = force_reg (QImode, mask);
38525
38526 if (GET_MODE (mask) != QImode)
38527 mask = gen_rtx_SUBREG (QImode, mask, 0);
38528
38529 /* If merge is 0 then we're about to emit z-masked variant. */
38530 if (const0_operand (merge, mode))
38531 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
38532 /* If merge is the same as accum then emit merge-masked
38533 variant. */
38534 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
38535 {
38536 merge = force_reg (mode, merge);
38537 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
38538 }
38539 /* Merge with something unknown might happen if we z-mask
38540 w/ -O0. */
38541 else
38542 {
38543 target = gen_reg_rtx (mode);
38544 emit_move_insn (target, merge);
38545 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
38546 }
38547 }
38548 return target;
38549 }
38550 default:
38551 return ix86_expand_args_builtin (bdesc_args2 + i, exp, target);
38552 }
38553 }
38554
38555 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
38556 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
38557 {
38558 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
38559 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
38560 }
38561
38562 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
38563 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
38564 {
38565 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
38566 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
38567 }
38568
38569 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
38570 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
38571 {
38572 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
38573 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
38574 }
38575
38576 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
38577 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
38578 {
38579 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
38580 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
38581 }
38582
38583 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
38584 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
38585 {
38586 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
38587 const struct builtin_description *d = bdesc_multi_arg + i;
38588 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
38589 (enum ix86_builtin_func_type)
38590 d->flag, d->comparison);
38591 }
38592
38593 gcc_unreachable ();
38594 }
38595
38596 /* This returns the target-specific builtin with code CODE if
38597 current_function_decl has visibility on this builtin, which is checked
38598 using isa flags. Returns NULL_TREE otherwise. */
38599
38600 static tree ix86_get_builtin (enum ix86_builtins code)
38601 {
38602 struct cl_target_option *opts;
38603 tree target_tree = NULL_TREE;
38604
38605 /* Determine the isa flags of current_function_decl. */
38606
38607 if (current_function_decl)
38608 target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl);
38609
38610 if (target_tree == NULL)
38611 target_tree = target_option_default_node;
38612
38613 opts = TREE_TARGET_OPTION (target_tree);
38614
38615 if ((ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
38616 || (ix86_builtins_isa[(int) code].isa2 & opts->x_ix86_isa_flags2))
38617 return ix86_builtin_decl (code, true);
38618 else
38619 return NULL_TREE;
38620 }
38621
38622 /* Return function decl for target specific builtin
38623 for given MPX builtin passed i FCODE. */
38624 static tree
38625 ix86_builtin_mpx_function (unsigned fcode)
38626 {
38627 switch (fcode)
38628 {
38629 case BUILT_IN_CHKP_BNDMK:
38630 return ix86_builtins[IX86_BUILTIN_BNDMK];
38631
38632 case BUILT_IN_CHKP_BNDSTX:
38633 return ix86_builtins[IX86_BUILTIN_BNDSTX];
38634
38635 case BUILT_IN_CHKP_BNDLDX:
38636 return ix86_builtins[IX86_BUILTIN_BNDLDX];
38637
38638 case BUILT_IN_CHKP_BNDCL:
38639 return ix86_builtins[IX86_BUILTIN_BNDCL];
38640
38641 case BUILT_IN_CHKP_BNDCU:
38642 return ix86_builtins[IX86_BUILTIN_BNDCU];
38643
38644 case BUILT_IN_CHKP_BNDRET:
38645 return ix86_builtins[IX86_BUILTIN_BNDRET];
38646
38647 case BUILT_IN_CHKP_INTERSECT:
38648 return ix86_builtins[IX86_BUILTIN_BNDINT];
38649
38650 case BUILT_IN_CHKP_NARROW:
38651 return ix86_builtins[IX86_BUILTIN_BNDNARROW];
38652
38653 case BUILT_IN_CHKP_SIZEOF:
38654 return ix86_builtins[IX86_BUILTIN_SIZEOF];
38655
38656 case BUILT_IN_CHKP_EXTRACT_LOWER:
38657 return ix86_builtins[IX86_BUILTIN_BNDLOWER];
38658
38659 case BUILT_IN_CHKP_EXTRACT_UPPER:
38660 return ix86_builtins[IX86_BUILTIN_BNDUPPER];
38661
38662 default:
38663 return NULL_TREE;
38664 }
38665
38666 gcc_unreachable ();
38667 }
38668
38669 /* Helper function for ix86_load_bounds and ix86_store_bounds.
38670
38671 Return an address to be used to load/store bounds for pointer
38672 passed in SLOT.
38673
38674 SLOT_NO is an integer constant holding number of a target
38675 dependent special slot to be used in case SLOT is not a memory.
38676
38677 SPECIAL_BASE is a pointer to be used as a base of fake address
38678 to access special slots in Bounds Table. SPECIAL_BASE[-1],
38679 SPECIAL_BASE[-2] etc. will be used as fake pointer locations. */
38680
38681 static rtx
38682 ix86_get_arg_address_for_bt (rtx slot, rtx slot_no, rtx special_base)
38683 {
38684 rtx addr = NULL;
38685
38686 /* NULL slot means we pass bounds for pointer not passed to the
38687 function at all. Register slot means we pass pointer in a
38688 register. In both these cases bounds are passed via Bounds
38689 Table. Since we do not have actual pointer stored in memory,
38690 we have to use fake addresses to access Bounds Table. We
38691 start with (special_base - sizeof (void*)) and decrease this
38692 address by pointer size to get addresses for other slots. */
38693 if (!slot || REG_P (slot))
38694 {
38695 gcc_assert (CONST_INT_P (slot_no));
38696 addr = plus_constant (Pmode, special_base,
38697 -(INTVAL (slot_no) + 1) * GET_MODE_SIZE (Pmode));
38698 }
38699 /* If pointer is passed in a memory then its address is used to
38700 access Bounds Table. */
38701 else if (MEM_P (slot))
38702 {
38703 addr = XEXP (slot, 0);
38704 if (!register_operand (addr, Pmode))
38705 addr = copy_addr_to_reg (addr);
38706 }
38707 else
38708 gcc_unreachable ();
38709
38710 return addr;
38711 }
38712
38713 /* Expand pass uses this hook to load bounds for function parameter
38714 PTR passed in SLOT in case its bounds are not passed in a register.
38715
38716 If SLOT is a memory, then bounds are loaded as for regular pointer
38717 loaded from memory. PTR may be NULL in case SLOT is a memory.
38718 In such case value of PTR (if required) may be loaded from SLOT.
38719
38720 If SLOT is NULL or a register then SLOT_NO is an integer constant
38721 holding number of the target dependent special slot which should be
38722 used to obtain bounds.
38723
38724 Return loaded bounds. */
38725
38726 static rtx
38727 ix86_load_bounds (rtx slot, rtx ptr, rtx slot_no)
38728 {
38729 rtx reg = gen_reg_rtx (BNDmode);
38730 rtx addr;
38731
38732 /* Get address to be used to access Bounds Table. Special slots start
38733 at the location of return address of the current function. */
38734 addr = ix86_get_arg_address_for_bt (slot, slot_no, arg_pointer_rtx);
38735
38736 /* Load pointer value from a memory if we don't have it. */
38737 if (!ptr)
38738 {
38739 gcc_assert (MEM_P (slot));
38740 ptr = copy_addr_to_reg (slot);
38741 }
38742
38743 if (!register_operand (ptr, Pmode))
38744 ptr = ix86_zero_extend_to_Pmode (ptr);
38745
38746 emit_insn (BNDmode == BND64mode
38747 ? gen_bnd64_ldx (reg, addr, ptr)
38748 : gen_bnd32_ldx (reg, addr, ptr));
38749
38750 return reg;
38751 }
38752
38753 /* Expand pass uses this hook to store BOUNDS for call argument PTR
38754 passed in SLOT in case BOUNDS are not passed in a register.
38755
38756 If SLOT is a memory, then BOUNDS are stored as for regular pointer
38757 stored in memory. PTR may be NULL in case SLOT is a memory.
38758 In such case value of PTR (if required) may be loaded from SLOT.
38759
38760 If SLOT is NULL or a register then SLOT_NO is an integer constant
38761 holding number of the target dependent special slot which should be
38762 used to store BOUNDS. */
38763
38764 static void
38765 ix86_store_bounds (rtx ptr, rtx slot, rtx bounds, rtx slot_no)
38766 {
38767 rtx addr;
38768
38769 /* Get address to be used to access Bounds Table. Special slots start
38770 at the location of return address of a called function. */
38771 addr = ix86_get_arg_address_for_bt (slot, slot_no, stack_pointer_rtx);
38772
38773 /* Load pointer value from a memory if we don't have it. */
38774 if (!ptr)
38775 {
38776 gcc_assert (MEM_P (slot));
38777 ptr = copy_addr_to_reg (slot);
38778 }
38779
38780 if (!register_operand (ptr, Pmode))
38781 ptr = ix86_zero_extend_to_Pmode (ptr);
38782
38783 gcc_assert (POINTER_BOUNDS_MODE_P (GET_MODE (bounds)));
38784 if (!register_operand (bounds, BNDmode))
38785 bounds = copy_to_mode_reg (BNDmode, bounds);
38786
38787 emit_insn (BNDmode == BND64mode
38788 ? gen_bnd64_stx (addr, ptr, bounds)
38789 : gen_bnd32_stx (addr, ptr, bounds));
38790 }
38791
38792 /* Load and return bounds returned by function in SLOT. */
38793
38794 static rtx
38795 ix86_load_returned_bounds (rtx slot)
38796 {
38797 rtx res;
38798
38799 gcc_assert (REG_P (slot));
38800 res = gen_reg_rtx (BNDmode);
38801 emit_move_insn (res, slot);
38802
38803 return res;
38804 }
38805
38806 /* Store BOUNDS returned by function into SLOT. */
38807
38808 static void
38809 ix86_store_returned_bounds (rtx slot, rtx bounds)
38810 {
38811 gcc_assert (REG_P (slot));
38812 emit_move_insn (slot, bounds);
38813 }
38814
38815 /* Returns a function decl for a vectorized version of the combined function
38816 with combined_fn code FN and the result vector type TYPE, or NULL_TREE
38817 if it is not available. */
38818
38819 static tree
38820 ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
38821 tree type_in)
38822 {
38823 machine_mode in_mode, out_mode;
38824 int in_n, out_n;
38825
38826 if (TREE_CODE (type_out) != VECTOR_TYPE
38827 || TREE_CODE (type_in) != VECTOR_TYPE)
38828 return NULL_TREE;
38829
38830 out_mode = TYPE_MODE (TREE_TYPE (type_out));
38831 out_n = TYPE_VECTOR_SUBPARTS (type_out);
38832 in_mode = TYPE_MODE (TREE_TYPE (type_in));
38833 in_n = TYPE_VECTOR_SUBPARTS (type_in);
38834
38835 switch (fn)
38836 {
38837 CASE_CFN_EXP2:
38838 if (out_mode == SFmode && in_mode == SFmode)
38839 {
38840 if (out_n == 16 && in_n == 16)
38841 return ix86_get_builtin (IX86_BUILTIN_EXP2PS);
38842 }
38843 break;
38844
38845 CASE_CFN_IFLOOR:
38846 CASE_CFN_LFLOOR:
38847 CASE_CFN_LLFLOOR:
38848 /* The round insn does not trap on denormals. */
38849 if (flag_trapping_math || !TARGET_ROUND)
38850 break;
38851
38852 if (out_mode == SImode && in_mode == DFmode)
38853 {
38854 if (out_n == 4 && in_n == 2)
38855 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX);
38856 else if (out_n == 8 && in_n == 4)
38857 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256);
38858 else if (out_n == 16 && in_n == 8)
38859 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512);
38860 }
38861 if (out_mode == SImode && in_mode == SFmode)
38862 {
38863 if (out_n == 4 && in_n == 4)
38864 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX);
38865 else if (out_n == 8 && in_n == 8)
38866 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256);
38867 else if (out_n == 16 && in_n == 16)
38868 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512);
38869 }
38870 break;
38871
38872 CASE_CFN_ICEIL:
38873 CASE_CFN_LCEIL:
38874 CASE_CFN_LLCEIL:
38875 /* The round insn does not trap on denormals. */
38876 if (flag_trapping_math || !TARGET_ROUND)
38877 break;
38878
38879 if (out_mode == SImode && in_mode == DFmode)
38880 {
38881 if (out_n == 4 && in_n == 2)
38882 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX);
38883 else if (out_n == 8 && in_n == 4)
38884 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256);
38885 else if (out_n == 16 && in_n == 8)
38886 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512);
38887 }
38888 if (out_mode == SImode && in_mode == SFmode)
38889 {
38890 if (out_n == 4 && in_n == 4)
38891 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX);
38892 else if (out_n == 8 && in_n == 8)
38893 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256);
38894 else if (out_n == 16 && in_n == 16)
38895 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512);
38896 }
38897 break;
38898
38899 CASE_CFN_IRINT:
38900 CASE_CFN_LRINT:
38901 CASE_CFN_LLRINT:
38902 if (out_mode == SImode && in_mode == DFmode)
38903 {
38904 if (out_n == 4 && in_n == 2)
38905 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX);
38906 else if (out_n == 8 && in_n == 4)
38907 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256);
38908 else if (out_n == 16 && in_n == 8)
38909 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512);
38910 }
38911 if (out_mode == SImode && in_mode == SFmode)
38912 {
38913 if (out_n == 4 && in_n == 4)
38914 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ);
38915 else if (out_n == 8 && in_n == 8)
38916 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256);
38917 else if (out_n == 16 && in_n == 16)
38918 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512);
38919 }
38920 break;
38921
38922 CASE_CFN_IROUND:
38923 CASE_CFN_LROUND:
38924 CASE_CFN_LLROUND:
38925 /* The round insn does not trap on denormals. */
38926 if (flag_trapping_math || !TARGET_ROUND)
38927 break;
38928
38929 if (out_mode == SImode && in_mode == DFmode)
38930 {
38931 if (out_n == 4 && in_n == 2)
38932 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX);
38933 else if (out_n == 8 && in_n == 4)
38934 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256);
38935 else if (out_n == 16 && in_n == 8)
38936 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512);
38937 }
38938 if (out_mode == SImode && in_mode == SFmode)
38939 {
38940 if (out_n == 4 && in_n == 4)
38941 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX);
38942 else if (out_n == 8 && in_n == 8)
38943 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256);
38944 else if (out_n == 16 && in_n == 16)
38945 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512);
38946 }
38947 break;
38948
38949 CASE_CFN_FLOOR:
38950 /* The round insn does not trap on denormals. */
38951 if (flag_trapping_math || !TARGET_ROUND)
38952 break;
38953
38954 if (out_mode == DFmode && in_mode == DFmode)
38955 {
38956 if (out_n == 2 && in_n == 2)
38957 return ix86_get_builtin (IX86_BUILTIN_FLOORPD);
38958 else if (out_n == 4 && in_n == 4)
38959 return ix86_get_builtin (IX86_BUILTIN_FLOORPD256);
38960 else if (out_n == 8 && in_n == 8)
38961 return ix86_get_builtin (IX86_BUILTIN_FLOORPD512);
38962 }
38963 if (out_mode == SFmode && in_mode == SFmode)
38964 {
38965 if (out_n == 4 && in_n == 4)
38966 return ix86_get_builtin (IX86_BUILTIN_FLOORPS);
38967 else if (out_n == 8 && in_n == 8)
38968 return ix86_get_builtin (IX86_BUILTIN_FLOORPS256);
38969 else if (out_n == 16 && in_n == 16)
38970 return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
38971 }
38972 break;
38973
38974 CASE_CFN_CEIL:
38975 /* The round insn does not trap on denormals. */
38976 if (flag_trapping_math || !TARGET_ROUND)
38977 break;
38978
38979 if (out_mode == DFmode && in_mode == DFmode)
38980 {
38981 if (out_n == 2 && in_n == 2)
38982 return ix86_get_builtin (IX86_BUILTIN_CEILPD);
38983 else if (out_n == 4 && in_n == 4)
38984 return ix86_get_builtin (IX86_BUILTIN_CEILPD256);
38985 else if (out_n == 8 && in_n == 8)
38986 return ix86_get_builtin (IX86_BUILTIN_CEILPD512);
38987 }
38988 if (out_mode == SFmode && in_mode == SFmode)
38989 {
38990 if (out_n == 4 && in_n == 4)
38991 return ix86_get_builtin (IX86_BUILTIN_CEILPS);
38992 else if (out_n == 8 && in_n == 8)
38993 return ix86_get_builtin (IX86_BUILTIN_CEILPS256);
38994 else if (out_n == 16 && in_n == 16)
38995 return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
38996 }
38997 break;
38998
38999 CASE_CFN_TRUNC:
39000 /* The round insn does not trap on denormals. */
39001 if (flag_trapping_math || !TARGET_ROUND)
39002 break;
39003
39004 if (out_mode == DFmode && in_mode == DFmode)
39005 {
39006 if (out_n == 2 && in_n == 2)
39007 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD);
39008 else if (out_n == 4 && in_n == 4)
39009 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256);
39010 else if (out_n == 8 && in_n == 8)
39011 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512);
39012 }
39013 if (out_mode == SFmode && in_mode == SFmode)
39014 {
39015 if (out_n == 4 && in_n == 4)
39016 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS);
39017 else if (out_n == 8 && in_n == 8)
39018 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256);
39019 else if (out_n == 16 && in_n == 16)
39020 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
39021 }
39022 break;
39023
39024 CASE_CFN_RINT:
39025 /* The round insn does not trap on denormals. */
39026 if (flag_trapping_math || !TARGET_ROUND)
39027 break;
39028
39029 if (out_mode == DFmode && in_mode == DFmode)
39030 {
39031 if (out_n == 2 && in_n == 2)
39032 return ix86_get_builtin (IX86_BUILTIN_RINTPD);
39033 else if (out_n == 4 && in_n == 4)
39034 return ix86_get_builtin (IX86_BUILTIN_RINTPD256);
39035 }
39036 if (out_mode == SFmode && in_mode == SFmode)
39037 {
39038 if (out_n == 4 && in_n == 4)
39039 return ix86_get_builtin (IX86_BUILTIN_RINTPS);
39040 else if (out_n == 8 && in_n == 8)
39041 return ix86_get_builtin (IX86_BUILTIN_RINTPS256);
39042 }
39043 break;
39044
39045 CASE_CFN_FMA:
39046 if (out_mode == DFmode && in_mode == DFmode)
39047 {
39048 if (out_n == 2 && in_n == 2)
39049 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD);
39050 if (out_n == 4 && in_n == 4)
39051 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256);
39052 }
39053 if (out_mode == SFmode && in_mode == SFmode)
39054 {
39055 if (out_n == 4 && in_n == 4)
39056 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS);
39057 if (out_n == 8 && in_n == 8)
39058 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256);
39059 }
39060 break;
39061
39062 default:
39063 break;
39064 }
39065
39066 /* Dispatch to a handler for a vectorization library. */
39067 if (ix86_veclib_handler)
39068 return ix86_veclib_handler (combined_fn (fn), type_out, type_in);
39069
39070 return NULL_TREE;
39071 }
39072
39073 /* Handler for an SVML-style interface to
39074 a library with vectorized intrinsics. */
39075
39076 static tree
39077 ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in)
39078 {
39079 char name[20];
39080 tree fntype, new_fndecl, args;
39081 unsigned arity;
39082 const char *bname;
39083 machine_mode el_mode, in_mode;
39084 int n, in_n;
39085
39086 /* The SVML is suitable for unsafe math only. */
39087 if (!flag_unsafe_math_optimizations)
39088 return NULL_TREE;
39089
39090 el_mode = TYPE_MODE (TREE_TYPE (type_out));
39091 n = TYPE_VECTOR_SUBPARTS (type_out);
39092 in_mode = TYPE_MODE (TREE_TYPE (type_in));
39093 in_n = TYPE_VECTOR_SUBPARTS (type_in);
39094 if (el_mode != in_mode
39095 || n != in_n)
39096 return NULL_TREE;
39097
39098 switch (fn)
39099 {
39100 CASE_CFN_EXP:
39101 CASE_CFN_LOG:
39102 CASE_CFN_LOG10:
39103 CASE_CFN_POW:
39104 CASE_CFN_TANH:
39105 CASE_CFN_TAN:
39106 CASE_CFN_ATAN:
39107 CASE_CFN_ATAN2:
39108 CASE_CFN_ATANH:
39109 CASE_CFN_CBRT:
39110 CASE_CFN_SINH:
39111 CASE_CFN_SIN:
39112 CASE_CFN_ASINH:
39113 CASE_CFN_ASIN:
39114 CASE_CFN_COSH:
39115 CASE_CFN_COS:
39116 CASE_CFN_ACOSH:
39117 CASE_CFN_ACOS:
39118 if ((el_mode != DFmode || n != 2)
39119 && (el_mode != SFmode || n != 4))
39120 return NULL_TREE;
39121 break;
39122
39123 default:
39124 return NULL_TREE;
39125 }
39126
39127 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
39128 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
39129
39130 if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF)
39131 strcpy (name, "vmlsLn4");
39132 else if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOG)
39133 strcpy (name, "vmldLn2");
39134 else if (n == 4)
39135 {
39136 sprintf (name, "vmls%s", bname+10);
39137 name[strlen (name)-1] = '4';
39138 }
39139 else
39140 sprintf (name, "vmld%s2", bname+10);
39141
39142 /* Convert to uppercase. */
39143 name[4] &= ~0x20;
39144
39145 arity = 0;
39146 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
39147 arity++;
39148
39149 if (arity == 1)
39150 fntype = build_function_type_list (type_out, type_in, NULL);
39151 else
39152 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
39153
39154 /* Build a function declaration for the vectorized function. */
39155 new_fndecl = build_decl (BUILTINS_LOCATION,
39156 FUNCTION_DECL, get_identifier (name), fntype);
39157 TREE_PUBLIC (new_fndecl) = 1;
39158 DECL_EXTERNAL (new_fndecl) = 1;
39159 DECL_IS_NOVOPS (new_fndecl) = 1;
39160 TREE_READONLY (new_fndecl) = 1;
39161
39162 return new_fndecl;
39163 }
39164
39165 /* Handler for an ACML-style interface to
39166 a library with vectorized intrinsics. */
39167
39168 static tree
39169 ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
39170 {
39171 char name[20] = "__vr.._";
39172 tree fntype, new_fndecl, args;
39173 unsigned arity;
39174 const char *bname;
39175 machine_mode el_mode, in_mode;
39176 int n, in_n;
39177
39178 /* The ACML is 64bits only and suitable for unsafe math only as
39179 it does not correctly support parts of IEEE with the required
39180 precision such as denormals. */
39181 if (!TARGET_64BIT
39182 || !flag_unsafe_math_optimizations)
39183 return NULL_TREE;
39184
39185 el_mode = TYPE_MODE (TREE_TYPE (type_out));
39186 n = TYPE_VECTOR_SUBPARTS (type_out);
39187 in_mode = TYPE_MODE (TREE_TYPE (type_in));
39188 in_n = TYPE_VECTOR_SUBPARTS (type_in);
39189 if (el_mode != in_mode
39190 || n != in_n)
39191 return NULL_TREE;
39192
39193 switch (fn)
39194 {
39195 CASE_CFN_SIN:
39196 CASE_CFN_COS:
39197 CASE_CFN_EXP:
39198 CASE_CFN_LOG:
39199 CASE_CFN_LOG2:
39200 CASE_CFN_LOG10:
39201 if (el_mode == DFmode && n == 2)
39202 {
39203 name[4] = 'd';
39204 name[5] = '2';
39205 }
39206 else if (el_mode == SFmode && n == 4)
39207 {
39208 name[4] = 's';
39209 name[5] = '4';
39210 }
39211 else
39212 return NULL_TREE;
39213 break;
39214
39215 default:
39216 return NULL_TREE;
39217 }
39218
39219 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
39220 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
39221 sprintf (name + 7, "%s", bname+10);
39222
39223 arity = 0;
39224 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
39225 arity++;
39226
39227 if (arity == 1)
39228 fntype = build_function_type_list (type_out, type_in, NULL);
39229 else
39230 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
39231
39232 /* Build a function declaration for the vectorized function. */
39233 new_fndecl = build_decl (BUILTINS_LOCATION,
39234 FUNCTION_DECL, get_identifier (name), fntype);
39235 TREE_PUBLIC (new_fndecl) = 1;
39236 DECL_EXTERNAL (new_fndecl) = 1;
39237 DECL_IS_NOVOPS (new_fndecl) = 1;
39238 TREE_READONLY (new_fndecl) = 1;
39239
39240 return new_fndecl;
39241 }
39242
39243 /* Returns a decl of a function that implements gather load with
39244 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
39245 Return NULL_TREE if it is not available. */
39246
39247 static tree
39248 ix86_vectorize_builtin_gather (const_tree mem_vectype,
39249 const_tree index_type, int scale)
39250 {
39251 bool si;
39252 enum ix86_builtins code;
39253
39254 if (! TARGET_AVX2)
39255 return NULL_TREE;
39256
39257 if ((TREE_CODE (index_type) != INTEGER_TYPE
39258 && !POINTER_TYPE_P (index_type))
39259 || (TYPE_MODE (index_type) != SImode
39260 && TYPE_MODE (index_type) != DImode))
39261 return NULL_TREE;
39262
39263 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
39264 return NULL_TREE;
39265
39266 /* v*gather* insn sign extends index to pointer mode. */
39267 if (TYPE_PRECISION (index_type) < POINTER_SIZE
39268 && TYPE_UNSIGNED (index_type))
39269 return NULL_TREE;
39270
39271 if (scale <= 0
39272 || scale > 8
39273 || (scale & (scale - 1)) != 0)
39274 return NULL_TREE;
39275
39276 si = TYPE_MODE (index_type) == SImode;
39277 switch (TYPE_MODE (mem_vectype))
39278 {
39279 case V2DFmode:
39280 if (TARGET_AVX512VL)
39281 code = si ? IX86_BUILTIN_GATHER3SIV2DF : IX86_BUILTIN_GATHER3DIV2DF;
39282 else
39283 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
39284 break;
39285 case V4DFmode:
39286 if (TARGET_AVX512VL)
39287 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DF : IX86_BUILTIN_GATHER3DIV4DF;
39288 else
39289 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
39290 break;
39291 case V2DImode:
39292 if (TARGET_AVX512VL)
39293 code = si ? IX86_BUILTIN_GATHER3SIV2DI : IX86_BUILTIN_GATHER3DIV2DI;
39294 else
39295 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
39296 break;
39297 case V4DImode:
39298 if (TARGET_AVX512VL)
39299 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DI : IX86_BUILTIN_GATHER3DIV4DI;
39300 else
39301 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
39302 break;
39303 case V4SFmode:
39304 if (TARGET_AVX512VL)
39305 code = si ? IX86_BUILTIN_GATHER3SIV4SF : IX86_BUILTIN_GATHER3DIV4SF;
39306 else
39307 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
39308 break;
39309 case V8SFmode:
39310 if (TARGET_AVX512VL)
39311 code = si ? IX86_BUILTIN_GATHER3SIV8SF : IX86_BUILTIN_GATHER3ALTDIV8SF;
39312 else
39313 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
39314 break;
39315 case V4SImode:
39316 if (TARGET_AVX512VL)
39317 code = si ? IX86_BUILTIN_GATHER3SIV4SI : IX86_BUILTIN_GATHER3DIV4SI;
39318 else
39319 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
39320 break;
39321 case V8SImode:
39322 if (TARGET_AVX512VL)
39323 code = si ? IX86_BUILTIN_GATHER3SIV8SI : IX86_BUILTIN_GATHER3ALTDIV8SI;
39324 else
39325 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
39326 break;
39327 case V8DFmode:
39328 if (TARGET_AVX512F)
39329 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
39330 else
39331 return NULL_TREE;
39332 break;
39333 case V8DImode:
39334 if (TARGET_AVX512F)
39335 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI;
39336 else
39337 return NULL_TREE;
39338 break;
39339 case V16SFmode:
39340 if (TARGET_AVX512F)
39341 code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF;
39342 else
39343 return NULL_TREE;
39344 break;
39345 case V16SImode:
39346 if (TARGET_AVX512F)
39347 code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI;
39348 else
39349 return NULL_TREE;
39350 break;
39351 default:
39352 return NULL_TREE;
39353 }
39354
39355 return ix86_get_builtin (code);
39356 }
39357
39358 /* Returns a decl of a function that implements scatter store with
39359 register type VECTYPE and index type INDEX_TYPE and SCALE.
39360 Return NULL_TREE if it is not available. */
39361
39362 static tree
39363 ix86_vectorize_builtin_scatter (const_tree vectype,
39364 const_tree index_type, int scale)
39365 {
39366 bool si;
39367 enum ix86_builtins code;
39368
39369 if (!TARGET_AVX512F)
39370 return NULL_TREE;
39371
39372 if ((TREE_CODE (index_type) != INTEGER_TYPE
39373 && !POINTER_TYPE_P (index_type))
39374 || (TYPE_MODE (index_type) != SImode
39375 && TYPE_MODE (index_type) != DImode))
39376 return NULL_TREE;
39377
39378 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
39379 return NULL_TREE;
39380
39381 /* v*scatter* insn sign extends index to pointer mode. */
39382 if (TYPE_PRECISION (index_type) < POINTER_SIZE
39383 && TYPE_UNSIGNED (index_type))
39384 return NULL_TREE;
39385
39386 /* Scale can be 1, 2, 4 or 8. */
39387 if (scale <= 0
39388 || scale > 8
39389 || (scale & (scale - 1)) != 0)
39390 return NULL_TREE;
39391
39392 si = TYPE_MODE (index_type) == SImode;
39393 switch (TYPE_MODE (vectype))
39394 {
39395 case V8DFmode:
39396 code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
39397 break;
39398 case V8DImode:
39399 code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
39400 break;
39401 case V16SFmode:
39402 code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
39403 break;
39404 case V16SImode:
39405 code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
39406 break;
39407 default:
39408 return NULL_TREE;
39409 }
39410
39411 return ix86_builtins[code];
39412 }
39413
39414 /* Return true if it is safe to use the rsqrt optabs to optimize
39415 1.0/sqrt. */
39416
39417 static bool
39418 use_rsqrt_p ()
39419 {
39420 return (TARGET_SSE_MATH
39421 && flag_finite_math_only
39422 && !flag_trapping_math
39423 && flag_unsafe_math_optimizations);
39424 }
39425
39426 /* Returns a code for a target-specific builtin that implements
39427 reciprocal of the function, or NULL_TREE if not available. */
39428
39429 static tree
39430 ix86_builtin_reciprocal (tree fndecl)
39431 {
39432 switch (DECL_FUNCTION_CODE (fndecl))
39433 {
39434 /* Vectorized version of sqrt to rsqrt conversion. */
39435 case IX86_BUILTIN_SQRTPS_NR:
39436 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR);
39437
39438 case IX86_BUILTIN_SQRTPS_NR256:
39439 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256);
39440
39441 default:
39442 return NULL_TREE;
39443 }
39444 }
39445 \f
39446 /* Helper for avx_vpermilps256_operand et al. This is also used by
39447 the expansion functions to turn the parallel back into a mask.
39448 The return value is 0 for no match and the imm8+1 for a match. */
39449
39450 int
39451 avx_vpermilp_parallel (rtx par, machine_mode mode)
39452 {
39453 unsigned i, nelt = GET_MODE_NUNITS (mode);
39454 unsigned mask = 0;
39455 unsigned char ipar[16] = {}; /* Silence -Wuninitialized warning. */
39456
39457 if (XVECLEN (par, 0) != (int) nelt)
39458 return 0;
39459
39460 /* Validate that all of the elements are constants, and not totally
39461 out of range. Copy the data into an integral array to make the
39462 subsequent checks easier. */
39463 for (i = 0; i < nelt; ++i)
39464 {
39465 rtx er = XVECEXP (par, 0, i);
39466 unsigned HOST_WIDE_INT ei;
39467
39468 if (!CONST_INT_P (er))
39469 return 0;
39470 ei = INTVAL (er);
39471 if (ei >= nelt)
39472 return 0;
39473 ipar[i] = ei;
39474 }
39475
39476 switch (mode)
39477 {
39478 case V8DFmode:
39479 /* In the 512-bit DFmode case, we can only move elements within
39480 a 128-bit lane. First fill the second part of the mask,
39481 then fallthru. */
39482 for (i = 4; i < 6; ++i)
39483 {
39484 if (ipar[i] < 4 || ipar[i] >= 6)
39485 return 0;
39486 mask |= (ipar[i] - 4) << i;
39487 }
39488 for (i = 6; i < 8; ++i)
39489 {
39490 if (ipar[i] < 6)
39491 return 0;
39492 mask |= (ipar[i] - 6) << i;
39493 }
39494 /* FALLTHRU */
39495
39496 case V4DFmode:
39497 /* In the 256-bit DFmode case, we can only move elements within
39498 a 128-bit lane. */
39499 for (i = 0; i < 2; ++i)
39500 {
39501 if (ipar[i] >= 2)
39502 return 0;
39503 mask |= ipar[i] << i;
39504 }
39505 for (i = 2; i < 4; ++i)
39506 {
39507 if (ipar[i] < 2)
39508 return 0;
39509 mask |= (ipar[i] - 2) << i;
39510 }
39511 break;
39512
39513 case V16SFmode:
39514 /* In 512 bit SFmode case, permutation in the upper 256 bits
39515 must mirror the permutation in the lower 256-bits. */
39516 for (i = 0; i < 8; ++i)
39517 if (ipar[i] + 8 != ipar[i + 8])
39518 return 0;
39519 /* FALLTHRU */
39520
39521 case V8SFmode:
39522 /* In 256 bit SFmode case, we have full freedom of
39523 movement within the low 128-bit lane, but the high 128-bit
39524 lane must mirror the exact same pattern. */
39525 for (i = 0; i < 4; ++i)
39526 if (ipar[i] + 4 != ipar[i + 4])
39527 return 0;
39528 nelt = 4;
39529 /* FALLTHRU */
39530
39531 case V2DFmode:
39532 case V4SFmode:
39533 /* In the 128-bit case, we've full freedom in the placement of
39534 the elements from the source operand. */
39535 for (i = 0; i < nelt; ++i)
39536 mask |= ipar[i] << (i * (nelt / 2));
39537 break;
39538
39539 default:
39540 gcc_unreachable ();
39541 }
39542
39543 /* Make sure success has a non-zero value by adding one. */
39544 return mask + 1;
39545 }
39546
39547 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
39548 the expansion functions to turn the parallel back into a mask.
39549 The return value is 0 for no match and the imm8+1 for a match. */
39550
39551 int
39552 avx_vperm2f128_parallel (rtx par, machine_mode mode)
39553 {
39554 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
39555 unsigned mask = 0;
39556 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
39557
39558 if (XVECLEN (par, 0) != (int) nelt)
39559 return 0;
39560
39561 /* Validate that all of the elements are constants, and not totally
39562 out of range. Copy the data into an integral array to make the
39563 subsequent checks easier. */
39564 for (i = 0; i < nelt; ++i)
39565 {
39566 rtx er = XVECEXP (par, 0, i);
39567 unsigned HOST_WIDE_INT ei;
39568
39569 if (!CONST_INT_P (er))
39570 return 0;
39571 ei = INTVAL (er);
39572 if (ei >= 2 * nelt)
39573 return 0;
39574 ipar[i] = ei;
39575 }
39576
39577 /* Validate that the halves of the permute are halves. */
39578 for (i = 0; i < nelt2 - 1; ++i)
39579 if (ipar[i] + 1 != ipar[i + 1])
39580 return 0;
39581 for (i = nelt2; i < nelt - 1; ++i)
39582 if (ipar[i] + 1 != ipar[i + 1])
39583 return 0;
39584
39585 /* Reconstruct the mask. */
39586 for (i = 0; i < 2; ++i)
39587 {
39588 unsigned e = ipar[i * nelt2];
39589 if (e % nelt2)
39590 return 0;
39591 e /= nelt2;
39592 mask |= e << (i * 4);
39593 }
39594
39595 /* Make sure success has a non-zero value by adding one. */
39596 return mask + 1;
39597 }
39598 \f
39599 /* Return a register priority for hard reg REGNO. */
39600 static int
39601 ix86_register_priority (int hard_regno)
39602 {
39603 /* ebp and r13 as the base always wants a displacement, r12 as the
39604 base always wants an index. So discourage their usage in an
39605 address. */
39606 if (hard_regno == R12_REG || hard_regno == R13_REG)
39607 return 0;
39608 if (hard_regno == BP_REG)
39609 return 1;
39610 /* New x86-64 int registers result in bigger code size. Discourage
39611 them. */
39612 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
39613 return 2;
39614 /* New x86-64 SSE registers result in bigger code size. Discourage
39615 them. */
39616 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
39617 return 2;
39618 /* Usage of AX register results in smaller code. Prefer it. */
39619 if (hard_regno == AX_REG)
39620 return 4;
39621 return 3;
39622 }
39623
39624 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
39625
39626 Put float CONST_DOUBLE in the constant pool instead of fp regs.
39627 QImode must go into class Q_REGS.
39628 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
39629 movdf to do mem-to-mem moves through integer regs. */
39630
39631 static reg_class_t
39632 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
39633 {
39634 machine_mode mode = GET_MODE (x);
39635
39636 /* We're only allowed to return a subclass of CLASS. Many of the
39637 following checks fail for NO_REGS, so eliminate that early. */
39638 if (regclass == NO_REGS)
39639 return NO_REGS;
39640
39641 /* All classes can load zeros. */
39642 if (x == CONST0_RTX (mode))
39643 return regclass;
39644
39645 /* Force constants into memory if we are loading a (nonzero) constant into
39646 an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
39647 instructions to load from a constant. */
39648 if (CONSTANT_P (x)
39649 && (MAYBE_MMX_CLASS_P (regclass)
39650 || MAYBE_SSE_CLASS_P (regclass)
39651 || MAYBE_MASK_CLASS_P (regclass)))
39652 return NO_REGS;
39653
39654 /* Floating-point constants need more complex checks. */
39655 if (CONST_DOUBLE_P (x))
39656 {
39657 /* General regs can load everything. */
39658 if (INTEGER_CLASS_P (regclass))
39659 return regclass;
39660
39661 /* Floats can load 0 and 1 plus some others. Note that we eliminated
39662 zero above. We only want to wind up preferring 80387 registers if
39663 we plan on doing computation with them. */
39664 if (IS_STACK_MODE (mode)
39665 && standard_80387_constant_p (x) > 0)
39666 {
39667 /* Limit class to FP regs. */
39668 if (FLOAT_CLASS_P (regclass))
39669 return FLOAT_REGS;
39670 else if (regclass == FP_TOP_SSE_REGS)
39671 return FP_TOP_REG;
39672 else if (regclass == FP_SECOND_SSE_REGS)
39673 return FP_SECOND_REG;
39674 }
39675
39676 return NO_REGS;
39677 }
39678
39679 /* Prefer SSE regs only, if we can use them for math. */
39680 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39681 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
39682
39683 /* Generally when we see PLUS here, it's the function invariant
39684 (plus soft-fp const_int). Which can only be computed into general
39685 regs. */
39686 if (GET_CODE (x) == PLUS)
39687 return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
39688
39689 /* QImode constants are easy to load, but non-constant QImode data
39690 must go into Q_REGS. */
39691 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
39692 {
39693 if (Q_CLASS_P (regclass))
39694 return regclass;
39695 else if (reg_class_subset_p (Q_REGS, regclass))
39696 return Q_REGS;
39697 else
39698 return NO_REGS;
39699 }
39700
39701 return regclass;
39702 }
39703
39704 /* Discourage putting floating-point values in SSE registers unless
39705 SSE math is being used, and likewise for the 387 registers. */
39706 static reg_class_t
39707 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
39708 {
39709 machine_mode mode = GET_MODE (x);
39710
39711 /* Restrict the output reload class to the register bank that we are doing
39712 math on. If we would like not to return a subset of CLASS, reject this
39713 alternative: if reload cannot do this, it will still use its choice. */
39714 mode = GET_MODE (x);
39715 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39716 return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
39717
39718 if (IS_STACK_MODE (mode))
39719 {
39720 if (regclass == FP_TOP_SSE_REGS)
39721 return FP_TOP_REG;
39722 else if (regclass == FP_SECOND_SSE_REGS)
39723 return FP_SECOND_REG;
39724 else
39725 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
39726 }
39727
39728 return regclass;
39729 }
39730
39731 static reg_class_t
39732 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
39733 machine_mode mode, secondary_reload_info *sri)
39734 {
39735 /* Double-word spills from general registers to non-offsettable memory
39736 references (zero-extended addresses) require special handling. */
39737 if (TARGET_64BIT
39738 && MEM_P (x)
39739 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
39740 && INTEGER_CLASS_P (rclass)
39741 && !offsettable_memref_p (x))
39742 {
39743 sri->icode = (in_p
39744 ? CODE_FOR_reload_noff_load
39745 : CODE_FOR_reload_noff_store);
39746 /* Add the cost of moving address to a temporary. */
39747 sri->extra_cost = 1;
39748
39749 return NO_REGS;
39750 }
39751
39752 /* QImode spills from non-QI registers require
39753 intermediate register on 32bit targets. */
39754 if (mode == QImode
39755 && ((!TARGET_64BIT && !in_p
39756 && INTEGER_CLASS_P (rclass)
39757 && MAYBE_NON_Q_CLASS_P (rclass))
39758 || (!TARGET_AVX512DQ
39759 && MAYBE_MASK_CLASS_P (rclass))))
39760 {
39761 int regno = true_regnum (x);
39762
39763 /* Return Q_REGS if the operand is in memory. */
39764 if (regno == -1)
39765 return Q_REGS;
39766
39767 return NO_REGS;
39768 }
39769
39770 /* This condition handles corner case where an expression involving
39771 pointers gets vectorized. We're trying to use the address of a
39772 stack slot as a vector initializer.
39773
39774 (set (reg:V2DI 74 [ vect_cst_.2 ])
39775 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
39776
39777 Eventually frame gets turned into sp+offset like this:
39778
39779 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39780 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
39781 (const_int 392 [0x188]))))
39782
39783 That later gets turned into:
39784
39785 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39786 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
39787 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
39788
39789 We'll have the following reload recorded:
39790
39791 Reload 0: reload_in (DI) =
39792 (plus:DI (reg/f:DI 7 sp)
39793 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
39794 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39795 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
39796 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
39797 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39798 reload_reg_rtx: (reg:V2DI 22 xmm1)
39799
39800 Which isn't going to work since SSE instructions can't handle scalar
39801 additions. Returning GENERAL_REGS forces the addition into integer
39802 register and reload can handle subsequent reloads without problems. */
39803
39804 if (in_p && GET_CODE (x) == PLUS
39805 && SSE_CLASS_P (rclass)
39806 && SCALAR_INT_MODE_P (mode))
39807 return GENERAL_REGS;
39808
39809 return NO_REGS;
39810 }
39811
39812 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
39813
39814 static bool
39815 ix86_class_likely_spilled_p (reg_class_t rclass)
39816 {
39817 switch (rclass)
39818 {
39819 case AREG:
39820 case DREG:
39821 case CREG:
39822 case BREG:
39823 case AD_REGS:
39824 case SIREG:
39825 case DIREG:
39826 case SSE_FIRST_REG:
39827 case FP_TOP_REG:
39828 case FP_SECOND_REG:
39829 case BND_REGS:
39830 return true;
39831
39832 default:
39833 break;
39834 }
39835
39836 return false;
39837 }
39838
39839 /* If we are copying between general and FP registers, we need a memory
39840 location. The same is true for SSE and MMX registers.
39841
39842 To optimize register_move_cost performance, allow inline variant.
39843
39844 The macro can't work reliably when one of the CLASSES is class containing
39845 registers from multiple units (SSE, MMX, integer). We avoid this by never
39846 combining those units in single alternative in the machine description.
39847 Ensure that this constraint holds to avoid unexpected surprises.
39848
39849 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
39850 enforce these sanity checks. */
39851
39852 static inline bool
39853 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
39854 machine_mode mode, int strict)
39855 {
39856 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
39857 return false;
39858 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
39859 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
39860 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
39861 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
39862 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
39863 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
39864 {
39865 gcc_assert (!strict || lra_in_progress);
39866 return true;
39867 }
39868
39869 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
39870 return true;
39871
39872 /* Between mask and general, we have moves no larger than word size. */
39873 if ((MAYBE_MASK_CLASS_P (class1) != MAYBE_MASK_CLASS_P (class2))
39874 && (GET_MODE_SIZE (mode) > UNITS_PER_WORD))
39875 return true;
39876
39877 /* ??? This is a lie. We do have moves between mmx/general, and for
39878 mmx/sse2. But by saying we need secondary memory we discourage the
39879 register allocator from using the mmx registers unless needed. */
39880 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
39881 return true;
39882
39883 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
39884 {
39885 /* SSE1 doesn't have any direct moves from other classes. */
39886 if (!TARGET_SSE2)
39887 return true;
39888
39889 /* If the target says that inter-unit moves are more expensive
39890 than moving through memory, then don't generate them. */
39891 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
39892 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
39893 return true;
39894
39895 /* Between SSE and general, we have moves no larger than word size. */
39896 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
39897 return true;
39898 }
39899
39900 return false;
39901 }
39902
39903 bool
39904 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
39905 machine_mode mode, int strict)
39906 {
39907 return inline_secondary_memory_needed (class1, class2, mode, strict);
39908 }
39909
39910 /* Implement the TARGET_CLASS_MAX_NREGS hook.
39911
39912 On the 80386, this is the size of MODE in words,
39913 except in the FP regs, where a single reg is always enough. */
39914
39915 static unsigned char
39916 ix86_class_max_nregs (reg_class_t rclass, machine_mode mode)
39917 {
39918 if (MAYBE_INTEGER_CLASS_P (rclass))
39919 {
39920 if (mode == XFmode)
39921 return (TARGET_64BIT ? 2 : 3);
39922 else if (mode == XCmode)
39923 return (TARGET_64BIT ? 4 : 6);
39924 else
39925 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
39926 }
39927 else
39928 {
39929 if (COMPLEX_MODE_P (mode))
39930 return 2;
39931 else
39932 return 1;
39933 }
39934 }
39935
39936 /* Return true if the registers in CLASS cannot represent the change from
39937 modes FROM to TO. */
39938
39939 bool
39940 ix86_cannot_change_mode_class (machine_mode from, machine_mode to,
39941 enum reg_class regclass)
39942 {
39943 if (from == to)
39944 return false;
39945
39946 /* x87 registers can't do subreg at all, as all values are reformatted
39947 to extended precision. */
39948 if (MAYBE_FLOAT_CLASS_P (regclass))
39949 return true;
39950
39951 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
39952 {
39953 /* Vector registers do not support QI or HImode loads. If we don't
39954 disallow a change to these modes, reload will assume it's ok to
39955 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
39956 the vec_dupv4hi pattern. */
39957 if (GET_MODE_SIZE (from) < 4)
39958 return true;
39959 }
39960
39961 return false;
39962 }
39963
39964 /* Return the cost of moving data of mode M between a
39965 register and memory. A value of 2 is the default; this cost is
39966 relative to those in `REGISTER_MOVE_COST'.
39967
39968 This function is used extensively by register_move_cost that is used to
39969 build tables at startup. Make it inline in this case.
39970 When IN is 2, return maximum of in and out move cost.
39971
39972 If moving between registers and memory is more expensive than
39973 between two registers, you should define this macro to express the
39974 relative cost.
39975
39976 Model also increased moving costs of QImode registers in non
39977 Q_REGS classes.
39978 */
39979 static inline int
39980 inline_memory_move_cost (machine_mode mode, enum reg_class regclass,
39981 int in)
39982 {
39983 int cost;
39984 if (FLOAT_CLASS_P (regclass))
39985 {
39986 int index;
39987 switch (mode)
39988 {
39989 case SFmode:
39990 index = 0;
39991 break;
39992 case DFmode:
39993 index = 1;
39994 break;
39995 case XFmode:
39996 index = 2;
39997 break;
39998 default:
39999 return 100;
40000 }
40001 if (in == 2)
40002 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
40003 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
40004 }
40005 if (SSE_CLASS_P (regclass))
40006 {
40007 int index;
40008 switch (GET_MODE_SIZE (mode))
40009 {
40010 case 4:
40011 index = 0;
40012 break;
40013 case 8:
40014 index = 1;
40015 break;
40016 case 16:
40017 index = 2;
40018 break;
40019 default:
40020 return 100;
40021 }
40022 if (in == 2)
40023 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
40024 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
40025 }
40026 if (MMX_CLASS_P (regclass))
40027 {
40028 int index;
40029 switch (GET_MODE_SIZE (mode))
40030 {
40031 case 4:
40032 index = 0;
40033 break;
40034 case 8:
40035 index = 1;
40036 break;
40037 default:
40038 return 100;
40039 }
40040 if (in)
40041 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
40042 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
40043 }
40044 switch (GET_MODE_SIZE (mode))
40045 {
40046 case 1:
40047 if (Q_CLASS_P (regclass) || TARGET_64BIT)
40048 {
40049 if (!in)
40050 return ix86_cost->int_store[0];
40051 if (TARGET_PARTIAL_REG_DEPENDENCY
40052 && optimize_function_for_speed_p (cfun))
40053 cost = ix86_cost->movzbl_load;
40054 else
40055 cost = ix86_cost->int_load[0];
40056 if (in == 2)
40057 return MAX (cost, ix86_cost->int_store[0]);
40058 return cost;
40059 }
40060 else
40061 {
40062 if (in == 2)
40063 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
40064 if (in)
40065 return ix86_cost->movzbl_load;
40066 else
40067 return ix86_cost->int_store[0] + 4;
40068 }
40069 break;
40070 case 2:
40071 if (in == 2)
40072 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
40073 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
40074 default:
40075 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
40076 if (mode == TFmode)
40077 mode = XFmode;
40078 if (in == 2)
40079 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
40080 else if (in)
40081 cost = ix86_cost->int_load[2];
40082 else
40083 cost = ix86_cost->int_store[2];
40084 return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
40085 }
40086 }
40087
40088 static int
40089 ix86_memory_move_cost (machine_mode mode, reg_class_t regclass,
40090 bool in)
40091 {
40092 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
40093 }
40094
40095
40096 /* Return the cost of moving data from a register in class CLASS1 to
40097 one in class CLASS2.
40098
40099 It is not required that the cost always equal 2 when FROM is the same as TO;
40100 on some machines it is expensive to move between registers if they are not
40101 general registers. */
40102
40103 static int
40104 ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
40105 reg_class_t class2_i)
40106 {
40107 enum reg_class class1 = (enum reg_class) class1_i;
40108 enum reg_class class2 = (enum reg_class) class2_i;
40109
40110 /* In case we require secondary memory, compute cost of the store followed
40111 by load. In order to avoid bad register allocation choices, we need
40112 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
40113
40114 if (inline_secondary_memory_needed (class1, class2, mode, 0))
40115 {
40116 int cost = 1;
40117
40118 cost += inline_memory_move_cost (mode, class1, 2);
40119 cost += inline_memory_move_cost (mode, class2, 2);
40120
40121 /* In case of copying from general_purpose_register we may emit multiple
40122 stores followed by single load causing memory size mismatch stall.
40123 Count this as arbitrarily high cost of 20. */
40124 if (targetm.class_max_nregs (class1, mode)
40125 > targetm.class_max_nregs (class2, mode))
40126 cost += 20;
40127
40128 /* In the case of FP/MMX moves, the registers actually overlap, and we
40129 have to switch modes in order to treat them differently. */
40130 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
40131 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
40132 cost += 20;
40133
40134 return cost;
40135 }
40136
40137 /* Moves between SSE/MMX and integer unit are expensive. */
40138 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
40139 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
40140
40141 /* ??? By keeping returned value relatively high, we limit the number
40142 of moves between integer and MMX/SSE registers for all targets.
40143 Additionally, high value prevents problem with x86_modes_tieable_p(),
40144 where integer modes in MMX/SSE registers are not tieable
40145 because of missing QImode and HImode moves to, from or between
40146 MMX/SSE registers. */
40147 return MAX (8, ix86_cost->mmxsse_to_integer);
40148
40149 if (MAYBE_FLOAT_CLASS_P (class1))
40150 return ix86_cost->fp_move;
40151 if (MAYBE_SSE_CLASS_P (class1))
40152 return ix86_cost->sse_move;
40153 if (MAYBE_MMX_CLASS_P (class1))
40154 return ix86_cost->mmx_move;
40155 return 2;
40156 }
40157
40158 /* Return TRUE if hard register REGNO can hold a value of machine-mode
40159 MODE. */
40160
40161 bool
40162 ix86_hard_regno_mode_ok (int regno, machine_mode mode)
40163 {
40164 /* Flags and only flags can only hold CCmode values. */
40165 if (CC_REGNO_P (regno))
40166 return GET_MODE_CLASS (mode) == MODE_CC;
40167 if (GET_MODE_CLASS (mode) == MODE_CC
40168 || GET_MODE_CLASS (mode) == MODE_RANDOM
40169 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
40170 return false;
40171 if (STACK_REGNO_P (regno))
40172 return VALID_FP_MODE_P (mode);
40173 if (MASK_REGNO_P (regno))
40174 return (VALID_MASK_REG_MODE (mode)
40175 || (TARGET_AVX512BW
40176 && VALID_MASK_AVX512BW_MODE (mode)));
40177 if (BND_REGNO_P (regno))
40178 return VALID_BND_REG_MODE (mode);
40179 if (SSE_REGNO_P (regno))
40180 {
40181 /* We implement the move patterns for all vector modes into and
40182 out of SSE registers, even when no operation instructions
40183 are available. */
40184
40185 /* For AVX-512 we allow, regardless of regno:
40186 - XI mode
40187 - any of 512-bit wide vector mode
40188 - any scalar mode. */
40189 if (TARGET_AVX512F
40190 && (mode == XImode
40191 || VALID_AVX512F_REG_MODE (mode)
40192 || VALID_AVX512F_SCALAR_MODE (mode)))
40193 return true;
40194
40195 /* For AVX-5124FMAPS allow V64SFmode for special regnos. */
40196 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
40197 && MOD4_SSE_REGNO_P (regno)
40198 && mode == V64SFmode)
40199 return true;
40200
40201 /* For AVX-5124VNNIW allow V64SImode for special regnos. */
40202 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
40203 && MOD4_SSE_REGNO_P (regno)
40204 && mode == V64SImode)
40205 return true;
40206
40207 /* TODO check for QI/HI scalars. */
40208 /* AVX512VL allows sse regs16+ for 128/256 bit modes. */
40209 if (TARGET_AVX512VL
40210 && (mode == OImode
40211 || mode == TImode
40212 || VALID_AVX256_REG_MODE (mode)
40213 || VALID_AVX512VL_128_REG_MODE (mode)))
40214 return true;
40215
40216 /* xmm16-xmm31 are only available for AVX-512. */
40217 if (EXT_REX_SSE_REGNO_P (regno))
40218 return false;
40219
40220 /* OImode and AVX modes are available only when AVX is enabled. */
40221 return ((TARGET_AVX
40222 && VALID_AVX256_REG_OR_OI_MODE (mode))
40223 || VALID_SSE_REG_MODE (mode)
40224 || VALID_SSE2_REG_MODE (mode)
40225 || VALID_MMX_REG_MODE (mode)
40226 || VALID_MMX_REG_MODE_3DNOW (mode));
40227 }
40228 if (MMX_REGNO_P (regno))
40229 {
40230 /* We implement the move patterns for 3DNOW modes even in MMX mode,
40231 so if the register is available at all, then we can move data of
40232 the given mode into or out of it. */
40233 return (VALID_MMX_REG_MODE (mode)
40234 || VALID_MMX_REG_MODE_3DNOW (mode));
40235 }
40236
40237 if (mode == QImode)
40238 {
40239 /* Take care for QImode values - they can be in non-QI regs,
40240 but then they do cause partial register stalls. */
40241 if (ANY_QI_REGNO_P (regno))
40242 return true;
40243 if (!TARGET_PARTIAL_REG_STALL)
40244 return true;
40245 /* LRA checks if the hard register is OK for the given mode.
40246 QImode values can live in non-QI regs, so we allow all
40247 registers here. */
40248 if (lra_in_progress)
40249 return true;
40250 return !can_create_pseudo_p ();
40251 }
40252 /* We handle both integer and floats in the general purpose registers. */
40253 else if (VALID_INT_MODE_P (mode))
40254 return true;
40255 else if (VALID_FP_MODE_P (mode))
40256 return true;
40257 else if (VALID_DFP_MODE_P (mode))
40258 return true;
40259 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
40260 on to use that value in smaller contexts, this can easily force a
40261 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
40262 supporting DImode, allow it. */
40263 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
40264 return true;
40265
40266 return false;
40267 }
40268
40269 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
40270 tieable integer mode. */
40271
40272 static bool
40273 ix86_tieable_integer_mode_p (machine_mode mode)
40274 {
40275 switch (mode)
40276 {
40277 case HImode:
40278 case SImode:
40279 return true;
40280
40281 case QImode:
40282 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
40283
40284 case DImode:
40285 return TARGET_64BIT;
40286
40287 default:
40288 return false;
40289 }
40290 }
40291
40292 /* Return true if MODE1 is accessible in a register that can hold MODE2
40293 without copying. That is, all register classes that can hold MODE2
40294 can also hold MODE1. */
40295
40296 bool
40297 ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
40298 {
40299 if (mode1 == mode2)
40300 return true;
40301
40302 if (ix86_tieable_integer_mode_p (mode1)
40303 && ix86_tieable_integer_mode_p (mode2))
40304 return true;
40305
40306 /* MODE2 being XFmode implies fp stack or general regs, which means we
40307 can tie any smaller floating point modes to it. Note that we do not
40308 tie this with TFmode. */
40309 if (mode2 == XFmode)
40310 return mode1 == SFmode || mode1 == DFmode;
40311
40312 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
40313 that we can tie it with SFmode. */
40314 if (mode2 == DFmode)
40315 return mode1 == SFmode;
40316
40317 /* If MODE2 is only appropriate for an SSE register, then tie with
40318 any other mode acceptable to SSE registers. */
40319 if (GET_MODE_SIZE (mode2) == 32
40320 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
40321 return (GET_MODE_SIZE (mode1) == 32
40322 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
40323 if (GET_MODE_SIZE (mode2) == 16
40324 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
40325 return (GET_MODE_SIZE (mode1) == 16
40326 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
40327
40328 /* If MODE2 is appropriate for an MMX register, then tie
40329 with any other mode acceptable to MMX registers. */
40330 if (GET_MODE_SIZE (mode2) == 8
40331 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
40332 return (GET_MODE_SIZE (mode1) == 8
40333 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
40334
40335 return false;
40336 }
40337
40338 /* Return the cost of moving between two registers of mode MODE. */
40339
40340 static int
40341 ix86_set_reg_reg_cost (machine_mode mode)
40342 {
40343 unsigned int units = UNITS_PER_WORD;
40344
40345 switch (GET_MODE_CLASS (mode))
40346 {
40347 default:
40348 break;
40349
40350 case MODE_CC:
40351 units = GET_MODE_SIZE (CCmode);
40352 break;
40353
40354 case MODE_FLOAT:
40355 if ((TARGET_SSE && mode == TFmode)
40356 || (TARGET_80387 && mode == XFmode)
40357 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
40358 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
40359 units = GET_MODE_SIZE (mode);
40360 break;
40361
40362 case MODE_COMPLEX_FLOAT:
40363 if ((TARGET_SSE && mode == TCmode)
40364 || (TARGET_80387 && mode == XCmode)
40365 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
40366 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
40367 units = GET_MODE_SIZE (mode);
40368 break;
40369
40370 case MODE_VECTOR_INT:
40371 case MODE_VECTOR_FLOAT:
40372 if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
40373 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
40374 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
40375 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
40376 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
40377 units = GET_MODE_SIZE (mode);
40378 }
40379
40380 /* Return the cost of moving between two registers of mode MODE,
40381 assuming that the move will be in pieces of at most UNITS bytes. */
40382 return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units));
40383 }
40384
40385 /* Compute a (partial) cost for rtx X. Return true if the complete
40386 cost has been computed, and false if subexpressions should be
40387 scanned. In either case, *TOTAL contains the cost result. */
40388
40389 static bool
40390 ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
40391 int *total, bool speed)
40392 {
40393 rtx mask;
40394 enum rtx_code code = GET_CODE (x);
40395 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
40396 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
40397 int src_cost;
40398
40399 switch (code)
40400 {
40401 case SET:
40402 if (register_operand (SET_DEST (x), VOIDmode)
40403 && reg_or_0_operand (SET_SRC (x), VOIDmode))
40404 {
40405 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
40406 return true;
40407 }
40408
40409 if (register_operand (SET_SRC (x), VOIDmode))
40410 /* Avoid potentially incorrect high cost from rtx_costs
40411 for non-tieable SUBREGs. */
40412 src_cost = 0;
40413 else
40414 {
40415 src_cost = rtx_cost (SET_SRC (x), mode, SET, 1, speed);
40416
40417 if (CONSTANT_P (SET_SRC (x)))
40418 /* Constant costs assume a base value of COSTS_N_INSNS (1) and add
40419 a small value, possibly zero for cheap constants. */
40420 src_cost += COSTS_N_INSNS (1);
40421 }
40422
40423 *total = src_cost + rtx_cost (SET_DEST (x), mode, SET, 0, speed);
40424 return true;
40425
40426 case CONST_INT:
40427 case CONST:
40428 case LABEL_REF:
40429 case SYMBOL_REF:
40430 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
40431 *total = 3;
40432 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
40433 *total = 2;
40434 else if (flag_pic && SYMBOLIC_CONST (x)
40435 && !(TARGET_64BIT
40436 && (GET_CODE (x) == LABEL_REF
40437 || (GET_CODE (x) == SYMBOL_REF
40438 && SYMBOL_REF_LOCAL_P (x))))
40439 /* Use 0 cost for CONST to improve its propagation. */
40440 && (TARGET_64BIT || GET_CODE (x) != CONST))
40441 *total = 1;
40442 else
40443 *total = 0;
40444 return true;
40445
40446 case CONST_DOUBLE:
40447 if (IS_STACK_MODE (mode))
40448 switch (standard_80387_constant_p (x))
40449 {
40450 case -1:
40451 case 0:
40452 break;
40453 case 1: /* 0.0 */
40454 *total = 1;
40455 return true;
40456 default: /* Other constants */
40457 *total = 2;
40458 return true;
40459 }
40460 /* FALLTHRU */
40461
40462 case CONST_VECTOR:
40463 switch (standard_sse_constant_p (x, mode))
40464 {
40465 case 0:
40466 break;
40467 case 1: /* 0: xor eliminates false dependency */
40468 *total = 0;
40469 return true;
40470 default: /* -1: cmp contains false dependency */
40471 *total = 1;
40472 return true;
40473 }
40474 /* FALLTHRU */
40475
40476 case CONST_WIDE_INT:
40477 /* Fall back to (MEM (SYMBOL_REF)), since that's where
40478 it'll probably end up. Add a penalty for size. */
40479 *total = (COSTS_N_INSNS (1)
40480 + (!TARGET_64BIT && flag_pic)
40481 + (GET_MODE_SIZE (mode) <= 4
40482 ? 0 : GET_MODE_SIZE (mode) <= 8 ? 1 : 2));
40483 return true;
40484
40485 case ZERO_EXTEND:
40486 /* The zero extensions is often completely free on x86_64, so make
40487 it as cheap as possible. */
40488 if (TARGET_64BIT && mode == DImode
40489 && GET_MODE (XEXP (x, 0)) == SImode)
40490 *total = 1;
40491 else if (TARGET_ZERO_EXTEND_WITH_AND)
40492 *total = cost->add;
40493 else
40494 *total = cost->movzx;
40495 return false;
40496
40497 case SIGN_EXTEND:
40498 *total = cost->movsx;
40499 return false;
40500
40501 case ASHIFT:
40502 if (SCALAR_INT_MODE_P (mode)
40503 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
40504 && CONST_INT_P (XEXP (x, 1)))
40505 {
40506 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
40507 if (value == 1)
40508 {
40509 *total = cost->add;
40510 return false;
40511 }
40512 if ((value == 2 || value == 3)
40513 && cost->lea <= cost->shift_const)
40514 {
40515 *total = cost->lea;
40516 return false;
40517 }
40518 }
40519 /* FALLTHRU */
40520
40521 case ROTATE:
40522 case ASHIFTRT:
40523 case LSHIFTRT:
40524 case ROTATERT:
40525 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
40526 {
40527 /* ??? Should be SSE vector operation cost. */
40528 /* At least for published AMD latencies, this really is the same
40529 as the latency for a simple fpu operation like fabs. */
40530 /* V*QImode is emulated with 1-11 insns. */
40531 if (mode == V16QImode || mode == V32QImode)
40532 {
40533 int count = 11;
40534 if (TARGET_XOP && mode == V16QImode)
40535 {
40536 /* For XOP we use vpshab, which requires a broadcast of the
40537 value to the variable shift insn. For constants this
40538 means a V16Q const in mem; even when we can perform the
40539 shift with one insn set the cost to prefer paddb. */
40540 if (CONSTANT_P (XEXP (x, 1)))
40541 {
40542 *total = (cost->fabs
40543 + rtx_cost (XEXP (x, 0), mode, code, 0, speed)
40544 + (speed ? 2 : COSTS_N_BYTES (16)));
40545 return true;
40546 }
40547 count = 3;
40548 }
40549 else if (TARGET_SSSE3)
40550 count = 7;
40551 *total = cost->fabs * count;
40552 }
40553 else
40554 *total = cost->fabs;
40555 }
40556 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40557 {
40558 if (CONST_INT_P (XEXP (x, 1)))
40559 {
40560 if (INTVAL (XEXP (x, 1)) > 32)
40561 *total = cost->shift_const + COSTS_N_INSNS (2);
40562 else
40563 *total = cost->shift_const * 2;
40564 }
40565 else
40566 {
40567 if (GET_CODE (XEXP (x, 1)) == AND)
40568 *total = cost->shift_var * 2;
40569 else
40570 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
40571 }
40572 }
40573 else
40574 {
40575 if (CONST_INT_P (XEXP (x, 1)))
40576 *total = cost->shift_const;
40577 else if (SUBREG_P (XEXP (x, 1))
40578 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND)
40579 {
40580 /* Return the cost after shift-and truncation. */
40581 *total = cost->shift_var;
40582 return true;
40583 }
40584 else
40585 *total = cost->shift_var;
40586 }
40587 return false;
40588
40589 case FMA:
40590 {
40591 rtx sub;
40592
40593 gcc_assert (FLOAT_MODE_P (mode));
40594 gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
40595
40596 /* ??? SSE scalar/vector cost should be used here. */
40597 /* ??? Bald assumption that fma has the same cost as fmul. */
40598 *total = cost->fmul;
40599 *total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
40600
40601 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
40602 sub = XEXP (x, 0);
40603 if (GET_CODE (sub) == NEG)
40604 sub = XEXP (sub, 0);
40605 *total += rtx_cost (sub, mode, FMA, 0, speed);
40606
40607 sub = XEXP (x, 2);
40608 if (GET_CODE (sub) == NEG)
40609 sub = XEXP (sub, 0);
40610 *total += rtx_cost (sub, mode, FMA, 2, speed);
40611 return true;
40612 }
40613
40614 case MULT:
40615 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40616 {
40617 /* ??? SSE scalar cost should be used here. */
40618 *total = cost->fmul;
40619 return false;
40620 }
40621 else if (X87_FLOAT_MODE_P (mode))
40622 {
40623 *total = cost->fmul;
40624 return false;
40625 }
40626 else if (FLOAT_MODE_P (mode))
40627 {
40628 /* ??? SSE vector cost should be used here. */
40629 *total = cost->fmul;
40630 return false;
40631 }
40632 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
40633 {
40634 /* V*QImode is emulated with 7-13 insns. */
40635 if (mode == V16QImode || mode == V32QImode)
40636 {
40637 int extra = 11;
40638 if (TARGET_XOP && mode == V16QImode)
40639 extra = 5;
40640 else if (TARGET_SSSE3)
40641 extra = 6;
40642 *total = cost->fmul * 2 + cost->fabs * extra;
40643 }
40644 /* V*DImode is emulated with 5-8 insns. */
40645 else if (mode == V2DImode || mode == V4DImode)
40646 {
40647 if (TARGET_XOP && mode == V2DImode)
40648 *total = cost->fmul * 2 + cost->fabs * 3;
40649 else
40650 *total = cost->fmul * 3 + cost->fabs * 5;
40651 }
40652 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
40653 insns, including two PMULUDQ. */
40654 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
40655 *total = cost->fmul * 2 + cost->fabs * 5;
40656 else
40657 *total = cost->fmul;
40658 return false;
40659 }
40660 else
40661 {
40662 rtx op0 = XEXP (x, 0);
40663 rtx op1 = XEXP (x, 1);
40664 int nbits;
40665 if (CONST_INT_P (XEXP (x, 1)))
40666 {
40667 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
40668 for (nbits = 0; value != 0; value &= value - 1)
40669 nbits++;
40670 }
40671 else
40672 /* This is arbitrary. */
40673 nbits = 7;
40674
40675 /* Compute costs correctly for widening multiplication. */
40676 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
40677 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
40678 == GET_MODE_SIZE (mode))
40679 {
40680 int is_mulwiden = 0;
40681 machine_mode inner_mode = GET_MODE (op0);
40682
40683 if (GET_CODE (op0) == GET_CODE (op1))
40684 is_mulwiden = 1, op1 = XEXP (op1, 0);
40685 else if (CONST_INT_P (op1))
40686 {
40687 if (GET_CODE (op0) == SIGN_EXTEND)
40688 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
40689 == INTVAL (op1);
40690 else
40691 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
40692 }
40693
40694 if (is_mulwiden)
40695 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
40696 }
40697
40698 *total = (cost->mult_init[MODE_INDEX (mode)]
40699 + nbits * cost->mult_bit
40700 + rtx_cost (op0, mode, outer_code, opno, speed)
40701 + rtx_cost (op1, mode, outer_code, opno, speed));
40702
40703 return true;
40704 }
40705
40706 case DIV:
40707 case UDIV:
40708 case MOD:
40709 case UMOD:
40710 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40711 /* ??? SSE cost should be used here. */
40712 *total = cost->fdiv;
40713 else if (X87_FLOAT_MODE_P (mode))
40714 *total = cost->fdiv;
40715 else if (FLOAT_MODE_P (mode))
40716 /* ??? SSE vector cost should be used here. */
40717 *total = cost->fdiv;
40718 else
40719 *total = cost->divide[MODE_INDEX (mode)];
40720 return false;
40721
40722 case PLUS:
40723 if (GET_MODE_CLASS (mode) == MODE_INT
40724 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
40725 {
40726 if (GET_CODE (XEXP (x, 0)) == PLUS
40727 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
40728 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
40729 && CONSTANT_P (XEXP (x, 1)))
40730 {
40731 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
40732 if (val == 2 || val == 4 || val == 8)
40733 {
40734 *total = cost->lea;
40735 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
40736 outer_code, opno, speed);
40737 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), mode,
40738 outer_code, opno, speed);
40739 *total += rtx_cost (XEXP (x, 1), mode,
40740 outer_code, opno, speed);
40741 return true;
40742 }
40743 }
40744 else if (GET_CODE (XEXP (x, 0)) == MULT
40745 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
40746 {
40747 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
40748 if (val == 2 || val == 4 || val == 8)
40749 {
40750 *total = cost->lea;
40751 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
40752 outer_code, opno, speed);
40753 *total += rtx_cost (XEXP (x, 1), mode,
40754 outer_code, opno, speed);
40755 return true;
40756 }
40757 }
40758 else if (GET_CODE (XEXP (x, 0)) == PLUS)
40759 {
40760 *total = cost->lea;
40761 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
40762 outer_code, opno, speed);
40763 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
40764 outer_code, opno, speed);
40765 *total += rtx_cost (XEXP (x, 1), mode,
40766 outer_code, opno, speed);
40767 return true;
40768 }
40769 }
40770 /* FALLTHRU */
40771
40772 case MINUS:
40773 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40774 {
40775 /* ??? SSE cost should be used here. */
40776 *total = cost->fadd;
40777 return false;
40778 }
40779 else if (X87_FLOAT_MODE_P (mode))
40780 {
40781 *total = cost->fadd;
40782 return false;
40783 }
40784 else if (FLOAT_MODE_P (mode))
40785 {
40786 /* ??? SSE vector cost should be used here. */
40787 *total = cost->fadd;
40788 return false;
40789 }
40790 /* FALLTHRU */
40791
40792 case AND:
40793 case IOR:
40794 case XOR:
40795 if (GET_MODE_CLASS (mode) == MODE_INT
40796 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40797 {
40798 *total = (cost->add * 2
40799 + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
40800 << (GET_MODE (XEXP (x, 0)) != DImode))
40801 + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
40802 << (GET_MODE (XEXP (x, 1)) != DImode)));
40803 return true;
40804 }
40805 /* FALLTHRU */
40806
40807 case NEG:
40808 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40809 {
40810 /* ??? SSE cost should be used here. */
40811 *total = cost->fchs;
40812 return false;
40813 }
40814 else if (X87_FLOAT_MODE_P (mode))
40815 {
40816 *total = cost->fchs;
40817 return false;
40818 }
40819 else if (FLOAT_MODE_P (mode))
40820 {
40821 /* ??? SSE vector cost should be used here. */
40822 *total = cost->fchs;
40823 return false;
40824 }
40825 /* FALLTHRU */
40826
40827 case NOT:
40828 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
40829 {
40830 /* ??? Should be SSE vector operation cost. */
40831 /* At least for published AMD latencies, this really is the same
40832 as the latency for a simple fpu operation like fabs. */
40833 *total = cost->fabs;
40834 }
40835 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40836 *total = cost->add * 2;
40837 else
40838 *total = cost->add;
40839 return false;
40840
40841 case COMPARE:
40842 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
40843 && XEXP (XEXP (x, 0), 1) == const1_rtx
40844 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
40845 && XEXP (x, 1) == const0_rtx)
40846 {
40847 /* This kind of construct is implemented using test[bwl].
40848 Treat it as if we had an AND. */
40849 mode = GET_MODE (XEXP (XEXP (x, 0), 0));
40850 *total = (cost->add
40851 + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code,
40852 opno, speed)
40853 + rtx_cost (const1_rtx, mode, outer_code, opno, speed));
40854 return true;
40855 }
40856
40857 /* The embedded comparison operand is completely free. */
40858 if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
40859 && XEXP (x, 1) == const0_rtx)
40860 *total = 0;
40861
40862 return false;
40863
40864 case FLOAT_EXTEND:
40865 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
40866 *total = 0;
40867 return false;
40868
40869 case ABS:
40870 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40871 /* ??? SSE cost should be used here. */
40872 *total = cost->fabs;
40873 else if (X87_FLOAT_MODE_P (mode))
40874 *total = cost->fabs;
40875 else if (FLOAT_MODE_P (mode))
40876 /* ??? SSE vector cost should be used here. */
40877 *total = cost->fabs;
40878 return false;
40879
40880 case SQRT:
40881 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40882 /* ??? SSE cost should be used here. */
40883 *total = cost->fsqrt;
40884 else if (X87_FLOAT_MODE_P (mode))
40885 *total = cost->fsqrt;
40886 else if (FLOAT_MODE_P (mode))
40887 /* ??? SSE vector cost should be used here. */
40888 *total = cost->fsqrt;
40889 return false;
40890
40891 case UNSPEC:
40892 if (XINT (x, 1) == UNSPEC_TP)
40893 *total = 0;
40894 return false;
40895
40896 case VEC_SELECT:
40897 case VEC_CONCAT:
40898 case VEC_DUPLICATE:
40899 /* ??? Assume all of these vector manipulation patterns are
40900 recognizable. In which case they all pretty much have the
40901 same cost. */
40902 *total = cost->fabs;
40903 return true;
40904 case VEC_MERGE:
40905 mask = XEXP (x, 2);
40906 /* This is masked instruction, assume the same cost,
40907 as nonmasked variant. */
40908 if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
40909 *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
40910 else
40911 *total = cost->fabs;
40912 return true;
40913
40914 default:
40915 return false;
40916 }
40917 }
40918
40919 #if TARGET_MACHO
40920
40921 static int current_machopic_label_num;
40922
40923 /* Given a symbol name and its associated stub, write out the
40924 definition of the stub. */
40925
40926 void
40927 machopic_output_stub (FILE *file, const char *symb, const char *stub)
40928 {
40929 unsigned int length;
40930 char *binder_name, *symbol_name, lazy_ptr_name[32];
40931 int label = ++current_machopic_label_num;
40932
40933 /* For 64-bit we shouldn't get here. */
40934 gcc_assert (!TARGET_64BIT);
40935
40936 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
40937 symb = targetm.strip_name_encoding (symb);
40938
40939 length = strlen (stub);
40940 binder_name = XALLOCAVEC (char, length + 32);
40941 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
40942
40943 length = strlen (symb);
40944 symbol_name = XALLOCAVEC (char, length + 32);
40945 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
40946
40947 sprintf (lazy_ptr_name, "L%d$lz", label);
40948
40949 if (MACHOPIC_ATT_STUB)
40950 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
40951 else if (MACHOPIC_PURE)
40952 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
40953 else
40954 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
40955
40956 fprintf (file, "%s:\n", stub);
40957 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
40958
40959 if (MACHOPIC_ATT_STUB)
40960 {
40961 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
40962 }
40963 else if (MACHOPIC_PURE)
40964 {
40965 /* PIC stub. */
40966 /* 25-byte PIC stub using "CALL get_pc_thunk". */
40967 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
40968 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
40969 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
40970 label, lazy_ptr_name, label);
40971 fprintf (file, "\tjmp\t*%%ecx\n");
40972 }
40973 else
40974 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
40975
40976 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
40977 it needs no stub-binding-helper. */
40978 if (MACHOPIC_ATT_STUB)
40979 return;
40980
40981 fprintf (file, "%s:\n", binder_name);
40982
40983 if (MACHOPIC_PURE)
40984 {
40985 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
40986 fprintf (file, "\tpushl\t%%ecx\n");
40987 }
40988 else
40989 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
40990
40991 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
40992
40993 /* N.B. Keep the correspondence of these
40994 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
40995 old-pic/new-pic/non-pic stubs; altering this will break
40996 compatibility with existing dylibs. */
40997 if (MACHOPIC_PURE)
40998 {
40999 /* 25-byte PIC stub using "CALL get_pc_thunk". */
41000 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
41001 }
41002 else
41003 /* 16-byte -mdynamic-no-pic stub. */
41004 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
41005
41006 fprintf (file, "%s:\n", lazy_ptr_name);
41007 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
41008 fprintf (file, ASM_LONG "%s\n", binder_name);
41009 }
41010 #endif /* TARGET_MACHO */
41011
41012 /* Order the registers for register allocator. */
41013
41014 void
41015 x86_order_regs_for_local_alloc (void)
41016 {
41017 int pos = 0;
41018 int i;
41019
41020 /* First allocate the local general purpose registers. */
41021 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
41022 if (GENERAL_REGNO_P (i) && call_used_regs[i])
41023 reg_alloc_order [pos++] = i;
41024
41025 /* Global general purpose registers. */
41026 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
41027 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
41028 reg_alloc_order [pos++] = i;
41029
41030 /* x87 registers come first in case we are doing FP math
41031 using them. */
41032 if (!TARGET_SSE_MATH)
41033 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
41034 reg_alloc_order [pos++] = i;
41035
41036 /* SSE registers. */
41037 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
41038 reg_alloc_order [pos++] = i;
41039 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
41040 reg_alloc_order [pos++] = i;
41041
41042 /* Extended REX SSE registers. */
41043 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
41044 reg_alloc_order [pos++] = i;
41045
41046 /* Mask register. */
41047 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
41048 reg_alloc_order [pos++] = i;
41049
41050 /* MPX bound registers. */
41051 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
41052 reg_alloc_order [pos++] = i;
41053
41054 /* x87 registers. */
41055 if (TARGET_SSE_MATH)
41056 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
41057 reg_alloc_order [pos++] = i;
41058
41059 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
41060 reg_alloc_order [pos++] = i;
41061
41062 /* Initialize the rest of array as we do not allocate some registers
41063 at all. */
41064 while (pos < FIRST_PSEUDO_REGISTER)
41065 reg_alloc_order [pos++] = 0;
41066 }
41067
41068 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
41069 in struct attribute_spec handler. */
41070 static tree
41071 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
41072 tree args,
41073 int,
41074 bool *no_add_attrs)
41075 {
41076 if (TREE_CODE (*node) != FUNCTION_TYPE
41077 && TREE_CODE (*node) != METHOD_TYPE
41078 && TREE_CODE (*node) != FIELD_DECL
41079 && TREE_CODE (*node) != TYPE_DECL)
41080 {
41081 warning (OPT_Wattributes, "%qE attribute only applies to functions",
41082 name);
41083 *no_add_attrs = true;
41084 return NULL_TREE;
41085 }
41086 if (TARGET_64BIT)
41087 {
41088 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
41089 name);
41090 *no_add_attrs = true;
41091 return NULL_TREE;
41092 }
41093 if (is_attribute_p ("callee_pop_aggregate_return", name))
41094 {
41095 tree cst;
41096
41097 cst = TREE_VALUE (args);
41098 if (TREE_CODE (cst) != INTEGER_CST)
41099 {
41100 warning (OPT_Wattributes,
41101 "%qE attribute requires an integer constant argument",
41102 name);
41103 *no_add_attrs = true;
41104 }
41105 else if (compare_tree_int (cst, 0) != 0
41106 && compare_tree_int (cst, 1) != 0)
41107 {
41108 warning (OPT_Wattributes,
41109 "argument to %qE attribute is neither zero, nor one",
41110 name);
41111 *no_add_attrs = true;
41112 }
41113
41114 return NULL_TREE;
41115 }
41116
41117 return NULL_TREE;
41118 }
41119
41120 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
41121 struct attribute_spec.handler. */
41122 static tree
41123 ix86_handle_abi_attribute (tree *node, tree name, tree, int,
41124 bool *no_add_attrs)
41125 {
41126 if (TREE_CODE (*node) != FUNCTION_TYPE
41127 && TREE_CODE (*node) != METHOD_TYPE
41128 && TREE_CODE (*node) != FIELD_DECL
41129 && TREE_CODE (*node) != TYPE_DECL)
41130 {
41131 warning (OPT_Wattributes, "%qE attribute only applies to functions",
41132 name);
41133 *no_add_attrs = true;
41134 return NULL_TREE;
41135 }
41136
41137 /* Can combine regparm with all attributes but fastcall. */
41138 if (is_attribute_p ("ms_abi", name))
41139 {
41140 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
41141 {
41142 error ("ms_abi and sysv_abi attributes are not compatible");
41143 }
41144
41145 return NULL_TREE;
41146 }
41147 else if (is_attribute_p ("sysv_abi", name))
41148 {
41149 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
41150 {
41151 error ("ms_abi and sysv_abi attributes are not compatible");
41152 }
41153
41154 return NULL_TREE;
41155 }
41156
41157 return NULL_TREE;
41158 }
41159
41160 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
41161 struct attribute_spec.handler. */
41162 static tree
41163 ix86_handle_struct_attribute (tree *node, tree name, tree, int,
41164 bool *no_add_attrs)
41165 {
41166 tree *type = NULL;
41167 if (DECL_P (*node))
41168 {
41169 if (TREE_CODE (*node) == TYPE_DECL)
41170 type = &TREE_TYPE (*node);
41171 }
41172 else
41173 type = node;
41174
41175 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
41176 {
41177 warning (OPT_Wattributes, "%qE attribute ignored",
41178 name);
41179 *no_add_attrs = true;
41180 }
41181
41182 else if ((is_attribute_p ("ms_struct", name)
41183 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
41184 || ((is_attribute_p ("gcc_struct", name)
41185 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
41186 {
41187 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
41188 name);
41189 *no_add_attrs = true;
41190 }
41191
41192 return NULL_TREE;
41193 }
41194
41195 static tree
41196 ix86_handle_fndecl_attribute (tree *node, tree name, tree, int,
41197 bool *no_add_attrs)
41198 {
41199 if (TREE_CODE (*node) != FUNCTION_DECL)
41200 {
41201 warning (OPT_Wattributes, "%qE attribute only applies to functions",
41202 name);
41203 *no_add_attrs = true;
41204 }
41205 return NULL_TREE;
41206 }
41207
41208 static tree
41209 ix86_handle_no_caller_saved_registers_attribute (tree *, tree, tree,
41210 int, bool *)
41211 {
41212 return NULL_TREE;
41213 }
41214
41215 static tree
41216 ix86_handle_interrupt_attribute (tree *node, tree, tree, int, bool *)
41217 {
41218 /* DECL_RESULT and DECL_ARGUMENTS do not exist there yet,
41219 but the function type contains args and return type data. */
41220 tree func_type = *node;
41221 tree return_type = TREE_TYPE (func_type);
41222
41223 int nargs = 0;
41224 tree current_arg_type = TYPE_ARG_TYPES (func_type);
41225 while (current_arg_type
41226 && ! VOID_TYPE_P (TREE_VALUE (current_arg_type)))
41227 {
41228 if (nargs == 0)
41229 {
41230 if (! POINTER_TYPE_P (TREE_VALUE (current_arg_type)))
41231 error ("interrupt service routine should have a pointer "
41232 "as the first argument");
41233 }
41234 else if (nargs == 1)
41235 {
41236 if (TREE_CODE (TREE_VALUE (current_arg_type)) != INTEGER_TYPE
41237 || TYPE_MODE (TREE_VALUE (current_arg_type)) != word_mode)
41238 error ("interrupt service routine should have unsigned %s"
41239 "int as the second argument",
41240 TARGET_64BIT
41241 ? (TARGET_X32 ? "long long " : "long ")
41242 : "");
41243 }
41244 nargs++;
41245 current_arg_type = TREE_CHAIN (current_arg_type);
41246 }
41247 if (!nargs || nargs > 2)
41248 error ("interrupt service routine can only have a pointer argument "
41249 "and an optional integer argument");
41250 if (! VOID_TYPE_P (return_type))
41251 error ("interrupt service routine can't have non-void return value");
41252
41253 return NULL_TREE;
41254 }
41255
41256 static bool
41257 ix86_ms_bitfield_layout_p (const_tree record_type)
41258 {
41259 return ((TARGET_MS_BITFIELD_LAYOUT
41260 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
41261 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
41262 }
41263
41264 /* Returns an expression indicating where the this parameter is
41265 located on entry to the FUNCTION. */
41266
41267 static rtx
41268 x86_this_parameter (tree function)
41269 {
41270 tree type = TREE_TYPE (function);
41271 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
41272 int nregs;
41273
41274 if (TARGET_64BIT)
41275 {
41276 const int *parm_regs;
41277
41278 if (ix86_function_type_abi (type) == MS_ABI)
41279 parm_regs = x86_64_ms_abi_int_parameter_registers;
41280 else
41281 parm_regs = x86_64_int_parameter_registers;
41282 return gen_rtx_REG (Pmode, parm_regs[aggr]);
41283 }
41284
41285 nregs = ix86_function_regparm (type, function);
41286
41287 if (nregs > 0 && !stdarg_p (type))
41288 {
41289 int regno;
41290 unsigned int ccvt = ix86_get_callcvt (type);
41291
41292 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
41293 regno = aggr ? DX_REG : CX_REG;
41294 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
41295 {
41296 regno = CX_REG;
41297 if (aggr)
41298 return gen_rtx_MEM (SImode,
41299 plus_constant (Pmode, stack_pointer_rtx, 4));
41300 }
41301 else
41302 {
41303 regno = AX_REG;
41304 if (aggr)
41305 {
41306 regno = DX_REG;
41307 if (nregs == 1)
41308 return gen_rtx_MEM (SImode,
41309 plus_constant (Pmode,
41310 stack_pointer_rtx, 4));
41311 }
41312 }
41313 return gen_rtx_REG (SImode, regno);
41314 }
41315
41316 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
41317 aggr ? 8 : 4));
41318 }
41319
41320 /* Determine whether x86_output_mi_thunk can succeed. */
41321
41322 static bool
41323 x86_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset,
41324 const_tree function)
41325 {
41326 /* 64-bit can handle anything. */
41327 if (TARGET_64BIT)
41328 return true;
41329
41330 /* For 32-bit, everything's fine if we have one free register. */
41331 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
41332 return true;
41333
41334 /* Need a free register for vcall_offset. */
41335 if (vcall_offset)
41336 return false;
41337
41338 /* Need a free register for GOT references. */
41339 if (flag_pic && !targetm.binds_local_p (function))
41340 return false;
41341
41342 /* Otherwise ok. */
41343 return true;
41344 }
41345
41346 /* Output the assembler code for a thunk function. THUNK_DECL is the
41347 declaration for the thunk function itself, FUNCTION is the decl for
41348 the target function. DELTA is an immediate constant offset to be
41349 added to THIS. If VCALL_OFFSET is nonzero, the word at
41350 *(*this + vcall_offset) should be added to THIS. */
41351
41352 static void
41353 x86_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
41354 HOST_WIDE_INT vcall_offset, tree function)
41355 {
41356 rtx this_param = x86_this_parameter (function);
41357 rtx this_reg, tmp, fnaddr;
41358 unsigned int tmp_regno;
41359 rtx_insn *insn;
41360
41361 if (TARGET_64BIT)
41362 tmp_regno = R10_REG;
41363 else
41364 {
41365 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
41366 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
41367 tmp_regno = AX_REG;
41368 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
41369 tmp_regno = DX_REG;
41370 else
41371 tmp_regno = CX_REG;
41372 }
41373
41374 emit_note (NOTE_INSN_PROLOGUE_END);
41375
41376 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
41377 pull it in now and let DELTA benefit. */
41378 if (REG_P (this_param))
41379 this_reg = this_param;
41380 else if (vcall_offset)
41381 {
41382 /* Put the this parameter into %eax. */
41383 this_reg = gen_rtx_REG (Pmode, AX_REG);
41384 emit_move_insn (this_reg, this_param);
41385 }
41386 else
41387 this_reg = NULL_RTX;
41388
41389 /* Adjust the this parameter by a fixed constant. */
41390 if (delta)
41391 {
41392 rtx delta_rtx = GEN_INT (delta);
41393 rtx delta_dst = this_reg ? this_reg : this_param;
41394
41395 if (TARGET_64BIT)
41396 {
41397 if (!x86_64_general_operand (delta_rtx, Pmode))
41398 {
41399 tmp = gen_rtx_REG (Pmode, tmp_regno);
41400 emit_move_insn (tmp, delta_rtx);
41401 delta_rtx = tmp;
41402 }
41403 }
41404
41405 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
41406 }
41407
41408 /* Adjust the this parameter by a value stored in the vtable. */
41409 if (vcall_offset)
41410 {
41411 rtx vcall_addr, vcall_mem, this_mem;
41412
41413 tmp = gen_rtx_REG (Pmode, tmp_regno);
41414
41415 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
41416 if (Pmode != ptr_mode)
41417 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
41418 emit_move_insn (tmp, this_mem);
41419
41420 /* Adjust the this parameter. */
41421 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
41422 if (TARGET_64BIT
41423 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
41424 {
41425 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
41426 emit_move_insn (tmp2, GEN_INT (vcall_offset));
41427 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
41428 }
41429
41430 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
41431 if (Pmode != ptr_mode)
41432 emit_insn (gen_addsi_1_zext (this_reg,
41433 gen_rtx_REG (ptr_mode,
41434 REGNO (this_reg)),
41435 vcall_mem));
41436 else
41437 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
41438 }
41439
41440 /* If necessary, drop THIS back to its stack slot. */
41441 if (this_reg && this_reg != this_param)
41442 emit_move_insn (this_param, this_reg);
41443
41444 fnaddr = XEXP (DECL_RTL (function), 0);
41445 if (TARGET_64BIT)
41446 {
41447 if (!flag_pic || targetm.binds_local_p (function)
41448 || TARGET_PECOFF)
41449 ;
41450 else
41451 {
41452 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
41453 tmp = gen_rtx_CONST (Pmode, tmp);
41454 fnaddr = gen_const_mem (Pmode, tmp);
41455 }
41456 }
41457 else
41458 {
41459 if (!flag_pic || targetm.binds_local_p (function))
41460 ;
41461 #if TARGET_MACHO
41462 else if (TARGET_MACHO)
41463 {
41464 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
41465 fnaddr = XEXP (fnaddr, 0);
41466 }
41467 #endif /* TARGET_MACHO */
41468 else
41469 {
41470 tmp = gen_rtx_REG (Pmode, CX_REG);
41471 output_set_got (tmp, NULL_RTX);
41472
41473 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
41474 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
41475 fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
41476 fnaddr = gen_const_mem (Pmode, fnaddr);
41477 }
41478 }
41479
41480 /* Our sibling call patterns do not allow memories, because we have no
41481 predicate that can distinguish between frame and non-frame memory.
41482 For our purposes here, we can get away with (ab)using a jump pattern,
41483 because we're going to do no optimization. */
41484 if (MEM_P (fnaddr))
41485 {
41486 if (sibcall_insn_operand (fnaddr, word_mode))
41487 {
41488 fnaddr = XEXP (DECL_RTL (function), 0);
41489 tmp = gen_rtx_MEM (QImode, fnaddr);
41490 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
41491 tmp = emit_call_insn (tmp);
41492 SIBLING_CALL_P (tmp) = 1;
41493 }
41494 else
41495 emit_jump_insn (gen_indirect_jump (fnaddr));
41496 }
41497 else
41498 {
41499 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
41500 {
41501 // CM_LARGE_PIC always uses pseudo PIC register which is
41502 // uninitialized. Since FUNCTION is local and calling it
41503 // doesn't go through PLT, we use scratch register %r11 as
41504 // PIC register and initialize it here.
41505 pic_offset_table_rtx = gen_rtx_REG (Pmode, R11_REG);
41506 ix86_init_large_pic_reg (tmp_regno);
41507 fnaddr = legitimize_pic_address (fnaddr,
41508 gen_rtx_REG (Pmode, tmp_regno));
41509 }
41510
41511 if (!sibcall_insn_operand (fnaddr, word_mode))
41512 {
41513 tmp = gen_rtx_REG (word_mode, tmp_regno);
41514 if (GET_MODE (fnaddr) != word_mode)
41515 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
41516 emit_move_insn (tmp, fnaddr);
41517 fnaddr = tmp;
41518 }
41519
41520 tmp = gen_rtx_MEM (QImode, fnaddr);
41521 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
41522 tmp = emit_call_insn (tmp);
41523 SIBLING_CALL_P (tmp) = 1;
41524 }
41525 emit_barrier ();
41526
41527 /* Emit just enough of rest_of_compilation to get the insns emitted.
41528 Note that use_thunk calls assemble_start_function et al. */
41529 insn = get_insns ();
41530 shorten_branches (insn);
41531 final_start_function (insn, file, 1);
41532 final (insn, file, 1);
41533 final_end_function ();
41534 }
41535
41536 static void
41537 x86_file_start (void)
41538 {
41539 default_file_start ();
41540 if (TARGET_16BIT)
41541 fputs ("\t.code16gcc\n", asm_out_file);
41542 #if TARGET_MACHO
41543 darwin_file_start ();
41544 #endif
41545 if (X86_FILE_START_VERSION_DIRECTIVE)
41546 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
41547 if (X86_FILE_START_FLTUSED)
41548 fputs ("\t.global\t__fltused\n", asm_out_file);
41549 if (ix86_asm_dialect == ASM_INTEL)
41550 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
41551 }
41552
41553 int
41554 x86_field_alignment (tree field, int computed)
41555 {
41556 machine_mode mode;
41557 tree type = TREE_TYPE (field);
41558
41559 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
41560 return computed;
41561 if (TARGET_IAMCU)
41562 return iamcu_alignment (type, computed);
41563 mode = TYPE_MODE (strip_array_types (type));
41564 if (mode == DFmode || mode == DCmode
41565 || GET_MODE_CLASS (mode) == MODE_INT
41566 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
41567 return MIN (32, computed);
41568 return computed;
41569 }
41570
41571 /* Print call to TARGET to FILE. */
41572
41573 static void
41574 x86_print_call_or_nop (FILE *file, const char *target)
41575 {
41576 if (flag_nop_mcount)
41577 fprintf (file, "1:\tnopl 0x00(%%eax,%%eax,1)\n"); /* 5 byte nop. */
41578 else
41579 fprintf (file, "1:\tcall\t%s\n", target);
41580 }
41581
41582 /* Output assembler code to FILE to increment profiler label # LABELNO
41583 for profiling a function entry. */
41584 void
41585 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
41586 {
41587 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
41588 : MCOUNT_NAME);
41589 if (TARGET_64BIT)
41590 {
41591 #ifndef NO_PROFILE_COUNTERS
41592 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
41593 #endif
41594
41595 if (!TARGET_PECOFF && flag_pic)
41596 fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
41597 else
41598 x86_print_call_or_nop (file, mcount_name);
41599 }
41600 else if (flag_pic)
41601 {
41602 #ifndef NO_PROFILE_COUNTERS
41603 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
41604 LPREFIX, labelno);
41605 #endif
41606 fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
41607 }
41608 else
41609 {
41610 #ifndef NO_PROFILE_COUNTERS
41611 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
41612 LPREFIX, labelno);
41613 #endif
41614 x86_print_call_or_nop (file, mcount_name);
41615 }
41616
41617 if (flag_record_mcount)
41618 {
41619 fprintf (file, "\t.section __mcount_loc, \"a\",@progbits\n");
41620 fprintf (file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
41621 fprintf (file, "\t.previous\n");
41622 }
41623 }
41624
41625 /* We don't have exact information about the insn sizes, but we may assume
41626 quite safely that we are informed about all 1 byte insns and memory
41627 address sizes. This is enough to eliminate unnecessary padding in
41628 99% of cases. */
41629
41630 static int
41631 min_insn_size (rtx_insn *insn)
41632 {
41633 int l = 0, len;
41634
41635 if (!INSN_P (insn) || !active_insn_p (insn))
41636 return 0;
41637
41638 /* Discard alignments we've emit and jump instructions. */
41639 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
41640 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
41641 return 0;
41642
41643 /* Important case - calls are always 5 bytes.
41644 It is common to have many calls in the row. */
41645 if (CALL_P (insn)
41646 && symbolic_reference_mentioned_p (PATTERN (insn))
41647 && !SIBLING_CALL_P (insn))
41648 return 5;
41649 len = get_attr_length (insn);
41650 if (len <= 1)
41651 return 1;
41652
41653 /* For normal instructions we rely on get_attr_length being exact,
41654 with a few exceptions. */
41655 if (!JUMP_P (insn))
41656 {
41657 enum attr_type type = get_attr_type (insn);
41658
41659 switch (type)
41660 {
41661 case TYPE_MULTI:
41662 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
41663 || asm_noperands (PATTERN (insn)) >= 0)
41664 return 0;
41665 break;
41666 case TYPE_OTHER:
41667 case TYPE_FCMP:
41668 break;
41669 default:
41670 /* Otherwise trust get_attr_length. */
41671 return len;
41672 }
41673
41674 l = get_attr_length_address (insn);
41675 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
41676 l = 4;
41677 }
41678 if (l)
41679 return 1+l;
41680 else
41681 return 2;
41682 }
41683
41684 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
41685
41686 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
41687 window. */
41688
41689 static void
41690 ix86_avoid_jump_mispredicts (void)
41691 {
41692 rtx_insn *insn, *start = get_insns ();
41693 int nbytes = 0, njumps = 0;
41694 bool isjump = false;
41695
41696 /* Look for all minimal intervals of instructions containing 4 jumps.
41697 The intervals are bounded by START and INSN. NBYTES is the total
41698 size of instructions in the interval including INSN and not including
41699 START. When the NBYTES is smaller than 16 bytes, it is possible
41700 that the end of START and INSN ends up in the same 16byte page.
41701
41702 The smallest offset in the page INSN can start is the case where START
41703 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
41704 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
41705
41706 Don't consider asm goto as jump, while it can contain a jump, it doesn't
41707 have to, control transfer to label(s) can be performed through other
41708 means, and also we estimate minimum length of all asm stmts as 0. */
41709 for (insn = start; insn; insn = NEXT_INSN (insn))
41710 {
41711 int min_size;
41712
41713 if (LABEL_P (insn))
41714 {
41715 int align = label_to_alignment (insn);
41716 int max_skip = label_to_max_skip (insn);
41717
41718 if (max_skip > 15)
41719 max_skip = 15;
41720 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
41721 already in the current 16 byte page, because otherwise
41722 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
41723 bytes to reach 16 byte boundary. */
41724 if (align <= 0
41725 || (align <= 3 && max_skip != (1 << align) - 1))
41726 max_skip = 0;
41727 if (dump_file)
41728 fprintf (dump_file, "Label %i with max_skip %i\n",
41729 INSN_UID (insn), max_skip);
41730 if (max_skip)
41731 {
41732 while (nbytes + max_skip >= 16)
41733 {
41734 start = NEXT_INSN (start);
41735 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
41736 || CALL_P (start))
41737 njumps--, isjump = true;
41738 else
41739 isjump = false;
41740 nbytes -= min_insn_size (start);
41741 }
41742 }
41743 continue;
41744 }
41745
41746 min_size = min_insn_size (insn);
41747 nbytes += min_size;
41748 if (dump_file)
41749 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
41750 INSN_UID (insn), min_size);
41751 if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0)
41752 || CALL_P (insn))
41753 njumps++;
41754 else
41755 continue;
41756
41757 while (njumps > 3)
41758 {
41759 start = NEXT_INSN (start);
41760 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
41761 || CALL_P (start))
41762 njumps--, isjump = true;
41763 else
41764 isjump = false;
41765 nbytes -= min_insn_size (start);
41766 }
41767 gcc_assert (njumps >= 0);
41768 if (dump_file)
41769 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
41770 INSN_UID (start), INSN_UID (insn), nbytes);
41771
41772 if (njumps == 3 && isjump && nbytes < 16)
41773 {
41774 int padsize = 15 - nbytes + min_insn_size (insn);
41775
41776 if (dump_file)
41777 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
41778 INSN_UID (insn), padsize);
41779 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
41780 }
41781 }
41782 }
41783 #endif
41784
41785 /* AMD Athlon works faster
41786 when RET is not destination of conditional jump or directly preceded
41787 by other jump instruction. We avoid the penalty by inserting NOP just
41788 before the RET instructions in such cases. */
41789 static void
41790 ix86_pad_returns (void)
41791 {
41792 edge e;
41793 edge_iterator ei;
41794
41795 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
41796 {
41797 basic_block bb = e->src;
41798 rtx_insn *ret = BB_END (bb);
41799 rtx_insn *prev;
41800 bool replace = false;
41801
41802 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
41803 || optimize_bb_for_size_p (bb))
41804 continue;
41805 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
41806 if (active_insn_p (prev) || LABEL_P (prev))
41807 break;
41808 if (prev && LABEL_P (prev))
41809 {
41810 edge e;
41811 edge_iterator ei;
41812
41813 FOR_EACH_EDGE (e, ei, bb->preds)
41814 if (EDGE_FREQUENCY (e) && e->src->index >= 0
41815 && !(e->flags & EDGE_FALLTHRU))
41816 {
41817 replace = true;
41818 break;
41819 }
41820 }
41821 if (!replace)
41822 {
41823 prev = prev_active_insn (ret);
41824 if (prev
41825 && ((JUMP_P (prev) && any_condjump_p (prev))
41826 || CALL_P (prev)))
41827 replace = true;
41828 /* Empty functions get branch mispredict even when
41829 the jump destination is not visible to us. */
41830 if (!prev && !optimize_function_for_size_p (cfun))
41831 replace = true;
41832 }
41833 if (replace)
41834 {
41835 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
41836 delete_insn (ret);
41837 }
41838 }
41839 }
41840
41841 /* Count the minimum number of instructions in BB. Return 4 if the
41842 number of instructions >= 4. */
41843
41844 static int
41845 ix86_count_insn_bb (basic_block bb)
41846 {
41847 rtx_insn *insn;
41848 int insn_count = 0;
41849
41850 /* Count number of instructions in this block. Return 4 if the number
41851 of instructions >= 4. */
41852 FOR_BB_INSNS (bb, insn)
41853 {
41854 /* Only happen in exit blocks. */
41855 if (JUMP_P (insn)
41856 && ANY_RETURN_P (PATTERN (insn)))
41857 break;
41858
41859 if (NONDEBUG_INSN_P (insn)
41860 && GET_CODE (PATTERN (insn)) != USE
41861 && GET_CODE (PATTERN (insn)) != CLOBBER)
41862 {
41863 insn_count++;
41864 if (insn_count >= 4)
41865 return insn_count;
41866 }
41867 }
41868
41869 return insn_count;
41870 }
41871
41872
41873 /* Count the minimum number of instructions in code path in BB.
41874 Return 4 if the number of instructions >= 4. */
41875
41876 static int
41877 ix86_count_insn (basic_block bb)
41878 {
41879 edge e;
41880 edge_iterator ei;
41881 int min_prev_count;
41882
41883 /* Only bother counting instructions along paths with no
41884 more than 2 basic blocks between entry and exit. Given
41885 that BB has an edge to exit, determine if a predecessor
41886 of BB has an edge from entry. If so, compute the number
41887 of instructions in the predecessor block. If there
41888 happen to be multiple such blocks, compute the minimum. */
41889 min_prev_count = 4;
41890 FOR_EACH_EDGE (e, ei, bb->preds)
41891 {
41892 edge prev_e;
41893 edge_iterator prev_ei;
41894
41895 if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
41896 {
41897 min_prev_count = 0;
41898 break;
41899 }
41900 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
41901 {
41902 if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
41903 {
41904 int count = ix86_count_insn_bb (e->src);
41905 if (count < min_prev_count)
41906 min_prev_count = count;
41907 break;
41908 }
41909 }
41910 }
41911
41912 if (min_prev_count < 4)
41913 min_prev_count += ix86_count_insn_bb (bb);
41914
41915 return min_prev_count;
41916 }
41917
41918 /* Pad short function to 4 instructions. */
41919
41920 static void
41921 ix86_pad_short_function (void)
41922 {
41923 edge e;
41924 edge_iterator ei;
41925
41926 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
41927 {
41928 rtx_insn *ret = BB_END (e->src);
41929 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
41930 {
41931 int insn_count = ix86_count_insn (e->src);
41932
41933 /* Pad short function. */
41934 if (insn_count < 4)
41935 {
41936 rtx_insn *insn = ret;
41937
41938 /* Find epilogue. */
41939 while (insn
41940 && (!NOTE_P (insn)
41941 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
41942 insn = PREV_INSN (insn);
41943
41944 if (!insn)
41945 insn = ret;
41946
41947 /* Two NOPs count as one instruction. */
41948 insn_count = 2 * (4 - insn_count);
41949 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
41950 }
41951 }
41952 }
41953 }
41954
41955 /* Fix up a Windows system unwinder issue. If an EH region falls through into
41956 the epilogue, the Windows system unwinder will apply epilogue logic and
41957 produce incorrect offsets. This can be avoided by adding a nop between
41958 the last insn that can throw and the first insn of the epilogue. */
41959
41960 static void
41961 ix86_seh_fixup_eh_fallthru (void)
41962 {
41963 edge e;
41964 edge_iterator ei;
41965
41966 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
41967 {
41968 rtx_insn *insn, *next;
41969
41970 /* Find the beginning of the epilogue. */
41971 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
41972 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
41973 break;
41974 if (insn == NULL)
41975 continue;
41976
41977 /* We only care about preceding insns that can throw. */
41978 insn = prev_active_insn (insn);
41979 if (insn == NULL || !can_throw_internal (insn))
41980 continue;
41981
41982 /* Do not separate calls from their debug information. */
41983 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
41984 if (NOTE_P (next)
41985 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
41986 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
41987 insn = next;
41988 else
41989 break;
41990
41991 emit_insn_after (gen_nops (const1_rtx), insn);
41992 }
41993 }
41994
41995 /* Given a register number BASE, the lowest of a group of registers, update
41996 regsets IN and OUT with the registers that should be avoided in input
41997 and output operands respectively when trying to avoid generating a modr/m
41998 byte for -fmitigate-rop. */
41999
42000 static void
42001 set_rop_modrm_reg_bits (int base, HARD_REG_SET &in, HARD_REG_SET &out)
42002 {
42003 SET_HARD_REG_BIT (out, base);
42004 SET_HARD_REG_BIT (out, base + 1);
42005 SET_HARD_REG_BIT (in, base + 2);
42006 SET_HARD_REG_BIT (in, base + 3);
42007 }
42008
42009 /* Called if -fmitigate_rop is in effect. Try to rewrite instructions so
42010 that certain encodings of modr/m bytes do not occur. */
42011 static void
42012 ix86_mitigate_rop (void)
42013 {
42014 HARD_REG_SET input_risky;
42015 HARD_REG_SET output_risky;
42016 HARD_REG_SET inout_risky;
42017
42018 CLEAR_HARD_REG_SET (output_risky);
42019 CLEAR_HARD_REG_SET (input_risky);
42020 SET_HARD_REG_BIT (output_risky, AX_REG);
42021 SET_HARD_REG_BIT (output_risky, CX_REG);
42022 SET_HARD_REG_BIT (input_risky, BX_REG);
42023 SET_HARD_REG_BIT (input_risky, DX_REG);
42024 set_rop_modrm_reg_bits (FIRST_SSE_REG, input_risky, output_risky);
42025 set_rop_modrm_reg_bits (FIRST_REX_INT_REG, input_risky, output_risky);
42026 set_rop_modrm_reg_bits (FIRST_REX_SSE_REG, input_risky, output_risky);
42027 set_rop_modrm_reg_bits (FIRST_EXT_REX_SSE_REG, input_risky, output_risky);
42028 set_rop_modrm_reg_bits (FIRST_MASK_REG, input_risky, output_risky);
42029 set_rop_modrm_reg_bits (FIRST_BND_REG, input_risky, output_risky);
42030 COPY_HARD_REG_SET (inout_risky, input_risky);
42031 IOR_HARD_REG_SET (inout_risky, output_risky);
42032
42033 df_note_add_problem ();
42034 /* Fix up what stack-regs did. */
42035 df_insn_rescan_all ();
42036 df_analyze ();
42037
42038 regrename_init (true);
42039 regrename_analyze (NULL);
42040
42041 auto_vec<du_head_p> cands;
42042
42043 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
42044 {
42045 if (!NONDEBUG_INSN_P (insn))
42046 continue;
42047
42048 if (GET_CODE (PATTERN (insn)) == USE
42049 || GET_CODE (PATTERN (insn)) == CLOBBER)
42050 continue;
42051
42052 extract_insn (insn);
42053
42054 int opno0, opno1;
42055 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
42056 recog_data.n_operands, &opno0,
42057 &opno1);
42058
42059 if (!ix86_rop_should_change_byte_p (modrm))
42060 continue;
42061
42062 insn_rr_info *info = &insn_rr[INSN_UID (insn)];
42063
42064 /* This happens when regrename has to fail a block. */
42065 if (!info->op_info)
42066 continue;
42067
42068 if (info->op_info[opno0].n_chains != 0)
42069 {
42070 gcc_assert (info->op_info[opno0].n_chains == 1);
42071 du_head_p op0c;
42072 op0c = regrename_chain_from_id (info->op_info[opno0].heads[0]->id);
42073 if (op0c->target_data_1 + op0c->target_data_2 == 0
42074 && !op0c->cannot_rename)
42075 cands.safe_push (op0c);
42076
42077 op0c->target_data_1++;
42078 }
42079 if (info->op_info[opno1].n_chains != 0)
42080 {
42081 gcc_assert (info->op_info[opno1].n_chains == 1);
42082 du_head_p op1c;
42083 op1c = regrename_chain_from_id (info->op_info[opno1].heads[0]->id);
42084 if (op1c->target_data_1 + op1c->target_data_2 == 0
42085 && !op1c->cannot_rename)
42086 cands.safe_push (op1c);
42087
42088 op1c->target_data_2++;
42089 }
42090 }
42091
42092 int i;
42093 du_head_p head;
42094 FOR_EACH_VEC_ELT (cands, i, head)
42095 {
42096 int old_reg, best_reg;
42097 HARD_REG_SET unavailable;
42098
42099 CLEAR_HARD_REG_SET (unavailable);
42100 if (head->target_data_1)
42101 IOR_HARD_REG_SET (unavailable, output_risky);
42102 if (head->target_data_2)
42103 IOR_HARD_REG_SET (unavailable, input_risky);
42104
42105 int n_uses;
42106 reg_class superclass = regrename_find_superclass (head, &n_uses,
42107 &unavailable);
42108 old_reg = head->regno;
42109 best_reg = find_rename_reg (head, superclass, &unavailable,
42110 old_reg, false);
42111 bool ok = regrename_do_replace (head, best_reg);
42112 gcc_assert (ok);
42113 if (dump_file)
42114 fprintf (dump_file, "Chain %d renamed as %s in %s\n", head->id,
42115 reg_names[best_reg], reg_class_names[superclass]);
42116
42117 }
42118
42119 regrename_finish ();
42120
42121 df_analyze ();
42122
42123 basic_block bb;
42124 regset_head live;
42125
42126 INIT_REG_SET (&live);
42127
42128 FOR_EACH_BB_FN (bb, cfun)
42129 {
42130 rtx_insn *insn;
42131
42132 COPY_REG_SET (&live, DF_LR_OUT (bb));
42133 df_simulate_initialize_backwards (bb, &live);
42134
42135 FOR_BB_INSNS_REVERSE (bb, insn)
42136 {
42137 if (!NONDEBUG_INSN_P (insn))
42138 continue;
42139
42140 df_simulate_one_insn_backwards (bb, insn, &live);
42141
42142 if (GET_CODE (PATTERN (insn)) == USE
42143 || GET_CODE (PATTERN (insn)) == CLOBBER)
42144 continue;
42145
42146 extract_insn (insn);
42147 constrain_operands_cached (insn, reload_completed);
42148 int opno0, opno1;
42149 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
42150 recog_data.n_operands, &opno0,
42151 &opno1);
42152 if (modrm < 0
42153 || !ix86_rop_should_change_byte_p (modrm)
42154 || opno0 == opno1)
42155 continue;
42156
42157 rtx oldreg = recog_data.operand[opno1];
42158 preprocess_constraints (insn);
42159 const operand_alternative *alt = which_op_alt ();
42160
42161 int i;
42162 for (i = 0; i < recog_data.n_operands; i++)
42163 if (i != opno1
42164 && alt[i].earlyclobber
42165 && reg_overlap_mentioned_p (recog_data.operand[i],
42166 oldreg))
42167 break;
42168
42169 if (i < recog_data.n_operands)
42170 continue;
42171
42172 if (dump_file)
42173 fprintf (dump_file,
42174 "attempting to fix modrm byte in insn %d:"
42175 " reg %d class %s", INSN_UID (insn), REGNO (oldreg),
42176 reg_class_names[alt[opno1].cl]);
42177
42178 HARD_REG_SET unavailable;
42179 REG_SET_TO_HARD_REG_SET (unavailable, &live);
42180 SET_HARD_REG_BIT (unavailable, REGNO (oldreg));
42181 IOR_COMPL_HARD_REG_SET (unavailable, call_used_reg_set);
42182 IOR_HARD_REG_SET (unavailable, fixed_reg_set);
42183 IOR_HARD_REG_SET (unavailable, output_risky);
42184 IOR_COMPL_HARD_REG_SET (unavailable,
42185 reg_class_contents[alt[opno1].cl]);
42186
42187 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
42188 if (!TEST_HARD_REG_BIT (unavailable, i))
42189 break;
42190 if (i == FIRST_PSEUDO_REGISTER)
42191 {
42192 if (dump_file)
42193 fprintf (dump_file, ", none available\n");
42194 continue;
42195 }
42196 if (dump_file)
42197 fprintf (dump_file, " -> %d\n", i);
42198 rtx newreg = gen_rtx_REG (recog_data.operand_mode[opno1], i);
42199 validate_change (insn, recog_data.operand_loc[opno1], newreg, false);
42200 insn = emit_insn_before (gen_move_insn (newreg, oldreg), insn);
42201 }
42202 }
42203 }
42204
42205 /* Implement machine specific optimizations. We implement padding of returns
42206 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
42207 static void
42208 ix86_reorg (void)
42209 {
42210 /* We are freeing block_for_insn in the toplev to keep compatibility
42211 with old MDEP_REORGS that are not CFG based. Recompute it now. */
42212 compute_bb_for_insn ();
42213
42214 if (flag_mitigate_rop)
42215 ix86_mitigate_rop ();
42216
42217 if (TARGET_SEH && current_function_has_exception_handlers ())
42218 ix86_seh_fixup_eh_fallthru ();
42219
42220 if (optimize && optimize_function_for_speed_p (cfun))
42221 {
42222 if (TARGET_PAD_SHORT_FUNCTION)
42223 ix86_pad_short_function ();
42224 else if (TARGET_PAD_RETURNS)
42225 ix86_pad_returns ();
42226 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
42227 if (TARGET_FOUR_JUMP_LIMIT)
42228 ix86_avoid_jump_mispredicts ();
42229 #endif
42230 }
42231 }
42232
42233 /* Return nonzero when QImode register that must be represented via REX prefix
42234 is used. */
42235 bool
42236 x86_extended_QIreg_mentioned_p (rtx_insn *insn)
42237 {
42238 int i;
42239 extract_insn_cached (insn);
42240 for (i = 0; i < recog_data.n_operands; i++)
42241 if (GENERAL_REG_P (recog_data.operand[i])
42242 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
42243 return true;
42244 return false;
42245 }
42246
42247 /* Return true when INSN mentions register that must be encoded using REX
42248 prefix. */
42249 bool
42250 x86_extended_reg_mentioned_p (rtx insn)
42251 {
42252 subrtx_iterator::array_type array;
42253 FOR_EACH_SUBRTX (iter, array, INSN_P (insn) ? PATTERN (insn) : insn, NONCONST)
42254 {
42255 const_rtx x = *iter;
42256 if (REG_P (x)
42257 && (REX_INT_REGNO_P (REGNO (x)) || REX_SSE_REGNO_P (REGNO (x))))
42258 return true;
42259 }
42260 return false;
42261 }
42262
42263 /* If profitable, negate (without causing overflow) integer constant
42264 of mode MODE at location LOC. Return true in this case. */
42265 bool
42266 x86_maybe_negate_const_int (rtx *loc, machine_mode mode)
42267 {
42268 HOST_WIDE_INT val;
42269
42270 if (!CONST_INT_P (*loc))
42271 return false;
42272
42273 switch (mode)
42274 {
42275 case DImode:
42276 /* DImode x86_64 constants must fit in 32 bits. */
42277 gcc_assert (x86_64_immediate_operand (*loc, mode));
42278
42279 mode = SImode;
42280 break;
42281
42282 case SImode:
42283 case HImode:
42284 case QImode:
42285 break;
42286
42287 default:
42288 gcc_unreachable ();
42289 }
42290
42291 /* Avoid overflows. */
42292 if (mode_signbit_p (mode, *loc))
42293 return false;
42294
42295 val = INTVAL (*loc);
42296
42297 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
42298 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
42299 if ((val < 0 && val != -128)
42300 || val == 128)
42301 {
42302 *loc = GEN_INT (-val);
42303 return true;
42304 }
42305
42306 return false;
42307 }
42308
42309 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
42310 optabs would emit if we didn't have TFmode patterns. */
42311
42312 void
42313 x86_emit_floatuns (rtx operands[2])
42314 {
42315 rtx_code_label *neglab, *donelab;
42316 rtx i0, i1, f0, in, out;
42317 machine_mode mode, inmode;
42318
42319 inmode = GET_MODE (operands[1]);
42320 gcc_assert (inmode == SImode || inmode == DImode);
42321
42322 out = operands[0];
42323 in = force_reg (inmode, operands[1]);
42324 mode = GET_MODE (out);
42325 neglab = gen_label_rtx ();
42326 donelab = gen_label_rtx ();
42327 f0 = gen_reg_rtx (mode);
42328
42329 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
42330
42331 expand_float (out, in, 0);
42332
42333 emit_jump_insn (gen_jump (donelab));
42334 emit_barrier ();
42335
42336 emit_label (neglab);
42337
42338 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
42339 1, OPTAB_DIRECT);
42340 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
42341 1, OPTAB_DIRECT);
42342 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
42343
42344 expand_float (f0, i0, 0);
42345
42346 emit_insn (gen_rtx_SET (out, gen_rtx_PLUS (mode, f0, f0)));
42347
42348 emit_label (donelab);
42349 }
42350 \f
42351 static bool canonicalize_perm (struct expand_vec_perm_d *d);
42352 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
42353 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
42354 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
42355
42356 /* Get a vector mode of the same size as the original but with elements
42357 twice as wide. This is only guaranteed to apply to integral vectors. */
42358
42359 static inline machine_mode
42360 get_mode_wider_vector (machine_mode o)
42361 {
42362 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
42363 machine_mode n = GET_MODE_WIDER_MODE (o);
42364 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
42365 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
42366 return n;
42367 }
42368
42369 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
42370 fill target with val via vec_duplicate. */
42371
42372 static bool
42373 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
42374 {
42375 bool ok;
42376 rtx_insn *insn;
42377 rtx dup;
42378
42379 /* First attempt to recognize VAL as-is. */
42380 dup = gen_rtx_VEC_DUPLICATE (mode, val);
42381 insn = emit_insn (gen_rtx_SET (target, dup));
42382 if (recog_memoized (insn) < 0)
42383 {
42384 rtx_insn *seq;
42385 /* If that fails, force VAL into a register. */
42386
42387 start_sequence ();
42388 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
42389 seq = get_insns ();
42390 end_sequence ();
42391 if (seq)
42392 emit_insn_before (seq, insn);
42393
42394 ok = recog_memoized (insn) >= 0;
42395 gcc_assert (ok);
42396 }
42397 return true;
42398 }
42399
42400 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
42401 with all elements equal to VAR. Return true if successful. */
42402
42403 static bool
42404 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
42405 rtx target, rtx val)
42406 {
42407 bool ok;
42408
42409 switch (mode)
42410 {
42411 case V2SImode:
42412 case V2SFmode:
42413 if (!mmx_ok)
42414 return false;
42415 /* FALLTHRU */
42416
42417 case V4DFmode:
42418 case V4DImode:
42419 case V8SFmode:
42420 case V8SImode:
42421 case V2DFmode:
42422 case V2DImode:
42423 case V4SFmode:
42424 case V4SImode:
42425 case V16SImode:
42426 case V8DImode:
42427 case V16SFmode:
42428 case V8DFmode:
42429 return ix86_vector_duplicate_value (mode, target, val);
42430
42431 case V4HImode:
42432 if (!mmx_ok)
42433 return false;
42434 if (TARGET_SSE || TARGET_3DNOW_A)
42435 {
42436 rtx x;
42437
42438 val = gen_lowpart (SImode, val);
42439 x = gen_rtx_TRUNCATE (HImode, val);
42440 x = gen_rtx_VEC_DUPLICATE (mode, x);
42441 emit_insn (gen_rtx_SET (target, x));
42442 return true;
42443 }
42444 goto widen;
42445
42446 case V8QImode:
42447 if (!mmx_ok)
42448 return false;
42449 goto widen;
42450
42451 case V8HImode:
42452 if (TARGET_AVX2)
42453 return ix86_vector_duplicate_value (mode, target, val);
42454
42455 if (TARGET_SSE2)
42456 {
42457 struct expand_vec_perm_d dperm;
42458 rtx tmp1, tmp2;
42459
42460 permute:
42461 memset (&dperm, 0, sizeof (dperm));
42462 dperm.target = target;
42463 dperm.vmode = mode;
42464 dperm.nelt = GET_MODE_NUNITS (mode);
42465 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
42466 dperm.one_operand_p = true;
42467
42468 /* Extend to SImode using a paradoxical SUBREG. */
42469 tmp1 = gen_reg_rtx (SImode);
42470 emit_move_insn (tmp1, gen_lowpart (SImode, val));
42471
42472 /* Insert the SImode value as low element of a V4SImode vector. */
42473 tmp2 = gen_reg_rtx (V4SImode);
42474 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
42475 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
42476
42477 ok = (expand_vec_perm_1 (&dperm)
42478 || expand_vec_perm_broadcast_1 (&dperm));
42479 gcc_assert (ok);
42480 return ok;
42481 }
42482 goto widen;
42483
42484 case V16QImode:
42485 if (TARGET_AVX2)
42486 return ix86_vector_duplicate_value (mode, target, val);
42487
42488 if (TARGET_SSE2)
42489 goto permute;
42490 goto widen;
42491
42492 widen:
42493 /* Replicate the value once into the next wider mode and recurse. */
42494 {
42495 machine_mode smode, wsmode, wvmode;
42496 rtx x;
42497
42498 smode = GET_MODE_INNER (mode);
42499 wvmode = get_mode_wider_vector (mode);
42500 wsmode = GET_MODE_INNER (wvmode);
42501
42502 val = convert_modes (wsmode, smode, val, true);
42503 x = expand_simple_binop (wsmode, ASHIFT, val,
42504 GEN_INT (GET_MODE_BITSIZE (smode)),
42505 NULL_RTX, 1, OPTAB_LIB_WIDEN);
42506 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
42507
42508 x = gen_reg_rtx (wvmode);
42509 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
42510 gcc_assert (ok);
42511 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
42512 return ok;
42513 }
42514
42515 case V16HImode:
42516 case V32QImode:
42517 if (TARGET_AVX2)
42518 return ix86_vector_duplicate_value (mode, target, val);
42519 else
42520 {
42521 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
42522 rtx x = gen_reg_rtx (hvmode);
42523
42524 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
42525 gcc_assert (ok);
42526
42527 x = gen_rtx_VEC_CONCAT (mode, x, x);
42528 emit_insn (gen_rtx_SET (target, x));
42529 }
42530 return true;
42531
42532 case V64QImode:
42533 case V32HImode:
42534 if (TARGET_AVX512BW)
42535 return ix86_vector_duplicate_value (mode, target, val);
42536 else
42537 {
42538 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
42539 rtx x = gen_reg_rtx (hvmode);
42540
42541 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
42542 gcc_assert (ok);
42543
42544 x = gen_rtx_VEC_CONCAT (mode, x, x);
42545 emit_insn (gen_rtx_SET (target, x));
42546 }
42547 return true;
42548
42549 default:
42550 return false;
42551 }
42552 }
42553
42554 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
42555 whose ONE_VAR element is VAR, and other elements are zero. Return true
42556 if successful. */
42557
42558 static bool
42559 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
42560 rtx target, rtx var, int one_var)
42561 {
42562 machine_mode vsimode;
42563 rtx new_target;
42564 rtx x, tmp;
42565 bool use_vector_set = false;
42566
42567 switch (mode)
42568 {
42569 case V2DImode:
42570 /* For SSE4.1, we normally use vector set. But if the second
42571 element is zero and inter-unit moves are OK, we use movq
42572 instead. */
42573 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
42574 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
42575 && one_var == 0));
42576 break;
42577 case V16QImode:
42578 case V4SImode:
42579 case V4SFmode:
42580 use_vector_set = TARGET_SSE4_1;
42581 break;
42582 case V8HImode:
42583 use_vector_set = TARGET_SSE2;
42584 break;
42585 case V4HImode:
42586 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
42587 break;
42588 case V32QImode:
42589 case V16HImode:
42590 case V8SImode:
42591 case V8SFmode:
42592 case V4DFmode:
42593 use_vector_set = TARGET_AVX;
42594 break;
42595 case V4DImode:
42596 /* Use ix86_expand_vector_set in 64bit mode only. */
42597 use_vector_set = TARGET_AVX && TARGET_64BIT;
42598 break;
42599 default:
42600 break;
42601 }
42602
42603 if (use_vector_set)
42604 {
42605 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
42606 var = force_reg (GET_MODE_INNER (mode), var);
42607 ix86_expand_vector_set (mmx_ok, target, var, one_var);
42608 return true;
42609 }
42610
42611 switch (mode)
42612 {
42613 case V2SFmode:
42614 case V2SImode:
42615 if (!mmx_ok)
42616 return false;
42617 /* FALLTHRU */
42618
42619 case V2DFmode:
42620 case V2DImode:
42621 if (one_var != 0)
42622 return false;
42623 var = force_reg (GET_MODE_INNER (mode), var);
42624 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
42625 emit_insn (gen_rtx_SET (target, x));
42626 return true;
42627
42628 case V4SFmode:
42629 case V4SImode:
42630 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
42631 new_target = gen_reg_rtx (mode);
42632 else
42633 new_target = target;
42634 var = force_reg (GET_MODE_INNER (mode), var);
42635 x = gen_rtx_VEC_DUPLICATE (mode, var);
42636 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
42637 emit_insn (gen_rtx_SET (new_target, x));
42638 if (one_var != 0)
42639 {
42640 /* We need to shuffle the value to the correct position, so
42641 create a new pseudo to store the intermediate result. */
42642
42643 /* With SSE2, we can use the integer shuffle insns. */
42644 if (mode != V4SFmode && TARGET_SSE2)
42645 {
42646 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
42647 const1_rtx,
42648 GEN_INT (one_var == 1 ? 0 : 1),
42649 GEN_INT (one_var == 2 ? 0 : 1),
42650 GEN_INT (one_var == 3 ? 0 : 1)));
42651 if (target != new_target)
42652 emit_move_insn (target, new_target);
42653 return true;
42654 }
42655
42656 /* Otherwise convert the intermediate result to V4SFmode and
42657 use the SSE1 shuffle instructions. */
42658 if (mode != V4SFmode)
42659 {
42660 tmp = gen_reg_rtx (V4SFmode);
42661 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
42662 }
42663 else
42664 tmp = new_target;
42665
42666 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
42667 const1_rtx,
42668 GEN_INT (one_var == 1 ? 0 : 1),
42669 GEN_INT (one_var == 2 ? 0+4 : 1+4),
42670 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
42671
42672 if (mode != V4SFmode)
42673 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
42674 else if (tmp != target)
42675 emit_move_insn (target, tmp);
42676 }
42677 else if (target != new_target)
42678 emit_move_insn (target, new_target);
42679 return true;
42680
42681 case V8HImode:
42682 case V16QImode:
42683 vsimode = V4SImode;
42684 goto widen;
42685 case V4HImode:
42686 case V8QImode:
42687 if (!mmx_ok)
42688 return false;
42689 vsimode = V2SImode;
42690 goto widen;
42691 widen:
42692 if (one_var != 0)
42693 return false;
42694
42695 /* Zero extend the variable element to SImode and recurse. */
42696 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
42697
42698 x = gen_reg_rtx (vsimode);
42699 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
42700 var, one_var))
42701 gcc_unreachable ();
42702
42703 emit_move_insn (target, gen_lowpart (mode, x));
42704 return true;
42705
42706 default:
42707 return false;
42708 }
42709 }
42710
42711 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
42712 consisting of the values in VALS. It is known that all elements
42713 except ONE_VAR are constants. Return true if successful. */
42714
42715 static bool
42716 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
42717 rtx target, rtx vals, int one_var)
42718 {
42719 rtx var = XVECEXP (vals, 0, one_var);
42720 machine_mode wmode;
42721 rtx const_vec, x;
42722
42723 const_vec = copy_rtx (vals);
42724 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
42725 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
42726
42727 switch (mode)
42728 {
42729 case V2DFmode:
42730 case V2DImode:
42731 case V2SFmode:
42732 case V2SImode:
42733 /* For the two element vectors, it's just as easy to use
42734 the general case. */
42735 return false;
42736
42737 case V4DImode:
42738 /* Use ix86_expand_vector_set in 64bit mode only. */
42739 if (!TARGET_64BIT)
42740 return false;
42741 /* FALLTHRU */
42742 case V4DFmode:
42743 case V8SFmode:
42744 case V8SImode:
42745 case V16HImode:
42746 case V32QImode:
42747 case V4SFmode:
42748 case V4SImode:
42749 case V8HImode:
42750 case V4HImode:
42751 break;
42752
42753 case V16QImode:
42754 if (TARGET_SSE4_1)
42755 break;
42756 wmode = V8HImode;
42757 goto widen;
42758 case V8QImode:
42759 wmode = V4HImode;
42760 goto widen;
42761 widen:
42762 /* There's no way to set one QImode entry easily. Combine
42763 the variable value with its adjacent constant value, and
42764 promote to an HImode set. */
42765 x = XVECEXP (vals, 0, one_var ^ 1);
42766 if (one_var & 1)
42767 {
42768 var = convert_modes (HImode, QImode, var, true);
42769 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
42770 NULL_RTX, 1, OPTAB_LIB_WIDEN);
42771 x = GEN_INT (INTVAL (x) & 0xff);
42772 }
42773 else
42774 {
42775 var = convert_modes (HImode, QImode, var, true);
42776 x = gen_int_mode (INTVAL (x) << 8, HImode);
42777 }
42778 if (x != const0_rtx)
42779 var = expand_simple_binop (HImode, IOR, var, x, var,
42780 1, OPTAB_LIB_WIDEN);
42781
42782 x = gen_reg_rtx (wmode);
42783 emit_move_insn (x, gen_lowpart (wmode, const_vec));
42784 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
42785
42786 emit_move_insn (target, gen_lowpart (mode, x));
42787 return true;
42788
42789 default:
42790 return false;
42791 }
42792
42793 emit_move_insn (target, const_vec);
42794 ix86_expand_vector_set (mmx_ok, target, var, one_var);
42795 return true;
42796 }
42797
42798 /* A subroutine of ix86_expand_vector_init_general. Use vector
42799 concatenate to handle the most general case: all values variable,
42800 and none identical. */
42801
42802 static void
42803 ix86_expand_vector_init_concat (machine_mode mode,
42804 rtx target, rtx *ops, int n)
42805 {
42806 machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
42807 rtx first[16], second[8], third[4];
42808 rtvec v;
42809 int i, j;
42810
42811 switch (n)
42812 {
42813 case 2:
42814 switch (mode)
42815 {
42816 case V16SImode:
42817 cmode = V8SImode;
42818 break;
42819 case V16SFmode:
42820 cmode = V8SFmode;
42821 break;
42822 case V8DImode:
42823 cmode = V4DImode;
42824 break;
42825 case V8DFmode:
42826 cmode = V4DFmode;
42827 break;
42828 case V8SImode:
42829 cmode = V4SImode;
42830 break;
42831 case V8SFmode:
42832 cmode = V4SFmode;
42833 break;
42834 case V4DImode:
42835 cmode = V2DImode;
42836 break;
42837 case V4DFmode:
42838 cmode = V2DFmode;
42839 break;
42840 case V4SImode:
42841 cmode = V2SImode;
42842 break;
42843 case V4SFmode:
42844 cmode = V2SFmode;
42845 break;
42846 case V2DImode:
42847 cmode = DImode;
42848 break;
42849 case V2SImode:
42850 cmode = SImode;
42851 break;
42852 case V2DFmode:
42853 cmode = DFmode;
42854 break;
42855 case V2SFmode:
42856 cmode = SFmode;
42857 break;
42858 default:
42859 gcc_unreachable ();
42860 }
42861
42862 if (!register_operand (ops[1], cmode))
42863 ops[1] = force_reg (cmode, ops[1]);
42864 if (!register_operand (ops[0], cmode))
42865 ops[0] = force_reg (cmode, ops[0]);
42866 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
42867 ops[1])));
42868 break;
42869
42870 case 4:
42871 switch (mode)
42872 {
42873 case V4DImode:
42874 cmode = V2DImode;
42875 break;
42876 case V4DFmode:
42877 cmode = V2DFmode;
42878 break;
42879 case V4SImode:
42880 cmode = V2SImode;
42881 break;
42882 case V4SFmode:
42883 cmode = V2SFmode;
42884 break;
42885 default:
42886 gcc_unreachable ();
42887 }
42888 goto half;
42889
42890 case 8:
42891 switch (mode)
42892 {
42893 case V8DImode:
42894 cmode = V2DImode;
42895 hmode = V4DImode;
42896 break;
42897 case V8DFmode:
42898 cmode = V2DFmode;
42899 hmode = V4DFmode;
42900 break;
42901 case V8SImode:
42902 cmode = V2SImode;
42903 hmode = V4SImode;
42904 break;
42905 case V8SFmode:
42906 cmode = V2SFmode;
42907 hmode = V4SFmode;
42908 break;
42909 default:
42910 gcc_unreachable ();
42911 }
42912 goto half;
42913
42914 case 16:
42915 switch (mode)
42916 {
42917 case V16SImode:
42918 cmode = V2SImode;
42919 hmode = V4SImode;
42920 gmode = V8SImode;
42921 break;
42922 case V16SFmode:
42923 cmode = V2SFmode;
42924 hmode = V4SFmode;
42925 gmode = V8SFmode;
42926 break;
42927 default:
42928 gcc_unreachable ();
42929 }
42930 goto half;
42931
42932 half:
42933 /* FIXME: We process inputs backward to help RA. PR 36222. */
42934 i = n - 1;
42935 j = (n >> 1) - 1;
42936 for (; i > 0; i -= 2, j--)
42937 {
42938 first[j] = gen_reg_rtx (cmode);
42939 v = gen_rtvec (2, ops[i - 1], ops[i]);
42940 ix86_expand_vector_init (false, first[j],
42941 gen_rtx_PARALLEL (cmode, v));
42942 }
42943
42944 n >>= 1;
42945 if (n > 4)
42946 {
42947 gcc_assert (hmode != VOIDmode);
42948 gcc_assert (gmode != VOIDmode);
42949 for (i = j = 0; i < n; i += 2, j++)
42950 {
42951 second[j] = gen_reg_rtx (hmode);
42952 ix86_expand_vector_init_concat (hmode, second [j],
42953 &first [i], 2);
42954 }
42955 n >>= 1;
42956 for (i = j = 0; i < n; i += 2, j++)
42957 {
42958 third[j] = gen_reg_rtx (gmode);
42959 ix86_expand_vector_init_concat (gmode, third[j],
42960 &second[i], 2);
42961 }
42962 n >>= 1;
42963 ix86_expand_vector_init_concat (mode, target, third, n);
42964 }
42965 else if (n > 2)
42966 {
42967 gcc_assert (hmode != VOIDmode);
42968 for (i = j = 0; i < n; i += 2, j++)
42969 {
42970 second[j] = gen_reg_rtx (hmode);
42971 ix86_expand_vector_init_concat (hmode, second [j],
42972 &first [i], 2);
42973 }
42974 n >>= 1;
42975 ix86_expand_vector_init_concat (mode, target, second, n);
42976 }
42977 else
42978 ix86_expand_vector_init_concat (mode, target, first, n);
42979 break;
42980
42981 default:
42982 gcc_unreachable ();
42983 }
42984 }
42985
42986 /* A subroutine of ix86_expand_vector_init_general. Use vector
42987 interleave to handle the most general case: all values variable,
42988 and none identical. */
42989
42990 static void
42991 ix86_expand_vector_init_interleave (machine_mode mode,
42992 rtx target, rtx *ops, int n)
42993 {
42994 machine_mode first_imode, second_imode, third_imode, inner_mode;
42995 int i, j;
42996 rtx op0, op1;
42997 rtx (*gen_load_even) (rtx, rtx, rtx);
42998 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
42999 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
43000
43001 switch (mode)
43002 {
43003 case V8HImode:
43004 gen_load_even = gen_vec_setv8hi;
43005 gen_interleave_first_low = gen_vec_interleave_lowv4si;
43006 gen_interleave_second_low = gen_vec_interleave_lowv2di;
43007 inner_mode = HImode;
43008 first_imode = V4SImode;
43009 second_imode = V2DImode;
43010 third_imode = VOIDmode;
43011 break;
43012 case V16QImode:
43013 gen_load_even = gen_vec_setv16qi;
43014 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
43015 gen_interleave_second_low = gen_vec_interleave_lowv4si;
43016 inner_mode = QImode;
43017 first_imode = V8HImode;
43018 second_imode = V4SImode;
43019 third_imode = V2DImode;
43020 break;
43021 default:
43022 gcc_unreachable ();
43023 }
43024
43025 for (i = 0; i < n; i++)
43026 {
43027 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
43028 op0 = gen_reg_rtx (SImode);
43029 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
43030
43031 /* Insert the SImode value as low element of V4SImode vector. */
43032 op1 = gen_reg_rtx (V4SImode);
43033 op0 = gen_rtx_VEC_MERGE (V4SImode,
43034 gen_rtx_VEC_DUPLICATE (V4SImode,
43035 op0),
43036 CONST0_RTX (V4SImode),
43037 const1_rtx);
43038 emit_insn (gen_rtx_SET (op1, op0));
43039
43040 /* Cast the V4SImode vector back to a vector in orignal mode. */
43041 op0 = gen_reg_rtx (mode);
43042 emit_move_insn (op0, gen_lowpart (mode, op1));
43043
43044 /* Load even elements into the second position. */
43045 emit_insn (gen_load_even (op0,
43046 force_reg (inner_mode,
43047 ops [i + i + 1]),
43048 const1_rtx));
43049
43050 /* Cast vector to FIRST_IMODE vector. */
43051 ops[i] = gen_reg_rtx (first_imode);
43052 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
43053 }
43054
43055 /* Interleave low FIRST_IMODE vectors. */
43056 for (i = j = 0; i < n; i += 2, j++)
43057 {
43058 op0 = gen_reg_rtx (first_imode);
43059 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
43060
43061 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
43062 ops[j] = gen_reg_rtx (second_imode);
43063 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
43064 }
43065
43066 /* Interleave low SECOND_IMODE vectors. */
43067 switch (second_imode)
43068 {
43069 case V4SImode:
43070 for (i = j = 0; i < n / 2; i += 2, j++)
43071 {
43072 op0 = gen_reg_rtx (second_imode);
43073 emit_insn (gen_interleave_second_low (op0, ops[i],
43074 ops[i + 1]));
43075
43076 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
43077 vector. */
43078 ops[j] = gen_reg_rtx (third_imode);
43079 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
43080 }
43081 second_imode = V2DImode;
43082 gen_interleave_second_low = gen_vec_interleave_lowv2di;
43083 /* FALLTHRU */
43084
43085 case V2DImode:
43086 op0 = gen_reg_rtx (second_imode);
43087 emit_insn (gen_interleave_second_low (op0, ops[0],
43088 ops[1]));
43089
43090 /* Cast the SECOND_IMODE vector back to a vector on original
43091 mode. */
43092 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
43093 break;
43094
43095 default:
43096 gcc_unreachable ();
43097 }
43098 }
43099
43100 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
43101 all values variable, and none identical. */
43102
43103 static void
43104 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
43105 rtx target, rtx vals)
43106 {
43107 rtx ops[64], op0, op1, op2, op3, op4, op5;
43108 machine_mode half_mode = VOIDmode;
43109 machine_mode quarter_mode = VOIDmode;
43110 int n, i;
43111
43112 switch (mode)
43113 {
43114 case V2SFmode:
43115 case V2SImode:
43116 if (!mmx_ok && !TARGET_SSE)
43117 break;
43118 /* FALLTHRU */
43119
43120 case V16SImode:
43121 case V16SFmode:
43122 case V8DFmode:
43123 case V8DImode:
43124 case V8SFmode:
43125 case V8SImode:
43126 case V4DFmode:
43127 case V4DImode:
43128 case V4SFmode:
43129 case V4SImode:
43130 case V2DFmode:
43131 case V2DImode:
43132 n = GET_MODE_NUNITS (mode);
43133 for (i = 0; i < n; i++)
43134 ops[i] = XVECEXP (vals, 0, i);
43135 ix86_expand_vector_init_concat (mode, target, ops, n);
43136 return;
43137
43138 case V32QImode:
43139 half_mode = V16QImode;
43140 goto half;
43141
43142 case V16HImode:
43143 half_mode = V8HImode;
43144 goto half;
43145
43146 half:
43147 n = GET_MODE_NUNITS (mode);
43148 for (i = 0; i < n; i++)
43149 ops[i] = XVECEXP (vals, 0, i);
43150 op0 = gen_reg_rtx (half_mode);
43151 op1 = gen_reg_rtx (half_mode);
43152 ix86_expand_vector_init_interleave (half_mode, op0, ops,
43153 n >> 2);
43154 ix86_expand_vector_init_interleave (half_mode, op1,
43155 &ops [n >> 1], n >> 2);
43156 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
43157 return;
43158
43159 case V64QImode:
43160 quarter_mode = V16QImode;
43161 half_mode = V32QImode;
43162 goto quarter;
43163
43164 case V32HImode:
43165 quarter_mode = V8HImode;
43166 half_mode = V16HImode;
43167 goto quarter;
43168
43169 quarter:
43170 n = GET_MODE_NUNITS (mode);
43171 for (i = 0; i < n; i++)
43172 ops[i] = XVECEXP (vals, 0, i);
43173 op0 = gen_reg_rtx (quarter_mode);
43174 op1 = gen_reg_rtx (quarter_mode);
43175 op2 = gen_reg_rtx (quarter_mode);
43176 op3 = gen_reg_rtx (quarter_mode);
43177 op4 = gen_reg_rtx (half_mode);
43178 op5 = gen_reg_rtx (half_mode);
43179 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
43180 n >> 3);
43181 ix86_expand_vector_init_interleave (quarter_mode, op1,
43182 &ops [n >> 2], n >> 3);
43183 ix86_expand_vector_init_interleave (quarter_mode, op2,
43184 &ops [n >> 1], n >> 3);
43185 ix86_expand_vector_init_interleave (quarter_mode, op3,
43186 &ops [(n >> 1) | (n >> 2)], n >> 3);
43187 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
43188 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
43189 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
43190 return;
43191
43192 case V16QImode:
43193 if (!TARGET_SSE4_1)
43194 break;
43195 /* FALLTHRU */
43196
43197 case V8HImode:
43198 if (!TARGET_SSE2)
43199 break;
43200
43201 /* Don't use ix86_expand_vector_init_interleave if we can't
43202 move from GPR to SSE register directly. */
43203 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
43204 break;
43205
43206 n = GET_MODE_NUNITS (mode);
43207 for (i = 0; i < n; i++)
43208 ops[i] = XVECEXP (vals, 0, i);
43209 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
43210 return;
43211
43212 case V4HImode:
43213 case V8QImode:
43214 break;
43215
43216 default:
43217 gcc_unreachable ();
43218 }
43219
43220 {
43221 int i, j, n_elts, n_words, n_elt_per_word;
43222 machine_mode inner_mode;
43223 rtx words[4], shift;
43224
43225 inner_mode = GET_MODE_INNER (mode);
43226 n_elts = GET_MODE_NUNITS (mode);
43227 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
43228 n_elt_per_word = n_elts / n_words;
43229 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
43230
43231 for (i = 0; i < n_words; ++i)
43232 {
43233 rtx word = NULL_RTX;
43234
43235 for (j = 0; j < n_elt_per_word; ++j)
43236 {
43237 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
43238 elt = convert_modes (word_mode, inner_mode, elt, true);
43239
43240 if (j == 0)
43241 word = elt;
43242 else
43243 {
43244 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
43245 word, 1, OPTAB_LIB_WIDEN);
43246 word = expand_simple_binop (word_mode, IOR, word, elt,
43247 word, 1, OPTAB_LIB_WIDEN);
43248 }
43249 }
43250
43251 words[i] = word;
43252 }
43253
43254 if (n_words == 1)
43255 emit_move_insn (target, gen_lowpart (mode, words[0]));
43256 else if (n_words == 2)
43257 {
43258 rtx tmp = gen_reg_rtx (mode);
43259 emit_clobber (tmp);
43260 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
43261 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
43262 emit_move_insn (target, tmp);
43263 }
43264 else if (n_words == 4)
43265 {
43266 rtx tmp = gen_reg_rtx (V4SImode);
43267 gcc_assert (word_mode == SImode);
43268 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
43269 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
43270 emit_move_insn (target, gen_lowpart (mode, tmp));
43271 }
43272 else
43273 gcc_unreachable ();
43274 }
43275 }
43276
43277 /* Initialize vector TARGET via VALS. Suppress the use of MMX
43278 instructions unless MMX_OK is true. */
43279
43280 void
43281 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
43282 {
43283 machine_mode mode = GET_MODE (target);
43284 machine_mode inner_mode = GET_MODE_INNER (mode);
43285 int n_elts = GET_MODE_NUNITS (mode);
43286 int n_var = 0, one_var = -1;
43287 bool all_same = true, all_const_zero = true;
43288 int i;
43289 rtx x;
43290
43291 for (i = 0; i < n_elts; ++i)
43292 {
43293 x = XVECEXP (vals, 0, i);
43294 if (!(CONST_SCALAR_INT_P (x)
43295 || CONST_DOUBLE_P (x)
43296 || CONST_FIXED_P (x)))
43297 n_var++, one_var = i;
43298 else if (x != CONST0_RTX (inner_mode))
43299 all_const_zero = false;
43300 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
43301 all_same = false;
43302 }
43303
43304 /* Constants are best loaded from the constant pool. */
43305 if (n_var == 0)
43306 {
43307 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
43308 return;
43309 }
43310
43311 /* If all values are identical, broadcast the value. */
43312 if (all_same
43313 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
43314 XVECEXP (vals, 0, 0)))
43315 return;
43316
43317 /* Values where only one field is non-constant are best loaded from
43318 the pool and overwritten via move later. */
43319 if (n_var == 1)
43320 {
43321 if (all_const_zero
43322 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
43323 XVECEXP (vals, 0, one_var),
43324 one_var))
43325 return;
43326
43327 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
43328 return;
43329 }
43330
43331 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
43332 }
43333
43334 void
43335 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
43336 {
43337 machine_mode mode = GET_MODE (target);
43338 machine_mode inner_mode = GET_MODE_INNER (mode);
43339 machine_mode half_mode;
43340 bool use_vec_merge = false;
43341 rtx tmp;
43342 static rtx (*gen_extract[6][2]) (rtx, rtx)
43343 = {
43344 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
43345 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
43346 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
43347 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
43348 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
43349 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
43350 };
43351 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
43352 = {
43353 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
43354 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
43355 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
43356 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
43357 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
43358 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
43359 };
43360 int i, j, n;
43361 machine_mode mmode = VOIDmode;
43362 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
43363
43364 switch (mode)
43365 {
43366 case V2SFmode:
43367 case V2SImode:
43368 if (mmx_ok)
43369 {
43370 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
43371 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
43372 if (elt == 0)
43373 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
43374 else
43375 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
43376 emit_insn (gen_rtx_SET (target, tmp));
43377 return;
43378 }
43379 break;
43380
43381 case V2DImode:
43382 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
43383 if (use_vec_merge)
43384 break;
43385
43386 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
43387 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
43388 if (elt == 0)
43389 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
43390 else
43391 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
43392 emit_insn (gen_rtx_SET (target, tmp));
43393 return;
43394
43395 case V2DFmode:
43396 {
43397 rtx op0, op1;
43398
43399 /* For the two element vectors, we implement a VEC_CONCAT with
43400 the extraction of the other element. */
43401
43402 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
43403 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
43404
43405 if (elt == 0)
43406 op0 = val, op1 = tmp;
43407 else
43408 op0 = tmp, op1 = val;
43409
43410 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
43411 emit_insn (gen_rtx_SET (target, tmp));
43412 }
43413 return;
43414
43415 case V4SFmode:
43416 use_vec_merge = TARGET_SSE4_1;
43417 if (use_vec_merge)
43418 break;
43419
43420 switch (elt)
43421 {
43422 case 0:
43423 use_vec_merge = true;
43424 break;
43425
43426 case 1:
43427 /* tmp = target = A B C D */
43428 tmp = copy_to_reg (target);
43429 /* target = A A B B */
43430 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
43431 /* target = X A B B */
43432 ix86_expand_vector_set (false, target, val, 0);
43433 /* target = A X C D */
43434 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
43435 const1_rtx, const0_rtx,
43436 GEN_INT (2+4), GEN_INT (3+4)));
43437 return;
43438
43439 case 2:
43440 /* tmp = target = A B C D */
43441 tmp = copy_to_reg (target);
43442 /* tmp = X B C D */
43443 ix86_expand_vector_set (false, tmp, val, 0);
43444 /* target = A B X D */
43445 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
43446 const0_rtx, const1_rtx,
43447 GEN_INT (0+4), GEN_INT (3+4)));
43448 return;
43449
43450 case 3:
43451 /* tmp = target = A B C D */
43452 tmp = copy_to_reg (target);
43453 /* tmp = X B C D */
43454 ix86_expand_vector_set (false, tmp, val, 0);
43455 /* target = A B X D */
43456 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
43457 const0_rtx, const1_rtx,
43458 GEN_INT (2+4), GEN_INT (0+4)));
43459 return;
43460
43461 default:
43462 gcc_unreachable ();
43463 }
43464 break;
43465
43466 case V4SImode:
43467 use_vec_merge = TARGET_SSE4_1;
43468 if (use_vec_merge)
43469 break;
43470
43471 /* Element 0 handled by vec_merge below. */
43472 if (elt == 0)
43473 {
43474 use_vec_merge = true;
43475 break;
43476 }
43477
43478 if (TARGET_SSE2)
43479 {
43480 /* With SSE2, use integer shuffles to swap element 0 and ELT,
43481 store into element 0, then shuffle them back. */
43482
43483 rtx order[4];
43484
43485 order[0] = GEN_INT (elt);
43486 order[1] = const1_rtx;
43487 order[2] = const2_rtx;
43488 order[3] = GEN_INT (3);
43489 order[elt] = const0_rtx;
43490
43491 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
43492 order[1], order[2], order[3]));
43493
43494 ix86_expand_vector_set (false, target, val, 0);
43495
43496 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
43497 order[1], order[2], order[3]));
43498 }
43499 else
43500 {
43501 /* For SSE1, we have to reuse the V4SF code. */
43502 rtx t = gen_reg_rtx (V4SFmode);
43503 emit_move_insn (t, gen_lowpart (V4SFmode, target));
43504 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
43505 emit_move_insn (target, gen_lowpart (mode, t));
43506 }
43507 return;
43508
43509 case V8HImode:
43510 use_vec_merge = TARGET_SSE2;
43511 break;
43512 case V4HImode:
43513 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
43514 break;
43515
43516 case V16QImode:
43517 use_vec_merge = TARGET_SSE4_1;
43518 break;
43519
43520 case V8QImode:
43521 break;
43522
43523 case V32QImode:
43524 half_mode = V16QImode;
43525 j = 0;
43526 n = 16;
43527 goto half;
43528
43529 case V16HImode:
43530 half_mode = V8HImode;
43531 j = 1;
43532 n = 8;
43533 goto half;
43534
43535 case V8SImode:
43536 half_mode = V4SImode;
43537 j = 2;
43538 n = 4;
43539 goto half;
43540
43541 case V4DImode:
43542 half_mode = V2DImode;
43543 j = 3;
43544 n = 2;
43545 goto half;
43546
43547 case V8SFmode:
43548 half_mode = V4SFmode;
43549 j = 4;
43550 n = 4;
43551 goto half;
43552
43553 case V4DFmode:
43554 half_mode = V2DFmode;
43555 j = 5;
43556 n = 2;
43557 goto half;
43558
43559 half:
43560 /* Compute offset. */
43561 i = elt / n;
43562 elt %= n;
43563
43564 gcc_assert (i <= 1);
43565
43566 /* Extract the half. */
43567 tmp = gen_reg_rtx (half_mode);
43568 emit_insn (gen_extract[j][i] (tmp, target));
43569
43570 /* Put val in tmp at elt. */
43571 ix86_expand_vector_set (false, tmp, val, elt);
43572
43573 /* Put it back. */
43574 emit_insn (gen_insert[j][i] (target, target, tmp));
43575 return;
43576
43577 case V8DFmode:
43578 if (TARGET_AVX512F)
43579 {
43580 mmode = QImode;
43581 gen_blendm = gen_avx512f_blendmv8df;
43582 }
43583 break;
43584
43585 case V8DImode:
43586 if (TARGET_AVX512F)
43587 {
43588 mmode = QImode;
43589 gen_blendm = gen_avx512f_blendmv8di;
43590 }
43591 break;
43592
43593 case V16SFmode:
43594 if (TARGET_AVX512F)
43595 {
43596 mmode = HImode;
43597 gen_blendm = gen_avx512f_blendmv16sf;
43598 }
43599 break;
43600
43601 case V16SImode:
43602 if (TARGET_AVX512F)
43603 {
43604 mmode = HImode;
43605 gen_blendm = gen_avx512f_blendmv16si;
43606 }
43607 break;
43608
43609 case V32HImode:
43610 if (TARGET_AVX512F && TARGET_AVX512BW)
43611 {
43612 mmode = SImode;
43613 gen_blendm = gen_avx512bw_blendmv32hi;
43614 }
43615 break;
43616
43617 case V64QImode:
43618 if (TARGET_AVX512F && TARGET_AVX512BW)
43619 {
43620 mmode = DImode;
43621 gen_blendm = gen_avx512bw_blendmv64qi;
43622 }
43623 break;
43624
43625 default:
43626 break;
43627 }
43628
43629 if (mmode != VOIDmode)
43630 {
43631 tmp = gen_reg_rtx (mode);
43632 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
43633 /* The avx512*_blendm<mode> expanders have different operand order
43634 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
43635 elements where the mask is set and second input operand otherwise,
43636 in {sse,avx}*_*blend* the first input operand is used for elements
43637 where the mask is clear and second input operand otherwise. */
43638 emit_insn (gen_blendm (target, target, tmp,
43639 force_reg (mmode,
43640 gen_int_mode (1 << elt, mmode))));
43641 }
43642 else if (use_vec_merge)
43643 {
43644 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
43645 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
43646 emit_insn (gen_rtx_SET (target, tmp));
43647 }
43648 else
43649 {
43650 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
43651
43652 emit_move_insn (mem, target);
43653
43654 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
43655 emit_move_insn (tmp, val);
43656
43657 emit_move_insn (target, mem);
43658 }
43659 }
43660
43661 void
43662 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
43663 {
43664 machine_mode mode = GET_MODE (vec);
43665 machine_mode inner_mode = GET_MODE_INNER (mode);
43666 bool use_vec_extr = false;
43667 rtx tmp;
43668
43669 switch (mode)
43670 {
43671 case V2SImode:
43672 case V2SFmode:
43673 if (!mmx_ok)
43674 break;
43675 /* FALLTHRU */
43676
43677 case V2DFmode:
43678 case V2DImode:
43679 use_vec_extr = true;
43680 break;
43681
43682 case V4SFmode:
43683 use_vec_extr = TARGET_SSE4_1;
43684 if (use_vec_extr)
43685 break;
43686
43687 switch (elt)
43688 {
43689 case 0:
43690 tmp = vec;
43691 break;
43692
43693 case 1:
43694 case 3:
43695 tmp = gen_reg_rtx (mode);
43696 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
43697 GEN_INT (elt), GEN_INT (elt),
43698 GEN_INT (elt+4), GEN_INT (elt+4)));
43699 break;
43700
43701 case 2:
43702 tmp = gen_reg_rtx (mode);
43703 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
43704 break;
43705
43706 default:
43707 gcc_unreachable ();
43708 }
43709 vec = tmp;
43710 use_vec_extr = true;
43711 elt = 0;
43712 break;
43713
43714 case V4SImode:
43715 use_vec_extr = TARGET_SSE4_1;
43716 if (use_vec_extr)
43717 break;
43718
43719 if (TARGET_SSE2)
43720 {
43721 switch (elt)
43722 {
43723 case 0:
43724 tmp = vec;
43725 break;
43726
43727 case 1:
43728 case 3:
43729 tmp = gen_reg_rtx (mode);
43730 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
43731 GEN_INT (elt), GEN_INT (elt),
43732 GEN_INT (elt), GEN_INT (elt)));
43733 break;
43734
43735 case 2:
43736 tmp = gen_reg_rtx (mode);
43737 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
43738 break;
43739
43740 default:
43741 gcc_unreachable ();
43742 }
43743 vec = tmp;
43744 use_vec_extr = true;
43745 elt = 0;
43746 }
43747 else
43748 {
43749 /* For SSE1, we have to reuse the V4SF code. */
43750 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
43751 gen_lowpart (V4SFmode, vec), elt);
43752 return;
43753 }
43754 break;
43755
43756 case V8HImode:
43757 use_vec_extr = TARGET_SSE2;
43758 break;
43759 case V4HImode:
43760 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
43761 break;
43762
43763 case V16QImode:
43764 use_vec_extr = TARGET_SSE4_1;
43765 break;
43766
43767 case V8SFmode:
43768 if (TARGET_AVX)
43769 {
43770 tmp = gen_reg_rtx (V4SFmode);
43771 if (elt < 4)
43772 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
43773 else
43774 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
43775 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43776 return;
43777 }
43778 break;
43779
43780 case V4DFmode:
43781 if (TARGET_AVX)
43782 {
43783 tmp = gen_reg_rtx (V2DFmode);
43784 if (elt < 2)
43785 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
43786 else
43787 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
43788 ix86_expand_vector_extract (false, target, tmp, elt & 1);
43789 return;
43790 }
43791 break;
43792
43793 case V32QImode:
43794 if (TARGET_AVX)
43795 {
43796 tmp = gen_reg_rtx (V16QImode);
43797 if (elt < 16)
43798 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
43799 else
43800 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
43801 ix86_expand_vector_extract (false, target, tmp, elt & 15);
43802 return;
43803 }
43804 break;
43805
43806 case V16HImode:
43807 if (TARGET_AVX)
43808 {
43809 tmp = gen_reg_rtx (V8HImode);
43810 if (elt < 8)
43811 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
43812 else
43813 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
43814 ix86_expand_vector_extract (false, target, tmp, elt & 7);
43815 return;
43816 }
43817 break;
43818
43819 case V8SImode:
43820 if (TARGET_AVX)
43821 {
43822 tmp = gen_reg_rtx (V4SImode);
43823 if (elt < 4)
43824 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
43825 else
43826 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
43827 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43828 return;
43829 }
43830 break;
43831
43832 case V4DImode:
43833 if (TARGET_AVX)
43834 {
43835 tmp = gen_reg_rtx (V2DImode);
43836 if (elt < 2)
43837 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
43838 else
43839 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
43840 ix86_expand_vector_extract (false, target, tmp, elt & 1);
43841 return;
43842 }
43843 break;
43844
43845 case V32HImode:
43846 if (TARGET_AVX512BW)
43847 {
43848 tmp = gen_reg_rtx (V16HImode);
43849 if (elt < 16)
43850 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
43851 else
43852 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
43853 ix86_expand_vector_extract (false, target, tmp, elt & 15);
43854 return;
43855 }
43856 break;
43857
43858 case V64QImode:
43859 if (TARGET_AVX512BW)
43860 {
43861 tmp = gen_reg_rtx (V32QImode);
43862 if (elt < 32)
43863 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
43864 else
43865 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
43866 ix86_expand_vector_extract (false, target, tmp, elt & 31);
43867 return;
43868 }
43869 break;
43870
43871 case V16SFmode:
43872 tmp = gen_reg_rtx (V8SFmode);
43873 if (elt < 8)
43874 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
43875 else
43876 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
43877 ix86_expand_vector_extract (false, target, tmp, elt & 7);
43878 return;
43879
43880 case V8DFmode:
43881 tmp = gen_reg_rtx (V4DFmode);
43882 if (elt < 4)
43883 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
43884 else
43885 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
43886 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43887 return;
43888
43889 case V16SImode:
43890 tmp = gen_reg_rtx (V8SImode);
43891 if (elt < 8)
43892 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
43893 else
43894 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
43895 ix86_expand_vector_extract (false, target, tmp, elt & 7);
43896 return;
43897
43898 case V8DImode:
43899 tmp = gen_reg_rtx (V4DImode);
43900 if (elt < 4)
43901 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
43902 else
43903 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
43904 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43905 return;
43906
43907 case V8QImode:
43908 /* ??? Could extract the appropriate HImode element and shift. */
43909 default:
43910 break;
43911 }
43912
43913 if (use_vec_extr)
43914 {
43915 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
43916 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
43917
43918 /* Let the rtl optimizers know about the zero extension performed. */
43919 if (inner_mode == QImode || inner_mode == HImode)
43920 {
43921 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
43922 target = gen_lowpart (SImode, target);
43923 }
43924
43925 emit_insn (gen_rtx_SET (target, tmp));
43926 }
43927 else
43928 {
43929 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
43930
43931 emit_move_insn (mem, vec);
43932
43933 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
43934 emit_move_insn (target, tmp);
43935 }
43936 }
43937
43938 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
43939 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
43940 The upper bits of DEST are undefined, though they shouldn't cause
43941 exceptions (some bits from src or all zeros are ok). */
43942
43943 static void
43944 emit_reduc_half (rtx dest, rtx src, int i)
43945 {
43946 rtx tem, d = dest;
43947 switch (GET_MODE (src))
43948 {
43949 case V4SFmode:
43950 if (i == 128)
43951 tem = gen_sse_movhlps (dest, src, src);
43952 else
43953 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
43954 GEN_INT (1 + 4), GEN_INT (1 + 4));
43955 break;
43956 case V2DFmode:
43957 tem = gen_vec_interleave_highv2df (dest, src, src);
43958 break;
43959 case V16QImode:
43960 case V8HImode:
43961 case V4SImode:
43962 case V2DImode:
43963 d = gen_reg_rtx (V1TImode);
43964 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
43965 GEN_INT (i / 2));
43966 break;
43967 case V8SFmode:
43968 if (i == 256)
43969 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
43970 else
43971 tem = gen_avx_shufps256 (dest, src, src,
43972 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
43973 break;
43974 case V4DFmode:
43975 if (i == 256)
43976 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
43977 else
43978 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
43979 break;
43980 case V32QImode:
43981 case V16HImode:
43982 case V8SImode:
43983 case V4DImode:
43984 if (i == 256)
43985 {
43986 if (GET_MODE (dest) != V4DImode)
43987 d = gen_reg_rtx (V4DImode);
43988 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
43989 gen_lowpart (V4DImode, src),
43990 const1_rtx);
43991 }
43992 else
43993 {
43994 d = gen_reg_rtx (V2TImode);
43995 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
43996 GEN_INT (i / 2));
43997 }
43998 break;
43999 case V64QImode:
44000 case V32HImode:
44001 case V16SImode:
44002 case V16SFmode:
44003 case V8DImode:
44004 case V8DFmode:
44005 if (i > 128)
44006 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
44007 gen_lowpart (V16SImode, src),
44008 gen_lowpart (V16SImode, src),
44009 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
44010 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
44011 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
44012 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
44013 GEN_INT (0xC), GEN_INT (0xD),
44014 GEN_INT (0xE), GEN_INT (0xF),
44015 GEN_INT (0x10), GEN_INT (0x11),
44016 GEN_INT (0x12), GEN_INT (0x13),
44017 GEN_INT (0x14), GEN_INT (0x15),
44018 GEN_INT (0x16), GEN_INT (0x17));
44019 else
44020 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
44021 gen_lowpart (V16SImode, src),
44022 GEN_INT (i == 128 ? 0x2 : 0x1),
44023 GEN_INT (0x3),
44024 GEN_INT (0x3),
44025 GEN_INT (0x3),
44026 GEN_INT (i == 128 ? 0x6 : 0x5),
44027 GEN_INT (0x7),
44028 GEN_INT (0x7),
44029 GEN_INT (0x7),
44030 GEN_INT (i == 128 ? 0xA : 0x9),
44031 GEN_INT (0xB),
44032 GEN_INT (0xB),
44033 GEN_INT (0xB),
44034 GEN_INT (i == 128 ? 0xE : 0xD),
44035 GEN_INT (0xF),
44036 GEN_INT (0xF),
44037 GEN_INT (0xF));
44038 break;
44039 default:
44040 gcc_unreachable ();
44041 }
44042 emit_insn (tem);
44043 if (d != dest)
44044 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
44045 }
44046
44047 /* Expand a vector reduction. FN is the binary pattern to reduce;
44048 DEST is the destination; IN is the input vector. */
44049
44050 void
44051 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
44052 {
44053 rtx half, dst, vec = in;
44054 machine_mode mode = GET_MODE (in);
44055 int i;
44056
44057 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
44058 if (TARGET_SSE4_1
44059 && mode == V8HImode
44060 && fn == gen_uminv8hi3)
44061 {
44062 emit_insn (gen_sse4_1_phminposuw (dest, in));
44063 return;
44064 }
44065
44066 for (i = GET_MODE_BITSIZE (mode);
44067 i > GET_MODE_UNIT_BITSIZE (mode);
44068 i >>= 1)
44069 {
44070 half = gen_reg_rtx (mode);
44071 emit_reduc_half (half, vec, i);
44072 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
44073 dst = dest;
44074 else
44075 dst = gen_reg_rtx (mode);
44076 emit_insn (fn (dst, half, vec));
44077 vec = dst;
44078 }
44079 }
44080 \f
44081 /* Target hook for scalar_mode_supported_p. */
44082 static bool
44083 ix86_scalar_mode_supported_p (machine_mode mode)
44084 {
44085 if (DECIMAL_FLOAT_MODE_P (mode))
44086 return default_decimal_float_supported_p ();
44087 else if (mode == TFmode)
44088 return true;
44089 else
44090 return default_scalar_mode_supported_p (mode);
44091 }
44092
44093 /* Implements target hook vector_mode_supported_p. */
44094 static bool
44095 ix86_vector_mode_supported_p (machine_mode mode)
44096 {
44097 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
44098 return true;
44099 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
44100 return true;
44101 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
44102 return true;
44103 if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
44104 return true;
44105 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
44106 return true;
44107 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
44108 return true;
44109 return false;
44110 }
44111
44112 /* Target hook for c_mode_for_suffix. */
44113 static machine_mode
44114 ix86_c_mode_for_suffix (char suffix)
44115 {
44116 if (suffix == 'q')
44117 return TFmode;
44118 if (suffix == 'w')
44119 return XFmode;
44120
44121 return VOIDmode;
44122 }
44123
44124 /* Worker function for TARGET_MD_ASM_ADJUST.
44125
44126 We implement asm flag outputs, and maintain source compatibility
44127 with the old cc0-based compiler. */
44128
44129 static rtx_insn *
44130 ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &/*inputs*/,
44131 vec<const char *> &constraints,
44132 vec<rtx> &clobbers, HARD_REG_SET &clobbered_regs)
44133 {
44134 clobbers.safe_push (gen_rtx_REG (CCFPmode, FPSR_REG));
44135 SET_HARD_REG_BIT (clobbered_regs, FPSR_REG);
44136
44137 bool saw_asm_flag = false;
44138
44139 start_sequence ();
44140 for (unsigned i = 0, n = outputs.length (); i < n; ++i)
44141 {
44142 const char *con = constraints[i];
44143 if (strncmp (con, "=@cc", 4) != 0)
44144 continue;
44145 con += 4;
44146 if (strchr (con, ',') != NULL)
44147 {
44148 error ("alternatives not allowed in asm flag output");
44149 continue;
44150 }
44151
44152 bool invert = false;
44153 if (con[0] == 'n')
44154 invert = true, con++;
44155
44156 machine_mode mode = CCmode;
44157 rtx_code code = UNKNOWN;
44158
44159 switch (con[0])
44160 {
44161 case 'a':
44162 if (con[1] == 0)
44163 mode = CCAmode, code = EQ;
44164 else if (con[1] == 'e' && con[2] == 0)
44165 mode = CCCmode, code = NE;
44166 break;
44167 case 'b':
44168 if (con[1] == 0)
44169 mode = CCCmode, code = EQ;
44170 else if (con[1] == 'e' && con[2] == 0)
44171 mode = CCAmode, code = NE;
44172 break;
44173 case 'c':
44174 if (con[1] == 0)
44175 mode = CCCmode, code = EQ;
44176 break;
44177 case 'e':
44178 if (con[1] == 0)
44179 mode = CCZmode, code = EQ;
44180 break;
44181 case 'g':
44182 if (con[1] == 0)
44183 mode = CCGCmode, code = GT;
44184 else if (con[1] == 'e' && con[2] == 0)
44185 mode = CCGCmode, code = GE;
44186 break;
44187 case 'l':
44188 if (con[1] == 0)
44189 mode = CCGCmode, code = LT;
44190 else if (con[1] == 'e' && con[2] == 0)
44191 mode = CCGCmode, code = LE;
44192 break;
44193 case 'o':
44194 if (con[1] == 0)
44195 mode = CCOmode, code = EQ;
44196 break;
44197 case 'p':
44198 if (con[1] == 0)
44199 mode = CCPmode, code = EQ;
44200 break;
44201 case 's':
44202 if (con[1] == 0)
44203 mode = CCSmode, code = EQ;
44204 break;
44205 case 'z':
44206 if (con[1] == 0)
44207 mode = CCZmode, code = EQ;
44208 break;
44209 }
44210 if (code == UNKNOWN)
44211 {
44212 error ("unknown asm flag output %qs", constraints[i]);
44213 continue;
44214 }
44215 if (invert)
44216 code = reverse_condition (code);
44217
44218 rtx dest = outputs[i];
44219 if (!saw_asm_flag)
44220 {
44221 /* This is the first asm flag output. Here we put the flags
44222 register in as the real output and adjust the condition to
44223 allow it. */
44224 constraints[i] = "=Bf";
44225 outputs[i] = gen_rtx_REG (CCmode, FLAGS_REG);
44226 saw_asm_flag = true;
44227 }
44228 else
44229 {
44230 /* We don't need the flags register as output twice. */
44231 constraints[i] = "=X";
44232 outputs[i] = gen_rtx_SCRATCH (SImode);
44233 }
44234
44235 rtx x = gen_rtx_REG (mode, FLAGS_REG);
44236 x = gen_rtx_fmt_ee (code, QImode, x, const0_rtx);
44237
44238 machine_mode dest_mode = GET_MODE (dest);
44239 if (!SCALAR_INT_MODE_P (dest_mode))
44240 {
44241 error ("invalid type for asm flag output");
44242 continue;
44243 }
44244
44245 if (dest_mode == DImode && !TARGET_64BIT)
44246 dest_mode = SImode;
44247
44248 if (dest_mode != QImode)
44249 {
44250 rtx destqi = gen_reg_rtx (QImode);
44251 emit_insn (gen_rtx_SET (destqi, x));
44252
44253 if (TARGET_ZERO_EXTEND_WITH_AND
44254 && optimize_function_for_speed_p (cfun))
44255 {
44256 x = force_reg (dest_mode, const0_rtx);
44257
44258 emit_insn (gen_movstrictqi
44259 (gen_lowpart (QImode, x), destqi));
44260 }
44261 else
44262 x = gen_rtx_ZERO_EXTEND (dest_mode, destqi);
44263 }
44264
44265 if (dest_mode != GET_MODE (dest))
44266 {
44267 rtx tmp = gen_reg_rtx (SImode);
44268
44269 emit_insn (gen_rtx_SET (tmp, x));
44270 emit_insn (gen_zero_extendsidi2 (dest, tmp));
44271 }
44272 else
44273 emit_insn (gen_rtx_SET (dest, x));
44274 }
44275 rtx_insn *seq = get_insns ();
44276 end_sequence ();
44277
44278 if (saw_asm_flag)
44279 return seq;
44280 else
44281 {
44282 /* If we had no asm flag outputs, clobber the flags. */
44283 clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REG));
44284 SET_HARD_REG_BIT (clobbered_regs, FLAGS_REG);
44285 return NULL;
44286 }
44287 }
44288
44289 /* Implements target vector targetm.asm.encode_section_info. */
44290
44291 static void ATTRIBUTE_UNUSED
44292 ix86_encode_section_info (tree decl, rtx rtl, int first)
44293 {
44294 default_encode_section_info (decl, rtl, first);
44295
44296 if (ix86_in_large_data_p (decl))
44297 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
44298 }
44299
44300 /* Worker function for REVERSE_CONDITION. */
44301
44302 enum rtx_code
44303 ix86_reverse_condition (enum rtx_code code, machine_mode mode)
44304 {
44305 return (mode != CCFPmode && mode != CCFPUmode
44306 ? reverse_condition (code)
44307 : reverse_condition_maybe_unordered (code));
44308 }
44309
44310 /* Output code to perform an x87 FP register move, from OPERANDS[1]
44311 to OPERANDS[0]. */
44312
44313 const char *
44314 output_387_reg_move (rtx_insn *insn, rtx *operands)
44315 {
44316 if (REG_P (operands[0]))
44317 {
44318 if (REG_P (operands[1])
44319 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
44320 {
44321 if (REGNO (operands[0]) == FIRST_STACK_REG)
44322 return output_387_ffreep (operands, 0);
44323 return "fstp\t%y0";
44324 }
44325 if (STACK_TOP_P (operands[0]))
44326 return "fld%Z1\t%y1";
44327 return "fst\t%y0";
44328 }
44329 else if (MEM_P (operands[0]))
44330 {
44331 gcc_assert (REG_P (operands[1]));
44332 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
44333 return "fstp%Z0\t%y0";
44334 else
44335 {
44336 /* There is no non-popping store to memory for XFmode.
44337 So if we need one, follow the store with a load. */
44338 if (GET_MODE (operands[0]) == XFmode)
44339 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
44340 else
44341 return "fst%Z0\t%y0";
44342 }
44343 }
44344 else
44345 gcc_unreachable();
44346 }
44347
44348 /* Output code to perform a conditional jump to LABEL, if C2 flag in
44349 FP status register is set. */
44350
44351 void
44352 ix86_emit_fp_unordered_jump (rtx label)
44353 {
44354 rtx reg = gen_reg_rtx (HImode);
44355 rtx temp;
44356
44357 emit_insn (gen_x86_fnstsw_1 (reg));
44358
44359 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
44360 {
44361 emit_insn (gen_x86_sahf_1 (reg));
44362
44363 temp = gen_rtx_REG (CCmode, FLAGS_REG);
44364 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
44365 }
44366 else
44367 {
44368 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
44369
44370 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
44371 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
44372 }
44373
44374 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
44375 gen_rtx_LABEL_REF (VOIDmode, label),
44376 pc_rtx);
44377 temp = gen_rtx_SET (pc_rtx, temp);
44378
44379 emit_jump_insn (temp);
44380 predict_jump (REG_BR_PROB_BASE * 10 / 100);
44381 }
44382
44383 /* Output code to perform a log1p XFmode calculation. */
44384
44385 void ix86_emit_i387_log1p (rtx op0, rtx op1)
44386 {
44387 rtx_code_label *label1 = gen_label_rtx ();
44388 rtx_code_label *label2 = gen_label_rtx ();
44389
44390 rtx tmp = gen_reg_rtx (XFmode);
44391 rtx tmp2 = gen_reg_rtx (XFmode);
44392 rtx test;
44393
44394 emit_insn (gen_absxf2 (tmp, op1));
44395 test = gen_rtx_GE (VOIDmode, tmp,
44396 const_double_from_real_value (
44397 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
44398 XFmode));
44399 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
44400
44401 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
44402 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
44403 emit_jump (label2);
44404
44405 emit_label (label1);
44406 emit_move_insn (tmp, CONST1_RTX (XFmode));
44407 emit_insn (gen_addxf3 (tmp, op1, tmp));
44408 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
44409 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
44410
44411 emit_label (label2);
44412 }
44413
44414 /* Emit code for round calculation. */
44415 void ix86_emit_i387_round (rtx op0, rtx op1)
44416 {
44417 machine_mode inmode = GET_MODE (op1);
44418 machine_mode outmode = GET_MODE (op0);
44419 rtx e1, e2, res, tmp, tmp1, half;
44420 rtx scratch = gen_reg_rtx (HImode);
44421 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
44422 rtx_code_label *jump_label = gen_label_rtx ();
44423 rtx insn;
44424 rtx (*gen_abs) (rtx, rtx);
44425 rtx (*gen_neg) (rtx, rtx);
44426
44427 switch (inmode)
44428 {
44429 case SFmode:
44430 gen_abs = gen_abssf2;
44431 break;
44432 case DFmode:
44433 gen_abs = gen_absdf2;
44434 break;
44435 case XFmode:
44436 gen_abs = gen_absxf2;
44437 break;
44438 default:
44439 gcc_unreachable ();
44440 }
44441
44442 switch (outmode)
44443 {
44444 case SFmode:
44445 gen_neg = gen_negsf2;
44446 break;
44447 case DFmode:
44448 gen_neg = gen_negdf2;
44449 break;
44450 case XFmode:
44451 gen_neg = gen_negxf2;
44452 break;
44453 case HImode:
44454 gen_neg = gen_neghi2;
44455 break;
44456 case SImode:
44457 gen_neg = gen_negsi2;
44458 break;
44459 case DImode:
44460 gen_neg = gen_negdi2;
44461 break;
44462 default:
44463 gcc_unreachable ();
44464 }
44465
44466 e1 = gen_reg_rtx (inmode);
44467 e2 = gen_reg_rtx (inmode);
44468 res = gen_reg_rtx (outmode);
44469
44470 half = const_double_from_real_value (dconsthalf, inmode);
44471
44472 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
44473
44474 /* scratch = fxam(op1) */
44475 emit_insn (gen_rtx_SET (scratch,
44476 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
44477 UNSPEC_FXAM)));
44478 /* e1 = fabs(op1) */
44479 emit_insn (gen_abs (e1, op1));
44480
44481 /* e2 = e1 + 0.5 */
44482 half = force_reg (inmode, half);
44483 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (inmode, e1, half)));
44484
44485 /* res = floor(e2) */
44486 if (inmode != XFmode)
44487 {
44488 tmp1 = gen_reg_rtx (XFmode);
44489
44490 emit_insn (gen_rtx_SET (tmp1, gen_rtx_FLOAT_EXTEND (XFmode, e2)));
44491 }
44492 else
44493 tmp1 = e2;
44494
44495 switch (outmode)
44496 {
44497 case SFmode:
44498 case DFmode:
44499 {
44500 rtx tmp0 = gen_reg_rtx (XFmode);
44501
44502 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
44503
44504 emit_insn (gen_rtx_SET (res,
44505 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
44506 UNSPEC_TRUNC_NOOP)));
44507 }
44508 break;
44509 case XFmode:
44510 emit_insn (gen_frndintxf2_floor (res, tmp1));
44511 break;
44512 case HImode:
44513 emit_insn (gen_lfloorxfhi2 (res, tmp1));
44514 break;
44515 case SImode:
44516 emit_insn (gen_lfloorxfsi2 (res, tmp1));
44517 break;
44518 case DImode:
44519 emit_insn (gen_lfloorxfdi2 (res, tmp1));
44520 break;
44521 default:
44522 gcc_unreachable ();
44523 }
44524
44525 /* flags = signbit(a) */
44526 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
44527
44528 /* if (flags) then res = -res */
44529 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
44530 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
44531 gen_rtx_LABEL_REF (VOIDmode, jump_label),
44532 pc_rtx);
44533 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
44534 predict_jump (REG_BR_PROB_BASE * 50 / 100);
44535 JUMP_LABEL (insn) = jump_label;
44536
44537 emit_insn (gen_neg (res, res));
44538
44539 emit_label (jump_label);
44540 LABEL_NUSES (jump_label) = 1;
44541
44542 emit_move_insn (op0, res);
44543 }
44544
44545 /* Output code to perform a Newton-Rhapson approximation of a single precision
44546 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
44547
44548 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
44549 {
44550 rtx x0, x1, e0, e1;
44551
44552 x0 = gen_reg_rtx (mode);
44553 e0 = gen_reg_rtx (mode);
44554 e1 = gen_reg_rtx (mode);
44555 x1 = gen_reg_rtx (mode);
44556
44557 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
44558
44559 b = force_reg (mode, b);
44560
44561 /* x0 = rcp(b) estimate */
44562 if (mode == V16SFmode || mode == V8DFmode)
44563 {
44564 if (TARGET_AVX512ER)
44565 {
44566 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
44567 UNSPEC_RCP28)));
44568 /* res = a * x0 */
44569 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
44570 return;
44571 }
44572 else
44573 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
44574 UNSPEC_RCP14)));
44575 }
44576 else
44577 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
44578 UNSPEC_RCP)));
44579
44580 /* e0 = x0 * b */
44581 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
44582
44583 /* e0 = x0 * e0 */
44584 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
44585
44586 /* e1 = x0 + x0 */
44587 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
44588
44589 /* x1 = e1 - e0 */
44590 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
44591
44592 /* res = a * x1 */
44593 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
44594 }
44595
44596 /* Output code to perform a Newton-Rhapson approximation of a
44597 single precision floating point [reciprocal] square root. */
44598
44599 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
44600 {
44601 rtx x0, e0, e1, e2, e3, mthree, mhalf;
44602 REAL_VALUE_TYPE r;
44603 int unspec;
44604
44605 x0 = gen_reg_rtx (mode);
44606 e0 = gen_reg_rtx (mode);
44607 e1 = gen_reg_rtx (mode);
44608 e2 = gen_reg_rtx (mode);
44609 e3 = gen_reg_rtx (mode);
44610
44611 if (TARGET_AVX512ER && mode == V16SFmode)
44612 {
44613 if (recip)
44614 /* res = rsqrt28(a) estimate */
44615 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
44616 UNSPEC_RSQRT28)));
44617 else
44618 {
44619 /* x0 = rsqrt28(a) estimate */
44620 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
44621 UNSPEC_RSQRT28)));
44622 /* res = rcp28(x0) estimate */
44623 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
44624 UNSPEC_RCP28)));
44625 }
44626 return;
44627 }
44628
44629 real_from_integer (&r, VOIDmode, -3, SIGNED);
44630 mthree = const_double_from_real_value (r, SFmode);
44631
44632 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
44633 mhalf = const_double_from_real_value (r, SFmode);
44634 unspec = UNSPEC_RSQRT;
44635
44636 if (VECTOR_MODE_P (mode))
44637 {
44638 mthree = ix86_build_const_vector (mode, true, mthree);
44639 mhalf = ix86_build_const_vector (mode, true, mhalf);
44640 /* There is no 512-bit rsqrt. There is however rsqrt14. */
44641 if (GET_MODE_SIZE (mode) == 64)
44642 unspec = UNSPEC_RSQRT14;
44643 }
44644
44645 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
44646 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
44647
44648 a = force_reg (mode, a);
44649
44650 /* x0 = rsqrt(a) estimate */
44651 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
44652 unspec)));
44653
44654 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
44655 if (!recip)
44656 {
44657 rtx zero = force_reg (mode, CONST0_RTX(mode));
44658 rtx mask;
44659
44660 /* Handle masked compare. */
44661 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
44662 {
44663 mask = gen_reg_rtx (HImode);
44664 /* Imm value 0x4 corresponds to not-equal comparison. */
44665 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
44666 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
44667 }
44668 else
44669 {
44670 mask = gen_reg_rtx (mode);
44671 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
44672 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
44673 }
44674 }
44675
44676 /* e0 = x0 * a */
44677 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
44678 /* e1 = e0 * x0 */
44679 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
44680
44681 /* e2 = e1 - 3. */
44682 mthree = force_reg (mode, mthree);
44683 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
44684
44685 mhalf = force_reg (mode, mhalf);
44686 if (recip)
44687 /* e3 = -.5 * x0 */
44688 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
44689 else
44690 /* e3 = -.5 * e0 */
44691 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
44692 /* ret = e2 * e3 */
44693 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
44694 }
44695
44696 #ifdef TARGET_SOLARIS
44697 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
44698
44699 static void
44700 i386_solaris_elf_named_section (const char *name, unsigned int flags,
44701 tree decl)
44702 {
44703 /* With Binutils 2.15, the "@unwind" marker must be specified on
44704 every occurrence of the ".eh_frame" section, not just the first
44705 one. */
44706 if (TARGET_64BIT
44707 && strcmp (name, ".eh_frame") == 0)
44708 {
44709 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
44710 flags & SECTION_WRITE ? "aw" : "a");
44711 return;
44712 }
44713
44714 #ifndef USE_GAS
44715 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
44716 {
44717 solaris_elf_asm_comdat_section (name, flags, decl);
44718 return;
44719 }
44720 #endif
44721
44722 default_elf_asm_named_section (name, flags, decl);
44723 }
44724 #endif /* TARGET_SOLARIS */
44725
44726 /* Return the mangling of TYPE if it is an extended fundamental type. */
44727
44728 static const char *
44729 ix86_mangle_type (const_tree type)
44730 {
44731 type = TYPE_MAIN_VARIANT (type);
44732
44733 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
44734 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
44735 return NULL;
44736
44737 switch (TYPE_MODE (type))
44738 {
44739 case TFmode:
44740 /* __float128 is "g". */
44741 return "g";
44742 case XFmode:
44743 /* "long double" or __float80 is "e". */
44744 return "e";
44745 default:
44746 return NULL;
44747 }
44748 }
44749
44750 #ifdef TARGET_THREAD_SSP_OFFSET
44751 /* If using TLS guards, don't waste time creating and expanding
44752 __stack_chk_guard decl and MEM as we are going to ignore it. */
44753 static tree
44754 ix86_stack_protect_guard (void)
44755 {
44756 if (TARGET_SSP_TLS_GUARD)
44757 return NULL_TREE;
44758 return default_stack_protect_guard ();
44759 }
44760 #endif
44761
44762 /* For 32-bit code we can save PIC register setup by using
44763 __stack_chk_fail_local hidden function instead of calling
44764 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
44765 register, so it is better to call __stack_chk_fail directly. */
44766
44767 static tree ATTRIBUTE_UNUSED
44768 ix86_stack_protect_fail (void)
44769 {
44770 return TARGET_64BIT
44771 ? default_external_stack_protect_fail ()
44772 : default_hidden_stack_protect_fail ();
44773 }
44774
44775 /* Select a format to encode pointers in exception handling data. CODE
44776 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
44777 true if the symbol may be affected by dynamic relocations.
44778
44779 ??? All x86 object file formats are capable of representing this.
44780 After all, the relocation needed is the same as for the call insn.
44781 Whether or not a particular assembler allows us to enter such, I
44782 guess we'll have to see. */
44783 int
44784 asm_preferred_eh_data_format (int code, int global)
44785 {
44786 if (flag_pic)
44787 {
44788 int type = DW_EH_PE_sdata8;
44789 if (!TARGET_64BIT
44790 || ix86_cmodel == CM_SMALL_PIC
44791 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
44792 type = DW_EH_PE_sdata4;
44793 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
44794 }
44795 if (ix86_cmodel == CM_SMALL
44796 || (ix86_cmodel == CM_MEDIUM && code))
44797 return DW_EH_PE_udata4;
44798 return DW_EH_PE_absptr;
44799 }
44800 \f
44801 /* Expand copysign from SIGN to the positive value ABS_VALUE
44802 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
44803 the sign-bit. */
44804 static void
44805 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
44806 {
44807 machine_mode mode = GET_MODE (sign);
44808 rtx sgn = gen_reg_rtx (mode);
44809 if (mask == NULL_RTX)
44810 {
44811 machine_mode vmode;
44812
44813 if (mode == SFmode)
44814 vmode = V4SFmode;
44815 else if (mode == DFmode)
44816 vmode = V2DFmode;
44817 else
44818 vmode = mode;
44819
44820 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
44821 if (!VECTOR_MODE_P (mode))
44822 {
44823 /* We need to generate a scalar mode mask in this case. */
44824 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
44825 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
44826 mask = gen_reg_rtx (mode);
44827 emit_insn (gen_rtx_SET (mask, tmp));
44828 }
44829 }
44830 else
44831 mask = gen_rtx_NOT (mode, mask);
44832 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
44833 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
44834 }
44835
44836 /* Expand fabs (OP0) and return a new rtx that holds the result. The
44837 mask for masking out the sign-bit is stored in *SMASK, if that is
44838 non-null. */
44839 static rtx
44840 ix86_expand_sse_fabs (rtx op0, rtx *smask)
44841 {
44842 machine_mode vmode, mode = GET_MODE (op0);
44843 rtx xa, mask;
44844
44845 xa = gen_reg_rtx (mode);
44846 if (mode == SFmode)
44847 vmode = V4SFmode;
44848 else if (mode == DFmode)
44849 vmode = V2DFmode;
44850 else
44851 vmode = mode;
44852 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
44853 if (!VECTOR_MODE_P (mode))
44854 {
44855 /* We need to generate a scalar mode mask in this case. */
44856 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
44857 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
44858 mask = gen_reg_rtx (mode);
44859 emit_insn (gen_rtx_SET (mask, tmp));
44860 }
44861 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
44862
44863 if (smask)
44864 *smask = mask;
44865
44866 return xa;
44867 }
44868
44869 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
44870 swapping the operands if SWAP_OPERANDS is true. The expanded
44871 code is a forward jump to a newly created label in case the
44872 comparison is true. The generated label rtx is returned. */
44873 static rtx_code_label *
44874 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
44875 bool swap_operands)
44876 {
44877 machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
44878 rtx_code_label *label;
44879 rtx tmp;
44880
44881 if (swap_operands)
44882 std::swap (op0, op1);
44883
44884 label = gen_label_rtx ();
44885 tmp = gen_rtx_REG (fpcmp_mode, FLAGS_REG);
44886 emit_insn (gen_rtx_SET (tmp, gen_rtx_COMPARE (fpcmp_mode, op0, op1)));
44887 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
44888 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
44889 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
44890 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
44891 JUMP_LABEL (tmp) = label;
44892
44893 return label;
44894 }
44895
44896 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
44897 using comparison code CODE. Operands are swapped for the comparison if
44898 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
44899 static rtx
44900 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
44901 bool swap_operands)
44902 {
44903 rtx (*insn)(rtx, rtx, rtx, rtx);
44904 machine_mode mode = GET_MODE (op0);
44905 rtx mask = gen_reg_rtx (mode);
44906
44907 if (swap_operands)
44908 std::swap (op0, op1);
44909
44910 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
44911
44912 emit_insn (insn (mask, op0, op1,
44913 gen_rtx_fmt_ee (code, mode, op0, op1)));
44914 return mask;
44915 }
44916
44917 /* Generate and return a rtx of mode MODE for 2**n where n is the number
44918 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
44919 static rtx
44920 ix86_gen_TWO52 (machine_mode mode)
44921 {
44922 REAL_VALUE_TYPE TWO52r;
44923 rtx TWO52;
44924
44925 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
44926 TWO52 = const_double_from_real_value (TWO52r, mode);
44927 TWO52 = force_reg (mode, TWO52);
44928
44929 return TWO52;
44930 }
44931
44932 /* Expand SSE sequence for computing lround from OP1 storing
44933 into OP0. */
44934 void
44935 ix86_expand_lround (rtx op0, rtx op1)
44936 {
44937 /* C code for the stuff we're doing below:
44938 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
44939 return (long)tmp;
44940 */
44941 machine_mode mode = GET_MODE (op1);
44942 const struct real_format *fmt;
44943 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
44944 rtx adj;
44945
44946 /* load nextafter (0.5, 0.0) */
44947 fmt = REAL_MODE_FORMAT (mode);
44948 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
44949 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
44950
44951 /* adj = copysign (0.5, op1) */
44952 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
44953 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
44954
44955 /* adj = op1 + adj */
44956 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
44957
44958 /* op0 = (imode)adj */
44959 expand_fix (op0, adj, 0);
44960 }
44961
44962 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
44963 into OPERAND0. */
44964 void
44965 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
44966 {
44967 /* C code for the stuff we're doing below (for do_floor):
44968 xi = (long)op1;
44969 xi -= (double)xi > op1 ? 1 : 0;
44970 return xi;
44971 */
44972 machine_mode fmode = GET_MODE (op1);
44973 machine_mode imode = GET_MODE (op0);
44974 rtx ireg, freg, tmp;
44975 rtx_code_label *label;
44976
44977 /* reg = (long)op1 */
44978 ireg = gen_reg_rtx (imode);
44979 expand_fix (ireg, op1, 0);
44980
44981 /* freg = (double)reg */
44982 freg = gen_reg_rtx (fmode);
44983 expand_float (freg, ireg, 0);
44984
44985 /* ireg = (freg > op1) ? ireg - 1 : ireg */
44986 label = ix86_expand_sse_compare_and_jump (UNLE,
44987 freg, op1, !do_floor);
44988 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
44989 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
44990 emit_move_insn (ireg, tmp);
44991
44992 emit_label (label);
44993 LABEL_NUSES (label) = 1;
44994
44995 emit_move_insn (op0, ireg);
44996 }
44997
44998 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
44999 result in OPERAND0. */
45000 void
45001 ix86_expand_rint (rtx operand0, rtx operand1)
45002 {
45003 /* C code for the stuff we're doing below:
45004 xa = fabs (operand1);
45005 if (!isless (xa, 2**52))
45006 return operand1;
45007 xa = xa + 2**52 - 2**52;
45008 return copysign (xa, operand1);
45009 */
45010 machine_mode mode = GET_MODE (operand0);
45011 rtx res, xa, TWO52, mask;
45012 rtx_code_label *label;
45013
45014 res = gen_reg_rtx (mode);
45015 emit_move_insn (res, operand1);
45016
45017 /* xa = abs (operand1) */
45018 xa = ix86_expand_sse_fabs (res, &mask);
45019
45020 /* if (!isless (xa, TWO52)) goto label; */
45021 TWO52 = ix86_gen_TWO52 (mode);
45022 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45023
45024 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
45025 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
45026
45027 ix86_sse_copysign_to_positive (res, xa, res, mask);
45028
45029 emit_label (label);
45030 LABEL_NUSES (label) = 1;
45031
45032 emit_move_insn (operand0, res);
45033 }
45034
45035 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
45036 into OPERAND0. */
45037 void
45038 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
45039 {
45040 /* C code for the stuff we expand below.
45041 double xa = fabs (x), x2;
45042 if (!isless (xa, TWO52))
45043 return x;
45044 xa = xa + TWO52 - TWO52;
45045 x2 = copysign (xa, x);
45046 Compensate. Floor:
45047 if (x2 > x)
45048 x2 -= 1;
45049 Compensate. Ceil:
45050 if (x2 < x)
45051 x2 -= -1;
45052 return x2;
45053 */
45054 machine_mode mode = GET_MODE (operand0);
45055 rtx xa, TWO52, tmp, one, res, mask;
45056 rtx_code_label *label;
45057
45058 TWO52 = ix86_gen_TWO52 (mode);
45059
45060 /* Temporary for holding the result, initialized to the input
45061 operand to ease control flow. */
45062 res = gen_reg_rtx (mode);
45063 emit_move_insn (res, operand1);
45064
45065 /* xa = abs (operand1) */
45066 xa = ix86_expand_sse_fabs (res, &mask);
45067
45068 /* if (!isless (xa, TWO52)) goto label; */
45069 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45070
45071 /* xa = xa + TWO52 - TWO52; */
45072 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
45073 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
45074
45075 /* xa = copysign (xa, operand1) */
45076 ix86_sse_copysign_to_positive (xa, xa, res, mask);
45077
45078 /* generate 1.0 or -1.0 */
45079 one = force_reg (mode,
45080 const_double_from_real_value (do_floor
45081 ? dconst1 : dconstm1, mode));
45082
45083 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
45084 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
45085 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45086 /* We always need to subtract here to preserve signed zero. */
45087 tmp = expand_simple_binop (mode, MINUS,
45088 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45089 emit_move_insn (res, tmp);
45090
45091 emit_label (label);
45092 LABEL_NUSES (label) = 1;
45093
45094 emit_move_insn (operand0, res);
45095 }
45096
45097 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
45098 into OPERAND0. */
45099 void
45100 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
45101 {
45102 /* C code for the stuff we expand below.
45103 double xa = fabs (x), x2;
45104 if (!isless (xa, TWO52))
45105 return x;
45106 x2 = (double)(long)x;
45107 Compensate. Floor:
45108 if (x2 > x)
45109 x2 -= 1;
45110 Compensate. Ceil:
45111 if (x2 < x)
45112 x2 += 1;
45113 if (HONOR_SIGNED_ZEROS (mode))
45114 return copysign (x2, x);
45115 return x2;
45116 */
45117 machine_mode mode = GET_MODE (operand0);
45118 rtx xa, xi, TWO52, tmp, one, res, mask;
45119 rtx_code_label *label;
45120
45121 TWO52 = ix86_gen_TWO52 (mode);
45122
45123 /* Temporary for holding the result, initialized to the input
45124 operand to ease control flow. */
45125 res = gen_reg_rtx (mode);
45126 emit_move_insn (res, operand1);
45127
45128 /* xa = abs (operand1) */
45129 xa = ix86_expand_sse_fabs (res, &mask);
45130
45131 /* if (!isless (xa, TWO52)) goto label; */
45132 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45133
45134 /* xa = (double)(long)x */
45135 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
45136 expand_fix (xi, res, 0);
45137 expand_float (xa, xi, 0);
45138
45139 /* generate 1.0 */
45140 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
45141
45142 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
45143 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
45144 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45145 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
45146 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45147 emit_move_insn (res, tmp);
45148
45149 if (HONOR_SIGNED_ZEROS (mode))
45150 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
45151
45152 emit_label (label);
45153 LABEL_NUSES (label) = 1;
45154
45155 emit_move_insn (operand0, res);
45156 }
45157
45158 /* Expand SSE sequence for computing round from OPERAND1 storing
45159 into OPERAND0. Sequence that works without relying on DImode truncation
45160 via cvttsd2siq that is only available on 64bit targets. */
45161 void
45162 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
45163 {
45164 /* C code for the stuff we expand below.
45165 double xa = fabs (x), xa2, x2;
45166 if (!isless (xa, TWO52))
45167 return x;
45168 Using the absolute value and copying back sign makes
45169 -0.0 -> -0.0 correct.
45170 xa2 = xa + TWO52 - TWO52;
45171 Compensate.
45172 dxa = xa2 - xa;
45173 if (dxa <= -0.5)
45174 xa2 += 1;
45175 else if (dxa > 0.5)
45176 xa2 -= 1;
45177 x2 = copysign (xa2, x);
45178 return x2;
45179 */
45180 machine_mode mode = GET_MODE (operand0);
45181 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
45182 rtx_code_label *label;
45183
45184 TWO52 = ix86_gen_TWO52 (mode);
45185
45186 /* Temporary for holding the result, initialized to the input
45187 operand to ease control flow. */
45188 res = gen_reg_rtx (mode);
45189 emit_move_insn (res, operand1);
45190
45191 /* xa = abs (operand1) */
45192 xa = ix86_expand_sse_fabs (res, &mask);
45193
45194 /* if (!isless (xa, TWO52)) goto label; */
45195 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45196
45197 /* xa2 = xa + TWO52 - TWO52; */
45198 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
45199 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
45200
45201 /* dxa = xa2 - xa; */
45202 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
45203
45204 /* generate 0.5, 1.0 and -0.5 */
45205 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
45206 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
45207 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
45208 0, OPTAB_DIRECT);
45209
45210 /* Compensate. */
45211 tmp = gen_reg_rtx (mode);
45212 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
45213 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
45214 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45215 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45216 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
45217 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
45218 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45219 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45220
45221 /* res = copysign (xa2, operand1) */
45222 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
45223
45224 emit_label (label);
45225 LABEL_NUSES (label) = 1;
45226
45227 emit_move_insn (operand0, res);
45228 }
45229
45230 /* Expand SSE sequence for computing trunc from OPERAND1 storing
45231 into OPERAND0. */
45232 void
45233 ix86_expand_trunc (rtx operand0, rtx operand1)
45234 {
45235 /* C code for SSE variant we expand below.
45236 double xa = fabs (x), x2;
45237 if (!isless (xa, TWO52))
45238 return x;
45239 x2 = (double)(long)x;
45240 if (HONOR_SIGNED_ZEROS (mode))
45241 return copysign (x2, x);
45242 return x2;
45243 */
45244 machine_mode mode = GET_MODE (operand0);
45245 rtx xa, xi, TWO52, res, mask;
45246 rtx_code_label *label;
45247
45248 TWO52 = ix86_gen_TWO52 (mode);
45249
45250 /* Temporary for holding the result, initialized to the input
45251 operand to ease control flow. */
45252 res = gen_reg_rtx (mode);
45253 emit_move_insn (res, operand1);
45254
45255 /* xa = abs (operand1) */
45256 xa = ix86_expand_sse_fabs (res, &mask);
45257
45258 /* if (!isless (xa, TWO52)) goto label; */
45259 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45260
45261 /* x = (double)(long)x */
45262 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
45263 expand_fix (xi, res, 0);
45264 expand_float (res, xi, 0);
45265
45266 if (HONOR_SIGNED_ZEROS (mode))
45267 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
45268
45269 emit_label (label);
45270 LABEL_NUSES (label) = 1;
45271
45272 emit_move_insn (operand0, res);
45273 }
45274
45275 /* Expand SSE sequence for computing trunc from OPERAND1 storing
45276 into OPERAND0. */
45277 void
45278 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
45279 {
45280 machine_mode mode = GET_MODE (operand0);
45281 rtx xa, mask, TWO52, one, res, smask, tmp;
45282 rtx_code_label *label;
45283
45284 /* C code for SSE variant we expand below.
45285 double xa = fabs (x), x2;
45286 if (!isless (xa, TWO52))
45287 return x;
45288 xa2 = xa + TWO52 - TWO52;
45289 Compensate:
45290 if (xa2 > xa)
45291 xa2 -= 1.0;
45292 x2 = copysign (xa2, x);
45293 return x2;
45294 */
45295
45296 TWO52 = ix86_gen_TWO52 (mode);
45297
45298 /* Temporary for holding the result, initialized to the input
45299 operand to ease control flow. */
45300 res = gen_reg_rtx (mode);
45301 emit_move_insn (res, operand1);
45302
45303 /* xa = abs (operand1) */
45304 xa = ix86_expand_sse_fabs (res, &smask);
45305
45306 /* if (!isless (xa, TWO52)) goto label; */
45307 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45308
45309 /* res = xa + TWO52 - TWO52; */
45310 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
45311 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
45312 emit_move_insn (res, tmp);
45313
45314 /* generate 1.0 */
45315 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
45316
45317 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
45318 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
45319 emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
45320 tmp = expand_simple_binop (mode, MINUS,
45321 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
45322 emit_move_insn (res, tmp);
45323
45324 /* res = copysign (res, operand1) */
45325 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
45326
45327 emit_label (label);
45328 LABEL_NUSES (label) = 1;
45329
45330 emit_move_insn (operand0, res);
45331 }
45332
45333 /* Expand SSE sequence for computing round from OPERAND1 storing
45334 into OPERAND0. */
45335 void
45336 ix86_expand_round (rtx operand0, rtx operand1)
45337 {
45338 /* C code for the stuff we're doing below:
45339 double xa = fabs (x);
45340 if (!isless (xa, TWO52))
45341 return x;
45342 xa = (double)(long)(xa + nextafter (0.5, 0.0));
45343 return copysign (xa, x);
45344 */
45345 machine_mode mode = GET_MODE (operand0);
45346 rtx res, TWO52, xa, xi, half, mask;
45347 rtx_code_label *label;
45348 const struct real_format *fmt;
45349 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
45350
45351 /* Temporary for holding the result, initialized to the input
45352 operand to ease control flow. */
45353 res = gen_reg_rtx (mode);
45354 emit_move_insn (res, operand1);
45355
45356 TWO52 = ix86_gen_TWO52 (mode);
45357 xa = ix86_expand_sse_fabs (res, &mask);
45358 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45359
45360 /* load nextafter (0.5, 0.0) */
45361 fmt = REAL_MODE_FORMAT (mode);
45362 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
45363 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
45364
45365 /* xa = xa + 0.5 */
45366 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
45367 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
45368
45369 /* xa = (double)(int64_t)xa */
45370 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
45371 expand_fix (xi, xa, 0);
45372 expand_float (xa, xi, 0);
45373
45374 /* res = copysign (xa, operand1) */
45375 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
45376
45377 emit_label (label);
45378 LABEL_NUSES (label) = 1;
45379
45380 emit_move_insn (operand0, res);
45381 }
45382
45383 /* Expand SSE sequence for computing round
45384 from OP1 storing into OP0 using sse4 round insn. */
45385 void
45386 ix86_expand_round_sse4 (rtx op0, rtx op1)
45387 {
45388 machine_mode mode = GET_MODE (op0);
45389 rtx e1, e2, res, half;
45390 const struct real_format *fmt;
45391 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
45392 rtx (*gen_copysign) (rtx, rtx, rtx);
45393 rtx (*gen_round) (rtx, rtx, rtx);
45394
45395 switch (mode)
45396 {
45397 case SFmode:
45398 gen_copysign = gen_copysignsf3;
45399 gen_round = gen_sse4_1_roundsf2;
45400 break;
45401 case DFmode:
45402 gen_copysign = gen_copysigndf3;
45403 gen_round = gen_sse4_1_rounddf2;
45404 break;
45405 default:
45406 gcc_unreachable ();
45407 }
45408
45409 /* round (a) = trunc (a + copysign (0.5, a)) */
45410
45411 /* load nextafter (0.5, 0.0) */
45412 fmt = REAL_MODE_FORMAT (mode);
45413 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
45414 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
45415 half = const_double_from_real_value (pred_half, mode);
45416
45417 /* e1 = copysign (0.5, op1) */
45418 e1 = gen_reg_rtx (mode);
45419 emit_insn (gen_copysign (e1, half, op1));
45420
45421 /* e2 = op1 + e1 */
45422 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
45423
45424 /* res = trunc (e2) */
45425 res = gen_reg_rtx (mode);
45426 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
45427
45428 emit_move_insn (op0, res);
45429 }
45430 \f
45431
45432 /* Table of valid machine attributes. */
45433 static const struct attribute_spec ix86_attribute_table[] =
45434 {
45435 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
45436 affects_type_identity } */
45437 /* Stdcall attribute says callee is responsible for popping arguments
45438 if they are not variable. */
45439 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
45440 true },
45441 /* Fastcall attribute says callee is responsible for popping arguments
45442 if they are not variable. */
45443 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
45444 true },
45445 /* Thiscall attribute says callee is responsible for popping arguments
45446 if they are not variable. */
45447 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
45448 true },
45449 /* Cdecl attribute says the callee is a normal C declaration */
45450 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
45451 true },
45452 /* Regparm attribute specifies how many integer arguments are to be
45453 passed in registers. */
45454 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
45455 true },
45456 /* Sseregparm attribute says we are using x86_64 calling conventions
45457 for FP arguments. */
45458 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
45459 true },
45460 /* The transactional memory builtins are implicitly regparm or fastcall
45461 depending on the ABI. Override the generic do-nothing attribute that
45462 these builtins were declared with. */
45463 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
45464 true },
45465 /* force_align_arg_pointer says this function realigns the stack at entry. */
45466 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
45467 false, true, true, ix86_handle_force_align_arg_pointer_attribute, false },
45468 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
45469 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
45470 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
45471 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
45472 false },
45473 #endif
45474 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
45475 false },
45476 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
45477 false },
45478 #ifdef SUBTARGET_ATTRIBUTE_TABLE
45479 SUBTARGET_ATTRIBUTE_TABLE,
45480 #endif
45481 /* ms_abi and sysv_abi calling convention function attributes. */
45482 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
45483 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
45484 { "ms_abi va_list", 0, 0, false, false, false, NULL, false },
45485 { "sysv_abi va_list", 0, 0, false, false, false, NULL, false },
45486 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
45487 false },
45488 { "callee_pop_aggregate_return", 1, 1, false, true, true,
45489 ix86_handle_callee_pop_aggregate_return, true },
45490 { "interrupt", 0, 0, false, true, true,
45491 ix86_handle_interrupt_attribute, false },
45492 { "no_caller_saved_registers", 0, 0, false, true, true,
45493 ix86_handle_no_caller_saved_registers_attribute, false },
45494
45495 /* End element. */
45496 { NULL, 0, 0, false, false, false, NULL, false }
45497 };
45498
45499 /* Implement targetm.vectorize.builtin_vectorization_cost. */
45500 static int
45501 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
45502 tree vectype, int)
45503 {
45504 switch (type_of_cost)
45505 {
45506 case scalar_stmt:
45507 return ix86_cost->scalar_stmt_cost;
45508
45509 case scalar_load:
45510 return ix86_cost->scalar_load_cost;
45511
45512 case scalar_store:
45513 return ix86_cost->scalar_store_cost;
45514
45515 case vector_stmt:
45516 return ix86_cost->vec_stmt_cost;
45517
45518 case vector_load:
45519 return ix86_cost->vec_align_load_cost;
45520
45521 case vector_store:
45522 return ix86_cost->vec_store_cost;
45523
45524 case vec_to_scalar:
45525 return ix86_cost->vec_to_scalar_cost;
45526
45527 case scalar_to_vec:
45528 return ix86_cost->scalar_to_vec_cost;
45529
45530 case unaligned_load:
45531 case unaligned_store:
45532 return ix86_cost->vec_unalign_load_cost;
45533
45534 case cond_branch_taken:
45535 return ix86_cost->cond_taken_branch_cost;
45536
45537 case cond_branch_not_taken:
45538 return ix86_cost->cond_not_taken_branch_cost;
45539
45540 case vec_perm:
45541 case vec_promote_demote:
45542 return ix86_cost->vec_stmt_cost;
45543
45544 case vec_construct:
45545 return ix86_cost->vec_stmt_cost * (TYPE_VECTOR_SUBPARTS (vectype) - 1);
45546
45547 default:
45548 gcc_unreachable ();
45549 }
45550 }
45551
45552 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
45553 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
45554 insn every time. */
45555
45556 static GTY(()) rtx_insn *vselect_insn;
45557
45558 /* Initialize vselect_insn. */
45559
45560 static void
45561 init_vselect_insn (void)
45562 {
45563 unsigned i;
45564 rtx x;
45565
45566 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
45567 for (i = 0; i < MAX_VECT_LEN; ++i)
45568 XVECEXP (x, 0, i) = const0_rtx;
45569 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
45570 const0_rtx), x);
45571 x = gen_rtx_SET (const0_rtx, x);
45572 start_sequence ();
45573 vselect_insn = emit_insn (x);
45574 end_sequence ();
45575 }
45576
45577 /* Construct (set target (vec_select op0 (parallel perm))) and
45578 return true if that's a valid instruction in the active ISA. */
45579
45580 static bool
45581 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
45582 unsigned nelt, bool testing_p)
45583 {
45584 unsigned int i;
45585 rtx x, save_vconcat;
45586 int icode;
45587
45588 if (vselect_insn == NULL_RTX)
45589 init_vselect_insn ();
45590
45591 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
45592 PUT_NUM_ELEM (XVEC (x, 0), nelt);
45593 for (i = 0; i < nelt; ++i)
45594 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
45595 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
45596 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
45597 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
45598 SET_DEST (PATTERN (vselect_insn)) = target;
45599 icode = recog_memoized (vselect_insn);
45600
45601 if (icode >= 0 && !testing_p)
45602 emit_insn (copy_rtx (PATTERN (vselect_insn)));
45603
45604 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
45605 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
45606 INSN_CODE (vselect_insn) = -1;
45607
45608 return icode >= 0;
45609 }
45610
45611 /* Similar, but generate a vec_concat from op0 and op1 as well. */
45612
45613 static bool
45614 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
45615 const unsigned char *perm, unsigned nelt,
45616 bool testing_p)
45617 {
45618 machine_mode v2mode;
45619 rtx x;
45620 bool ok;
45621
45622 if (vselect_insn == NULL_RTX)
45623 init_vselect_insn ();
45624
45625 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
45626 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
45627 PUT_MODE (x, v2mode);
45628 XEXP (x, 0) = op0;
45629 XEXP (x, 1) = op1;
45630 ok = expand_vselect (target, x, perm, nelt, testing_p);
45631 XEXP (x, 0) = const0_rtx;
45632 XEXP (x, 1) = const0_rtx;
45633 return ok;
45634 }
45635
45636 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45637 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
45638
45639 static bool
45640 expand_vec_perm_blend (struct expand_vec_perm_d *d)
45641 {
45642 machine_mode mmode, vmode = d->vmode;
45643 unsigned i, mask, nelt = d->nelt;
45644 rtx target, op0, op1, maskop, x;
45645 rtx rperm[32], vperm;
45646
45647 if (d->one_operand_p)
45648 return false;
45649 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
45650 && (TARGET_AVX512BW
45651 || GET_MODE_UNIT_SIZE (vmode) >= 4))
45652 ;
45653 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
45654 ;
45655 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
45656 ;
45657 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
45658 ;
45659 else
45660 return false;
45661
45662 /* This is a blend, not a permute. Elements must stay in their
45663 respective lanes. */
45664 for (i = 0; i < nelt; ++i)
45665 {
45666 unsigned e = d->perm[i];
45667 if (!(e == i || e == i + nelt))
45668 return false;
45669 }
45670
45671 if (d->testing_p)
45672 return true;
45673
45674 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
45675 decision should be extracted elsewhere, so that we only try that
45676 sequence once all budget==3 options have been tried. */
45677 target = d->target;
45678 op0 = d->op0;
45679 op1 = d->op1;
45680 mask = 0;
45681
45682 switch (vmode)
45683 {
45684 case V8DFmode:
45685 case V16SFmode:
45686 case V4DFmode:
45687 case V8SFmode:
45688 case V2DFmode:
45689 case V4SFmode:
45690 case V8HImode:
45691 case V8SImode:
45692 case V32HImode:
45693 case V64QImode:
45694 case V16SImode:
45695 case V8DImode:
45696 for (i = 0; i < nelt; ++i)
45697 mask |= (d->perm[i] >= nelt) << i;
45698 break;
45699
45700 case V2DImode:
45701 for (i = 0; i < 2; ++i)
45702 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
45703 vmode = V8HImode;
45704 goto do_subreg;
45705
45706 case V4SImode:
45707 for (i = 0; i < 4; ++i)
45708 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
45709 vmode = V8HImode;
45710 goto do_subreg;
45711
45712 case V16QImode:
45713 /* See if bytes move in pairs so we can use pblendw with
45714 an immediate argument, rather than pblendvb with a vector
45715 argument. */
45716 for (i = 0; i < 16; i += 2)
45717 if (d->perm[i] + 1 != d->perm[i + 1])
45718 {
45719 use_pblendvb:
45720 for (i = 0; i < nelt; ++i)
45721 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
45722
45723 finish_pblendvb:
45724 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
45725 vperm = force_reg (vmode, vperm);
45726
45727 if (GET_MODE_SIZE (vmode) == 16)
45728 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
45729 else
45730 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
45731 if (target != d->target)
45732 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45733 return true;
45734 }
45735
45736 for (i = 0; i < 8; ++i)
45737 mask |= (d->perm[i * 2] >= 16) << i;
45738 vmode = V8HImode;
45739 /* FALLTHRU */
45740
45741 do_subreg:
45742 target = gen_reg_rtx (vmode);
45743 op0 = gen_lowpart (vmode, op0);
45744 op1 = gen_lowpart (vmode, op1);
45745 break;
45746
45747 case V32QImode:
45748 /* See if bytes move in pairs. If not, vpblendvb must be used. */
45749 for (i = 0; i < 32; i += 2)
45750 if (d->perm[i] + 1 != d->perm[i + 1])
45751 goto use_pblendvb;
45752 /* See if bytes move in quadruplets. If yes, vpblendd
45753 with immediate can be used. */
45754 for (i = 0; i < 32; i += 4)
45755 if (d->perm[i] + 2 != d->perm[i + 2])
45756 break;
45757 if (i < 32)
45758 {
45759 /* See if bytes move the same in both lanes. If yes,
45760 vpblendw with immediate can be used. */
45761 for (i = 0; i < 16; i += 2)
45762 if (d->perm[i] + 16 != d->perm[i + 16])
45763 goto use_pblendvb;
45764
45765 /* Use vpblendw. */
45766 for (i = 0; i < 16; ++i)
45767 mask |= (d->perm[i * 2] >= 32) << i;
45768 vmode = V16HImode;
45769 goto do_subreg;
45770 }
45771
45772 /* Use vpblendd. */
45773 for (i = 0; i < 8; ++i)
45774 mask |= (d->perm[i * 4] >= 32) << i;
45775 vmode = V8SImode;
45776 goto do_subreg;
45777
45778 case V16HImode:
45779 /* See if words move in pairs. If yes, vpblendd can be used. */
45780 for (i = 0; i < 16; i += 2)
45781 if (d->perm[i] + 1 != d->perm[i + 1])
45782 break;
45783 if (i < 16)
45784 {
45785 /* See if words move the same in both lanes. If not,
45786 vpblendvb must be used. */
45787 for (i = 0; i < 8; i++)
45788 if (d->perm[i] + 8 != d->perm[i + 8])
45789 {
45790 /* Use vpblendvb. */
45791 for (i = 0; i < 32; ++i)
45792 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
45793
45794 vmode = V32QImode;
45795 nelt = 32;
45796 target = gen_reg_rtx (vmode);
45797 op0 = gen_lowpart (vmode, op0);
45798 op1 = gen_lowpart (vmode, op1);
45799 goto finish_pblendvb;
45800 }
45801
45802 /* Use vpblendw. */
45803 for (i = 0; i < 16; ++i)
45804 mask |= (d->perm[i] >= 16) << i;
45805 break;
45806 }
45807
45808 /* Use vpblendd. */
45809 for (i = 0; i < 8; ++i)
45810 mask |= (d->perm[i * 2] >= 16) << i;
45811 vmode = V8SImode;
45812 goto do_subreg;
45813
45814 case V4DImode:
45815 /* Use vpblendd. */
45816 for (i = 0; i < 4; ++i)
45817 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
45818 vmode = V8SImode;
45819 goto do_subreg;
45820
45821 default:
45822 gcc_unreachable ();
45823 }
45824
45825 switch (vmode)
45826 {
45827 case V8DFmode:
45828 case V8DImode:
45829 mmode = QImode;
45830 break;
45831 case V16SFmode:
45832 case V16SImode:
45833 mmode = HImode;
45834 break;
45835 case V32HImode:
45836 mmode = SImode;
45837 break;
45838 case V64QImode:
45839 mmode = DImode;
45840 break;
45841 default:
45842 mmode = VOIDmode;
45843 }
45844
45845 if (mmode != VOIDmode)
45846 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
45847 else
45848 maskop = GEN_INT (mask);
45849
45850 /* This matches five different patterns with the different modes. */
45851 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
45852 x = gen_rtx_SET (target, x);
45853 emit_insn (x);
45854 if (target != d->target)
45855 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45856
45857 return true;
45858 }
45859
45860 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45861 in terms of the variable form of vpermilps.
45862
45863 Note that we will have already failed the immediate input vpermilps,
45864 which requires that the high and low part shuffle be identical; the
45865 variable form doesn't require that. */
45866
45867 static bool
45868 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
45869 {
45870 rtx rperm[8], vperm;
45871 unsigned i;
45872
45873 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
45874 return false;
45875
45876 /* We can only permute within the 128-bit lane. */
45877 for (i = 0; i < 8; ++i)
45878 {
45879 unsigned e = d->perm[i];
45880 if (i < 4 ? e >= 4 : e < 4)
45881 return false;
45882 }
45883
45884 if (d->testing_p)
45885 return true;
45886
45887 for (i = 0; i < 8; ++i)
45888 {
45889 unsigned e = d->perm[i];
45890
45891 /* Within each 128-bit lane, the elements of op0 are numbered
45892 from 0 and the elements of op1 are numbered from 4. */
45893 if (e >= 8 + 4)
45894 e -= 8;
45895 else if (e >= 4)
45896 e -= 4;
45897
45898 rperm[i] = GEN_INT (e);
45899 }
45900
45901 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
45902 vperm = force_reg (V8SImode, vperm);
45903 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
45904
45905 return true;
45906 }
45907
45908 /* Return true if permutation D can be performed as VMODE permutation
45909 instead. */
45910
45911 static bool
45912 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
45913 {
45914 unsigned int i, j, chunk;
45915
45916 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
45917 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
45918 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
45919 return false;
45920
45921 if (GET_MODE_NUNITS (vmode) >= d->nelt)
45922 return true;
45923
45924 chunk = d->nelt / GET_MODE_NUNITS (vmode);
45925 for (i = 0; i < d->nelt; i += chunk)
45926 if (d->perm[i] & (chunk - 1))
45927 return false;
45928 else
45929 for (j = 1; j < chunk; ++j)
45930 if (d->perm[i] + j != d->perm[i + j])
45931 return false;
45932
45933 return true;
45934 }
45935
45936 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45937 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
45938
45939 static bool
45940 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
45941 {
45942 unsigned i, nelt, eltsz, mask;
45943 unsigned char perm[64];
45944 machine_mode vmode = V16QImode;
45945 rtx rperm[64], vperm, target, op0, op1;
45946
45947 nelt = d->nelt;
45948
45949 if (!d->one_operand_p)
45950 {
45951 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
45952 {
45953 if (TARGET_AVX2
45954 && valid_perm_using_mode_p (V2TImode, d))
45955 {
45956 if (d->testing_p)
45957 return true;
45958
45959 /* Use vperm2i128 insn. The pattern uses
45960 V4DImode instead of V2TImode. */
45961 target = d->target;
45962 if (d->vmode != V4DImode)
45963 target = gen_reg_rtx (V4DImode);
45964 op0 = gen_lowpart (V4DImode, d->op0);
45965 op1 = gen_lowpart (V4DImode, d->op1);
45966 rperm[0]
45967 = GEN_INT ((d->perm[0] / (nelt / 2))
45968 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
45969 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
45970 if (target != d->target)
45971 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45972 return true;
45973 }
45974 return false;
45975 }
45976 }
45977 else
45978 {
45979 if (GET_MODE_SIZE (d->vmode) == 16)
45980 {
45981 if (!TARGET_SSSE3)
45982 return false;
45983 }
45984 else if (GET_MODE_SIZE (d->vmode) == 32)
45985 {
45986 if (!TARGET_AVX2)
45987 return false;
45988
45989 /* V4DImode should be already handled through
45990 expand_vselect by vpermq instruction. */
45991 gcc_assert (d->vmode != V4DImode);
45992
45993 vmode = V32QImode;
45994 if (d->vmode == V8SImode
45995 || d->vmode == V16HImode
45996 || d->vmode == V32QImode)
45997 {
45998 /* First see if vpermq can be used for
45999 V8SImode/V16HImode/V32QImode. */
46000 if (valid_perm_using_mode_p (V4DImode, d))
46001 {
46002 for (i = 0; i < 4; i++)
46003 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
46004 if (d->testing_p)
46005 return true;
46006 target = gen_reg_rtx (V4DImode);
46007 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
46008 perm, 4, false))
46009 {
46010 emit_move_insn (d->target,
46011 gen_lowpart (d->vmode, target));
46012 return true;
46013 }
46014 return false;
46015 }
46016
46017 /* Next see if vpermd can be used. */
46018 if (valid_perm_using_mode_p (V8SImode, d))
46019 vmode = V8SImode;
46020 }
46021 /* Or if vpermps can be used. */
46022 else if (d->vmode == V8SFmode)
46023 vmode = V8SImode;
46024
46025 if (vmode == V32QImode)
46026 {
46027 /* vpshufb only works intra lanes, it is not
46028 possible to shuffle bytes in between the lanes. */
46029 for (i = 0; i < nelt; ++i)
46030 if ((d->perm[i] ^ i) & (nelt / 2))
46031 return false;
46032 }
46033 }
46034 else if (GET_MODE_SIZE (d->vmode) == 64)
46035 {
46036 if (!TARGET_AVX512BW)
46037 return false;
46038
46039 /* If vpermq didn't work, vpshufb won't work either. */
46040 if (d->vmode == V8DFmode || d->vmode == V8DImode)
46041 return false;
46042
46043 vmode = V64QImode;
46044 if (d->vmode == V16SImode
46045 || d->vmode == V32HImode
46046 || d->vmode == V64QImode)
46047 {
46048 /* First see if vpermq can be used for
46049 V16SImode/V32HImode/V64QImode. */
46050 if (valid_perm_using_mode_p (V8DImode, d))
46051 {
46052 for (i = 0; i < 8; i++)
46053 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
46054 if (d->testing_p)
46055 return true;
46056 target = gen_reg_rtx (V8DImode);
46057 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
46058 perm, 8, false))
46059 {
46060 emit_move_insn (d->target,
46061 gen_lowpart (d->vmode, target));
46062 return true;
46063 }
46064 return false;
46065 }
46066
46067 /* Next see if vpermd can be used. */
46068 if (valid_perm_using_mode_p (V16SImode, d))
46069 vmode = V16SImode;
46070 }
46071 /* Or if vpermps can be used. */
46072 else if (d->vmode == V16SFmode)
46073 vmode = V16SImode;
46074 if (vmode == V64QImode)
46075 {
46076 /* vpshufb only works intra lanes, it is not
46077 possible to shuffle bytes in between the lanes. */
46078 for (i = 0; i < nelt; ++i)
46079 if ((d->perm[i] ^ i) & (nelt / 4))
46080 return false;
46081 }
46082 }
46083 else
46084 return false;
46085 }
46086
46087 if (d->testing_p)
46088 return true;
46089
46090 if (vmode == V8SImode)
46091 for (i = 0; i < 8; ++i)
46092 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
46093 else if (vmode == V16SImode)
46094 for (i = 0; i < 16; ++i)
46095 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
46096 else
46097 {
46098 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46099 if (!d->one_operand_p)
46100 mask = 2 * nelt - 1;
46101 else if (vmode == V16QImode)
46102 mask = nelt - 1;
46103 else if (vmode == V64QImode)
46104 mask = nelt / 4 - 1;
46105 else
46106 mask = nelt / 2 - 1;
46107
46108 for (i = 0; i < nelt; ++i)
46109 {
46110 unsigned j, e = d->perm[i] & mask;
46111 for (j = 0; j < eltsz; ++j)
46112 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
46113 }
46114 }
46115
46116 vperm = gen_rtx_CONST_VECTOR (vmode,
46117 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
46118 vperm = force_reg (vmode, vperm);
46119
46120 target = d->target;
46121 if (d->vmode != vmode)
46122 target = gen_reg_rtx (vmode);
46123 op0 = gen_lowpart (vmode, d->op0);
46124 if (d->one_operand_p)
46125 {
46126 if (vmode == V16QImode)
46127 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
46128 else if (vmode == V32QImode)
46129 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
46130 else if (vmode == V64QImode)
46131 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
46132 else if (vmode == V8SFmode)
46133 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
46134 else if (vmode == V8SImode)
46135 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
46136 else if (vmode == V16SFmode)
46137 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
46138 else if (vmode == V16SImode)
46139 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
46140 else
46141 gcc_unreachable ();
46142 }
46143 else
46144 {
46145 op1 = gen_lowpart (vmode, d->op1);
46146 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
46147 }
46148 if (target != d->target)
46149 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46150
46151 return true;
46152 }
46153
46154 /* For V*[QHS]Imode permutations, check if the same permutation
46155 can't be performed in a 2x, 4x or 8x wider inner mode. */
46156
46157 static bool
46158 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
46159 struct expand_vec_perm_d *nd)
46160 {
46161 int i;
46162 enum machine_mode mode = VOIDmode;
46163
46164 switch (d->vmode)
46165 {
46166 case V16QImode: mode = V8HImode; break;
46167 case V32QImode: mode = V16HImode; break;
46168 case V64QImode: mode = V32HImode; break;
46169 case V8HImode: mode = V4SImode; break;
46170 case V16HImode: mode = V8SImode; break;
46171 case V32HImode: mode = V16SImode; break;
46172 case V4SImode: mode = V2DImode; break;
46173 case V8SImode: mode = V4DImode; break;
46174 case V16SImode: mode = V8DImode; break;
46175 default: return false;
46176 }
46177 for (i = 0; i < d->nelt; i += 2)
46178 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
46179 return false;
46180 nd->vmode = mode;
46181 nd->nelt = d->nelt / 2;
46182 for (i = 0; i < nd->nelt; i++)
46183 nd->perm[i] = d->perm[2 * i] / 2;
46184 if (GET_MODE_INNER (mode) != DImode)
46185 canonicalize_vector_int_perm (nd, nd);
46186 if (nd != d)
46187 {
46188 nd->one_operand_p = d->one_operand_p;
46189 nd->testing_p = d->testing_p;
46190 if (d->op0 == d->op1)
46191 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
46192 else
46193 {
46194 nd->op0 = gen_lowpart (nd->vmode, d->op0);
46195 nd->op1 = gen_lowpart (nd->vmode, d->op1);
46196 }
46197 if (d->testing_p)
46198 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
46199 else
46200 nd->target = gen_reg_rtx (nd->vmode);
46201 }
46202 return true;
46203 }
46204
46205 /* Try to expand one-operand permutation with constant mask. */
46206
46207 static bool
46208 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
46209 {
46210 machine_mode mode = GET_MODE (d->op0);
46211 machine_mode maskmode = mode;
46212 rtx (*gen) (rtx, rtx, rtx) = NULL;
46213 rtx target, op0, mask;
46214 rtx vec[64];
46215
46216 if (!rtx_equal_p (d->op0, d->op1))
46217 return false;
46218
46219 if (!TARGET_AVX512F)
46220 return false;
46221
46222 switch (mode)
46223 {
46224 case V16SImode:
46225 gen = gen_avx512f_permvarv16si;
46226 break;
46227 case V16SFmode:
46228 gen = gen_avx512f_permvarv16sf;
46229 maskmode = V16SImode;
46230 break;
46231 case V8DImode:
46232 gen = gen_avx512f_permvarv8di;
46233 break;
46234 case V8DFmode:
46235 gen = gen_avx512f_permvarv8df;
46236 maskmode = V8DImode;
46237 break;
46238 default:
46239 return false;
46240 }
46241
46242 target = d->target;
46243 op0 = d->op0;
46244 for (int i = 0; i < d->nelt; ++i)
46245 vec[i] = GEN_INT (d->perm[i]);
46246 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
46247 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
46248 return true;
46249 }
46250
46251 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
46252 in a single instruction. */
46253
46254 static bool
46255 expand_vec_perm_1 (struct expand_vec_perm_d *d)
46256 {
46257 unsigned i, nelt = d->nelt;
46258 struct expand_vec_perm_d nd;
46259
46260 /* Check plain VEC_SELECT first, because AVX has instructions that could
46261 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
46262 input where SEL+CONCAT may not. */
46263 if (d->one_operand_p)
46264 {
46265 int mask = nelt - 1;
46266 bool identity_perm = true;
46267 bool broadcast_perm = true;
46268
46269 for (i = 0; i < nelt; i++)
46270 {
46271 nd.perm[i] = d->perm[i] & mask;
46272 if (nd.perm[i] != i)
46273 identity_perm = false;
46274 if (nd.perm[i])
46275 broadcast_perm = false;
46276 }
46277
46278 if (identity_perm)
46279 {
46280 if (!d->testing_p)
46281 emit_move_insn (d->target, d->op0);
46282 return true;
46283 }
46284 else if (broadcast_perm && TARGET_AVX2)
46285 {
46286 /* Use vpbroadcast{b,w,d}. */
46287 rtx (*gen) (rtx, rtx) = NULL;
46288 switch (d->vmode)
46289 {
46290 case V64QImode:
46291 if (TARGET_AVX512BW)
46292 gen = gen_avx512bw_vec_dupv64qi_1;
46293 break;
46294 case V32QImode:
46295 gen = gen_avx2_pbroadcastv32qi_1;
46296 break;
46297 case V32HImode:
46298 if (TARGET_AVX512BW)
46299 gen = gen_avx512bw_vec_dupv32hi_1;
46300 break;
46301 case V16HImode:
46302 gen = gen_avx2_pbroadcastv16hi_1;
46303 break;
46304 case V16SImode:
46305 if (TARGET_AVX512F)
46306 gen = gen_avx512f_vec_dupv16si_1;
46307 break;
46308 case V8SImode:
46309 gen = gen_avx2_pbroadcastv8si_1;
46310 break;
46311 case V16QImode:
46312 gen = gen_avx2_pbroadcastv16qi;
46313 break;
46314 case V8HImode:
46315 gen = gen_avx2_pbroadcastv8hi;
46316 break;
46317 case V16SFmode:
46318 if (TARGET_AVX512F)
46319 gen = gen_avx512f_vec_dupv16sf_1;
46320 break;
46321 case V8SFmode:
46322 gen = gen_avx2_vec_dupv8sf_1;
46323 break;
46324 case V8DFmode:
46325 if (TARGET_AVX512F)
46326 gen = gen_avx512f_vec_dupv8df_1;
46327 break;
46328 case V8DImode:
46329 if (TARGET_AVX512F)
46330 gen = gen_avx512f_vec_dupv8di_1;
46331 break;
46332 /* For other modes prefer other shuffles this function creates. */
46333 default: break;
46334 }
46335 if (gen != NULL)
46336 {
46337 if (!d->testing_p)
46338 emit_insn (gen (d->target, d->op0));
46339 return true;
46340 }
46341 }
46342
46343 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
46344 return true;
46345
46346 /* There are plenty of patterns in sse.md that are written for
46347 SEL+CONCAT and are not replicated for a single op. Perhaps
46348 that should be changed, to avoid the nastiness here. */
46349
46350 /* Recognize interleave style patterns, which means incrementing
46351 every other permutation operand. */
46352 for (i = 0; i < nelt; i += 2)
46353 {
46354 nd.perm[i] = d->perm[i] & mask;
46355 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
46356 }
46357 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
46358 d->testing_p))
46359 return true;
46360
46361 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
46362 if (nelt >= 4)
46363 {
46364 for (i = 0; i < nelt; i += 4)
46365 {
46366 nd.perm[i + 0] = d->perm[i + 0] & mask;
46367 nd.perm[i + 1] = d->perm[i + 1] & mask;
46368 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
46369 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
46370 }
46371
46372 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
46373 d->testing_p))
46374 return true;
46375 }
46376 }
46377
46378 /* Finally, try the fully general two operand permute. */
46379 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
46380 d->testing_p))
46381 return true;
46382
46383 /* Recognize interleave style patterns with reversed operands. */
46384 if (!d->one_operand_p)
46385 {
46386 for (i = 0; i < nelt; ++i)
46387 {
46388 unsigned e = d->perm[i];
46389 if (e >= nelt)
46390 e -= nelt;
46391 else
46392 e += nelt;
46393 nd.perm[i] = e;
46394 }
46395
46396 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
46397 d->testing_p))
46398 return true;
46399 }
46400
46401 /* Try the SSE4.1 blend variable merge instructions. */
46402 if (expand_vec_perm_blend (d))
46403 return true;
46404
46405 /* Try one of the AVX vpermil variable permutations. */
46406 if (expand_vec_perm_vpermil (d))
46407 return true;
46408
46409 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
46410 vpshufb, vpermd, vpermps or vpermq variable permutation. */
46411 if (expand_vec_perm_pshufb (d))
46412 return true;
46413
46414 /* Try the AVX2 vpalignr instruction. */
46415 if (expand_vec_perm_palignr (d, true))
46416 return true;
46417
46418 /* Try the AVX512F vperm{s,d} instructions. */
46419 if (ix86_expand_vec_one_operand_perm_avx512 (d))
46420 return true;
46421
46422 /* Try the AVX512F vpermi2 instructions. */
46423 if (ix86_expand_vec_perm_vpermi2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
46424 return true;
46425
46426 /* See if we can get the same permutation in different vector integer
46427 mode. */
46428 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
46429 {
46430 if (!d->testing_p)
46431 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
46432 return true;
46433 }
46434 return false;
46435 }
46436
46437 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46438 in terms of a pair of pshuflw + pshufhw instructions. */
46439
46440 static bool
46441 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
46442 {
46443 unsigned char perm2[MAX_VECT_LEN];
46444 unsigned i;
46445 bool ok;
46446
46447 if (d->vmode != V8HImode || !d->one_operand_p)
46448 return false;
46449
46450 /* The two permutations only operate in 64-bit lanes. */
46451 for (i = 0; i < 4; ++i)
46452 if (d->perm[i] >= 4)
46453 return false;
46454 for (i = 4; i < 8; ++i)
46455 if (d->perm[i] < 4)
46456 return false;
46457
46458 if (d->testing_p)
46459 return true;
46460
46461 /* Emit the pshuflw. */
46462 memcpy (perm2, d->perm, 4);
46463 for (i = 4; i < 8; ++i)
46464 perm2[i] = i;
46465 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
46466 gcc_assert (ok);
46467
46468 /* Emit the pshufhw. */
46469 memcpy (perm2 + 4, d->perm + 4, 4);
46470 for (i = 0; i < 4; ++i)
46471 perm2[i] = i;
46472 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
46473 gcc_assert (ok);
46474
46475 return true;
46476 }
46477
46478 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46479 the permutation using the SSSE3 palignr instruction. This succeeds
46480 when all of the elements in PERM fit within one vector and we merely
46481 need to shift them down so that a single vector permutation has a
46482 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
46483 the vpalignr instruction itself can perform the requested permutation. */
46484
46485 static bool
46486 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
46487 {
46488 unsigned i, nelt = d->nelt;
46489 unsigned min, max, minswap, maxswap;
46490 bool in_order, ok, swap = false;
46491 rtx shift, target;
46492 struct expand_vec_perm_d dcopy;
46493
46494 /* Even with AVX, palignr only operates on 128-bit vectors,
46495 in AVX2 palignr operates on both 128-bit lanes. */
46496 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
46497 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
46498 return false;
46499
46500 min = 2 * nelt;
46501 max = 0;
46502 minswap = 2 * nelt;
46503 maxswap = 0;
46504 for (i = 0; i < nelt; ++i)
46505 {
46506 unsigned e = d->perm[i];
46507 unsigned eswap = d->perm[i] ^ nelt;
46508 if (GET_MODE_SIZE (d->vmode) == 32)
46509 {
46510 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
46511 eswap = e ^ (nelt / 2);
46512 }
46513 if (e < min)
46514 min = e;
46515 if (e > max)
46516 max = e;
46517 if (eswap < minswap)
46518 minswap = eswap;
46519 if (eswap > maxswap)
46520 maxswap = eswap;
46521 }
46522 if (min == 0
46523 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
46524 {
46525 if (d->one_operand_p
46526 || minswap == 0
46527 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
46528 ? nelt / 2 : nelt))
46529 return false;
46530 swap = true;
46531 min = minswap;
46532 max = maxswap;
46533 }
46534
46535 /* Given that we have SSSE3, we know we'll be able to implement the
46536 single operand permutation after the palignr with pshufb for
46537 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
46538 first. */
46539 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
46540 return true;
46541
46542 dcopy = *d;
46543 if (swap)
46544 {
46545 dcopy.op0 = d->op1;
46546 dcopy.op1 = d->op0;
46547 for (i = 0; i < nelt; ++i)
46548 dcopy.perm[i] ^= nelt;
46549 }
46550
46551 in_order = true;
46552 for (i = 0; i < nelt; ++i)
46553 {
46554 unsigned e = dcopy.perm[i];
46555 if (GET_MODE_SIZE (d->vmode) == 32
46556 && e >= nelt
46557 && (e & (nelt / 2 - 1)) < min)
46558 e = e - min - (nelt / 2);
46559 else
46560 e = e - min;
46561 if (e != i)
46562 in_order = false;
46563 dcopy.perm[i] = e;
46564 }
46565 dcopy.one_operand_p = true;
46566
46567 if (single_insn_only_p && !in_order)
46568 return false;
46569
46570 /* For AVX2, test whether we can permute the result in one instruction. */
46571 if (d->testing_p)
46572 {
46573 if (in_order)
46574 return true;
46575 dcopy.op1 = dcopy.op0;
46576 return expand_vec_perm_1 (&dcopy);
46577 }
46578
46579 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
46580 if (GET_MODE_SIZE (d->vmode) == 16)
46581 {
46582 target = gen_reg_rtx (TImode);
46583 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
46584 gen_lowpart (TImode, dcopy.op0), shift));
46585 }
46586 else
46587 {
46588 target = gen_reg_rtx (V2TImode);
46589 emit_insn (gen_avx2_palignrv2ti (target,
46590 gen_lowpart (V2TImode, dcopy.op1),
46591 gen_lowpart (V2TImode, dcopy.op0),
46592 shift));
46593 }
46594
46595 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
46596
46597 /* Test for the degenerate case where the alignment by itself
46598 produces the desired permutation. */
46599 if (in_order)
46600 {
46601 emit_move_insn (d->target, dcopy.op0);
46602 return true;
46603 }
46604
46605 ok = expand_vec_perm_1 (&dcopy);
46606 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
46607
46608 return ok;
46609 }
46610
46611 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
46612 the permutation using the SSE4_1 pblendv instruction. Potentially
46613 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
46614
46615 static bool
46616 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
46617 {
46618 unsigned i, which, nelt = d->nelt;
46619 struct expand_vec_perm_d dcopy, dcopy1;
46620 machine_mode vmode = d->vmode;
46621 bool ok;
46622
46623 /* Use the same checks as in expand_vec_perm_blend. */
46624 if (d->one_operand_p)
46625 return false;
46626 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
46627 ;
46628 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
46629 ;
46630 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
46631 ;
46632 else
46633 return false;
46634
46635 /* Figure out where permutation elements stay not in their
46636 respective lanes. */
46637 for (i = 0, which = 0; i < nelt; ++i)
46638 {
46639 unsigned e = d->perm[i];
46640 if (e != i)
46641 which |= (e < nelt ? 1 : 2);
46642 }
46643 /* We can pblend the part where elements stay not in their
46644 respective lanes only when these elements are all in one
46645 half of a permutation.
46646 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
46647 lanes, but both 8 and 9 >= 8
46648 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
46649 respective lanes and 8 >= 8, but 2 not. */
46650 if (which != 1 && which != 2)
46651 return false;
46652 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
46653 return true;
46654
46655 /* First we apply one operand permutation to the part where
46656 elements stay not in their respective lanes. */
46657 dcopy = *d;
46658 if (which == 2)
46659 dcopy.op0 = dcopy.op1 = d->op1;
46660 else
46661 dcopy.op0 = dcopy.op1 = d->op0;
46662 if (!d->testing_p)
46663 dcopy.target = gen_reg_rtx (vmode);
46664 dcopy.one_operand_p = true;
46665
46666 for (i = 0; i < nelt; ++i)
46667 dcopy.perm[i] = d->perm[i] & (nelt - 1);
46668
46669 ok = expand_vec_perm_1 (&dcopy);
46670 if (GET_MODE_SIZE (vmode) != 16 && !ok)
46671 return false;
46672 else
46673 gcc_assert (ok);
46674 if (d->testing_p)
46675 return true;
46676
46677 /* Next we put permuted elements into their positions. */
46678 dcopy1 = *d;
46679 if (which == 2)
46680 dcopy1.op1 = dcopy.target;
46681 else
46682 dcopy1.op0 = dcopy.target;
46683
46684 for (i = 0; i < nelt; ++i)
46685 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
46686
46687 ok = expand_vec_perm_blend (&dcopy1);
46688 gcc_assert (ok);
46689
46690 return true;
46691 }
46692
46693 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
46694
46695 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46696 a two vector permutation into a single vector permutation by using
46697 an interleave operation to merge the vectors. */
46698
46699 static bool
46700 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
46701 {
46702 struct expand_vec_perm_d dremap, dfinal;
46703 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
46704 unsigned HOST_WIDE_INT contents;
46705 unsigned char remap[2 * MAX_VECT_LEN];
46706 rtx_insn *seq;
46707 bool ok, same_halves = false;
46708
46709 if (GET_MODE_SIZE (d->vmode) == 16)
46710 {
46711 if (d->one_operand_p)
46712 return false;
46713 }
46714 else if (GET_MODE_SIZE (d->vmode) == 32)
46715 {
46716 if (!TARGET_AVX)
46717 return false;
46718 /* For 32-byte modes allow even d->one_operand_p.
46719 The lack of cross-lane shuffling in some instructions
46720 might prevent a single insn shuffle. */
46721 dfinal = *d;
46722 dfinal.testing_p = true;
46723 /* If expand_vec_perm_interleave3 can expand this into
46724 a 3 insn sequence, give up and let it be expanded as
46725 3 insn sequence. While that is one insn longer,
46726 it doesn't need a memory operand and in the common
46727 case that both interleave low and high permutations
46728 with the same operands are adjacent needs 4 insns
46729 for both after CSE. */
46730 if (expand_vec_perm_interleave3 (&dfinal))
46731 return false;
46732 }
46733 else
46734 return false;
46735
46736 /* Examine from whence the elements come. */
46737 contents = 0;
46738 for (i = 0; i < nelt; ++i)
46739 contents |= HOST_WIDE_INT_1U << d->perm[i];
46740
46741 memset (remap, 0xff, sizeof (remap));
46742 dremap = *d;
46743
46744 if (GET_MODE_SIZE (d->vmode) == 16)
46745 {
46746 unsigned HOST_WIDE_INT h1, h2, h3, h4;
46747
46748 /* Split the two input vectors into 4 halves. */
46749 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
46750 h2 = h1 << nelt2;
46751 h3 = h2 << nelt2;
46752 h4 = h3 << nelt2;
46753
46754 /* If the elements from the low halves use interleave low, and similarly
46755 for interleave high. If the elements are from mis-matched halves, we
46756 can use shufps for V4SF/V4SI or do a DImode shuffle. */
46757 if ((contents & (h1 | h3)) == contents)
46758 {
46759 /* punpckl* */
46760 for (i = 0; i < nelt2; ++i)
46761 {
46762 remap[i] = i * 2;
46763 remap[i + nelt] = i * 2 + 1;
46764 dremap.perm[i * 2] = i;
46765 dremap.perm[i * 2 + 1] = i + nelt;
46766 }
46767 if (!TARGET_SSE2 && d->vmode == V4SImode)
46768 dremap.vmode = V4SFmode;
46769 }
46770 else if ((contents & (h2 | h4)) == contents)
46771 {
46772 /* punpckh* */
46773 for (i = 0; i < nelt2; ++i)
46774 {
46775 remap[i + nelt2] = i * 2;
46776 remap[i + nelt + nelt2] = i * 2 + 1;
46777 dremap.perm[i * 2] = i + nelt2;
46778 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
46779 }
46780 if (!TARGET_SSE2 && d->vmode == V4SImode)
46781 dremap.vmode = V4SFmode;
46782 }
46783 else if ((contents & (h1 | h4)) == contents)
46784 {
46785 /* shufps */
46786 for (i = 0; i < nelt2; ++i)
46787 {
46788 remap[i] = i;
46789 remap[i + nelt + nelt2] = i + nelt2;
46790 dremap.perm[i] = i;
46791 dremap.perm[i + nelt2] = i + nelt + nelt2;
46792 }
46793 if (nelt != 4)
46794 {
46795 /* shufpd */
46796 dremap.vmode = V2DImode;
46797 dremap.nelt = 2;
46798 dremap.perm[0] = 0;
46799 dremap.perm[1] = 3;
46800 }
46801 }
46802 else if ((contents & (h2 | h3)) == contents)
46803 {
46804 /* shufps */
46805 for (i = 0; i < nelt2; ++i)
46806 {
46807 remap[i + nelt2] = i;
46808 remap[i + nelt] = i + nelt2;
46809 dremap.perm[i] = i + nelt2;
46810 dremap.perm[i + nelt2] = i + nelt;
46811 }
46812 if (nelt != 4)
46813 {
46814 /* shufpd */
46815 dremap.vmode = V2DImode;
46816 dremap.nelt = 2;
46817 dremap.perm[0] = 1;
46818 dremap.perm[1] = 2;
46819 }
46820 }
46821 else
46822 return false;
46823 }
46824 else
46825 {
46826 unsigned int nelt4 = nelt / 4, nzcnt = 0;
46827 unsigned HOST_WIDE_INT q[8];
46828 unsigned int nonzero_halves[4];
46829
46830 /* Split the two input vectors into 8 quarters. */
46831 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
46832 for (i = 1; i < 8; ++i)
46833 q[i] = q[0] << (nelt4 * i);
46834 for (i = 0; i < 4; ++i)
46835 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
46836 {
46837 nonzero_halves[nzcnt] = i;
46838 ++nzcnt;
46839 }
46840
46841 if (nzcnt == 1)
46842 {
46843 gcc_assert (d->one_operand_p);
46844 nonzero_halves[1] = nonzero_halves[0];
46845 same_halves = true;
46846 }
46847 else if (d->one_operand_p)
46848 {
46849 gcc_assert (nonzero_halves[0] == 0);
46850 gcc_assert (nonzero_halves[1] == 1);
46851 }
46852
46853 if (nzcnt <= 2)
46854 {
46855 if (d->perm[0] / nelt2 == nonzero_halves[1])
46856 {
46857 /* Attempt to increase the likelihood that dfinal
46858 shuffle will be intra-lane. */
46859 std::swap (nonzero_halves[0], nonzero_halves[1]);
46860 }
46861
46862 /* vperm2f128 or vperm2i128. */
46863 for (i = 0; i < nelt2; ++i)
46864 {
46865 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
46866 remap[i + nonzero_halves[0] * nelt2] = i;
46867 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
46868 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
46869 }
46870
46871 if (d->vmode != V8SFmode
46872 && d->vmode != V4DFmode
46873 && d->vmode != V8SImode)
46874 {
46875 dremap.vmode = V8SImode;
46876 dremap.nelt = 8;
46877 for (i = 0; i < 4; ++i)
46878 {
46879 dremap.perm[i] = i + nonzero_halves[0] * 4;
46880 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
46881 }
46882 }
46883 }
46884 else if (d->one_operand_p)
46885 return false;
46886 else if (TARGET_AVX2
46887 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
46888 {
46889 /* vpunpckl* */
46890 for (i = 0; i < nelt4; ++i)
46891 {
46892 remap[i] = i * 2;
46893 remap[i + nelt] = i * 2 + 1;
46894 remap[i + nelt2] = i * 2 + nelt2;
46895 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
46896 dremap.perm[i * 2] = i;
46897 dremap.perm[i * 2 + 1] = i + nelt;
46898 dremap.perm[i * 2 + nelt2] = i + nelt2;
46899 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
46900 }
46901 }
46902 else if (TARGET_AVX2
46903 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
46904 {
46905 /* vpunpckh* */
46906 for (i = 0; i < nelt4; ++i)
46907 {
46908 remap[i + nelt4] = i * 2;
46909 remap[i + nelt + nelt4] = i * 2 + 1;
46910 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
46911 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
46912 dremap.perm[i * 2] = i + nelt4;
46913 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
46914 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
46915 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
46916 }
46917 }
46918 else
46919 return false;
46920 }
46921
46922 /* Use the remapping array set up above to move the elements from their
46923 swizzled locations into their final destinations. */
46924 dfinal = *d;
46925 for (i = 0; i < nelt; ++i)
46926 {
46927 unsigned e = remap[d->perm[i]];
46928 gcc_assert (e < nelt);
46929 /* If same_halves is true, both halves of the remapped vector are the
46930 same. Avoid cross-lane accesses if possible. */
46931 if (same_halves && i >= nelt2)
46932 {
46933 gcc_assert (e < nelt2);
46934 dfinal.perm[i] = e + nelt2;
46935 }
46936 else
46937 dfinal.perm[i] = e;
46938 }
46939 if (!d->testing_p)
46940 {
46941 dremap.target = gen_reg_rtx (dremap.vmode);
46942 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
46943 }
46944 dfinal.op1 = dfinal.op0;
46945 dfinal.one_operand_p = true;
46946
46947 /* Test if the final remap can be done with a single insn. For V4SFmode or
46948 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
46949 start_sequence ();
46950 ok = expand_vec_perm_1 (&dfinal);
46951 seq = get_insns ();
46952 end_sequence ();
46953
46954 if (!ok)
46955 return false;
46956
46957 if (d->testing_p)
46958 return true;
46959
46960 if (dremap.vmode != dfinal.vmode)
46961 {
46962 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
46963 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
46964 }
46965
46966 ok = expand_vec_perm_1 (&dremap);
46967 gcc_assert (ok);
46968
46969 emit_insn (seq);
46970 return true;
46971 }
46972
46973 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46974 a single vector cross-lane permutation into vpermq followed
46975 by any of the single insn permutations. */
46976
46977 static bool
46978 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
46979 {
46980 struct expand_vec_perm_d dremap, dfinal;
46981 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
46982 unsigned contents[2];
46983 bool ok;
46984
46985 if (!(TARGET_AVX2
46986 && (d->vmode == V32QImode || d->vmode == V16HImode)
46987 && d->one_operand_p))
46988 return false;
46989
46990 contents[0] = 0;
46991 contents[1] = 0;
46992 for (i = 0; i < nelt2; ++i)
46993 {
46994 contents[0] |= 1u << (d->perm[i] / nelt4);
46995 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
46996 }
46997
46998 for (i = 0; i < 2; ++i)
46999 {
47000 unsigned int cnt = 0;
47001 for (j = 0; j < 4; ++j)
47002 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
47003 return false;
47004 }
47005
47006 if (d->testing_p)
47007 return true;
47008
47009 dremap = *d;
47010 dremap.vmode = V4DImode;
47011 dremap.nelt = 4;
47012 dremap.target = gen_reg_rtx (V4DImode);
47013 dremap.op0 = gen_lowpart (V4DImode, d->op0);
47014 dremap.op1 = dremap.op0;
47015 dremap.one_operand_p = true;
47016 for (i = 0; i < 2; ++i)
47017 {
47018 unsigned int cnt = 0;
47019 for (j = 0; j < 4; ++j)
47020 if ((contents[i] & (1u << j)) != 0)
47021 dremap.perm[2 * i + cnt++] = j;
47022 for (; cnt < 2; ++cnt)
47023 dremap.perm[2 * i + cnt] = 0;
47024 }
47025
47026 dfinal = *d;
47027 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
47028 dfinal.op1 = dfinal.op0;
47029 dfinal.one_operand_p = true;
47030 for (i = 0, j = 0; i < nelt; ++i)
47031 {
47032 if (i == nelt2)
47033 j = 2;
47034 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
47035 if ((d->perm[i] / nelt4) == dremap.perm[j])
47036 ;
47037 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
47038 dfinal.perm[i] |= nelt4;
47039 else
47040 gcc_unreachable ();
47041 }
47042
47043 ok = expand_vec_perm_1 (&dremap);
47044 gcc_assert (ok);
47045
47046 ok = expand_vec_perm_1 (&dfinal);
47047 gcc_assert (ok);
47048
47049 return true;
47050 }
47051
47052 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
47053 a vector permutation using two instructions, vperm2f128 resp.
47054 vperm2i128 followed by any single in-lane permutation. */
47055
47056 static bool
47057 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
47058 {
47059 struct expand_vec_perm_d dfirst, dsecond;
47060 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
47061 bool ok;
47062
47063 if (!TARGET_AVX
47064 || GET_MODE_SIZE (d->vmode) != 32
47065 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
47066 return false;
47067
47068 dsecond = *d;
47069 dsecond.one_operand_p = false;
47070 dsecond.testing_p = true;
47071
47072 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
47073 immediate. For perm < 16 the second permutation uses
47074 d->op0 as first operand, for perm >= 16 it uses d->op1
47075 as first operand. The second operand is the result of
47076 vperm2[fi]128. */
47077 for (perm = 0; perm < 32; perm++)
47078 {
47079 /* Ignore permutations which do not move anything cross-lane. */
47080 if (perm < 16)
47081 {
47082 /* The second shuffle for e.g. V4DFmode has
47083 0123 and ABCD operands.
47084 Ignore AB23, as 23 is already in the second lane
47085 of the first operand. */
47086 if ((perm & 0xc) == (1 << 2)) continue;
47087 /* And 01CD, as 01 is in the first lane of the first
47088 operand. */
47089 if ((perm & 3) == 0) continue;
47090 /* And 4567, as then the vperm2[fi]128 doesn't change
47091 anything on the original 4567 second operand. */
47092 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
47093 }
47094 else
47095 {
47096 /* The second shuffle for e.g. V4DFmode has
47097 4567 and ABCD operands.
47098 Ignore AB67, as 67 is already in the second lane
47099 of the first operand. */
47100 if ((perm & 0xc) == (3 << 2)) continue;
47101 /* And 45CD, as 45 is in the first lane of the first
47102 operand. */
47103 if ((perm & 3) == 2) continue;
47104 /* And 0123, as then the vperm2[fi]128 doesn't change
47105 anything on the original 0123 first operand. */
47106 if ((perm & 0xf) == (1 << 2)) continue;
47107 }
47108
47109 for (i = 0; i < nelt; i++)
47110 {
47111 j = d->perm[i] / nelt2;
47112 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
47113 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
47114 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
47115 dsecond.perm[i] = d->perm[i] & (nelt - 1);
47116 else
47117 break;
47118 }
47119
47120 if (i == nelt)
47121 {
47122 start_sequence ();
47123 ok = expand_vec_perm_1 (&dsecond);
47124 end_sequence ();
47125 }
47126 else
47127 ok = false;
47128
47129 if (ok)
47130 {
47131 if (d->testing_p)
47132 return true;
47133
47134 /* Found a usable second shuffle. dfirst will be
47135 vperm2f128 on d->op0 and d->op1. */
47136 dsecond.testing_p = false;
47137 dfirst = *d;
47138 dfirst.target = gen_reg_rtx (d->vmode);
47139 for (i = 0; i < nelt; i++)
47140 dfirst.perm[i] = (i & (nelt2 - 1))
47141 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
47142
47143 canonicalize_perm (&dfirst);
47144 ok = expand_vec_perm_1 (&dfirst);
47145 gcc_assert (ok);
47146
47147 /* And dsecond is some single insn shuffle, taking
47148 d->op0 and result of vperm2f128 (if perm < 16) or
47149 d->op1 and result of vperm2f128 (otherwise). */
47150 if (perm >= 16)
47151 dsecond.op0 = dsecond.op1;
47152 dsecond.op1 = dfirst.target;
47153
47154 ok = expand_vec_perm_1 (&dsecond);
47155 gcc_assert (ok);
47156
47157 return true;
47158 }
47159
47160 /* For one operand, the only useful vperm2f128 permutation is 0x01
47161 aka lanes swap. */
47162 if (d->one_operand_p)
47163 return false;
47164 }
47165
47166 return false;
47167 }
47168
47169 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47170 a two vector permutation using 2 intra-lane interleave insns
47171 and cross-lane shuffle for 32-byte vectors. */
47172
47173 static bool
47174 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
47175 {
47176 unsigned i, nelt;
47177 rtx (*gen) (rtx, rtx, rtx);
47178
47179 if (d->one_operand_p)
47180 return false;
47181 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
47182 ;
47183 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
47184 ;
47185 else
47186 return false;
47187
47188 nelt = d->nelt;
47189 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
47190 return false;
47191 for (i = 0; i < nelt; i += 2)
47192 if (d->perm[i] != d->perm[0] + i / 2
47193 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
47194 return false;
47195
47196 if (d->testing_p)
47197 return true;
47198
47199 switch (d->vmode)
47200 {
47201 case V32QImode:
47202 if (d->perm[0])
47203 gen = gen_vec_interleave_highv32qi;
47204 else
47205 gen = gen_vec_interleave_lowv32qi;
47206 break;
47207 case V16HImode:
47208 if (d->perm[0])
47209 gen = gen_vec_interleave_highv16hi;
47210 else
47211 gen = gen_vec_interleave_lowv16hi;
47212 break;
47213 case V8SImode:
47214 if (d->perm[0])
47215 gen = gen_vec_interleave_highv8si;
47216 else
47217 gen = gen_vec_interleave_lowv8si;
47218 break;
47219 case V4DImode:
47220 if (d->perm[0])
47221 gen = gen_vec_interleave_highv4di;
47222 else
47223 gen = gen_vec_interleave_lowv4di;
47224 break;
47225 case V8SFmode:
47226 if (d->perm[0])
47227 gen = gen_vec_interleave_highv8sf;
47228 else
47229 gen = gen_vec_interleave_lowv8sf;
47230 break;
47231 case V4DFmode:
47232 if (d->perm[0])
47233 gen = gen_vec_interleave_highv4df;
47234 else
47235 gen = gen_vec_interleave_lowv4df;
47236 break;
47237 default:
47238 gcc_unreachable ();
47239 }
47240
47241 emit_insn (gen (d->target, d->op0, d->op1));
47242 return true;
47243 }
47244
47245 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
47246 a single vector permutation using a single intra-lane vector
47247 permutation, vperm2f128 swapping the lanes and vblend* insn blending
47248 the non-swapped and swapped vectors together. */
47249
47250 static bool
47251 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
47252 {
47253 struct expand_vec_perm_d dfirst, dsecond;
47254 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
47255 rtx_insn *seq;
47256 bool ok;
47257 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
47258
47259 if (!TARGET_AVX
47260 || TARGET_AVX2
47261 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
47262 || !d->one_operand_p)
47263 return false;
47264
47265 dfirst = *d;
47266 for (i = 0; i < nelt; i++)
47267 dfirst.perm[i] = 0xff;
47268 for (i = 0, msk = 0; i < nelt; i++)
47269 {
47270 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
47271 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
47272 return false;
47273 dfirst.perm[j] = d->perm[i];
47274 if (j != i)
47275 msk |= (1 << i);
47276 }
47277 for (i = 0; i < nelt; i++)
47278 if (dfirst.perm[i] == 0xff)
47279 dfirst.perm[i] = i;
47280
47281 if (!d->testing_p)
47282 dfirst.target = gen_reg_rtx (dfirst.vmode);
47283
47284 start_sequence ();
47285 ok = expand_vec_perm_1 (&dfirst);
47286 seq = get_insns ();
47287 end_sequence ();
47288
47289 if (!ok)
47290 return false;
47291
47292 if (d->testing_p)
47293 return true;
47294
47295 emit_insn (seq);
47296
47297 dsecond = *d;
47298 dsecond.op0 = dfirst.target;
47299 dsecond.op1 = dfirst.target;
47300 dsecond.one_operand_p = true;
47301 dsecond.target = gen_reg_rtx (dsecond.vmode);
47302 for (i = 0; i < nelt; i++)
47303 dsecond.perm[i] = i ^ nelt2;
47304
47305 ok = expand_vec_perm_1 (&dsecond);
47306 gcc_assert (ok);
47307
47308 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
47309 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
47310 return true;
47311 }
47312
47313 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
47314 permutation using two vperm2f128, followed by a vshufpd insn blending
47315 the two vectors together. */
47316
47317 static bool
47318 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
47319 {
47320 struct expand_vec_perm_d dfirst, dsecond, dthird;
47321 bool ok;
47322
47323 if (!TARGET_AVX || (d->vmode != V4DFmode))
47324 return false;
47325
47326 if (d->testing_p)
47327 return true;
47328
47329 dfirst = *d;
47330 dsecond = *d;
47331 dthird = *d;
47332
47333 dfirst.perm[0] = (d->perm[0] & ~1);
47334 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
47335 dfirst.perm[2] = (d->perm[2] & ~1);
47336 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
47337 dsecond.perm[0] = (d->perm[1] & ~1);
47338 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
47339 dsecond.perm[2] = (d->perm[3] & ~1);
47340 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
47341 dthird.perm[0] = (d->perm[0] % 2);
47342 dthird.perm[1] = (d->perm[1] % 2) + 4;
47343 dthird.perm[2] = (d->perm[2] % 2) + 2;
47344 dthird.perm[3] = (d->perm[3] % 2) + 6;
47345
47346 dfirst.target = gen_reg_rtx (dfirst.vmode);
47347 dsecond.target = gen_reg_rtx (dsecond.vmode);
47348 dthird.op0 = dfirst.target;
47349 dthird.op1 = dsecond.target;
47350 dthird.one_operand_p = false;
47351
47352 canonicalize_perm (&dfirst);
47353 canonicalize_perm (&dsecond);
47354
47355 ok = expand_vec_perm_1 (&dfirst)
47356 && expand_vec_perm_1 (&dsecond)
47357 && expand_vec_perm_1 (&dthird);
47358
47359 gcc_assert (ok);
47360
47361 return true;
47362 }
47363
47364 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
47365 permutation with two pshufb insns and an ior. We should have already
47366 failed all two instruction sequences. */
47367
47368 static bool
47369 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
47370 {
47371 rtx rperm[2][16], vperm, l, h, op, m128;
47372 unsigned int i, nelt, eltsz;
47373
47374 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
47375 return false;
47376 gcc_assert (!d->one_operand_p);
47377
47378 if (d->testing_p)
47379 return true;
47380
47381 nelt = d->nelt;
47382 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47383
47384 /* Generate two permutation masks. If the required element is within
47385 the given vector it is shuffled into the proper lane. If the required
47386 element is in the other vector, force a zero into the lane by setting
47387 bit 7 in the permutation mask. */
47388 m128 = GEN_INT (-128);
47389 for (i = 0; i < nelt; ++i)
47390 {
47391 unsigned j, e = d->perm[i];
47392 unsigned which = (e >= nelt);
47393 if (e >= nelt)
47394 e -= nelt;
47395
47396 for (j = 0; j < eltsz; ++j)
47397 {
47398 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
47399 rperm[1-which][i*eltsz + j] = m128;
47400 }
47401 }
47402
47403 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
47404 vperm = force_reg (V16QImode, vperm);
47405
47406 l = gen_reg_rtx (V16QImode);
47407 op = gen_lowpart (V16QImode, d->op0);
47408 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
47409
47410 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
47411 vperm = force_reg (V16QImode, vperm);
47412
47413 h = gen_reg_rtx (V16QImode);
47414 op = gen_lowpart (V16QImode, d->op1);
47415 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
47416
47417 op = d->target;
47418 if (d->vmode != V16QImode)
47419 op = gen_reg_rtx (V16QImode);
47420 emit_insn (gen_iorv16qi3 (op, l, h));
47421 if (op != d->target)
47422 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
47423
47424 return true;
47425 }
47426
47427 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
47428 with two vpshufb insns, vpermq and vpor. We should have already failed
47429 all two or three instruction sequences. */
47430
47431 static bool
47432 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
47433 {
47434 rtx rperm[2][32], vperm, l, h, hp, op, m128;
47435 unsigned int i, nelt, eltsz;
47436
47437 if (!TARGET_AVX2
47438 || !d->one_operand_p
47439 || (d->vmode != V32QImode && d->vmode != V16HImode))
47440 return false;
47441
47442 if (d->testing_p)
47443 return true;
47444
47445 nelt = d->nelt;
47446 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47447
47448 /* Generate two permutation masks. If the required element is within
47449 the same lane, it is shuffled in. If the required element from the
47450 other lane, force a zero by setting bit 7 in the permutation mask.
47451 In the other mask the mask has non-negative elements if element
47452 is requested from the other lane, but also moved to the other lane,
47453 so that the result of vpshufb can have the two V2TImode halves
47454 swapped. */
47455 m128 = GEN_INT (-128);
47456 for (i = 0; i < nelt; ++i)
47457 {
47458 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
47459 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
47460
47461 for (j = 0; j < eltsz; ++j)
47462 {
47463 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
47464 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
47465 }
47466 }
47467
47468 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
47469 vperm = force_reg (V32QImode, vperm);
47470
47471 h = gen_reg_rtx (V32QImode);
47472 op = gen_lowpart (V32QImode, d->op0);
47473 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
47474
47475 /* Swap the 128-byte lanes of h into hp. */
47476 hp = gen_reg_rtx (V4DImode);
47477 op = gen_lowpart (V4DImode, h);
47478 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
47479 const1_rtx));
47480
47481 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
47482 vperm = force_reg (V32QImode, vperm);
47483
47484 l = gen_reg_rtx (V32QImode);
47485 op = gen_lowpart (V32QImode, d->op0);
47486 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
47487
47488 op = d->target;
47489 if (d->vmode != V32QImode)
47490 op = gen_reg_rtx (V32QImode);
47491 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
47492 if (op != d->target)
47493 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
47494
47495 return true;
47496 }
47497
47498 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
47499 and extract-odd permutations of two V32QImode and V16QImode operand
47500 with two vpshufb insns, vpor and vpermq. We should have already
47501 failed all two or three instruction sequences. */
47502
47503 static bool
47504 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
47505 {
47506 rtx rperm[2][32], vperm, l, h, ior, op, m128;
47507 unsigned int i, nelt, eltsz;
47508
47509 if (!TARGET_AVX2
47510 || d->one_operand_p
47511 || (d->vmode != V32QImode && d->vmode != V16HImode))
47512 return false;
47513
47514 for (i = 0; i < d->nelt; ++i)
47515 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
47516 return false;
47517
47518 if (d->testing_p)
47519 return true;
47520
47521 nelt = d->nelt;
47522 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47523
47524 /* Generate two permutation masks. In the first permutation mask
47525 the first quarter will contain indexes for the first half
47526 of the op0, the second quarter will contain bit 7 set, third quarter
47527 will contain indexes for the second half of the op0 and the
47528 last quarter bit 7 set. In the second permutation mask
47529 the first quarter will contain bit 7 set, the second quarter
47530 indexes for the first half of the op1, the third quarter bit 7 set
47531 and last quarter indexes for the second half of the op1.
47532 I.e. the first mask e.g. for V32QImode extract even will be:
47533 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
47534 (all values masked with 0xf except for -128) and second mask
47535 for extract even will be
47536 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
47537 m128 = GEN_INT (-128);
47538 for (i = 0; i < nelt; ++i)
47539 {
47540 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
47541 unsigned which = d->perm[i] >= nelt;
47542 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
47543
47544 for (j = 0; j < eltsz; ++j)
47545 {
47546 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
47547 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
47548 }
47549 }
47550
47551 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
47552 vperm = force_reg (V32QImode, vperm);
47553
47554 l = gen_reg_rtx (V32QImode);
47555 op = gen_lowpart (V32QImode, d->op0);
47556 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
47557
47558 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
47559 vperm = force_reg (V32QImode, vperm);
47560
47561 h = gen_reg_rtx (V32QImode);
47562 op = gen_lowpart (V32QImode, d->op1);
47563 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
47564
47565 ior = gen_reg_rtx (V32QImode);
47566 emit_insn (gen_iorv32qi3 (ior, l, h));
47567
47568 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
47569 op = gen_reg_rtx (V4DImode);
47570 ior = gen_lowpart (V4DImode, ior);
47571 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
47572 const1_rtx, GEN_INT (3)));
47573 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
47574
47575 return true;
47576 }
47577
47578 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
47579 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
47580 with two "and" and "pack" or two "shift" and "pack" insns. We should
47581 have already failed all two instruction sequences. */
47582
47583 static bool
47584 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
47585 {
47586 rtx op, dop0, dop1, t, rperm[16];
47587 unsigned i, odd, c, s, nelt = d->nelt;
47588 bool end_perm = false;
47589 machine_mode half_mode;
47590 rtx (*gen_and) (rtx, rtx, rtx);
47591 rtx (*gen_pack) (rtx, rtx, rtx);
47592 rtx (*gen_shift) (rtx, rtx, rtx);
47593
47594 if (d->one_operand_p)
47595 return false;
47596
47597 switch (d->vmode)
47598 {
47599 case V8HImode:
47600 /* Required for "pack". */
47601 if (!TARGET_SSE4_1)
47602 return false;
47603 c = 0xffff;
47604 s = 16;
47605 half_mode = V4SImode;
47606 gen_and = gen_andv4si3;
47607 gen_pack = gen_sse4_1_packusdw;
47608 gen_shift = gen_lshrv4si3;
47609 break;
47610 case V16QImode:
47611 /* No check as all instructions are SSE2. */
47612 c = 0xff;
47613 s = 8;
47614 half_mode = V8HImode;
47615 gen_and = gen_andv8hi3;
47616 gen_pack = gen_sse2_packuswb;
47617 gen_shift = gen_lshrv8hi3;
47618 break;
47619 case V16HImode:
47620 if (!TARGET_AVX2)
47621 return false;
47622 c = 0xffff;
47623 s = 16;
47624 half_mode = V8SImode;
47625 gen_and = gen_andv8si3;
47626 gen_pack = gen_avx2_packusdw;
47627 gen_shift = gen_lshrv8si3;
47628 end_perm = true;
47629 break;
47630 case V32QImode:
47631 if (!TARGET_AVX2)
47632 return false;
47633 c = 0xff;
47634 s = 8;
47635 half_mode = V16HImode;
47636 gen_and = gen_andv16hi3;
47637 gen_pack = gen_avx2_packuswb;
47638 gen_shift = gen_lshrv16hi3;
47639 end_perm = true;
47640 break;
47641 default:
47642 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
47643 general shuffles. */
47644 return false;
47645 }
47646
47647 /* Check that permutation is even or odd. */
47648 odd = d->perm[0];
47649 if (odd > 1)
47650 return false;
47651
47652 for (i = 1; i < nelt; ++i)
47653 if (d->perm[i] != 2 * i + odd)
47654 return false;
47655
47656 if (d->testing_p)
47657 return true;
47658
47659 dop0 = gen_reg_rtx (half_mode);
47660 dop1 = gen_reg_rtx (half_mode);
47661 if (odd == 0)
47662 {
47663 for (i = 0; i < nelt / 2; i++)
47664 rperm[i] = GEN_INT (c);
47665 t = gen_rtx_CONST_VECTOR (half_mode, gen_rtvec_v (nelt / 2, rperm));
47666 t = force_reg (half_mode, t);
47667 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
47668 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
47669 }
47670 else
47671 {
47672 emit_insn (gen_shift (dop0,
47673 gen_lowpart (half_mode, d->op0),
47674 GEN_INT (s)));
47675 emit_insn (gen_shift (dop1,
47676 gen_lowpart (half_mode, d->op1),
47677 GEN_INT (s)));
47678 }
47679 /* In AVX2 for 256 bit case we need to permute pack result. */
47680 if (TARGET_AVX2 && end_perm)
47681 {
47682 op = gen_reg_rtx (d->vmode);
47683 t = gen_reg_rtx (V4DImode);
47684 emit_insn (gen_pack (op, dop0, dop1));
47685 emit_insn (gen_avx2_permv4di_1 (t,
47686 gen_lowpart (V4DImode, op),
47687 const0_rtx,
47688 const2_rtx,
47689 const1_rtx,
47690 GEN_INT (3)));
47691 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
47692 }
47693 else
47694 emit_insn (gen_pack (d->target, dop0, dop1));
47695
47696 return true;
47697 }
47698
47699 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
47700 and extract-odd permutations of two V64QI operands
47701 with two "shifts", two "truncs" and one "concat" insns for "odd"
47702 and two "truncs" and one concat insn for "even."
47703 Have already failed all two instruction sequences. */
47704
47705 static bool
47706 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
47707 {
47708 rtx t1, t2, t3, t4;
47709 unsigned i, odd, nelt = d->nelt;
47710
47711 if (!TARGET_AVX512BW
47712 || d->one_operand_p
47713 || d->vmode != V64QImode)
47714 return false;
47715
47716 /* Check that permutation is even or odd. */
47717 odd = d->perm[0];
47718 if (odd > 1)
47719 return false;
47720
47721 for (i = 1; i < nelt; ++i)
47722 if (d->perm[i] != 2 * i + odd)
47723 return false;
47724
47725 if (d->testing_p)
47726 return true;
47727
47728
47729 if (odd)
47730 {
47731 t1 = gen_reg_rtx (V32HImode);
47732 t2 = gen_reg_rtx (V32HImode);
47733 emit_insn (gen_lshrv32hi3 (t1,
47734 gen_lowpart (V32HImode, d->op0),
47735 GEN_INT (8)));
47736 emit_insn (gen_lshrv32hi3 (t2,
47737 gen_lowpart (V32HImode, d->op1),
47738 GEN_INT (8)));
47739 }
47740 else
47741 {
47742 t1 = gen_lowpart (V32HImode, d->op0);
47743 t2 = gen_lowpart (V32HImode, d->op1);
47744 }
47745
47746 t3 = gen_reg_rtx (V32QImode);
47747 t4 = gen_reg_rtx (V32QImode);
47748 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
47749 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
47750 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
47751
47752 return true;
47753 }
47754
47755 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
47756 and extract-odd permutations. */
47757
47758 static bool
47759 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
47760 {
47761 rtx t1, t2, t3, t4, t5;
47762
47763 switch (d->vmode)
47764 {
47765 case V4DFmode:
47766 if (d->testing_p)
47767 break;
47768 t1 = gen_reg_rtx (V4DFmode);
47769 t2 = gen_reg_rtx (V4DFmode);
47770
47771 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
47772 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
47773 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
47774
47775 /* Now an unpck[lh]pd will produce the result required. */
47776 if (odd)
47777 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
47778 else
47779 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
47780 emit_insn (t3);
47781 break;
47782
47783 case V8SFmode:
47784 {
47785 int mask = odd ? 0xdd : 0x88;
47786
47787 if (d->testing_p)
47788 break;
47789 t1 = gen_reg_rtx (V8SFmode);
47790 t2 = gen_reg_rtx (V8SFmode);
47791 t3 = gen_reg_rtx (V8SFmode);
47792
47793 /* Shuffle within the 128-bit lanes to produce:
47794 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
47795 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
47796 GEN_INT (mask)));
47797
47798 /* Shuffle the lanes around to produce:
47799 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
47800 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
47801 GEN_INT (0x3)));
47802
47803 /* Shuffle within the 128-bit lanes to produce:
47804 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
47805 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
47806
47807 /* Shuffle within the 128-bit lanes to produce:
47808 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
47809 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
47810
47811 /* Shuffle the lanes around to produce:
47812 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
47813 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
47814 GEN_INT (0x20)));
47815 }
47816 break;
47817
47818 case V2DFmode:
47819 case V4SFmode:
47820 case V2DImode:
47821 case V4SImode:
47822 /* These are always directly implementable by expand_vec_perm_1. */
47823 gcc_unreachable ();
47824
47825 case V8HImode:
47826 if (TARGET_SSE4_1)
47827 return expand_vec_perm_even_odd_pack (d);
47828 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
47829 return expand_vec_perm_pshufb2 (d);
47830 else
47831 {
47832 if (d->testing_p)
47833 break;
47834 /* We need 2*log2(N)-1 operations to achieve odd/even
47835 with interleave. */
47836 t1 = gen_reg_rtx (V8HImode);
47837 t2 = gen_reg_rtx (V8HImode);
47838 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
47839 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
47840 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
47841 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
47842 if (odd)
47843 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
47844 else
47845 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
47846 emit_insn (t3);
47847 }
47848 break;
47849
47850 case V16QImode:
47851 return expand_vec_perm_even_odd_pack (d);
47852
47853 case V16HImode:
47854 case V32QImode:
47855 return expand_vec_perm_even_odd_pack (d);
47856
47857 case V64QImode:
47858 return expand_vec_perm_even_odd_trunc (d);
47859
47860 case V4DImode:
47861 if (!TARGET_AVX2)
47862 {
47863 struct expand_vec_perm_d d_copy = *d;
47864 d_copy.vmode = V4DFmode;
47865 if (d->testing_p)
47866 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
47867 else
47868 d_copy.target = gen_reg_rtx (V4DFmode);
47869 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
47870 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
47871 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
47872 {
47873 if (!d->testing_p)
47874 emit_move_insn (d->target,
47875 gen_lowpart (V4DImode, d_copy.target));
47876 return true;
47877 }
47878 return false;
47879 }
47880
47881 if (d->testing_p)
47882 break;
47883
47884 t1 = gen_reg_rtx (V4DImode);
47885 t2 = gen_reg_rtx (V4DImode);
47886
47887 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
47888 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
47889 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
47890
47891 /* Now an vpunpck[lh]qdq will produce the result required. */
47892 if (odd)
47893 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
47894 else
47895 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
47896 emit_insn (t3);
47897 break;
47898
47899 case V8SImode:
47900 if (!TARGET_AVX2)
47901 {
47902 struct expand_vec_perm_d d_copy = *d;
47903 d_copy.vmode = V8SFmode;
47904 if (d->testing_p)
47905 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
47906 else
47907 d_copy.target = gen_reg_rtx (V8SFmode);
47908 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
47909 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
47910 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
47911 {
47912 if (!d->testing_p)
47913 emit_move_insn (d->target,
47914 gen_lowpart (V8SImode, d_copy.target));
47915 return true;
47916 }
47917 return false;
47918 }
47919
47920 if (d->testing_p)
47921 break;
47922
47923 t1 = gen_reg_rtx (V8SImode);
47924 t2 = gen_reg_rtx (V8SImode);
47925 t3 = gen_reg_rtx (V4DImode);
47926 t4 = gen_reg_rtx (V4DImode);
47927 t5 = gen_reg_rtx (V4DImode);
47928
47929 /* Shuffle the lanes around into
47930 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
47931 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
47932 gen_lowpart (V4DImode, d->op1),
47933 GEN_INT (0x20)));
47934 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
47935 gen_lowpart (V4DImode, d->op1),
47936 GEN_INT (0x31)));
47937
47938 /* Swap the 2nd and 3rd position in each lane into
47939 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
47940 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
47941 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
47942 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
47943 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
47944
47945 /* Now an vpunpck[lh]qdq will produce
47946 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
47947 if (odd)
47948 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
47949 gen_lowpart (V4DImode, t2));
47950 else
47951 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
47952 gen_lowpart (V4DImode, t2));
47953 emit_insn (t3);
47954 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
47955 break;
47956
47957 default:
47958 gcc_unreachable ();
47959 }
47960
47961 return true;
47962 }
47963
47964 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
47965 extract-even and extract-odd permutations. */
47966
47967 static bool
47968 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
47969 {
47970 unsigned i, odd, nelt = d->nelt;
47971
47972 odd = d->perm[0];
47973 if (odd != 0 && odd != 1)
47974 return false;
47975
47976 for (i = 1; i < nelt; ++i)
47977 if (d->perm[i] != 2 * i + odd)
47978 return false;
47979
47980 return expand_vec_perm_even_odd_1 (d, odd);
47981 }
47982
47983 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
47984 permutations. We assume that expand_vec_perm_1 has already failed. */
47985
47986 static bool
47987 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
47988 {
47989 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
47990 machine_mode vmode = d->vmode;
47991 unsigned char perm2[4];
47992 rtx op0 = d->op0, dest;
47993 bool ok;
47994
47995 switch (vmode)
47996 {
47997 case V4DFmode:
47998 case V8SFmode:
47999 /* These are special-cased in sse.md so that we can optionally
48000 use the vbroadcast instruction. They expand to two insns
48001 if the input happens to be in a register. */
48002 gcc_unreachable ();
48003
48004 case V2DFmode:
48005 case V2DImode:
48006 case V4SFmode:
48007 case V4SImode:
48008 /* These are always implementable using standard shuffle patterns. */
48009 gcc_unreachable ();
48010
48011 case V8HImode:
48012 case V16QImode:
48013 /* These can be implemented via interleave. We save one insn by
48014 stopping once we have promoted to V4SImode and then use pshufd. */
48015 if (d->testing_p)
48016 return true;
48017 do
48018 {
48019 rtx dest;
48020 rtx (*gen) (rtx, rtx, rtx)
48021 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
48022 : gen_vec_interleave_lowv8hi;
48023
48024 if (elt >= nelt2)
48025 {
48026 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
48027 : gen_vec_interleave_highv8hi;
48028 elt -= nelt2;
48029 }
48030 nelt2 /= 2;
48031
48032 dest = gen_reg_rtx (vmode);
48033 emit_insn (gen (dest, op0, op0));
48034 vmode = get_mode_wider_vector (vmode);
48035 op0 = gen_lowpart (vmode, dest);
48036 }
48037 while (vmode != V4SImode);
48038
48039 memset (perm2, elt, 4);
48040 dest = gen_reg_rtx (V4SImode);
48041 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
48042 gcc_assert (ok);
48043 if (!d->testing_p)
48044 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
48045 return true;
48046
48047 case V64QImode:
48048 case V32QImode:
48049 case V16HImode:
48050 case V8SImode:
48051 case V4DImode:
48052 /* For AVX2 broadcasts of the first element vpbroadcast* or
48053 vpermq should be used by expand_vec_perm_1. */
48054 gcc_assert (!TARGET_AVX2 || d->perm[0]);
48055 return false;
48056
48057 default:
48058 gcc_unreachable ();
48059 }
48060 }
48061
48062 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
48063 broadcast permutations. */
48064
48065 static bool
48066 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
48067 {
48068 unsigned i, elt, nelt = d->nelt;
48069
48070 if (!d->one_operand_p)
48071 return false;
48072
48073 elt = d->perm[0];
48074 for (i = 1; i < nelt; ++i)
48075 if (d->perm[i] != elt)
48076 return false;
48077
48078 return expand_vec_perm_broadcast_1 (d);
48079 }
48080
48081 /* Implement arbitrary permutations of two V64QImode operands
48082 will 2 vpermi2w, 2 vpshufb and one vpor instruction. */
48083 static bool
48084 expand_vec_perm_vpermi2_vpshub2 (struct expand_vec_perm_d *d)
48085 {
48086 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
48087 return false;
48088
48089 if (d->testing_p)
48090 return true;
48091
48092 struct expand_vec_perm_d ds[2];
48093 rtx rperm[128], vperm, target0, target1;
48094 unsigned int i, nelt;
48095 machine_mode vmode;
48096
48097 nelt = d->nelt;
48098 vmode = V64QImode;
48099
48100 for (i = 0; i < 2; i++)
48101 {
48102 ds[i] = *d;
48103 ds[i].vmode = V32HImode;
48104 ds[i].nelt = 32;
48105 ds[i].target = gen_reg_rtx (V32HImode);
48106 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
48107 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
48108 }
48109
48110 /* Prepare permutations such that the first one takes care of
48111 putting the even bytes into the right positions or one higher
48112 positions (ds[0]) and the second one takes care of
48113 putting the odd bytes into the right positions or one below
48114 (ds[1]). */
48115
48116 for (i = 0; i < nelt; i++)
48117 {
48118 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
48119 if (i & 1)
48120 {
48121 rperm[i] = constm1_rtx;
48122 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
48123 }
48124 else
48125 {
48126 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
48127 rperm[i + 64] = constm1_rtx;
48128 }
48129 }
48130
48131 bool ok = expand_vec_perm_1 (&ds[0]);
48132 gcc_assert (ok);
48133 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
48134
48135 ok = expand_vec_perm_1 (&ds[1]);
48136 gcc_assert (ok);
48137 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
48138
48139 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
48140 vperm = force_reg (vmode, vperm);
48141 target0 = gen_reg_rtx (V64QImode);
48142 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
48143
48144 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
48145 vperm = force_reg (vmode, vperm);
48146 target1 = gen_reg_rtx (V64QImode);
48147 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
48148
48149 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
48150 return true;
48151 }
48152
48153 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
48154 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
48155 all the shorter instruction sequences. */
48156
48157 static bool
48158 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
48159 {
48160 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
48161 unsigned int i, nelt, eltsz;
48162 bool used[4];
48163
48164 if (!TARGET_AVX2
48165 || d->one_operand_p
48166 || (d->vmode != V32QImode && d->vmode != V16HImode))
48167 return false;
48168
48169 if (d->testing_p)
48170 return true;
48171
48172 nelt = d->nelt;
48173 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48174
48175 /* Generate 4 permutation masks. If the required element is within
48176 the same lane, it is shuffled in. If the required element from the
48177 other lane, force a zero by setting bit 7 in the permutation mask.
48178 In the other mask the mask has non-negative elements if element
48179 is requested from the other lane, but also moved to the other lane,
48180 so that the result of vpshufb can have the two V2TImode halves
48181 swapped. */
48182 m128 = GEN_INT (-128);
48183 for (i = 0; i < 32; ++i)
48184 {
48185 rperm[0][i] = m128;
48186 rperm[1][i] = m128;
48187 rperm[2][i] = m128;
48188 rperm[3][i] = m128;
48189 }
48190 used[0] = false;
48191 used[1] = false;
48192 used[2] = false;
48193 used[3] = false;
48194 for (i = 0; i < nelt; ++i)
48195 {
48196 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
48197 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
48198 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
48199
48200 for (j = 0; j < eltsz; ++j)
48201 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
48202 used[which] = true;
48203 }
48204
48205 for (i = 0; i < 2; ++i)
48206 {
48207 if (!used[2 * i + 1])
48208 {
48209 h[i] = NULL_RTX;
48210 continue;
48211 }
48212 vperm = gen_rtx_CONST_VECTOR (V32QImode,
48213 gen_rtvec_v (32, rperm[2 * i + 1]));
48214 vperm = force_reg (V32QImode, vperm);
48215 h[i] = gen_reg_rtx (V32QImode);
48216 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
48217 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
48218 }
48219
48220 /* Swap the 128-byte lanes of h[X]. */
48221 for (i = 0; i < 2; ++i)
48222 {
48223 if (h[i] == NULL_RTX)
48224 continue;
48225 op = gen_reg_rtx (V4DImode);
48226 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
48227 const2_rtx, GEN_INT (3), const0_rtx,
48228 const1_rtx));
48229 h[i] = gen_lowpart (V32QImode, op);
48230 }
48231
48232 for (i = 0; i < 2; ++i)
48233 {
48234 if (!used[2 * i])
48235 {
48236 l[i] = NULL_RTX;
48237 continue;
48238 }
48239 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
48240 vperm = force_reg (V32QImode, vperm);
48241 l[i] = gen_reg_rtx (V32QImode);
48242 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
48243 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
48244 }
48245
48246 for (i = 0; i < 2; ++i)
48247 {
48248 if (h[i] && l[i])
48249 {
48250 op = gen_reg_rtx (V32QImode);
48251 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
48252 l[i] = op;
48253 }
48254 else if (h[i])
48255 l[i] = h[i];
48256 }
48257
48258 gcc_assert (l[0] && l[1]);
48259 op = d->target;
48260 if (d->vmode != V32QImode)
48261 op = gen_reg_rtx (V32QImode);
48262 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
48263 if (op != d->target)
48264 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48265 return true;
48266 }
48267
48268 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
48269 With all of the interface bits taken care of, perform the expansion
48270 in D and return true on success. */
48271
48272 static bool
48273 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
48274 {
48275 /* Try a single instruction expansion. */
48276 if (expand_vec_perm_1 (d))
48277 return true;
48278
48279 /* Try sequences of two instructions. */
48280
48281 if (expand_vec_perm_pshuflw_pshufhw (d))
48282 return true;
48283
48284 if (expand_vec_perm_palignr (d, false))
48285 return true;
48286
48287 if (expand_vec_perm_interleave2 (d))
48288 return true;
48289
48290 if (expand_vec_perm_broadcast (d))
48291 return true;
48292
48293 if (expand_vec_perm_vpermq_perm_1 (d))
48294 return true;
48295
48296 if (expand_vec_perm_vperm2f128 (d))
48297 return true;
48298
48299 if (expand_vec_perm_pblendv (d))
48300 return true;
48301
48302 /* Try sequences of three instructions. */
48303
48304 if (expand_vec_perm_even_odd_pack (d))
48305 return true;
48306
48307 if (expand_vec_perm_2vperm2f128_vshuf (d))
48308 return true;
48309
48310 if (expand_vec_perm_pshufb2 (d))
48311 return true;
48312
48313 if (expand_vec_perm_interleave3 (d))
48314 return true;
48315
48316 if (expand_vec_perm_vperm2f128_vblend (d))
48317 return true;
48318
48319 /* Try sequences of four instructions. */
48320
48321 if (expand_vec_perm_even_odd_trunc (d))
48322 return true;
48323 if (expand_vec_perm_vpshufb2_vpermq (d))
48324 return true;
48325
48326 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
48327 return true;
48328
48329 if (expand_vec_perm_vpermi2_vpshub2 (d))
48330 return true;
48331
48332 /* ??? Look for narrow permutations whose element orderings would
48333 allow the promotion to a wider mode. */
48334
48335 /* ??? Look for sequences of interleave or a wider permute that place
48336 the data into the correct lanes for a half-vector shuffle like
48337 pshuf[lh]w or vpermilps. */
48338
48339 /* ??? Look for sequences of interleave that produce the desired results.
48340 The combinatorics of punpck[lh] get pretty ugly... */
48341
48342 if (expand_vec_perm_even_odd (d))
48343 return true;
48344
48345 /* Even longer sequences. */
48346 if (expand_vec_perm_vpshufb4_vpermq2 (d))
48347 return true;
48348
48349 /* See if we can get the same permutation in different vector integer
48350 mode. */
48351 struct expand_vec_perm_d nd;
48352 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
48353 {
48354 if (!d->testing_p)
48355 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
48356 return true;
48357 }
48358
48359 return false;
48360 }
48361
48362 /* If a permutation only uses one operand, make it clear. Returns true
48363 if the permutation references both operands. */
48364
48365 static bool
48366 canonicalize_perm (struct expand_vec_perm_d *d)
48367 {
48368 int i, which, nelt = d->nelt;
48369
48370 for (i = which = 0; i < nelt; ++i)
48371 which |= (d->perm[i] < nelt ? 1 : 2);
48372
48373 d->one_operand_p = true;
48374 switch (which)
48375 {
48376 default:
48377 gcc_unreachable();
48378
48379 case 3:
48380 if (!rtx_equal_p (d->op0, d->op1))
48381 {
48382 d->one_operand_p = false;
48383 break;
48384 }
48385 /* The elements of PERM do not suggest that only the first operand
48386 is used, but both operands are identical. Allow easier matching
48387 of the permutation by folding the permutation into the single
48388 input vector. */
48389 /* FALLTHRU */
48390
48391 case 2:
48392 for (i = 0; i < nelt; ++i)
48393 d->perm[i] &= nelt - 1;
48394 d->op0 = d->op1;
48395 break;
48396
48397 case 1:
48398 d->op1 = d->op0;
48399 break;
48400 }
48401
48402 return (which == 3);
48403 }
48404
48405 bool
48406 ix86_expand_vec_perm_const (rtx operands[4])
48407 {
48408 struct expand_vec_perm_d d;
48409 unsigned char perm[MAX_VECT_LEN];
48410 int i, nelt;
48411 bool two_args;
48412 rtx sel;
48413
48414 d.target = operands[0];
48415 d.op0 = operands[1];
48416 d.op1 = operands[2];
48417 sel = operands[3];
48418
48419 d.vmode = GET_MODE (d.target);
48420 gcc_assert (VECTOR_MODE_P (d.vmode));
48421 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
48422 d.testing_p = false;
48423
48424 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
48425 gcc_assert (XVECLEN (sel, 0) == nelt);
48426 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
48427
48428 for (i = 0; i < nelt; ++i)
48429 {
48430 rtx e = XVECEXP (sel, 0, i);
48431 int ei = INTVAL (e) & (2 * nelt - 1);
48432 d.perm[i] = ei;
48433 perm[i] = ei;
48434 }
48435
48436 two_args = canonicalize_perm (&d);
48437
48438 if (ix86_expand_vec_perm_const_1 (&d))
48439 return true;
48440
48441 /* If the selector says both arguments are needed, but the operands are the
48442 same, the above tried to expand with one_operand_p and flattened selector.
48443 If that didn't work, retry without one_operand_p; we succeeded with that
48444 during testing. */
48445 if (two_args && d.one_operand_p)
48446 {
48447 d.one_operand_p = false;
48448 memcpy (d.perm, perm, sizeof (perm));
48449 return ix86_expand_vec_perm_const_1 (&d);
48450 }
48451
48452 return false;
48453 }
48454
48455 /* Implement targetm.vectorize.vec_perm_const_ok. */
48456
48457 static bool
48458 ix86_vectorize_vec_perm_const_ok (machine_mode vmode,
48459 const unsigned char *sel)
48460 {
48461 struct expand_vec_perm_d d;
48462 unsigned int i, nelt, which;
48463 bool ret;
48464
48465 d.vmode = vmode;
48466 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
48467 d.testing_p = true;
48468
48469 /* Given sufficient ISA support we can just return true here
48470 for selected vector modes. */
48471 switch (d.vmode)
48472 {
48473 case V16SFmode:
48474 case V16SImode:
48475 case V8DImode:
48476 case V8DFmode:
48477 if (TARGET_AVX512F)
48478 /* All implementable with a single vpermi2 insn. */
48479 return true;
48480 break;
48481 case V32HImode:
48482 if (TARGET_AVX512BW)
48483 /* All implementable with a single vpermi2 insn. */
48484 return true;
48485 break;
48486 case V64QImode:
48487 if (TARGET_AVX512BW)
48488 /* Implementable with 2 vpermi2, 2 vpshufb and 1 or insn. */
48489 return true;
48490 break;
48491 case V8SImode:
48492 case V8SFmode:
48493 case V4DFmode:
48494 case V4DImode:
48495 if (TARGET_AVX512VL)
48496 /* All implementable with a single vpermi2 insn. */
48497 return true;
48498 break;
48499 case V16HImode:
48500 if (TARGET_AVX2)
48501 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
48502 return true;
48503 break;
48504 case V32QImode:
48505 if (TARGET_AVX2)
48506 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
48507 return true;
48508 break;
48509 case V4SImode:
48510 case V4SFmode:
48511 case V8HImode:
48512 case V16QImode:
48513 /* All implementable with a single vpperm insn. */
48514 if (TARGET_XOP)
48515 return true;
48516 /* All implementable with 2 pshufb + 1 ior. */
48517 if (TARGET_SSSE3)
48518 return true;
48519 break;
48520 case V2DImode:
48521 case V2DFmode:
48522 /* All implementable with shufpd or unpck[lh]pd. */
48523 return true;
48524 default:
48525 return false;
48526 }
48527
48528 /* Extract the values from the vector CST into the permutation
48529 array in D. */
48530 memcpy (d.perm, sel, nelt);
48531 for (i = which = 0; i < nelt; ++i)
48532 {
48533 unsigned char e = d.perm[i];
48534 gcc_assert (e < 2 * nelt);
48535 which |= (e < nelt ? 1 : 2);
48536 }
48537
48538 /* For all elements from second vector, fold the elements to first. */
48539 if (which == 2)
48540 for (i = 0; i < nelt; ++i)
48541 d.perm[i] -= nelt;
48542
48543 /* Check whether the mask can be applied to the vector type. */
48544 d.one_operand_p = (which != 3);
48545
48546 /* Implementable with shufps or pshufd. */
48547 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
48548 return true;
48549
48550 /* Otherwise we have to go through the motions and see if we can
48551 figure out how to generate the requested permutation. */
48552 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
48553 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
48554 if (!d.one_operand_p)
48555 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
48556
48557 start_sequence ();
48558 ret = ix86_expand_vec_perm_const_1 (&d);
48559 end_sequence ();
48560
48561 return ret;
48562 }
48563
48564 void
48565 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
48566 {
48567 struct expand_vec_perm_d d;
48568 unsigned i, nelt;
48569
48570 d.target = targ;
48571 d.op0 = op0;
48572 d.op1 = op1;
48573 d.vmode = GET_MODE (targ);
48574 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
48575 d.one_operand_p = false;
48576 d.testing_p = false;
48577
48578 for (i = 0; i < nelt; ++i)
48579 d.perm[i] = i * 2 + odd;
48580
48581 /* We'll either be able to implement the permutation directly... */
48582 if (expand_vec_perm_1 (&d))
48583 return;
48584
48585 /* ... or we use the special-case patterns. */
48586 expand_vec_perm_even_odd_1 (&d, odd);
48587 }
48588
48589 static void
48590 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
48591 {
48592 struct expand_vec_perm_d d;
48593 unsigned i, nelt, base;
48594 bool ok;
48595
48596 d.target = targ;
48597 d.op0 = op0;
48598 d.op1 = op1;
48599 d.vmode = GET_MODE (targ);
48600 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
48601 d.one_operand_p = false;
48602 d.testing_p = false;
48603
48604 base = high_p ? nelt / 2 : 0;
48605 for (i = 0; i < nelt / 2; ++i)
48606 {
48607 d.perm[i * 2] = i + base;
48608 d.perm[i * 2 + 1] = i + base + nelt;
48609 }
48610
48611 /* Note that for AVX this isn't one instruction. */
48612 ok = ix86_expand_vec_perm_const_1 (&d);
48613 gcc_assert (ok);
48614 }
48615
48616
48617 /* Expand a vector operation CODE for a V*QImode in terms of the
48618 same operation on V*HImode. */
48619
48620 void
48621 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
48622 {
48623 machine_mode qimode = GET_MODE (dest);
48624 machine_mode himode;
48625 rtx (*gen_il) (rtx, rtx, rtx);
48626 rtx (*gen_ih) (rtx, rtx, rtx);
48627 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
48628 struct expand_vec_perm_d d;
48629 bool ok, full_interleave;
48630 bool uns_p = false;
48631 int i;
48632
48633 switch (qimode)
48634 {
48635 case V16QImode:
48636 himode = V8HImode;
48637 gen_il = gen_vec_interleave_lowv16qi;
48638 gen_ih = gen_vec_interleave_highv16qi;
48639 break;
48640 case V32QImode:
48641 himode = V16HImode;
48642 gen_il = gen_avx2_interleave_lowv32qi;
48643 gen_ih = gen_avx2_interleave_highv32qi;
48644 break;
48645 case V64QImode:
48646 himode = V32HImode;
48647 gen_il = gen_avx512bw_interleave_lowv64qi;
48648 gen_ih = gen_avx512bw_interleave_highv64qi;
48649 break;
48650 default:
48651 gcc_unreachable ();
48652 }
48653
48654 op2_l = op2_h = op2;
48655 switch (code)
48656 {
48657 case MULT:
48658 /* Unpack data such that we've got a source byte in each low byte of
48659 each word. We don't care what goes into the high byte of each word.
48660 Rather than trying to get zero in there, most convenient is to let
48661 it be a copy of the low byte. */
48662 op2_l = gen_reg_rtx (qimode);
48663 op2_h = gen_reg_rtx (qimode);
48664 emit_insn (gen_il (op2_l, op2, op2));
48665 emit_insn (gen_ih (op2_h, op2, op2));
48666 /* FALLTHRU */
48667
48668 op1_l = gen_reg_rtx (qimode);
48669 op1_h = gen_reg_rtx (qimode);
48670 emit_insn (gen_il (op1_l, op1, op1));
48671 emit_insn (gen_ih (op1_h, op1, op1));
48672 full_interleave = qimode == V16QImode;
48673 break;
48674
48675 case ASHIFT:
48676 case LSHIFTRT:
48677 uns_p = true;
48678 /* FALLTHRU */
48679 case ASHIFTRT:
48680 op1_l = gen_reg_rtx (himode);
48681 op1_h = gen_reg_rtx (himode);
48682 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
48683 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
48684 full_interleave = true;
48685 break;
48686 default:
48687 gcc_unreachable ();
48688 }
48689
48690 /* Perform the operation. */
48691 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
48692 1, OPTAB_DIRECT);
48693 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
48694 1, OPTAB_DIRECT);
48695 gcc_assert (res_l && res_h);
48696
48697 /* Merge the data back into the right place. */
48698 d.target = dest;
48699 d.op0 = gen_lowpart (qimode, res_l);
48700 d.op1 = gen_lowpart (qimode, res_h);
48701 d.vmode = qimode;
48702 d.nelt = GET_MODE_NUNITS (qimode);
48703 d.one_operand_p = false;
48704 d.testing_p = false;
48705
48706 if (full_interleave)
48707 {
48708 /* For SSE2, we used an full interleave, so the desired
48709 results are in the even elements. */
48710 for (i = 0; i < d.nelt; ++i)
48711 d.perm[i] = i * 2;
48712 }
48713 else
48714 {
48715 /* For AVX, the interleave used above was not cross-lane. So the
48716 extraction is evens but with the second and third quarter swapped.
48717 Happily, that is even one insn shorter than even extraction.
48718 For AVX512BW we have 4 lanes. We extract evens from within a lane,
48719 always first from the first and then from the second source operand,
48720 the index bits above the low 4 bits remains the same.
48721 Thus, for d.nelt == 32 we want permutation
48722 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
48723 and for d.nelt == 64 we want permutation
48724 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
48725 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
48726 for (i = 0; i < d.nelt; ++i)
48727 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
48728 }
48729
48730 ok = ix86_expand_vec_perm_const_1 (&d);
48731 gcc_assert (ok);
48732
48733 set_unique_reg_note (get_last_insn (), REG_EQUAL,
48734 gen_rtx_fmt_ee (code, qimode, op1, op2));
48735 }
48736
48737 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
48738 if op is CONST_VECTOR with all odd elements equal to their
48739 preceding element. */
48740
48741 static bool
48742 const_vector_equal_evenodd_p (rtx op)
48743 {
48744 machine_mode mode = GET_MODE (op);
48745 int i, nunits = GET_MODE_NUNITS (mode);
48746 if (GET_CODE (op) != CONST_VECTOR
48747 || nunits != CONST_VECTOR_NUNITS (op))
48748 return false;
48749 for (i = 0; i < nunits; i += 2)
48750 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
48751 return false;
48752 return true;
48753 }
48754
48755 void
48756 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
48757 bool uns_p, bool odd_p)
48758 {
48759 machine_mode mode = GET_MODE (op1);
48760 machine_mode wmode = GET_MODE (dest);
48761 rtx x;
48762 rtx orig_op1 = op1, orig_op2 = op2;
48763
48764 if (!nonimmediate_operand (op1, mode))
48765 op1 = force_reg (mode, op1);
48766 if (!nonimmediate_operand (op2, mode))
48767 op2 = force_reg (mode, op2);
48768
48769 /* We only play even/odd games with vectors of SImode. */
48770 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
48771
48772 /* If we're looking for the odd results, shift those members down to
48773 the even slots. For some cpus this is faster than a PSHUFD. */
48774 if (odd_p)
48775 {
48776 /* For XOP use vpmacsdqh, but only for smult, as it is only
48777 signed. */
48778 if (TARGET_XOP && mode == V4SImode && !uns_p)
48779 {
48780 x = force_reg (wmode, CONST0_RTX (wmode));
48781 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
48782 return;
48783 }
48784
48785 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
48786 if (!const_vector_equal_evenodd_p (orig_op1))
48787 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
48788 x, NULL, 1, OPTAB_DIRECT);
48789 if (!const_vector_equal_evenodd_p (orig_op2))
48790 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
48791 x, NULL, 1, OPTAB_DIRECT);
48792 op1 = gen_lowpart (mode, op1);
48793 op2 = gen_lowpart (mode, op2);
48794 }
48795
48796 if (mode == V16SImode)
48797 {
48798 if (uns_p)
48799 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
48800 else
48801 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
48802 }
48803 else if (mode == V8SImode)
48804 {
48805 if (uns_p)
48806 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
48807 else
48808 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
48809 }
48810 else if (uns_p)
48811 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
48812 else if (TARGET_SSE4_1)
48813 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
48814 else
48815 {
48816 rtx s1, s2, t0, t1, t2;
48817
48818 /* The easiest way to implement this without PMULDQ is to go through
48819 the motions as if we are performing a full 64-bit multiply. With
48820 the exception that we need to do less shuffling of the elements. */
48821
48822 /* Compute the sign-extension, aka highparts, of the two operands. */
48823 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
48824 op1, pc_rtx, pc_rtx);
48825 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
48826 op2, pc_rtx, pc_rtx);
48827
48828 /* Multiply LO(A) * HI(B), and vice-versa. */
48829 t1 = gen_reg_rtx (wmode);
48830 t2 = gen_reg_rtx (wmode);
48831 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
48832 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
48833
48834 /* Multiply LO(A) * LO(B). */
48835 t0 = gen_reg_rtx (wmode);
48836 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
48837
48838 /* Combine and shift the highparts into place. */
48839 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
48840 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
48841 1, OPTAB_DIRECT);
48842
48843 /* Combine high and low parts. */
48844 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
48845 return;
48846 }
48847 emit_insn (x);
48848 }
48849
48850 void
48851 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
48852 bool uns_p, bool high_p)
48853 {
48854 machine_mode wmode = GET_MODE (dest);
48855 machine_mode mode = GET_MODE (op1);
48856 rtx t1, t2, t3, t4, mask;
48857
48858 switch (mode)
48859 {
48860 case V4SImode:
48861 t1 = gen_reg_rtx (mode);
48862 t2 = gen_reg_rtx (mode);
48863 if (TARGET_XOP && !uns_p)
48864 {
48865 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
48866 shuffle the elements once so that all elements are in the right
48867 place for immediate use: { A C B D }. */
48868 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
48869 const1_rtx, GEN_INT (3)));
48870 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
48871 const1_rtx, GEN_INT (3)));
48872 }
48873 else
48874 {
48875 /* Put the elements into place for the multiply. */
48876 ix86_expand_vec_interleave (t1, op1, op1, high_p);
48877 ix86_expand_vec_interleave (t2, op2, op2, high_p);
48878 high_p = false;
48879 }
48880 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
48881 break;
48882
48883 case V8SImode:
48884 /* Shuffle the elements between the lanes. After this we
48885 have { A B E F | C D G H } for each operand. */
48886 t1 = gen_reg_rtx (V4DImode);
48887 t2 = gen_reg_rtx (V4DImode);
48888 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
48889 const0_rtx, const2_rtx,
48890 const1_rtx, GEN_INT (3)));
48891 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
48892 const0_rtx, const2_rtx,
48893 const1_rtx, GEN_INT (3)));
48894
48895 /* Shuffle the elements within the lanes. After this we
48896 have { A A B B | C C D D } or { E E F F | G G H H }. */
48897 t3 = gen_reg_rtx (V8SImode);
48898 t4 = gen_reg_rtx (V8SImode);
48899 mask = GEN_INT (high_p
48900 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
48901 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
48902 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
48903 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
48904
48905 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
48906 break;
48907
48908 case V8HImode:
48909 case V16HImode:
48910 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
48911 uns_p, OPTAB_DIRECT);
48912 t2 = expand_binop (mode,
48913 uns_p ? umul_highpart_optab : smul_highpart_optab,
48914 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
48915 gcc_assert (t1 && t2);
48916
48917 t3 = gen_reg_rtx (mode);
48918 ix86_expand_vec_interleave (t3, t1, t2, high_p);
48919 emit_move_insn (dest, gen_lowpart (wmode, t3));
48920 break;
48921
48922 case V16QImode:
48923 case V32QImode:
48924 case V32HImode:
48925 case V16SImode:
48926 case V64QImode:
48927 t1 = gen_reg_rtx (wmode);
48928 t2 = gen_reg_rtx (wmode);
48929 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
48930 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
48931
48932 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
48933 break;
48934
48935 default:
48936 gcc_unreachable ();
48937 }
48938 }
48939
48940 void
48941 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
48942 {
48943 rtx res_1, res_2, res_3, res_4;
48944
48945 res_1 = gen_reg_rtx (V4SImode);
48946 res_2 = gen_reg_rtx (V4SImode);
48947 res_3 = gen_reg_rtx (V2DImode);
48948 res_4 = gen_reg_rtx (V2DImode);
48949 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
48950 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
48951
48952 /* Move the results in element 2 down to element 1; we don't care
48953 what goes in elements 2 and 3. Then we can merge the parts
48954 back together with an interleave.
48955
48956 Note that two other sequences were tried:
48957 (1) Use interleaves at the start instead of psrldq, which allows
48958 us to use a single shufps to merge things back at the end.
48959 (2) Use shufps here to combine the two vectors, then pshufd to
48960 put the elements in the correct order.
48961 In both cases the cost of the reformatting stall was too high
48962 and the overall sequence slower. */
48963
48964 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
48965 const0_rtx, const2_rtx,
48966 const0_rtx, const0_rtx));
48967 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
48968 const0_rtx, const2_rtx,
48969 const0_rtx, const0_rtx));
48970 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
48971
48972 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
48973 }
48974
48975 void
48976 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
48977 {
48978 machine_mode mode = GET_MODE (op0);
48979 rtx t1, t2, t3, t4, t5, t6;
48980
48981 if (TARGET_AVX512DQ && mode == V8DImode)
48982 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
48983 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
48984 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
48985 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
48986 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
48987 else if (TARGET_XOP && mode == V2DImode)
48988 {
48989 /* op1: A,B,C,D, op2: E,F,G,H */
48990 op1 = gen_lowpart (V4SImode, op1);
48991 op2 = gen_lowpart (V4SImode, op2);
48992
48993 t1 = gen_reg_rtx (V4SImode);
48994 t2 = gen_reg_rtx (V4SImode);
48995 t3 = gen_reg_rtx (V2DImode);
48996 t4 = gen_reg_rtx (V2DImode);
48997
48998 /* t1: B,A,D,C */
48999 emit_insn (gen_sse2_pshufd_1 (t1, op1,
49000 GEN_INT (1),
49001 GEN_INT (0),
49002 GEN_INT (3),
49003 GEN_INT (2)));
49004
49005 /* t2: (B*E),(A*F),(D*G),(C*H) */
49006 emit_insn (gen_mulv4si3 (t2, t1, op2));
49007
49008 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
49009 emit_insn (gen_xop_phadddq (t3, t2));
49010
49011 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
49012 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
49013
49014 /* Multiply lower parts and add all */
49015 t5 = gen_reg_rtx (V2DImode);
49016 emit_insn (gen_vec_widen_umult_even_v4si (t5,
49017 gen_lowpart (V4SImode, op1),
49018 gen_lowpart (V4SImode, op2)));
49019 op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
49020
49021 }
49022 else
49023 {
49024 machine_mode nmode;
49025 rtx (*umul) (rtx, rtx, rtx);
49026
49027 if (mode == V2DImode)
49028 {
49029 umul = gen_vec_widen_umult_even_v4si;
49030 nmode = V4SImode;
49031 }
49032 else if (mode == V4DImode)
49033 {
49034 umul = gen_vec_widen_umult_even_v8si;
49035 nmode = V8SImode;
49036 }
49037 else if (mode == V8DImode)
49038 {
49039 umul = gen_vec_widen_umult_even_v16si;
49040 nmode = V16SImode;
49041 }
49042 else
49043 gcc_unreachable ();
49044
49045
49046 /* Multiply low parts. */
49047 t1 = gen_reg_rtx (mode);
49048 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
49049
49050 /* Shift input vectors right 32 bits so we can multiply high parts. */
49051 t6 = GEN_INT (32);
49052 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
49053 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
49054
49055 /* Multiply high parts by low parts. */
49056 t4 = gen_reg_rtx (mode);
49057 t5 = gen_reg_rtx (mode);
49058 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
49059 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
49060
49061 /* Combine and shift the highparts back. */
49062 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
49063 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
49064
49065 /* Combine high and low parts. */
49066 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
49067 }
49068
49069 set_unique_reg_note (get_last_insn (), REG_EQUAL,
49070 gen_rtx_MULT (mode, op1, op2));
49071 }
49072
49073 /* Return 1 if control tansfer instruction INSN
49074 should be encoded with bnd prefix.
49075 If insn is NULL then return 1 when control
49076 transfer instructions should be prefixed with
49077 bnd by default for current function. */
49078
49079 bool
49080 ix86_bnd_prefixed_insn_p (rtx insn)
49081 {
49082 /* For call insns check special flag. */
49083 if (insn && CALL_P (insn))
49084 {
49085 rtx call = get_call_rtx_from (insn);
49086 if (call)
49087 return CALL_EXPR_WITH_BOUNDS_P (call);
49088 }
49089
49090 /* All other insns are prefixed only if function is instrumented. */
49091 return chkp_function_instrumented_p (current_function_decl);
49092 }
49093
49094 /* Calculate integer abs() using only SSE2 instructions. */
49095
49096 void
49097 ix86_expand_sse2_abs (rtx target, rtx input)
49098 {
49099 machine_mode mode = GET_MODE (target);
49100 rtx tmp0, tmp1, x;
49101
49102 switch (mode)
49103 {
49104 /* For 32-bit signed integer X, the best way to calculate the absolute
49105 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
49106 case V4SImode:
49107 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
49108 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
49109 NULL, 0, OPTAB_DIRECT);
49110 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
49111 NULL, 0, OPTAB_DIRECT);
49112 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
49113 target, 0, OPTAB_DIRECT);
49114 break;
49115
49116 /* For 16-bit signed integer X, the best way to calculate the absolute
49117 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
49118 case V8HImode:
49119 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
49120
49121 x = expand_simple_binop (mode, SMAX, tmp0, input,
49122 target, 0, OPTAB_DIRECT);
49123 break;
49124
49125 /* For 8-bit signed integer X, the best way to calculate the absolute
49126 value of X is min ((unsigned char) X, (unsigned char) (-X)),
49127 as SSE2 provides the PMINUB insn. */
49128 case V16QImode:
49129 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
49130
49131 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
49132 target, 0, OPTAB_DIRECT);
49133 break;
49134
49135 default:
49136 gcc_unreachable ();
49137 }
49138
49139 if (x != target)
49140 emit_move_insn (target, x);
49141 }
49142
49143 /* Expand an extract from a vector register through pextr insn.
49144 Return true if successful. */
49145
49146 bool
49147 ix86_expand_pextr (rtx *operands)
49148 {
49149 rtx dst = operands[0];
49150 rtx src = operands[1];
49151
49152 unsigned int size = INTVAL (operands[2]);
49153 unsigned int pos = INTVAL (operands[3]);
49154
49155 if (SUBREG_P (dst))
49156 {
49157 /* Reject non-lowpart subregs. */
49158 if (SUBREG_BYTE (dst) > 0)
49159 return false;
49160 dst = SUBREG_REG (dst);
49161 }
49162
49163 if (SUBREG_P (src))
49164 {
49165 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
49166 src = SUBREG_REG (src);
49167 }
49168
49169 switch (GET_MODE (src))
49170 {
49171 case V16QImode:
49172 case V8HImode:
49173 case V4SImode:
49174 case V2DImode:
49175 case V1TImode:
49176 case TImode:
49177 {
49178 machine_mode srcmode, dstmode;
49179 rtx d, pat;
49180
49181 dstmode = mode_for_size (size, MODE_INT, 0);
49182
49183 switch (dstmode)
49184 {
49185 case QImode:
49186 if (!TARGET_SSE4_1)
49187 return false;
49188 srcmode = V16QImode;
49189 break;
49190
49191 case HImode:
49192 if (!TARGET_SSE2)
49193 return false;
49194 srcmode = V8HImode;
49195 break;
49196
49197 case SImode:
49198 if (!TARGET_SSE4_1)
49199 return false;
49200 srcmode = V4SImode;
49201 break;
49202
49203 case DImode:
49204 gcc_assert (TARGET_64BIT);
49205 if (!TARGET_SSE4_1)
49206 return false;
49207 srcmode = V2DImode;
49208 break;
49209
49210 default:
49211 return false;
49212 }
49213
49214 /* Reject extractions from misaligned positions. */
49215 if (pos & (size-1))
49216 return false;
49217
49218 if (GET_MODE (dst) == dstmode)
49219 d = dst;
49220 else
49221 d = gen_reg_rtx (dstmode);
49222
49223 /* Construct insn pattern. */
49224 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
49225 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
49226
49227 /* Let the rtl optimizers know about the zero extension performed. */
49228 if (dstmode == QImode || dstmode == HImode)
49229 {
49230 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
49231 d = gen_lowpart (SImode, d);
49232 }
49233
49234 emit_insn (gen_rtx_SET (d, pat));
49235
49236 if (d != dst)
49237 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
49238 return true;
49239 }
49240
49241 default:
49242 return false;
49243 }
49244 }
49245
49246 /* Expand an insert into a vector register through pinsr insn.
49247 Return true if successful. */
49248
49249 bool
49250 ix86_expand_pinsr (rtx *operands)
49251 {
49252 rtx dst = operands[0];
49253 rtx src = operands[3];
49254
49255 unsigned int size = INTVAL (operands[1]);
49256 unsigned int pos = INTVAL (operands[2]);
49257
49258 if (SUBREG_P (dst))
49259 {
49260 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
49261 dst = SUBREG_REG (dst);
49262 }
49263
49264 switch (GET_MODE (dst))
49265 {
49266 case V16QImode:
49267 case V8HImode:
49268 case V4SImode:
49269 case V2DImode:
49270 case V1TImode:
49271 case TImode:
49272 {
49273 machine_mode srcmode, dstmode;
49274 rtx (*pinsr)(rtx, rtx, rtx, rtx);
49275 rtx d;
49276
49277 srcmode = mode_for_size (size, MODE_INT, 0);
49278
49279 switch (srcmode)
49280 {
49281 case QImode:
49282 if (!TARGET_SSE4_1)
49283 return false;
49284 dstmode = V16QImode;
49285 pinsr = gen_sse4_1_pinsrb;
49286 break;
49287
49288 case HImode:
49289 if (!TARGET_SSE2)
49290 return false;
49291 dstmode = V8HImode;
49292 pinsr = gen_sse2_pinsrw;
49293 break;
49294
49295 case SImode:
49296 if (!TARGET_SSE4_1)
49297 return false;
49298 dstmode = V4SImode;
49299 pinsr = gen_sse4_1_pinsrd;
49300 break;
49301
49302 case DImode:
49303 gcc_assert (TARGET_64BIT);
49304 if (!TARGET_SSE4_1)
49305 return false;
49306 dstmode = V2DImode;
49307 pinsr = gen_sse4_1_pinsrq;
49308 break;
49309
49310 default:
49311 return false;
49312 }
49313
49314 /* Reject insertions to misaligned positions. */
49315 if (pos & (size-1))
49316 return false;
49317
49318 if (SUBREG_P (src))
49319 {
49320 unsigned int srcpos = SUBREG_BYTE (src);
49321
49322 if (srcpos > 0)
49323 {
49324 rtx extr_ops[4];
49325
49326 extr_ops[0] = gen_reg_rtx (srcmode);
49327 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
49328 extr_ops[2] = GEN_INT (size);
49329 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
49330
49331 if (!ix86_expand_pextr (extr_ops))
49332 return false;
49333
49334 src = extr_ops[0];
49335 }
49336 else
49337 src = gen_lowpart (srcmode, SUBREG_REG (src));
49338 }
49339
49340 if (GET_MODE (dst) == dstmode)
49341 d = dst;
49342 else
49343 d = gen_reg_rtx (dstmode);
49344
49345 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
49346 gen_lowpart (srcmode, src),
49347 GEN_INT (1 << (pos / size))));
49348 if (d != dst)
49349 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
49350 return true;
49351 }
49352
49353 default:
49354 return false;
49355 }
49356 }
49357 \f
49358 /* This function returns the calling abi specific va_list type node.
49359 It returns the FNDECL specific va_list type. */
49360
49361 static tree
49362 ix86_fn_abi_va_list (tree fndecl)
49363 {
49364 if (!TARGET_64BIT)
49365 return va_list_type_node;
49366 gcc_assert (fndecl != NULL_TREE);
49367
49368 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
49369 return ms_va_list_type_node;
49370 else
49371 return sysv_va_list_type_node;
49372 }
49373
49374 /* Returns the canonical va_list type specified by TYPE. If there
49375 is no valid TYPE provided, it return NULL_TREE. */
49376
49377 static tree
49378 ix86_canonical_va_list_type (tree type)
49379 {
49380 if (TARGET_64BIT)
49381 {
49382 if (lookup_attribute ("ms_abi va_list", TYPE_ATTRIBUTES (type)))
49383 return ms_va_list_type_node;
49384
49385 if ((TREE_CODE (type) == ARRAY_TYPE
49386 && integer_zerop (array_type_nelts (type)))
49387 || POINTER_TYPE_P (type))
49388 {
49389 tree elem_type = TREE_TYPE (type);
49390 if (TREE_CODE (elem_type) == RECORD_TYPE
49391 && lookup_attribute ("sysv_abi va_list",
49392 TYPE_ATTRIBUTES (elem_type)))
49393 return sysv_va_list_type_node;
49394 }
49395
49396 return NULL_TREE;
49397 }
49398
49399 return std_canonical_va_list_type (type);
49400 }
49401
49402 /* Iterate through the target-specific builtin types for va_list.
49403 IDX denotes the iterator, *PTREE is set to the result type of
49404 the va_list builtin, and *PNAME to its internal type.
49405 Returns zero if there is no element for this index, otherwise
49406 IDX should be increased upon the next call.
49407 Note, do not iterate a base builtin's name like __builtin_va_list.
49408 Used from c_common_nodes_and_builtins. */
49409
49410 static int
49411 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
49412 {
49413 if (TARGET_64BIT)
49414 {
49415 switch (idx)
49416 {
49417 default:
49418 break;
49419
49420 case 0:
49421 *ptree = ms_va_list_type_node;
49422 *pname = "__builtin_ms_va_list";
49423 return 1;
49424
49425 case 1:
49426 *ptree = sysv_va_list_type_node;
49427 *pname = "__builtin_sysv_va_list";
49428 return 1;
49429 }
49430 }
49431
49432 return 0;
49433 }
49434
49435 #undef TARGET_SCHED_DISPATCH
49436 #define TARGET_SCHED_DISPATCH has_dispatch
49437 #undef TARGET_SCHED_DISPATCH_DO
49438 #define TARGET_SCHED_DISPATCH_DO do_dispatch
49439 #undef TARGET_SCHED_REASSOCIATION_WIDTH
49440 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
49441 #undef TARGET_SCHED_REORDER
49442 #define TARGET_SCHED_REORDER ix86_sched_reorder
49443 #undef TARGET_SCHED_ADJUST_PRIORITY
49444 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
49445 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
49446 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
49447 ix86_dependencies_evaluation_hook
49448
49449 /* The size of the dispatch window is the total number of bytes of
49450 object code allowed in a window. */
49451 #define DISPATCH_WINDOW_SIZE 16
49452
49453 /* Number of dispatch windows considered for scheduling. */
49454 #define MAX_DISPATCH_WINDOWS 3
49455
49456 /* Maximum number of instructions in a window. */
49457 #define MAX_INSN 4
49458
49459 /* Maximum number of immediate operands in a window. */
49460 #define MAX_IMM 4
49461
49462 /* Maximum number of immediate bits allowed in a window. */
49463 #define MAX_IMM_SIZE 128
49464
49465 /* Maximum number of 32 bit immediates allowed in a window. */
49466 #define MAX_IMM_32 4
49467
49468 /* Maximum number of 64 bit immediates allowed in a window. */
49469 #define MAX_IMM_64 2
49470
49471 /* Maximum total of loads or prefetches allowed in a window. */
49472 #define MAX_LOAD 2
49473
49474 /* Maximum total of stores allowed in a window. */
49475 #define MAX_STORE 1
49476
49477 #undef BIG
49478 #define BIG 100
49479
49480
49481 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
49482 enum dispatch_group {
49483 disp_no_group = 0,
49484 disp_load,
49485 disp_store,
49486 disp_load_store,
49487 disp_prefetch,
49488 disp_imm,
49489 disp_imm_32,
49490 disp_imm_64,
49491 disp_branch,
49492 disp_cmp,
49493 disp_jcc,
49494 disp_last
49495 };
49496
49497 /* Number of allowable groups in a dispatch window. It is an array
49498 indexed by dispatch_group enum. 100 is used as a big number,
49499 because the number of these kind of operations does not have any
49500 effect in dispatch window, but we need them for other reasons in
49501 the table. */
49502 static unsigned int num_allowable_groups[disp_last] = {
49503 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
49504 };
49505
49506 char group_name[disp_last + 1][16] = {
49507 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
49508 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
49509 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
49510 };
49511
49512 /* Instruction path. */
49513 enum insn_path {
49514 no_path = 0,
49515 path_single, /* Single micro op. */
49516 path_double, /* Double micro op. */
49517 path_multi, /* Instructions with more than 2 micro op.. */
49518 last_path
49519 };
49520
49521 /* sched_insn_info defines a window to the instructions scheduled in
49522 the basic block. It contains a pointer to the insn_info table and
49523 the instruction scheduled.
49524
49525 Windows are allocated for each basic block and are linked
49526 together. */
49527 typedef struct sched_insn_info_s {
49528 rtx insn;
49529 enum dispatch_group group;
49530 enum insn_path path;
49531 int byte_len;
49532 int imm_bytes;
49533 } sched_insn_info;
49534
49535 /* Linked list of dispatch windows. This is a two way list of
49536 dispatch windows of a basic block. It contains information about
49537 the number of uops in the window and the total number of
49538 instructions and of bytes in the object code for this dispatch
49539 window. */
49540 typedef struct dispatch_windows_s {
49541 int num_insn; /* Number of insn in the window. */
49542 int num_uops; /* Number of uops in the window. */
49543 int window_size; /* Number of bytes in the window. */
49544 int window_num; /* Window number between 0 or 1. */
49545 int num_imm; /* Number of immediates in an insn. */
49546 int num_imm_32; /* Number of 32 bit immediates in an insn. */
49547 int num_imm_64; /* Number of 64 bit immediates in an insn. */
49548 int imm_size; /* Total immediates in the window. */
49549 int num_loads; /* Total memory loads in the window. */
49550 int num_stores; /* Total memory stores in the window. */
49551 int violation; /* Violation exists in window. */
49552 sched_insn_info *window; /* Pointer to the window. */
49553 struct dispatch_windows_s *next;
49554 struct dispatch_windows_s *prev;
49555 } dispatch_windows;
49556
49557 /* Immediate valuse used in an insn. */
49558 typedef struct imm_info_s
49559 {
49560 int imm;
49561 int imm32;
49562 int imm64;
49563 } imm_info;
49564
49565 static dispatch_windows *dispatch_window_list;
49566 static dispatch_windows *dispatch_window_list1;
49567
49568 /* Get dispatch group of insn. */
49569
49570 static enum dispatch_group
49571 get_mem_group (rtx_insn *insn)
49572 {
49573 enum attr_memory memory;
49574
49575 if (INSN_CODE (insn) < 0)
49576 return disp_no_group;
49577 memory = get_attr_memory (insn);
49578 if (memory == MEMORY_STORE)
49579 return disp_store;
49580
49581 if (memory == MEMORY_LOAD)
49582 return disp_load;
49583
49584 if (memory == MEMORY_BOTH)
49585 return disp_load_store;
49586
49587 return disp_no_group;
49588 }
49589
49590 /* Return true if insn is a compare instruction. */
49591
49592 static bool
49593 is_cmp (rtx_insn *insn)
49594 {
49595 enum attr_type type;
49596
49597 type = get_attr_type (insn);
49598 return (type == TYPE_TEST
49599 || type == TYPE_ICMP
49600 || type == TYPE_FCMP
49601 || GET_CODE (PATTERN (insn)) == COMPARE);
49602 }
49603
49604 /* Return true if a dispatch violation encountered. */
49605
49606 static bool
49607 dispatch_violation (void)
49608 {
49609 if (dispatch_window_list->next)
49610 return dispatch_window_list->next->violation;
49611 return dispatch_window_list->violation;
49612 }
49613
49614 /* Return true if insn is a branch instruction. */
49615
49616 static bool
49617 is_branch (rtx_insn *insn)
49618 {
49619 return (CALL_P (insn) || JUMP_P (insn));
49620 }
49621
49622 /* Return true if insn is a prefetch instruction. */
49623
49624 static bool
49625 is_prefetch (rtx_insn *insn)
49626 {
49627 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
49628 }
49629
49630 /* This function initializes a dispatch window and the list container holding a
49631 pointer to the window. */
49632
49633 static void
49634 init_window (int window_num)
49635 {
49636 int i;
49637 dispatch_windows *new_list;
49638
49639 if (window_num == 0)
49640 new_list = dispatch_window_list;
49641 else
49642 new_list = dispatch_window_list1;
49643
49644 new_list->num_insn = 0;
49645 new_list->num_uops = 0;
49646 new_list->window_size = 0;
49647 new_list->next = NULL;
49648 new_list->prev = NULL;
49649 new_list->window_num = window_num;
49650 new_list->num_imm = 0;
49651 new_list->num_imm_32 = 0;
49652 new_list->num_imm_64 = 0;
49653 new_list->imm_size = 0;
49654 new_list->num_loads = 0;
49655 new_list->num_stores = 0;
49656 new_list->violation = false;
49657
49658 for (i = 0; i < MAX_INSN; i++)
49659 {
49660 new_list->window[i].insn = NULL;
49661 new_list->window[i].group = disp_no_group;
49662 new_list->window[i].path = no_path;
49663 new_list->window[i].byte_len = 0;
49664 new_list->window[i].imm_bytes = 0;
49665 }
49666 return;
49667 }
49668
49669 /* This function allocates and initializes a dispatch window and the
49670 list container holding a pointer to the window. */
49671
49672 static dispatch_windows *
49673 allocate_window (void)
49674 {
49675 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
49676 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
49677
49678 return new_list;
49679 }
49680
49681 /* This routine initializes the dispatch scheduling information. It
49682 initiates building dispatch scheduler tables and constructs the
49683 first dispatch window. */
49684
49685 static void
49686 init_dispatch_sched (void)
49687 {
49688 /* Allocate a dispatch list and a window. */
49689 dispatch_window_list = allocate_window ();
49690 dispatch_window_list1 = allocate_window ();
49691 init_window (0);
49692 init_window (1);
49693 }
49694
49695 /* This function returns true if a branch is detected. End of a basic block
49696 does not have to be a branch, but here we assume only branches end a
49697 window. */
49698
49699 static bool
49700 is_end_basic_block (enum dispatch_group group)
49701 {
49702 return group == disp_branch;
49703 }
49704
49705 /* This function is called when the end of a window processing is reached. */
49706
49707 static void
49708 process_end_window (void)
49709 {
49710 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
49711 if (dispatch_window_list->next)
49712 {
49713 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
49714 gcc_assert (dispatch_window_list->window_size
49715 + dispatch_window_list1->window_size <= 48);
49716 init_window (1);
49717 }
49718 init_window (0);
49719 }
49720
49721 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
49722 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
49723 for 48 bytes of instructions. Note that these windows are not dispatch
49724 windows that their sizes are DISPATCH_WINDOW_SIZE. */
49725
49726 static dispatch_windows *
49727 allocate_next_window (int window_num)
49728 {
49729 if (window_num == 0)
49730 {
49731 if (dispatch_window_list->next)
49732 init_window (1);
49733 init_window (0);
49734 return dispatch_window_list;
49735 }
49736
49737 dispatch_window_list->next = dispatch_window_list1;
49738 dispatch_window_list1->prev = dispatch_window_list;
49739
49740 return dispatch_window_list1;
49741 }
49742
49743 /* Compute number of immediate operands of an instruction. */
49744
49745 static void
49746 find_constant (rtx in_rtx, imm_info *imm_values)
49747 {
49748 if (INSN_P (in_rtx))
49749 in_rtx = PATTERN (in_rtx);
49750 subrtx_iterator::array_type array;
49751 FOR_EACH_SUBRTX (iter, array, in_rtx, ALL)
49752 if (const_rtx x = *iter)
49753 switch (GET_CODE (x))
49754 {
49755 case CONST:
49756 case SYMBOL_REF:
49757 case CONST_INT:
49758 (imm_values->imm)++;
49759 if (x86_64_immediate_operand (CONST_CAST_RTX (x), SImode))
49760 (imm_values->imm32)++;
49761 else
49762 (imm_values->imm64)++;
49763 break;
49764
49765 case CONST_DOUBLE:
49766 case CONST_WIDE_INT:
49767 (imm_values->imm)++;
49768 (imm_values->imm64)++;
49769 break;
49770
49771 case CODE_LABEL:
49772 if (LABEL_KIND (x) == LABEL_NORMAL)
49773 {
49774 (imm_values->imm)++;
49775 (imm_values->imm32)++;
49776 }
49777 break;
49778
49779 default:
49780 break;
49781 }
49782 }
49783
49784 /* Return total size of immediate operands of an instruction along with number
49785 of corresponding immediate-operands. It initializes its parameters to zero
49786 befor calling FIND_CONSTANT.
49787 INSN is the input instruction. IMM is the total of immediates.
49788 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
49789 bit immediates. */
49790
49791 static int
49792 get_num_immediates (rtx_insn *insn, int *imm, int *imm32, int *imm64)
49793 {
49794 imm_info imm_values = {0, 0, 0};
49795
49796 find_constant (insn, &imm_values);
49797 *imm = imm_values.imm;
49798 *imm32 = imm_values.imm32;
49799 *imm64 = imm_values.imm64;
49800 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
49801 }
49802
49803 /* This function indicates if an operand of an instruction is an
49804 immediate. */
49805
49806 static bool
49807 has_immediate (rtx_insn *insn)
49808 {
49809 int num_imm_operand;
49810 int num_imm32_operand;
49811 int num_imm64_operand;
49812
49813 if (insn)
49814 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
49815 &num_imm64_operand);
49816 return false;
49817 }
49818
49819 /* Return single or double path for instructions. */
49820
49821 static enum insn_path
49822 get_insn_path (rtx_insn *insn)
49823 {
49824 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
49825
49826 if ((int)path == 0)
49827 return path_single;
49828
49829 if ((int)path == 1)
49830 return path_double;
49831
49832 return path_multi;
49833 }
49834
49835 /* Return insn dispatch group. */
49836
49837 static enum dispatch_group
49838 get_insn_group (rtx_insn *insn)
49839 {
49840 enum dispatch_group group = get_mem_group (insn);
49841 if (group)
49842 return group;
49843
49844 if (is_branch (insn))
49845 return disp_branch;
49846
49847 if (is_cmp (insn))
49848 return disp_cmp;
49849
49850 if (has_immediate (insn))
49851 return disp_imm;
49852
49853 if (is_prefetch (insn))
49854 return disp_prefetch;
49855
49856 return disp_no_group;
49857 }
49858
49859 /* Count number of GROUP restricted instructions in a dispatch
49860 window WINDOW_LIST. */
49861
49862 static int
49863 count_num_restricted (rtx_insn *insn, dispatch_windows *window_list)
49864 {
49865 enum dispatch_group group = get_insn_group (insn);
49866 int imm_size;
49867 int num_imm_operand;
49868 int num_imm32_operand;
49869 int num_imm64_operand;
49870
49871 if (group == disp_no_group)
49872 return 0;
49873
49874 if (group == disp_imm)
49875 {
49876 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
49877 &num_imm64_operand);
49878 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
49879 || num_imm_operand + window_list->num_imm > MAX_IMM
49880 || (num_imm32_operand > 0
49881 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
49882 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
49883 || (num_imm64_operand > 0
49884 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
49885 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
49886 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
49887 && num_imm64_operand > 0
49888 && ((window_list->num_imm_64 > 0
49889 && window_list->num_insn >= 2)
49890 || window_list->num_insn >= 3)))
49891 return BIG;
49892
49893 return 1;
49894 }
49895
49896 if ((group == disp_load_store
49897 && (window_list->num_loads >= MAX_LOAD
49898 || window_list->num_stores >= MAX_STORE))
49899 || ((group == disp_load
49900 || group == disp_prefetch)
49901 && window_list->num_loads >= MAX_LOAD)
49902 || (group == disp_store
49903 && window_list->num_stores >= MAX_STORE))
49904 return BIG;
49905
49906 return 1;
49907 }
49908
49909 /* This function returns true if insn satisfies dispatch rules on the
49910 last window scheduled. */
49911
49912 static bool
49913 fits_dispatch_window (rtx_insn *insn)
49914 {
49915 dispatch_windows *window_list = dispatch_window_list;
49916 dispatch_windows *window_list_next = dispatch_window_list->next;
49917 unsigned int num_restrict;
49918 enum dispatch_group group = get_insn_group (insn);
49919 enum insn_path path = get_insn_path (insn);
49920 int sum;
49921
49922 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
49923 instructions should be given the lowest priority in the
49924 scheduling process in Haifa scheduler to make sure they will be
49925 scheduled in the same dispatch window as the reference to them. */
49926 if (group == disp_jcc || group == disp_cmp)
49927 return false;
49928
49929 /* Check nonrestricted. */
49930 if (group == disp_no_group || group == disp_branch)
49931 return true;
49932
49933 /* Get last dispatch window. */
49934 if (window_list_next)
49935 window_list = window_list_next;
49936
49937 if (window_list->window_num == 1)
49938 {
49939 sum = window_list->prev->window_size + window_list->window_size;
49940
49941 if (sum == 32
49942 || (min_insn_size (insn) + sum) >= 48)
49943 /* Window 1 is full. Go for next window. */
49944 return true;
49945 }
49946
49947 num_restrict = count_num_restricted (insn, window_list);
49948
49949 if (num_restrict > num_allowable_groups[group])
49950 return false;
49951
49952 /* See if it fits in the first window. */
49953 if (window_list->window_num == 0)
49954 {
49955 /* The first widow should have only single and double path
49956 uops. */
49957 if (path == path_double
49958 && (window_list->num_uops + 2) > MAX_INSN)
49959 return false;
49960 else if (path != path_single)
49961 return false;
49962 }
49963 return true;
49964 }
49965
49966 /* Add an instruction INSN with NUM_UOPS micro-operations to the
49967 dispatch window WINDOW_LIST. */
49968
49969 static void
49970 add_insn_window (rtx_insn *insn, dispatch_windows *window_list, int num_uops)
49971 {
49972 int byte_len = min_insn_size (insn);
49973 int num_insn = window_list->num_insn;
49974 int imm_size;
49975 sched_insn_info *window = window_list->window;
49976 enum dispatch_group group = get_insn_group (insn);
49977 enum insn_path path = get_insn_path (insn);
49978 int num_imm_operand;
49979 int num_imm32_operand;
49980 int num_imm64_operand;
49981
49982 if (!window_list->violation && group != disp_cmp
49983 && !fits_dispatch_window (insn))
49984 window_list->violation = true;
49985
49986 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
49987 &num_imm64_operand);
49988
49989 /* Initialize window with new instruction. */
49990 window[num_insn].insn = insn;
49991 window[num_insn].byte_len = byte_len;
49992 window[num_insn].group = group;
49993 window[num_insn].path = path;
49994 window[num_insn].imm_bytes = imm_size;
49995
49996 window_list->window_size += byte_len;
49997 window_list->num_insn = num_insn + 1;
49998 window_list->num_uops = window_list->num_uops + num_uops;
49999 window_list->imm_size += imm_size;
50000 window_list->num_imm += num_imm_operand;
50001 window_list->num_imm_32 += num_imm32_operand;
50002 window_list->num_imm_64 += num_imm64_operand;
50003
50004 if (group == disp_store)
50005 window_list->num_stores += 1;
50006 else if (group == disp_load
50007 || group == disp_prefetch)
50008 window_list->num_loads += 1;
50009 else if (group == disp_load_store)
50010 {
50011 window_list->num_stores += 1;
50012 window_list->num_loads += 1;
50013 }
50014 }
50015
50016 /* Adds a scheduled instruction, INSN, to the current dispatch window.
50017 If the total bytes of instructions or the number of instructions in
50018 the window exceed allowable, it allocates a new window. */
50019
50020 static void
50021 add_to_dispatch_window (rtx_insn *insn)
50022 {
50023 int byte_len;
50024 dispatch_windows *window_list;
50025 dispatch_windows *next_list;
50026 dispatch_windows *window0_list;
50027 enum insn_path path;
50028 enum dispatch_group insn_group;
50029 bool insn_fits;
50030 int num_insn;
50031 int num_uops;
50032 int window_num;
50033 int insn_num_uops;
50034 int sum;
50035
50036 if (INSN_CODE (insn) < 0)
50037 return;
50038
50039 byte_len = min_insn_size (insn);
50040 window_list = dispatch_window_list;
50041 next_list = window_list->next;
50042 path = get_insn_path (insn);
50043 insn_group = get_insn_group (insn);
50044
50045 /* Get the last dispatch window. */
50046 if (next_list)
50047 window_list = dispatch_window_list->next;
50048
50049 if (path == path_single)
50050 insn_num_uops = 1;
50051 else if (path == path_double)
50052 insn_num_uops = 2;
50053 else
50054 insn_num_uops = (int) path;
50055
50056 /* If current window is full, get a new window.
50057 Window number zero is full, if MAX_INSN uops are scheduled in it.
50058 Window number one is full, if window zero's bytes plus window
50059 one's bytes is 32, or if the bytes of the new instruction added
50060 to the total makes it greater than 48, or it has already MAX_INSN
50061 instructions in it. */
50062 num_insn = window_list->num_insn;
50063 num_uops = window_list->num_uops;
50064 window_num = window_list->window_num;
50065 insn_fits = fits_dispatch_window (insn);
50066
50067 if (num_insn >= MAX_INSN
50068 || num_uops + insn_num_uops > MAX_INSN
50069 || !(insn_fits))
50070 {
50071 window_num = ~window_num & 1;
50072 window_list = allocate_next_window (window_num);
50073 }
50074
50075 if (window_num == 0)
50076 {
50077 add_insn_window (insn, window_list, insn_num_uops);
50078 if (window_list->num_insn >= MAX_INSN
50079 && insn_group == disp_branch)
50080 {
50081 process_end_window ();
50082 return;
50083 }
50084 }
50085 else if (window_num == 1)
50086 {
50087 window0_list = window_list->prev;
50088 sum = window0_list->window_size + window_list->window_size;
50089 if (sum == 32
50090 || (byte_len + sum) >= 48)
50091 {
50092 process_end_window ();
50093 window_list = dispatch_window_list;
50094 }
50095
50096 add_insn_window (insn, window_list, insn_num_uops);
50097 }
50098 else
50099 gcc_unreachable ();
50100
50101 if (is_end_basic_block (insn_group))
50102 {
50103 /* End of basic block is reached do end-basic-block process. */
50104 process_end_window ();
50105 return;
50106 }
50107 }
50108
50109 /* Print the dispatch window, WINDOW_NUM, to FILE. */
50110
50111 DEBUG_FUNCTION static void
50112 debug_dispatch_window_file (FILE *file, int window_num)
50113 {
50114 dispatch_windows *list;
50115 int i;
50116
50117 if (window_num == 0)
50118 list = dispatch_window_list;
50119 else
50120 list = dispatch_window_list1;
50121
50122 fprintf (file, "Window #%d:\n", list->window_num);
50123 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
50124 list->num_insn, list->num_uops, list->window_size);
50125 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
50126 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
50127
50128 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
50129 list->num_stores);
50130 fprintf (file, " insn info:\n");
50131
50132 for (i = 0; i < MAX_INSN; i++)
50133 {
50134 if (!list->window[i].insn)
50135 break;
50136 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
50137 i, group_name[list->window[i].group],
50138 i, (void *)list->window[i].insn,
50139 i, list->window[i].path,
50140 i, list->window[i].byte_len,
50141 i, list->window[i].imm_bytes);
50142 }
50143 }
50144
50145 /* Print to stdout a dispatch window. */
50146
50147 DEBUG_FUNCTION void
50148 debug_dispatch_window (int window_num)
50149 {
50150 debug_dispatch_window_file (stdout, window_num);
50151 }
50152
50153 /* Print INSN dispatch information to FILE. */
50154
50155 DEBUG_FUNCTION static void
50156 debug_insn_dispatch_info_file (FILE *file, rtx_insn *insn)
50157 {
50158 int byte_len;
50159 enum insn_path path;
50160 enum dispatch_group group;
50161 int imm_size;
50162 int num_imm_operand;
50163 int num_imm32_operand;
50164 int num_imm64_operand;
50165
50166 if (INSN_CODE (insn) < 0)
50167 return;
50168
50169 byte_len = min_insn_size (insn);
50170 path = get_insn_path (insn);
50171 group = get_insn_group (insn);
50172 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
50173 &num_imm64_operand);
50174
50175 fprintf (file, " insn info:\n");
50176 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
50177 group_name[group], path, byte_len);
50178 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
50179 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
50180 }
50181
50182 /* Print to STDERR the status of the ready list with respect to
50183 dispatch windows. */
50184
50185 DEBUG_FUNCTION void
50186 debug_ready_dispatch (void)
50187 {
50188 int i;
50189 int no_ready = number_in_ready ();
50190
50191 fprintf (stdout, "Number of ready: %d\n", no_ready);
50192
50193 for (i = 0; i < no_ready; i++)
50194 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
50195 }
50196
50197 /* This routine is the driver of the dispatch scheduler. */
50198
50199 static void
50200 do_dispatch (rtx_insn *insn, int mode)
50201 {
50202 if (mode == DISPATCH_INIT)
50203 init_dispatch_sched ();
50204 else if (mode == ADD_TO_DISPATCH_WINDOW)
50205 add_to_dispatch_window (insn);
50206 }
50207
50208 /* Return TRUE if Dispatch Scheduling is supported. */
50209
50210 static bool
50211 has_dispatch (rtx_insn *insn, int action)
50212 {
50213 if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3
50214 || TARGET_BDVER4 || TARGET_ZNVER1) && flag_dispatch_scheduler)
50215 switch (action)
50216 {
50217 default:
50218 return false;
50219
50220 case IS_DISPATCH_ON:
50221 return true;
50222
50223 case IS_CMP:
50224 return is_cmp (insn);
50225
50226 case DISPATCH_VIOLATION:
50227 return dispatch_violation ();
50228
50229 case FITS_DISPATCH_WINDOW:
50230 return fits_dispatch_window (insn);
50231 }
50232
50233 return false;
50234 }
50235
50236 /* Implementation of reassociation_width target hook used by
50237 reassoc phase to identify parallelism level in reassociated
50238 tree. Statements tree_code is passed in OPC. Arguments type
50239 is passed in MODE.
50240
50241 Currently parallel reassociation is enabled for Atom
50242 processors only and we set reassociation width to be 2
50243 because Atom may issue up to 2 instructions per cycle.
50244
50245 Return value should be fixed if parallel reassociation is
50246 enabled for other processors. */
50247
50248 static int
50249 ix86_reassociation_width (unsigned int, machine_mode mode)
50250 {
50251 /* Vector part. */
50252 if (VECTOR_MODE_P (mode))
50253 {
50254 if (TARGET_VECTOR_PARALLEL_EXECUTION)
50255 return 2;
50256 else
50257 return 1;
50258 }
50259
50260 /* Scalar part. */
50261 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
50262 return 2;
50263 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
50264 return ((TARGET_64BIT && ix86_tune == PROCESSOR_HASWELL)? 4 : 2);
50265 else
50266 return 1;
50267 }
50268
50269 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
50270 place emms and femms instructions. */
50271
50272 static machine_mode
50273 ix86_preferred_simd_mode (machine_mode mode)
50274 {
50275 if (!TARGET_SSE)
50276 return word_mode;
50277
50278 switch (mode)
50279 {
50280 case QImode:
50281 return TARGET_AVX512BW ? V64QImode :
50282 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
50283 case HImode:
50284 return TARGET_AVX512BW ? V32HImode :
50285 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
50286 case SImode:
50287 return TARGET_AVX512F ? V16SImode :
50288 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
50289 case DImode:
50290 return TARGET_AVX512F ? V8DImode :
50291 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
50292
50293 case SFmode:
50294 if (TARGET_AVX512F)
50295 return V16SFmode;
50296 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50297 return V8SFmode;
50298 else
50299 return V4SFmode;
50300
50301 case DFmode:
50302 if (TARGET_AVX512F)
50303 return V8DFmode;
50304 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50305 return V4DFmode;
50306 else if (TARGET_SSE2)
50307 return V2DFmode;
50308 /* FALLTHRU */
50309
50310 default:
50311 return word_mode;
50312 }
50313 }
50314
50315 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
50316 vectors. If AVX512F is enabled then try vectorizing with 512bit,
50317 256bit and 128bit vectors. */
50318
50319 static unsigned int
50320 ix86_autovectorize_vector_sizes (void)
50321 {
50322 return TARGET_AVX512F ? 64 | 32 | 16 :
50323 (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
50324 }
50325
50326 /* Implemenation of targetm.vectorize.get_mask_mode. */
50327
50328 static machine_mode
50329 ix86_get_mask_mode (unsigned nunits, unsigned vector_size)
50330 {
50331 unsigned elem_size = vector_size / nunits;
50332
50333 /* Scalar mask case. */
50334 if ((TARGET_AVX512F && vector_size == 64)
50335 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
50336 {
50337 if (elem_size == 4 || elem_size == 8 || TARGET_AVX512BW)
50338 return smallest_mode_for_size (nunits, MODE_INT);
50339 }
50340
50341 machine_mode elem_mode
50342 = smallest_mode_for_size (elem_size * BITS_PER_UNIT, MODE_INT);
50343
50344 gcc_assert (elem_size * nunits == vector_size);
50345
50346 return mode_for_vector (elem_mode, nunits);
50347 }
50348
50349 \f
50350
50351 /* Return class of registers which could be used for pseudo of MODE
50352 and of class RCLASS for spilling instead of memory. Return NO_REGS
50353 if it is not possible or non-profitable. */
50354
50355 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
50356
50357 static reg_class_t
50358 ix86_spill_class (reg_class_t rclass, machine_mode mode)
50359 {
50360 if (0 && TARGET_GENERAL_REGS_SSE_SPILL
50361 && TARGET_SSE2
50362 && TARGET_INTER_UNIT_MOVES_TO_VEC
50363 && TARGET_INTER_UNIT_MOVES_FROM_VEC
50364 && (mode == SImode || (TARGET_64BIT && mode == DImode))
50365 && INTEGER_CLASS_P (rclass))
50366 return ALL_SSE_REGS;
50367 return NO_REGS;
50368 }
50369
50370 /* Implement targetm.vectorize.init_cost. */
50371
50372 static void *
50373 ix86_init_cost (struct loop *)
50374 {
50375 unsigned *cost = XNEWVEC (unsigned, 3);
50376 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
50377 return cost;
50378 }
50379
50380 /* Implement targetm.vectorize.add_stmt_cost. */
50381
50382 static unsigned
50383 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
50384 struct _stmt_vec_info *stmt_info, int misalign,
50385 enum vect_cost_model_location where)
50386 {
50387 unsigned *cost = (unsigned *) data;
50388 unsigned retval = 0;
50389
50390 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
50391 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
50392
50393 /* Penalize DFmode vector operations for Bonnell. */
50394 if (TARGET_BONNELL && kind == vector_stmt
50395 && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
50396 stmt_cost *= 5; /* FIXME: The value here is arbitrary. */
50397
50398 /* Statements in an inner loop relative to the loop being
50399 vectorized are weighted more heavily. The value here is
50400 arbitrary and could potentially be improved with analysis. */
50401 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
50402 count *= 50; /* FIXME. */
50403
50404 retval = (unsigned) (count * stmt_cost);
50405
50406 /* We need to multiply all vector stmt cost by 1.7 (estimated cost)
50407 for Silvermont as it has out of order integer pipeline and can execute
50408 2 scalar instruction per tick, but has in order SIMD pipeline. */
50409 if ((TARGET_SILVERMONT || TARGET_INTEL)
50410 && stmt_info && stmt_info->stmt)
50411 {
50412 tree lhs_op = gimple_get_lhs (stmt_info->stmt);
50413 if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE)
50414 retval = (retval * 17) / 10;
50415 }
50416
50417 cost[where] += retval;
50418
50419 return retval;
50420 }
50421
50422 /* Implement targetm.vectorize.finish_cost. */
50423
50424 static void
50425 ix86_finish_cost (void *data, unsigned *prologue_cost,
50426 unsigned *body_cost, unsigned *epilogue_cost)
50427 {
50428 unsigned *cost = (unsigned *) data;
50429 *prologue_cost = cost[vect_prologue];
50430 *body_cost = cost[vect_body];
50431 *epilogue_cost = cost[vect_epilogue];
50432 }
50433
50434 /* Implement targetm.vectorize.destroy_cost_data. */
50435
50436 static void
50437 ix86_destroy_cost_data (void *data)
50438 {
50439 free (data);
50440 }
50441
50442 /* Validate target specific memory model bits in VAL. */
50443
50444 static unsigned HOST_WIDE_INT
50445 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
50446 {
50447 enum memmodel model = memmodel_from_int (val);
50448 bool strong;
50449
50450 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
50451 |MEMMODEL_MASK)
50452 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
50453 {
50454 warning (OPT_Winvalid_memory_model,
50455 "Unknown architecture specific memory model");
50456 return MEMMODEL_SEQ_CST;
50457 }
50458 strong = (is_mm_acq_rel (model) || is_mm_seq_cst (model));
50459 if (val & IX86_HLE_ACQUIRE && !(is_mm_acquire (model) || strong))
50460 {
50461 warning (OPT_Winvalid_memory_model,
50462 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
50463 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
50464 }
50465 if (val & IX86_HLE_RELEASE && !(is_mm_release (model) || strong))
50466 {
50467 warning (OPT_Winvalid_memory_model,
50468 "HLE_RELEASE not used with RELEASE or stronger memory model");
50469 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
50470 }
50471 return val;
50472 }
50473
50474 /* Set CLONEI->vecsize_mangle, CLONEI->mask_mode, CLONEI->vecsize_int,
50475 CLONEI->vecsize_float and if CLONEI->simdlen is 0, also
50476 CLONEI->simdlen. Return 0 if SIMD clones shouldn't be emitted,
50477 or number of vecsize_mangle variants that should be emitted. */
50478
50479 static int
50480 ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
50481 struct cgraph_simd_clone *clonei,
50482 tree base_type, int num)
50483 {
50484 int ret = 1;
50485
50486 if (clonei->simdlen
50487 && (clonei->simdlen < 2
50488 || clonei->simdlen > 1024
50489 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
50490 {
50491 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50492 "unsupported simdlen %d", clonei->simdlen);
50493 return 0;
50494 }
50495
50496 tree ret_type = TREE_TYPE (TREE_TYPE (node->decl));
50497 if (TREE_CODE (ret_type) != VOID_TYPE)
50498 switch (TYPE_MODE (ret_type))
50499 {
50500 case QImode:
50501 case HImode:
50502 case SImode:
50503 case DImode:
50504 case SFmode:
50505 case DFmode:
50506 /* case SCmode: */
50507 /* case DCmode: */
50508 break;
50509 default:
50510 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50511 "unsupported return type %qT for simd\n", ret_type);
50512 return 0;
50513 }
50514
50515 tree t;
50516 int i;
50517
50518 for (t = DECL_ARGUMENTS (node->decl), i = 0; t; t = DECL_CHAIN (t), i++)
50519 /* FIXME: Shouldn't we allow such arguments if they are uniform? */
50520 switch (TYPE_MODE (TREE_TYPE (t)))
50521 {
50522 case QImode:
50523 case HImode:
50524 case SImode:
50525 case DImode:
50526 case SFmode:
50527 case DFmode:
50528 /* case SCmode: */
50529 /* case DCmode: */
50530 break;
50531 default:
50532 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50533 "unsupported argument type %qT for simd\n", TREE_TYPE (t));
50534 return 0;
50535 }
50536
50537 if (clonei->cilk_elemental)
50538 {
50539 /* Parse here processor clause. If not present, default to 'b'. */
50540 clonei->vecsize_mangle = 'b';
50541 }
50542 else if (!TREE_PUBLIC (node->decl))
50543 {
50544 /* If the function isn't exported, we can pick up just one ISA
50545 for the clones. */
50546 if (TARGET_AVX512F)
50547 clonei->vecsize_mangle = 'e';
50548 else if (TARGET_AVX2)
50549 clonei->vecsize_mangle = 'd';
50550 else if (TARGET_AVX)
50551 clonei->vecsize_mangle = 'c';
50552 else
50553 clonei->vecsize_mangle = 'b';
50554 ret = 1;
50555 }
50556 else
50557 {
50558 clonei->vecsize_mangle = "bcde"[num];
50559 ret = 4;
50560 }
50561 clonei->mask_mode = VOIDmode;
50562 switch (clonei->vecsize_mangle)
50563 {
50564 case 'b':
50565 clonei->vecsize_int = 128;
50566 clonei->vecsize_float = 128;
50567 break;
50568 case 'c':
50569 clonei->vecsize_int = 128;
50570 clonei->vecsize_float = 256;
50571 break;
50572 case 'd':
50573 clonei->vecsize_int = 256;
50574 clonei->vecsize_float = 256;
50575 break;
50576 case 'e':
50577 clonei->vecsize_int = 512;
50578 clonei->vecsize_float = 512;
50579 if (TYPE_MODE (base_type) == QImode)
50580 clonei->mask_mode = DImode;
50581 else
50582 clonei->mask_mode = SImode;
50583 break;
50584 }
50585 if (clonei->simdlen == 0)
50586 {
50587 if (SCALAR_INT_MODE_P (TYPE_MODE (base_type)))
50588 clonei->simdlen = clonei->vecsize_int;
50589 else
50590 clonei->simdlen = clonei->vecsize_float;
50591 clonei->simdlen /= GET_MODE_BITSIZE (TYPE_MODE (base_type));
50592 }
50593 else if (clonei->simdlen > 16)
50594 {
50595 /* For compatibility with ICC, use the same upper bounds
50596 for simdlen. In particular, for CTYPE below, use the return type,
50597 unless the function returns void, in that case use the characteristic
50598 type. If it is possible for given SIMDLEN to pass CTYPE value
50599 in registers (8 [XYZ]MM* regs for 32-bit code, 16 [XYZ]MM* regs
50600 for 64-bit code), accept that SIMDLEN, otherwise warn and don't
50601 emit corresponding clone. */
50602 tree ctype = ret_type;
50603 if (TREE_CODE (ret_type) == VOID_TYPE)
50604 ctype = base_type;
50605 int cnt = GET_MODE_BITSIZE (TYPE_MODE (ctype)) * clonei->simdlen;
50606 if (SCALAR_INT_MODE_P (TYPE_MODE (ctype)))
50607 cnt /= clonei->vecsize_int;
50608 else
50609 cnt /= clonei->vecsize_float;
50610 if (cnt > (TARGET_64BIT ? 16 : 8))
50611 {
50612 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50613 "unsupported simdlen %d", clonei->simdlen);
50614 return 0;
50615 }
50616 }
50617 return ret;
50618 }
50619
50620 /* Add target attribute to SIMD clone NODE if needed. */
50621
50622 static void
50623 ix86_simd_clone_adjust (struct cgraph_node *node)
50624 {
50625 const char *str = NULL;
50626 gcc_assert (node->decl == cfun->decl);
50627 switch (node->simdclone->vecsize_mangle)
50628 {
50629 case 'b':
50630 if (!TARGET_SSE2)
50631 str = "sse2";
50632 break;
50633 case 'c':
50634 if (!TARGET_AVX)
50635 str = "avx";
50636 break;
50637 case 'd':
50638 if (!TARGET_AVX2)
50639 str = "avx2";
50640 break;
50641 case 'e':
50642 if (!TARGET_AVX512F)
50643 str = "avx512f";
50644 break;
50645 default:
50646 gcc_unreachable ();
50647 }
50648 if (str == NULL)
50649 return;
50650 push_cfun (NULL);
50651 tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str));
50652 bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0);
50653 gcc_assert (ok);
50654 pop_cfun ();
50655 ix86_reset_previous_fndecl ();
50656 ix86_set_current_function (node->decl);
50657 }
50658
50659 /* If SIMD clone NODE can't be used in a vectorized loop
50660 in current function, return -1, otherwise return a badness of using it
50661 (0 if it is most desirable from vecsize_mangle point of view, 1
50662 slightly less desirable, etc.). */
50663
50664 static int
50665 ix86_simd_clone_usable (struct cgraph_node *node)
50666 {
50667 switch (node->simdclone->vecsize_mangle)
50668 {
50669 case 'b':
50670 if (!TARGET_SSE2)
50671 return -1;
50672 if (!TARGET_AVX)
50673 return 0;
50674 return TARGET_AVX2 ? 2 : 1;
50675 case 'c':
50676 if (!TARGET_AVX)
50677 return -1;
50678 return TARGET_AVX2 ? 1 : 0;
50679 case 'd':
50680 if (!TARGET_AVX2)
50681 return -1;
50682 return 0;
50683 case 'e':
50684 if (!TARGET_AVX512F)
50685 return -1;
50686 return 0;
50687 default:
50688 gcc_unreachable ();
50689 }
50690 }
50691
50692 /* This function adjusts the unroll factor based on
50693 the hardware capabilities. For ex, bdver3 has
50694 a loop buffer which makes unrolling of smaller
50695 loops less important. This function decides the
50696 unroll factor using number of memory references
50697 (value 32 is used) as a heuristic. */
50698
50699 static unsigned
50700 ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
50701 {
50702 basic_block *bbs;
50703 rtx_insn *insn;
50704 unsigned i;
50705 unsigned mem_count = 0;
50706
50707 if (!TARGET_ADJUST_UNROLL)
50708 return nunroll;
50709
50710 /* Count the number of memory references within the loop body.
50711 This value determines the unrolling factor for bdver3 and bdver4
50712 architectures. */
50713 subrtx_iterator::array_type array;
50714 bbs = get_loop_body (loop);
50715 for (i = 0; i < loop->num_nodes; i++)
50716 FOR_BB_INSNS (bbs[i], insn)
50717 if (NONDEBUG_INSN_P (insn))
50718 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
50719 if (const_rtx x = *iter)
50720 if (MEM_P (x))
50721 {
50722 machine_mode mode = GET_MODE (x);
50723 unsigned int n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
50724 if (n_words > 4)
50725 mem_count += 2;
50726 else
50727 mem_count += 1;
50728 }
50729 free (bbs);
50730
50731 if (mem_count && mem_count <=32)
50732 return 32/mem_count;
50733
50734 return nunroll;
50735 }
50736
50737
50738 /* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P. */
50739
50740 static bool
50741 ix86_float_exceptions_rounding_supported_p (void)
50742 {
50743 /* For x87 floating point with standard excess precision handling,
50744 there is no adddf3 pattern (since x87 floating point only has
50745 XFmode operations) so the default hook implementation gets this
50746 wrong. */
50747 return TARGET_80387 || TARGET_SSE_MATH;
50748 }
50749
50750 /* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV. */
50751
50752 static void
50753 ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
50754 {
50755 if (!TARGET_80387 && !TARGET_SSE_MATH)
50756 return;
50757 tree exceptions_var = create_tmp_var_raw (integer_type_node);
50758 if (TARGET_80387)
50759 {
50760 tree fenv_index_type = build_index_type (size_int (6));
50761 tree fenv_type = build_array_type (unsigned_type_node, fenv_index_type);
50762 tree fenv_var = create_tmp_var_raw (fenv_type);
50763 TREE_ADDRESSABLE (fenv_var) = 1;
50764 tree fenv_ptr = build_pointer_type (fenv_type);
50765 tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
50766 fenv_addr = fold_convert (ptr_type_node, fenv_addr);
50767 tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
50768 tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
50769 tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
50770 tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
50771 tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
50772 tree hold_fnclex = build_call_expr (fnclex, 0);
50773 fenv_var = build4 (TARGET_EXPR, fenv_type, fenv_var, hold_fnstenv,
50774 NULL_TREE, NULL_TREE);
50775 *hold = build2 (COMPOUND_EXPR, void_type_node, fenv_var,
50776 hold_fnclex);
50777 *clear = build_call_expr (fnclex, 0);
50778 tree sw_var = create_tmp_var_raw (short_unsigned_type_node);
50779 tree fnstsw_call = build_call_expr (fnstsw, 0);
50780 tree sw_mod = build2 (MODIFY_EXPR, short_unsigned_type_node,
50781 sw_var, fnstsw_call);
50782 tree exceptions_x87 = fold_convert (integer_type_node, sw_var);
50783 tree update_mod = build2 (MODIFY_EXPR, integer_type_node,
50784 exceptions_var, exceptions_x87);
50785 *update = build2 (COMPOUND_EXPR, integer_type_node,
50786 sw_mod, update_mod);
50787 tree update_fldenv = build_call_expr (fldenv, 1, fenv_addr);
50788 *update = build2 (COMPOUND_EXPR, void_type_node, *update, update_fldenv);
50789 }
50790 if (TARGET_SSE_MATH)
50791 {
50792 tree mxcsr_orig_var = create_tmp_var_raw (unsigned_type_node);
50793 tree mxcsr_mod_var = create_tmp_var_raw (unsigned_type_node);
50794 tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
50795 tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
50796 tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
50797 tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
50798 mxcsr_orig_var, stmxcsr_hold_call);
50799 tree hold_mod_val = build2 (BIT_IOR_EXPR, unsigned_type_node,
50800 mxcsr_orig_var,
50801 build_int_cst (unsigned_type_node, 0x1f80));
50802 hold_mod_val = build2 (BIT_AND_EXPR, unsigned_type_node, hold_mod_val,
50803 build_int_cst (unsigned_type_node, 0xffffffc0));
50804 tree hold_assign_mod = build2 (MODIFY_EXPR, unsigned_type_node,
50805 mxcsr_mod_var, hold_mod_val);
50806 tree ldmxcsr_hold_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
50807 tree hold_all = build2 (COMPOUND_EXPR, unsigned_type_node,
50808 hold_assign_orig, hold_assign_mod);
50809 hold_all = build2 (COMPOUND_EXPR, void_type_node, hold_all,
50810 ldmxcsr_hold_call);
50811 if (*hold)
50812 *hold = build2 (COMPOUND_EXPR, void_type_node, *hold, hold_all);
50813 else
50814 *hold = hold_all;
50815 tree ldmxcsr_clear_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
50816 if (*clear)
50817 *clear = build2 (COMPOUND_EXPR, void_type_node, *clear,
50818 ldmxcsr_clear_call);
50819 else
50820 *clear = ldmxcsr_clear_call;
50821 tree stxmcsr_update_call = build_call_expr (stmxcsr, 0);
50822 tree exceptions_sse = fold_convert (integer_type_node,
50823 stxmcsr_update_call);
50824 if (*update)
50825 {
50826 tree exceptions_mod = build2 (BIT_IOR_EXPR, integer_type_node,
50827 exceptions_var, exceptions_sse);
50828 tree exceptions_assign = build2 (MODIFY_EXPR, integer_type_node,
50829 exceptions_var, exceptions_mod);
50830 *update = build2 (COMPOUND_EXPR, integer_type_node, *update,
50831 exceptions_assign);
50832 }
50833 else
50834 *update = build2 (MODIFY_EXPR, integer_type_node,
50835 exceptions_var, exceptions_sse);
50836 tree ldmxcsr_update_call = build_call_expr (ldmxcsr, 1, mxcsr_orig_var);
50837 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
50838 ldmxcsr_update_call);
50839 }
50840 tree atomic_feraiseexcept
50841 = builtin_decl_implicit (BUILT_IN_ATOMIC_FERAISEEXCEPT);
50842 tree atomic_feraiseexcept_call = build_call_expr (atomic_feraiseexcept,
50843 1, exceptions_var);
50844 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
50845 atomic_feraiseexcept_call);
50846 }
50847
50848 /* Return mode to be used for bounds or VOIDmode
50849 if bounds are not supported. */
50850
50851 static enum machine_mode
50852 ix86_mpx_bound_mode ()
50853 {
50854 /* Do not support pointer checker if MPX
50855 is not enabled. */
50856 if (!TARGET_MPX)
50857 {
50858 if (flag_check_pointer_bounds)
50859 warning (0, "Pointer Checker requires MPX support on this target."
50860 " Use -mmpx options to enable MPX.");
50861 return VOIDmode;
50862 }
50863
50864 return BNDmode;
50865 }
50866
50867 /* Return constant used to statically initialize constant bounds.
50868
50869 This function is used to create special bound values. For now
50870 only INIT bounds and NONE bounds are expected. More special
50871 values may be added later. */
50872
50873 static tree
50874 ix86_make_bounds_constant (HOST_WIDE_INT lb, HOST_WIDE_INT ub)
50875 {
50876 tree low = lb ? build_minus_one_cst (pointer_sized_int_node)
50877 : build_zero_cst (pointer_sized_int_node);
50878 tree high = ub ? build_zero_cst (pointer_sized_int_node)
50879 : build_minus_one_cst (pointer_sized_int_node);
50880
50881 /* This function is supposed to be used to create INIT and
50882 NONE bounds only. */
50883 gcc_assert ((lb == 0 && ub == -1)
50884 || (lb == -1 && ub == 0));
50885
50886 return build_complex (NULL, low, high);
50887 }
50888
50889 /* Generate a list of statements STMTS to initialize pointer bounds
50890 variable VAR with bounds LB and UB. Return the number of generated
50891 statements. */
50892
50893 static int
50894 ix86_initialize_bounds (tree var, tree lb, tree ub, tree *stmts)
50895 {
50896 tree bnd_ptr = build_pointer_type (pointer_sized_int_node);
50897 tree lhs, modify, var_p;
50898
50899 ub = build1 (BIT_NOT_EXPR, pointer_sized_int_node, ub);
50900 var_p = fold_convert (bnd_ptr, build_fold_addr_expr (var));
50901
50902 lhs = build1 (INDIRECT_REF, pointer_sized_int_node, var_p);
50903 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, lb);
50904 append_to_statement_list (modify, stmts);
50905
50906 lhs = build1 (INDIRECT_REF, pointer_sized_int_node,
50907 build2 (POINTER_PLUS_EXPR, bnd_ptr, var_p,
50908 TYPE_SIZE_UNIT (pointer_sized_int_node)));
50909 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, ub);
50910 append_to_statement_list (modify, stmts);
50911
50912 return 2;
50913 }
50914
50915 #if !TARGET_MACHO && !TARGET_DLLIMPORT_DECL_ATTRIBUTES
50916 /* For i386, common symbol is local only for non-PIE binaries. For
50917 x86-64, common symbol is local only for non-PIE binaries or linker
50918 supports copy reloc in PIE binaries. */
50919
50920 static bool
50921 ix86_binds_local_p (const_tree exp)
50922 {
50923 return default_binds_local_p_3 (exp, flag_shlib != 0, true, true,
50924 (!flag_pic
50925 || (TARGET_64BIT
50926 && HAVE_LD_PIE_COPYRELOC != 0)));
50927 }
50928 #endif
50929
50930 /* If MEM is in the form of [base+offset], extract the two parts
50931 of address and set to BASE and OFFSET, otherwise return false. */
50932
50933 static bool
50934 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
50935 {
50936 rtx addr;
50937
50938 gcc_assert (MEM_P (mem));
50939
50940 addr = XEXP (mem, 0);
50941
50942 if (GET_CODE (addr) == CONST)
50943 addr = XEXP (addr, 0);
50944
50945 if (REG_P (addr) || GET_CODE (addr) == SYMBOL_REF)
50946 {
50947 *base = addr;
50948 *offset = const0_rtx;
50949 return true;
50950 }
50951
50952 if (GET_CODE (addr) == PLUS
50953 && (REG_P (XEXP (addr, 0))
50954 || GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
50955 && CONST_INT_P (XEXP (addr, 1)))
50956 {
50957 *base = XEXP (addr, 0);
50958 *offset = XEXP (addr, 1);
50959 return true;
50960 }
50961
50962 return false;
50963 }
50964
50965 /* Given OPERANDS of consecutive load/store, check if we can merge
50966 them into move multiple. LOAD is true if they are load instructions.
50967 MODE is the mode of memory operands. */
50968
50969 bool
50970 ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
50971 enum machine_mode mode)
50972 {
50973 HOST_WIDE_INT offval_1, offval_2, msize;
50974 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
50975
50976 if (load)
50977 {
50978 mem_1 = operands[1];
50979 mem_2 = operands[3];
50980 reg_1 = operands[0];
50981 reg_2 = operands[2];
50982 }
50983 else
50984 {
50985 mem_1 = operands[0];
50986 mem_2 = operands[2];
50987 reg_1 = operands[1];
50988 reg_2 = operands[3];
50989 }
50990
50991 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
50992
50993 if (REGNO (reg_1) != REGNO (reg_2))
50994 return false;
50995
50996 /* Check if the addresses are in the form of [base+offset]. */
50997 if (!extract_base_offset_in_addr (mem_1, &base_1, &offset_1))
50998 return false;
50999 if (!extract_base_offset_in_addr (mem_2, &base_2, &offset_2))
51000 return false;
51001
51002 /* Check if the bases are the same. */
51003 if (!rtx_equal_p (base_1, base_2))
51004 return false;
51005
51006 offval_1 = INTVAL (offset_1);
51007 offval_2 = INTVAL (offset_2);
51008 msize = GET_MODE_SIZE (mode);
51009 /* Check if mem_1 is adjacent to mem_2 and mem_1 has lower address. */
51010 if (offval_1 + msize != offval_2)
51011 return false;
51012
51013 return true;
51014 }
51015
51016 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
51017
51018 static bool
51019 ix86_optab_supported_p (int op, machine_mode mode1, machine_mode,
51020 optimization_type opt_type)
51021 {
51022 switch (op)
51023 {
51024 case asin_optab:
51025 case acos_optab:
51026 case log1p_optab:
51027 case exp_optab:
51028 case exp10_optab:
51029 case exp2_optab:
51030 case expm1_optab:
51031 case ldexp_optab:
51032 case scalb_optab:
51033 case round_optab:
51034 return opt_type == OPTIMIZE_FOR_SPEED;
51035
51036 case rint_optab:
51037 if (SSE_FLOAT_MODE_P (mode1)
51038 && TARGET_SSE_MATH
51039 && !flag_trapping_math
51040 && !TARGET_ROUND)
51041 return opt_type == OPTIMIZE_FOR_SPEED;
51042 return true;
51043
51044 case floor_optab:
51045 case ceil_optab:
51046 case btrunc_optab:
51047 if (SSE_FLOAT_MODE_P (mode1)
51048 && TARGET_SSE_MATH
51049 && !flag_trapping_math
51050 && TARGET_ROUND)
51051 return true;
51052 return opt_type == OPTIMIZE_FOR_SPEED;
51053
51054 case rsqrt_optab:
51055 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
51056
51057 default:
51058 return true;
51059 }
51060 }
51061
51062 /* Address space support.
51063
51064 This is not "far pointers" in the 16-bit sense, but an easy way
51065 to use %fs and %gs segment prefixes. Therefore:
51066
51067 (a) All address spaces have the same modes,
51068 (b) All address spaces have the same addresss forms,
51069 (c) While %fs and %gs are technically subsets of the generic
51070 address space, they are probably not subsets of each other.
51071 (d) Since we have no access to the segment base register values
51072 without resorting to a system call, we cannot convert a
51073 non-default address space to a default address space.
51074 Therefore we do not claim %fs or %gs are subsets of generic.
51075
51076 Therefore we can (mostly) use the default hooks. */
51077
51078 /* All use of segmentation is assumed to make address 0 valid. */
51079
51080 static bool
51081 ix86_addr_space_zero_address_valid (addr_space_t as)
51082 {
51083 return as != ADDR_SPACE_GENERIC;
51084 }
51085
51086 static void
51087 ix86_init_libfuncs (void)
51088 {
51089 if (TARGET_64BIT)
51090 {
51091 set_optab_libfunc (sdivmod_optab, TImode, "__divmodti4");
51092 set_optab_libfunc (udivmod_optab, TImode, "__udivmodti4");
51093 }
51094 else
51095 {
51096 set_optab_libfunc (sdivmod_optab, DImode, "__divmoddi4");
51097 set_optab_libfunc (udivmod_optab, DImode, "__udivmoddi4");
51098 }
51099
51100 #if TARGET_MACHO
51101 darwin_rename_builtins ();
51102 #endif
51103 }
51104
51105 /* Generate call to __divmoddi4. */
51106
51107 static void
51108 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
51109 rtx op0, rtx op1,
51110 rtx *quot_p, rtx *rem_p)
51111 {
51112 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
51113
51114 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
51115 mode, 3,
51116 op0, GET_MODE (op0),
51117 op1, GET_MODE (op1),
51118 XEXP (rem, 0), Pmode);
51119 *quot_p = quot;
51120 *rem_p = rem;
51121 }
51122
51123 /* Set the value of FLT_EVAL_METHOD in float.h. When using only the
51124 FPU, assume that the fpcw is set to extended precision; when using
51125 only SSE, rounding is correct; when using both SSE and the FPU,
51126 the rounding precision is indeterminate, since either may be chosen
51127 apparently at random. */
51128
51129 static enum flt_eval_method
51130 ix86_excess_precision (enum excess_precision_type type)
51131 {
51132 switch (type)
51133 {
51134 case EXCESS_PRECISION_TYPE_FAST:
51135 /* The fastest type to promote to will always be the native type,
51136 whether that occurs with implicit excess precision or
51137 otherwise. */
51138 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
51139 case EXCESS_PRECISION_TYPE_STANDARD:
51140 case EXCESS_PRECISION_TYPE_IMPLICIT:
51141 /* Otherwise, the excess precision we want when we are
51142 in a standards compliant mode, and the implicit precision we
51143 provide would be identical were it not for the unpredictable
51144 cases. */
51145 if (!TARGET_80387)
51146 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
51147 else if (!TARGET_MIX_SSE_I387)
51148 {
51149 if (!TARGET_SSE_MATH)
51150 return FLT_EVAL_METHOD_PROMOTE_TO_LONG_DOUBLE;
51151 else if (TARGET_SSE2)
51152 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
51153 }
51154
51155 /* If we are in standards compliant mode, but we know we will
51156 calculate in unpredictable precision, return
51157 FLT_EVAL_METHOD_FLOAT. There is no reason to introduce explicit
51158 excess precision if the target can't guarantee it will honor
51159 it. */
51160 return (type == EXCESS_PRECISION_TYPE_STANDARD
51161 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
51162 : FLT_EVAL_METHOD_UNPREDICTABLE);
51163 default:
51164 gcc_unreachable ();
51165 }
51166
51167 return FLT_EVAL_METHOD_UNPREDICTABLE;
51168 }
51169
51170 /* Target-specific selftests. */
51171
51172 #if CHECKING_P
51173
51174 namespace selftest {
51175
51176 /* Verify that hard regs are dumped as expected (in compact mode). */
51177
51178 static void
51179 ix86_test_dumping_hard_regs ()
51180 {
51181 ASSERT_RTL_DUMP_EQ ("(reg:SI ax)", gen_raw_REG (SImode, 0));
51182 ASSERT_RTL_DUMP_EQ ("(reg:SI dx)", gen_raw_REG (SImode, 1));
51183 }
51184
51185 /* Test dumping an insn with repeated references to the same SCRATCH,
51186 to verify the rtx_reuse code. */
51187
51188 static void
51189 ix86_test_dumping_memory_blockage ()
51190 {
51191 set_new_first_and_last_insn (NULL, NULL);
51192
51193 rtx pat = gen_memory_blockage ();
51194 rtx_reuse_manager r;
51195 r.preprocess (pat);
51196
51197 /* Verify that the repeated references to the SCRATCH show use
51198 reuse IDS. The first should be prefixed with a reuse ID,
51199 and the second should be dumped as a "reuse_rtx" of that ID.
51200 The expected string assumes Pmode == DImode. */
51201 if (Pmode == DImode)
51202 ASSERT_RTL_DUMP_EQ_WITH_REUSE
51203 ("(cinsn 1 (set (mem/v:BLK (0|scratch:DI) [0 A8])\n"
51204 " (unspec:BLK [\n"
51205 " (mem/v:BLK (reuse_rtx 0) [0 A8])\n"
51206 " ] UNSPEC_MEMORY_BLOCKAGE)))\n", pat, &r);
51207 }
51208
51209 /* Verify loading an RTL dump; specifically a dump of copying
51210 a param on x86_64 from a hard reg into the frame.
51211 This test is target-specific since the dump contains target-specific
51212 hard reg names. */
51213
51214 static void
51215 ix86_test_loading_dump_fragment_1 ()
51216 {
51217 rtl_dump_test t (SELFTEST_LOCATION,
51218 locate_file ("x86_64/copy-hard-reg-into-frame.rtl"));
51219
51220 rtx_insn *insn = get_insn_by_uid (1);
51221
51222 /* The block structure and indentation here is purely for
51223 readability; it mirrors the structure of the rtx. */
51224 tree mem_expr;
51225 {
51226 rtx pat = PATTERN (insn);
51227 ASSERT_EQ (SET, GET_CODE (pat));
51228 {
51229 rtx dest = SET_DEST (pat);
51230 ASSERT_EQ (MEM, GET_CODE (dest));
51231 /* Verify the "/c" was parsed. */
51232 ASSERT_TRUE (RTX_FLAG (dest, call));
51233 ASSERT_EQ (SImode, GET_MODE (dest));
51234 {
51235 rtx addr = XEXP (dest, 0);
51236 ASSERT_EQ (PLUS, GET_CODE (addr));
51237 ASSERT_EQ (DImode, GET_MODE (addr));
51238 {
51239 rtx lhs = XEXP (addr, 0);
51240 /* Verify that the "frame" REG was consolidated. */
51241 ASSERT_RTX_PTR_EQ (frame_pointer_rtx, lhs);
51242 }
51243 {
51244 rtx rhs = XEXP (addr, 1);
51245 ASSERT_EQ (CONST_INT, GET_CODE (rhs));
51246 ASSERT_EQ (-4, INTVAL (rhs));
51247 }
51248 }
51249 /* Verify the "[1 i+0 S4 A32]" was parsed. */
51250 ASSERT_EQ (1, MEM_ALIAS_SET (dest));
51251 /* "i" should have been handled by synthesizing a global int
51252 variable named "i". */
51253 mem_expr = MEM_EXPR (dest);
51254 ASSERT_NE (mem_expr, NULL);
51255 ASSERT_EQ (VAR_DECL, TREE_CODE (mem_expr));
51256 ASSERT_EQ (integer_type_node, TREE_TYPE (mem_expr));
51257 ASSERT_EQ (IDENTIFIER_NODE, TREE_CODE (DECL_NAME (mem_expr)));
51258 ASSERT_STREQ ("i", IDENTIFIER_POINTER (DECL_NAME (mem_expr)));
51259 /* "+0". */
51260 ASSERT_TRUE (MEM_OFFSET_KNOWN_P (dest));
51261 ASSERT_EQ (0, MEM_OFFSET (dest));
51262 /* "S4". */
51263 ASSERT_EQ (4, MEM_SIZE (dest));
51264 /* "A32. */
51265 ASSERT_EQ (32, MEM_ALIGN (dest));
51266 }
51267 {
51268 rtx src = SET_SRC (pat);
51269 ASSERT_EQ (REG, GET_CODE (src));
51270 ASSERT_EQ (SImode, GET_MODE (src));
51271 ASSERT_EQ (5, REGNO (src));
51272 tree reg_expr = REG_EXPR (src);
51273 /* "i" here should point to the same var as for the MEM_EXPR. */
51274 ASSERT_EQ (reg_expr, mem_expr);
51275 }
51276 }
51277 }
51278
51279 /* Verify that the RTL loader copes with a call_insn dump.
51280 This test is target-specific since the dump contains a target-specific
51281 hard reg name. */
51282
51283 static void
51284 ix86_test_loading_call_insn ()
51285 {
51286 /* The test dump includes register "xmm0", where requires TARGET_SSE
51287 to exist. */
51288 if (!TARGET_SSE)
51289 return;
51290
51291 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/call-insn.rtl"));
51292
51293 rtx_insn *insn = get_insns ();
51294 ASSERT_EQ (CALL_INSN, GET_CODE (insn));
51295
51296 /* "/j". */
51297 ASSERT_TRUE (RTX_FLAG (insn, jump));
51298
51299 rtx pat = PATTERN (insn);
51300 ASSERT_EQ (CALL, GET_CODE (SET_SRC (pat)));
51301
51302 /* Verify REG_NOTES. */
51303 {
51304 /* "(expr_list:REG_CALL_DECL". */
51305 ASSERT_EQ (EXPR_LIST, GET_CODE (REG_NOTES (insn)));
51306 rtx_expr_list *note0 = as_a <rtx_expr_list *> (REG_NOTES (insn));
51307 ASSERT_EQ (REG_CALL_DECL, REG_NOTE_KIND (note0));
51308
51309 /* "(expr_list:REG_EH_REGION (const_int 0 [0])". */
51310 rtx_expr_list *note1 = note0->next ();
51311 ASSERT_EQ (REG_EH_REGION, REG_NOTE_KIND (note1));
51312
51313 ASSERT_EQ (NULL, note1->next ());
51314 }
51315
51316 /* Verify CALL_INSN_FUNCTION_USAGE. */
51317 {
51318 /* "(expr_list:DF (use (reg:DF 21 xmm0))". */
51319 rtx_expr_list *usage
51320 = as_a <rtx_expr_list *> (CALL_INSN_FUNCTION_USAGE (insn));
51321 ASSERT_EQ (EXPR_LIST, GET_CODE (usage));
51322 ASSERT_EQ (DFmode, GET_MODE (usage));
51323 ASSERT_EQ (USE, GET_CODE (usage->element ()));
51324 ASSERT_EQ (NULL, usage->next ());
51325 }
51326 }
51327
51328 /* Verify that the RTL loader copes a dump from print_rtx_function.
51329 This test is target-specific since the dump contains target-specific
51330 hard reg names. */
51331
51332 static void
51333 ix86_test_loading_full_dump ()
51334 {
51335 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/times-two.rtl"));
51336
51337 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
51338
51339 rtx_insn *insn_1 = get_insn_by_uid (1);
51340 ASSERT_EQ (NOTE, GET_CODE (insn_1));
51341
51342 rtx_insn *insn_7 = get_insn_by_uid (7);
51343 ASSERT_EQ (INSN, GET_CODE (insn_7));
51344 ASSERT_EQ (PARALLEL, GET_CODE (PATTERN (insn_7)));
51345
51346 rtx_insn *insn_15 = get_insn_by_uid (15);
51347 ASSERT_EQ (INSN, GET_CODE (insn_15));
51348 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
51349
51350 /* Verify crtl->return_rtx. */
51351 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
51352 ASSERT_EQ (0, REGNO (crtl->return_rtx));
51353 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
51354 }
51355
51356 /* Verify that the RTL loader copes with UNSPEC and UNSPEC_VOLATILE insns.
51357 In particular, verify that it correctly loads the 2nd operand.
51358 This test is target-specific since these are machine-specific
51359 operands (and enums). */
51360
51361 static void
51362 ix86_test_loading_unspec ()
51363 {
51364 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/unspec.rtl"));
51365
51366 ASSERT_STREQ ("test_unspec", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
51367
51368 ASSERT_TRUE (cfun);
51369
51370 /* Test of an UNSPEC. */
51371 rtx_insn *insn = get_insns ();
51372 ASSERT_EQ (INSN, GET_CODE (insn));
51373 rtx set = single_set (insn);
51374 ASSERT_NE (NULL, set);
51375 rtx dst = SET_DEST (set);
51376 ASSERT_EQ (MEM, GET_CODE (dst));
51377 rtx src = SET_SRC (set);
51378 ASSERT_EQ (UNSPEC, GET_CODE (src));
51379 ASSERT_EQ (BLKmode, GET_MODE (src));
51380 ASSERT_EQ (UNSPEC_MEMORY_BLOCKAGE, XINT (src, 1));
51381
51382 rtx v0 = XVECEXP (src, 0, 0);
51383
51384 /* Verify that the two uses of the first SCRATCH have pointer
51385 equality. */
51386 rtx scratch_a = XEXP (dst, 0);
51387 ASSERT_EQ (SCRATCH, GET_CODE (scratch_a));
51388
51389 rtx scratch_b = XEXP (v0, 0);
51390 ASSERT_EQ (SCRATCH, GET_CODE (scratch_b));
51391
51392 ASSERT_EQ (scratch_a, scratch_b);
51393
51394 /* Verify that the two mems are thus treated as equal. */
51395 ASSERT_TRUE (rtx_equal_p (dst, v0));
51396
51397 /* Verify the the insn is recognized. */
51398 ASSERT_NE(-1, recog_memoized (insn));
51399
51400 /* Test of an UNSPEC_VOLATILE, which has its own enum values. */
51401 insn = NEXT_INSN (insn);
51402 ASSERT_EQ (INSN, GET_CODE (insn));
51403
51404 set = single_set (insn);
51405 ASSERT_NE (NULL, set);
51406
51407 src = SET_SRC (set);
51408 ASSERT_EQ (UNSPEC_VOLATILE, GET_CODE (src));
51409 ASSERT_EQ (UNSPECV_RDTSCP, XINT (src, 1));
51410 }
51411
51412 /* Run all target-specific selftests. */
51413
51414 static void
51415 ix86_run_selftests (void)
51416 {
51417 ix86_test_dumping_hard_regs ();
51418 ix86_test_dumping_memory_blockage ();
51419
51420 /* Various tests of loading RTL dumps, here because they contain
51421 ix86-isms (e.g. names of hard regs). */
51422 ix86_test_loading_dump_fragment_1 ();
51423 ix86_test_loading_call_insn ();
51424 ix86_test_loading_full_dump ();
51425 ix86_test_loading_unspec ();
51426 }
51427
51428 } // namespace selftest
51429
51430 #endif /* CHECKING_P */
51431
51432 /* Initialize the GCC target structure. */
51433 #undef TARGET_RETURN_IN_MEMORY
51434 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
51435
51436 #undef TARGET_LEGITIMIZE_ADDRESS
51437 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
51438
51439 #undef TARGET_ATTRIBUTE_TABLE
51440 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
51441 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
51442 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
51443 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
51444 # undef TARGET_MERGE_DECL_ATTRIBUTES
51445 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
51446 #endif
51447
51448 #undef TARGET_COMP_TYPE_ATTRIBUTES
51449 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
51450
51451 #undef TARGET_INIT_BUILTINS
51452 #define TARGET_INIT_BUILTINS ix86_init_builtins
51453 #undef TARGET_BUILTIN_DECL
51454 #define TARGET_BUILTIN_DECL ix86_builtin_decl
51455 #undef TARGET_EXPAND_BUILTIN
51456 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
51457
51458 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
51459 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
51460 ix86_builtin_vectorized_function
51461
51462 #undef TARGET_VECTORIZE_BUILTIN_GATHER
51463 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
51464
51465 #undef TARGET_VECTORIZE_BUILTIN_SCATTER
51466 #define TARGET_VECTORIZE_BUILTIN_SCATTER ix86_vectorize_builtin_scatter
51467
51468 #undef TARGET_BUILTIN_RECIPROCAL
51469 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
51470
51471 #undef TARGET_ASM_FUNCTION_EPILOGUE
51472 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
51473
51474 #undef TARGET_ENCODE_SECTION_INFO
51475 #ifndef SUBTARGET_ENCODE_SECTION_INFO
51476 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
51477 #else
51478 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
51479 #endif
51480
51481 #undef TARGET_ASM_OPEN_PAREN
51482 #define TARGET_ASM_OPEN_PAREN ""
51483 #undef TARGET_ASM_CLOSE_PAREN
51484 #define TARGET_ASM_CLOSE_PAREN ""
51485
51486 #undef TARGET_ASM_BYTE_OP
51487 #define TARGET_ASM_BYTE_OP ASM_BYTE
51488
51489 #undef TARGET_ASM_ALIGNED_HI_OP
51490 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
51491 #undef TARGET_ASM_ALIGNED_SI_OP
51492 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
51493 #ifdef ASM_QUAD
51494 #undef TARGET_ASM_ALIGNED_DI_OP
51495 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
51496 #endif
51497
51498 #undef TARGET_PROFILE_BEFORE_PROLOGUE
51499 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
51500
51501 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
51502 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
51503
51504 #undef TARGET_ASM_UNALIGNED_HI_OP
51505 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
51506 #undef TARGET_ASM_UNALIGNED_SI_OP
51507 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
51508 #undef TARGET_ASM_UNALIGNED_DI_OP
51509 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
51510
51511 #undef TARGET_PRINT_OPERAND
51512 #define TARGET_PRINT_OPERAND ix86_print_operand
51513 #undef TARGET_PRINT_OPERAND_ADDRESS
51514 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
51515 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
51516 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
51517 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
51518 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
51519
51520 #undef TARGET_SCHED_INIT_GLOBAL
51521 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
51522 #undef TARGET_SCHED_ADJUST_COST
51523 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
51524 #undef TARGET_SCHED_ISSUE_RATE
51525 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
51526 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
51527 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
51528 ia32_multipass_dfa_lookahead
51529 #undef TARGET_SCHED_MACRO_FUSION_P
51530 #define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
51531 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
51532 #define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
51533
51534 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
51535 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
51536
51537 #undef TARGET_MEMMODEL_CHECK
51538 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
51539
51540 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
51541 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV ix86_atomic_assign_expand_fenv
51542
51543 #ifdef HAVE_AS_TLS
51544 #undef TARGET_HAVE_TLS
51545 #define TARGET_HAVE_TLS true
51546 #endif
51547 #undef TARGET_CANNOT_FORCE_CONST_MEM
51548 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
51549 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
51550 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
51551
51552 #undef TARGET_DELEGITIMIZE_ADDRESS
51553 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
51554
51555 #undef TARGET_MS_BITFIELD_LAYOUT_P
51556 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
51557
51558 #if TARGET_MACHO
51559 #undef TARGET_BINDS_LOCAL_P
51560 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
51561 #else
51562 #undef TARGET_BINDS_LOCAL_P
51563 #define TARGET_BINDS_LOCAL_P ix86_binds_local_p
51564 #endif
51565 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
51566 #undef TARGET_BINDS_LOCAL_P
51567 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
51568 #endif
51569
51570 #undef TARGET_ASM_OUTPUT_MI_THUNK
51571 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
51572 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
51573 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
51574
51575 #undef TARGET_ASM_FILE_START
51576 #define TARGET_ASM_FILE_START x86_file_start
51577
51578 #undef TARGET_OPTION_OVERRIDE
51579 #define TARGET_OPTION_OVERRIDE ix86_option_override
51580
51581 #undef TARGET_REGISTER_MOVE_COST
51582 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
51583 #undef TARGET_MEMORY_MOVE_COST
51584 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
51585 #undef TARGET_RTX_COSTS
51586 #define TARGET_RTX_COSTS ix86_rtx_costs
51587 #undef TARGET_ADDRESS_COST
51588 #define TARGET_ADDRESS_COST ix86_address_cost
51589
51590 #undef TARGET_FIXED_CONDITION_CODE_REGS
51591 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
51592 #undef TARGET_CC_MODES_COMPATIBLE
51593 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
51594
51595 #undef TARGET_MACHINE_DEPENDENT_REORG
51596 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
51597
51598 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
51599 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
51600
51601 #undef TARGET_BUILD_BUILTIN_VA_LIST
51602 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
51603
51604 #undef TARGET_FOLD_BUILTIN
51605 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
51606
51607 #undef TARGET_GIMPLE_FOLD_BUILTIN
51608 #define TARGET_GIMPLE_FOLD_BUILTIN ix86_gimple_fold_builtin
51609
51610 #undef TARGET_COMPARE_VERSION_PRIORITY
51611 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
51612
51613 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
51614 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
51615 ix86_generate_version_dispatcher_body
51616
51617 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
51618 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
51619 ix86_get_function_versions_dispatcher
51620
51621 #undef TARGET_ENUM_VA_LIST_P
51622 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
51623
51624 #undef TARGET_FN_ABI_VA_LIST
51625 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
51626
51627 #undef TARGET_CANONICAL_VA_LIST_TYPE
51628 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
51629
51630 #undef TARGET_EXPAND_BUILTIN_VA_START
51631 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
51632
51633 #undef TARGET_MD_ASM_ADJUST
51634 #define TARGET_MD_ASM_ADJUST ix86_md_asm_adjust
51635
51636 #undef TARGET_C_EXCESS_PRECISION
51637 #define TARGET_C_EXCESS_PRECISION ix86_excess_precision
51638 #undef TARGET_PROMOTE_PROTOTYPES
51639 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
51640 #undef TARGET_SETUP_INCOMING_VARARGS
51641 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
51642 #undef TARGET_MUST_PASS_IN_STACK
51643 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
51644 #undef TARGET_FUNCTION_ARG_ADVANCE
51645 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
51646 #undef TARGET_FUNCTION_ARG
51647 #define TARGET_FUNCTION_ARG ix86_function_arg
51648 #undef TARGET_INIT_PIC_REG
51649 #define TARGET_INIT_PIC_REG ix86_init_pic_reg
51650 #undef TARGET_USE_PSEUDO_PIC_REG
51651 #define TARGET_USE_PSEUDO_PIC_REG ix86_use_pseudo_pic_reg
51652 #undef TARGET_FUNCTION_ARG_BOUNDARY
51653 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
51654 #undef TARGET_PASS_BY_REFERENCE
51655 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
51656 #undef TARGET_INTERNAL_ARG_POINTER
51657 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
51658 #undef TARGET_UPDATE_STACK_BOUNDARY
51659 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
51660 #undef TARGET_GET_DRAP_RTX
51661 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
51662 #undef TARGET_STRICT_ARGUMENT_NAMING
51663 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
51664 #undef TARGET_STATIC_CHAIN
51665 #define TARGET_STATIC_CHAIN ix86_static_chain
51666 #undef TARGET_TRAMPOLINE_INIT
51667 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
51668 #undef TARGET_RETURN_POPS_ARGS
51669 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
51670
51671 #undef TARGET_LEGITIMATE_COMBINED_INSN
51672 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
51673
51674 #undef TARGET_ASAN_SHADOW_OFFSET
51675 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
51676
51677 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
51678 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
51679
51680 #undef TARGET_SCALAR_MODE_SUPPORTED_P
51681 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
51682
51683 #undef TARGET_VECTOR_MODE_SUPPORTED_P
51684 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
51685
51686 #undef TARGET_C_MODE_FOR_SUFFIX
51687 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
51688
51689 #ifdef HAVE_AS_TLS
51690 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
51691 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
51692 #endif
51693
51694 #ifdef SUBTARGET_INSERT_ATTRIBUTES
51695 #undef TARGET_INSERT_ATTRIBUTES
51696 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
51697 #endif
51698
51699 #undef TARGET_MANGLE_TYPE
51700 #define TARGET_MANGLE_TYPE ix86_mangle_type
51701
51702 #ifdef TARGET_THREAD_SSP_OFFSET
51703 #undef TARGET_STACK_PROTECT_GUARD
51704 #define TARGET_STACK_PROTECT_GUARD ix86_stack_protect_guard
51705 #endif
51706
51707 #if !TARGET_MACHO
51708 #undef TARGET_STACK_PROTECT_FAIL
51709 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
51710 #endif
51711
51712 #undef TARGET_FUNCTION_VALUE
51713 #define TARGET_FUNCTION_VALUE ix86_function_value
51714
51715 #undef TARGET_FUNCTION_VALUE_REGNO_P
51716 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
51717
51718 #undef TARGET_PROMOTE_FUNCTION_MODE
51719 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
51720
51721 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
51722 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE ix86_override_options_after_change
51723
51724 #undef TARGET_MEMBER_TYPE_FORCES_BLK
51725 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
51726
51727 #undef TARGET_INSTANTIATE_DECLS
51728 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
51729
51730 #undef TARGET_SECONDARY_RELOAD
51731 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
51732
51733 #undef TARGET_CLASS_MAX_NREGS
51734 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
51735
51736 #undef TARGET_PREFERRED_RELOAD_CLASS
51737 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
51738 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
51739 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
51740 #undef TARGET_CLASS_LIKELY_SPILLED_P
51741 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
51742
51743 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
51744 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
51745 ix86_builtin_vectorization_cost
51746 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
51747 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
51748 ix86_vectorize_vec_perm_const_ok
51749 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
51750 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
51751 ix86_preferred_simd_mode
51752 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
51753 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
51754 ix86_autovectorize_vector_sizes
51755 #undef TARGET_VECTORIZE_GET_MASK_MODE
51756 #define TARGET_VECTORIZE_GET_MASK_MODE ix86_get_mask_mode
51757 #undef TARGET_VECTORIZE_INIT_COST
51758 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
51759 #undef TARGET_VECTORIZE_ADD_STMT_COST
51760 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
51761 #undef TARGET_VECTORIZE_FINISH_COST
51762 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
51763 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
51764 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
51765
51766 #undef TARGET_SET_CURRENT_FUNCTION
51767 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
51768
51769 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
51770 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
51771
51772 #undef TARGET_OPTION_SAVE
51773 #define TARGET_OPTION_SAVE ix86_function_specific_save
51774
51775 #undef TARGET_OPTION_RESTORE
51776 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
51777
51778 #undef TARGET_OPTION_POST_STREAM_IN
51779 #define TARGET_OPTION_POST_STREAM_IN ix86_function_specific_post_stream_in
51780
51781 #undef TARGET_OPTION_PRINT
51782 #define TARGET_OPTION_PRINT ix86_function_specific_print
51783
51784 #undef TARGET_OPTION_FUNCTION_VERSIONS
51785 #define TARGET_OPTION_FUNCTION_VERSIONS ix86_function_versions
51786
51787 #undef TARGET_CAN_INLINE_P
51788 #define TARGET_CAN_INLINE_P ix86_can_inline_p
51789
51790 #undef TARGET_LEGITIMATE_ADDRESS_P
51791 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
51792
51793 #undef TARGET_REGISTER_PRIORITY
51794 #define TARGET_REGISTER_PRIORITY ix86_register_priority
51795
51796 #undef TARGET_REGISTER_USAGE_LEVELING_P
51797 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
51798
51799 #undef TARGET_LEGITIMATE_CONSTANT_P
51800 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
51801
51802 #undef TARGET_FRAME_POINTER_REQUIRED
51803 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
51804
51805 #undef TARGET_CAN_ELIMINATE
51806 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
51807
51808 #undef TARGET_EXTRA_LIVE_ON_ENTRY
51809 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
51810
51811 #undef TARGET_ASM_CODE_END
51812 #define TARGET_ASM_CODE_END ix86_code_end
51813
51814 #undef TARGET_CONDITIONAL_REGISTER_USAGE
51815 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
51816
51817 #undef TARGET_LOOP_UNROLL_ADJUST
51818 #define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
51819
51820 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
51821 #undef TARGET_SPILL_CLASS
51822 #define TARGET_SPILL_CLASS ix86_spill_class
51823
51824 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
51825 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
51826 ix86_simd_clone_compute_vecsize_and_simdlen
51827
51828 #undef TARGET_SIMD_CLONE_ADJUST
51829 #define TARGET_SIMD_CLONE_ADJUST \
51830 ix86_simd_clone_adjust
51831
51832 #undef TARGET_SIMD_CLONE_USABLE
51833 #define TARGET_SIMD_CLONE_USABLE \
51834 ix86_simd_clone_usable
51835
51836 #undef TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P
51837 #define TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P \
51838 ix86_float_exceptions_rounding_supported_p
51839
51840 #undef TARGET_MODE_EMIT
51841 #define TARGET_MODE_EMIT ix86_emit_mode_set
51842
51843 #undef TARGET_MODE_NEEDED
51844 #define TARGET_MODE_NEEDED ix86_mode_needed
51845
51846 #undef TARGET_MODE_AFTER
51847 #define TARGET_MODE_AFTER ix86_mode_after
51848
51849 #undef TARGET_MODE_ENTRY
51850 #define TARGET_MODE_ENTRY ix86_mode_entry
51851
51852 #undef TARGET_MODE_EXIT
51853 #define TARGET_MODE_EXIT ix86_mode_exit
51854
51855 #undef TARGET_MODE_PRIORITY
51856 #define TARGET_MODE_PRIORITY ix86_mode_priority
51857
51858 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
51859 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
51860
51861 #undef TARGET_LOAD_BOUNDS_FOR_ARG
51862 #define TARGET_LOAD_BOUNDS_FOR_ARG ix86_load_bounds
51863
51864 #undef TARGET_STORE_BOUNDS_FOR_ARG
51865 #define TARGET_STORE_BOUNDS_FOR_ARG ix86_store_bounds
51866
51867 #undef TARGET_LOAD_RETURNED_BOUNDS
51868 #define TARGET_LOAD_RETURNED_BOUNDS ix86_load_returned_bounds
51869
51870 #undef TARGET_STORE_RETURNED_BOUNDS
51871 #define TARGET_STORE_RETURNED_BOUNDS ix86_store_returned_bounds
51872
51873 #undef TARGET_CHKP_BOUND_MODE
51874 #define TARGET_CHKP_BOUND_MODE ix86_mpx_bound_mode
51875
51876 #undef TARGET_BUILTIN_CHKP_FUNCTION
51877 #define TARGET_BUILTIN_CHKP_FUNCTION ix86_builtin_mpx_function
51878
51879 #undef TARGET_CHKP_FUNCTION_VALUE_BOUNDS
51880 #define TARGET_CHKP_FUNCTION_VALUE_BOUNDS ix86_function_value_bounds
51881
51882 #undef TARGET_CHKP_MAKE_BOUNDS_CONSTANT
51883 #define TARGET_CHKP_MAKE_BOUNDS_CONSTANT ix86_make_bounds_constant
51884
51885 #undef TARGET_CHKP_INITIALIZE_BOUNDS
51886 #define TARGET_CHKP_INITIALIZE_BOUNDS ix86_initialize_bounds
51887
51888 #undef TARGET_SETUP_INCOMING_VARARG_BOUNDS
51889 #define TARGET_SETUP_INCOMING_VARARG_BOUNDS ix86_setup_incoming_vararg_bounds
51890
51891 #undef TARGET_OFFLOAD_OPTIONS
51892 #define TARGET_OFFLOAD_OPTIONS \
51893 ix86_offload_options
51894
51895 #undef TARGET_ABSOLUTE_BIGGEST_ALIGNMENT
51896 #define TARGET_ABSOLUTE_BIGGEST_ALIGNMENT 512
51897
51898 #undef TARGET_OPTAB_SUPPORTED_P
51899 #define TARGET_OPTAB_SUPPORTED_P ix86_optab_supported_p
51900
51901 #undef TARGET_HARD_REGNO_SCRATCH_OK
51902 #define TARGET_HARD_REGNO_SCRATCH_OK ix86_hard_regno_scratch_ok
51903
51904 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
51905 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 1
51906
51907 #undef TARGET_ADDITIONAL_ALLOCNO_CLASS_P
51908 #define TARGET_ADDITIONAL_ALLOCNO_CLASS_P ix86_additional_allocno_class_p
51909
51910 #undef TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID
51911 #define TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID ix86_addr_space_zero_address_valid
51912
51913 #undef TARGET_INIT_LIBFUNCS
51914 #define TARGET_INIT_LIBFUNCS ix86_init_libfuncs
51915
51916 #undef TARGET_EXPAND_DIVMOD_LIBFUNC
51917 #define TARGET_EXPAND_DIVMOD_LIBFUNC ix86_expand_divmod_libfunc
51918
51919 #if CHECKING_P
51920 #undef TARGET_RUN_TARGET_SELFTESTS
51921 #define TARGET_RUN_TARGET_SELFTESTS selftest::ix86_run_selftests
51922 #endif /* #if CHECKING_P */
51923
51924 struct gcc_target targetm = TARGET_INITIALIZER;
51925 \f
51926 #include "gt-i386.h"
This page took 2.584799 seconds and 5 git commands to generate.