]> gcc.gnu.org Git - gcc.git/blob - gcc/config/i386/i386.c
sync.md (mem_thread_fence): Mask operands[0] with MEMMODEL_MASK to determine memory...
[gcc.git] / gcc / config / i386 / i386.c
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2013 Free Software Foundation, Inc.
3
4 This file is part of GCC.
5
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
10
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
19
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "tm_p.h"
27 #include "regs.h"
28 #include "hard-reg-set.h"
29 #include "insn-config.h"
30 #include "conditions.h"
31 #include "output.h"
32 #include "insn-codes.h"
33 #include "insn-attr.h"
34 #include "flags.h"
35 #include "except.h"
36 #include "function.h"
37 #include "recog.h"
38 #include "expr.h"
39 #include "optabs.h"
40 #include "diagnostic-core.h"
41 #include "toplev.h"
42 #include "basic-block.h"
43 #include "ggc.h"
44 #include "target.h"
45 #include "target-def.h"
46 #include "common/common-target.h"
47 #include "langhooks.h"
48 #include "reload.h"
49 #include "cgraph.h"
50 #include "gimple.h"
51 #include "dwarf2.h"
52 #include "df.h"
53 #include "tm-constrs.h"
54 #include "params.h"
55 #include "cselib.h"
56 #include "debug.h"
57 #include "sched-int.h"
58 #include "sbitmap.h"
59 #include "fibheap.h"
60 #include "opts.h"
61 #include "diagnostic.h"
62 #include "dumpfile.h"
63 #include "tree-pass.h"
64 #include "tree-flow.h"
65
66 static rtx legitimize_dllimport_symbol (rtx, bool);
67
68 #ifndef CHECK_STACK_LIMIT
69 #define CHECK_STACK_LIMIT (-1)
70 #endif
71
72 /* Return index of given mode in mult and division cost tables. */
73 #define MODE_INDEX(mode) \
74 ((mode) == QImode ? 0 \
75 : (mode) == HImode ? 1 \
76 : (mode) == SImode ? 2 \
77 : (mode) == DImode ? 3 \
78 : 4)
79
80 /* Processor costs (relative to an add) */
81 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
82 #define COSTS_N_BYTES(N) ((N) * 2)
83
84 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
85
86 const
87 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
88 COSTS_N_BYTES (2), /* cost of an add instruction */
89 COSTS_N_BYTES (3), /* cost of a lea instruction */
90 COSTS_N_BYTES (2), /* variable shift costs */
91 COSTS_N_BYTES (3), /* constant shift costs */
92 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
93 COSTS_N_BYTES (3), /* HI */
94 COSTS_N_BYTES (3), /* SI */
95 COSTS_N_BYTES (3), /* DI */
96 COSTS_N_BYTES (5)}, /* other */
97 0, /* cost of multiply per each bit set */
98 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
99 COSTS_N_BYTES (3), /* HI */
100 COSTS_N_BYTES (3), /* SI */
101 COSTS_N_BYTES (3), /* DI */
102 COSTS_N_BYTES (5)}, /* other */
103 COSTS_N_BYTES (3), /* cost of movsx */
104 COSTS_N_BYTES (3), /* cost of movzx */
105 0, /* "large" insn */
106 2, /* MOVE_RATIO */
107 2, /* cost for loading QImode using movzbl */
108 {2, 2, 2}, /* cost of loading integer registers
109 in QImode, HImode and SImode.
110 Relative to reg-reg move (2). */
111 {2, 2, 2}, /* cost of storing integer registers */
112 2, /* cost of reg,reg fld/fst */
113 {2, 2, 2}, /* cost of loading fp registers
114 in SFmode, DFmode and XFmode */
115 {2, 2, 2}, /* cost of storing fp registers
116 in SFmode, DFmode and XFmode */
117 3, /* cost of moving MMX register */
118 {3, 3}, /* cost of loading MMX registers
119 in SImode and DImode */
120 {3, 3}, /* cost of storing MMX registers
121 in SImode and DImode */
122 3, /* cost of moving SSE register */
123 {3, 3, 3}, /* cost of loading SSE registers
124 in SImode, DImode and TImode */
125 {3, 3, 3}, /* cost of storing SSE registers
126 in SImode, DImode and TImode */
127 3, /* MMX or SSE register to integer */
128 0, /* size of l1 cache */
129 0, /* size of l2 cache */
130 0, /* size of prefetch block */
131 0, /* number of parallel prefetches */
132 2, /* Branch cost */
133 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
134 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
135 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
136 COSTS_N_BYTES (2), /* cost of FABS instruction. */
137 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
138 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
139 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
140 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}},
141 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
142 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}},
143 1, /* scalar_stmt_cost. */
144 1, /* scalar load_cost. */
145 1, /* scalar_store_cost. */
146 1, /* vec_stmt_cost. */
147 1, /* vec_to_scalar_cost. */
148 1, /* scalar_to_vec_cost. */
149 1, /* vec_align_load_cost. */
150 1, /* vec_unalign_load_cost. */
151 1, /* vec_store_cost. */
152 1, /* cond_taken_branch_cost. */
153 1, /* cond_not_taken_branch_cost. */
154 };
155
156 /* Processor costs (relative to an add) */
157 static const
158 struct processor_costs i386_cost = { /* 386 specific costs */
159 COSTS_N_INSNS (1), /* cost of an add instruction */
160 COSTS_N_INSNS (1), /* cost of a lea instruction */
161 COSTS_N_INSNS (3), /* variable shift costs */
162 COSTS_N_INSNS (2), /* constant shift costs */
163 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
164 COSTS_N_INSNS (6), /* HI */
165 COSTS_N_INSNS (6), /* SI */
166 COSTS_N_INSNS (6), /* DI */
167 COSTS_N_INSNS (6)}, /* other */
168 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
169 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
170 COSTS_N_INSNS (23), /* HI */
171 COSTS_N_INSNS (23), /* SI */
172 COSTS_N_INSNS (23), /* DI */
173 COSTS_N_INSNS (23)}, /* other */
174 COSTS_N_INSNS (3), /* cost of movsx */
175 COSTS_N_INSNS (2), /* cost of movzx */
176 15, /* "large" insn */
177 3, /* MOVE_RATIO */
178 4, /* cost for loading QImode using movzbl */
179 {2, 4, 2}, /* cost of loading integer registers
180 in QImode, HImode and SImode.
181 Relative to reg-reg move (2). */
182 {2, 4, 2}, /* cost of storing integer registers */
183 2, /* cost of reg,reg fld/fst */
184 {8, 8, 8}, /* cost of loading fp registers
185 in SFmode, DFmode and XFmode */
186 {8, 8, 8}, /* cost of storing fp registers
187 in SFmode, DFmode and XFmode */
188 2, /* cost of moving MMX register */
189 {4, 8}, /* cost of loading MMX registers
190 in SImode and DImode */
191 {4, 8}, /* cost of storing MMX registers
192 in SImode and DImode */
193 2, /* cost of moving SSE register */
194 {4, 8, 16}, /* cost of loading SSE registers
195 in SImode, DImode and TImode */
196 {4, 8, 16}, /* cost of storing SSE registers
197 in SImode, DImode and TImode */
198 3, /* MMX or SSE register to integer */
199 0, /* size of l1 cache */
200 0, /* size of l2 cache */
201 0, /* size of prefetch block */
202 0, /* number of parallel prefetches */
203 1, /* Branch cost */
204 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
205 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
206 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
207 COSTS_N_INSNS (22), /* cost of FABS instruction. */
208 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
209 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
210 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
211 DUMMY_STRINGOP_ALGS},
212 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
213 DUMMY_STRINGOP_ALGS},
214 1, /* scalar_stmt_cost. */
215 1, /* scalar load_cost. */
216 1, /* scalar_store_cost. */
217 1, /* vec_stmt_cost. */
218 1, /* vec_to_scalar_cost. */
219 1, /* scalar_to_vec_cost. */
220 1, /* vec_align_load_cost. */
221 2, /* vec_unalign_load_cost. */
222 1, /* vec_store_cost. */
223 3, /* cond_taken_branch_cost. */
224 1, /* cond_not_taken_branch_cost. */
225 };
226
227 static const
228 struct processor_costs i486_cost = { /* 486 specific costs */
229 COSTS_N_INSNS (1), /* cost of an add instruction */
230 COSTS_N_INSNS (1), /* cost of a lea instruction */
231 COSTS_N_INSNS (3), /* variable shift costs */
232 COSTS_N_INSNS (2), /* constant shift costs */
233 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
234 COSTS_N_INSNS (12), /* HI */
235 COSTS_N_INSNS (12), /* SI */
236 COSTS_N_INSNS (12), /* DI */
237 COSTS_N_INSNS (12)}, /* other */
238 1, /* cost of multiply per each bit set */
239 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
240 COSTS_N_INSNS (40), /* HI */
241 COSTS_N_INSNS (40), /* SI */
242 COSTS_N_INSNS (40), /* DI */
243 COSTS_N_INSNS (40)}, /* other */
244 COSTS_N_INSNS (3), /* cost of movsx */
245 COSTS_N_INSNS (2), /* cost of movzx */
246 15, /* "large" insn */
247 3, /* MOVE_RATIO */
248 4, /* cost for loading QImode using movzbl */
249 {2, 4, 2}, /* cost of loading integer registers
250 in QImode, HImode and SImode.
251 Relative to reg-reg move (2). */
252 {2, 4, 2}, /* cost of storing integer registers */
253 2, /* cost of reg,reg fld/fst */
254 {8, 8, 8}, /* cost of loading fp registers
255 in SFmode, DFmode and XFmode */
256 {8, 8, 8}, /* cost of storing fp registers
257 in SFmode, DFmode and XFmode */
258 2, /* cost of moving MMX register */
259 {4, 8}, /* cost of loading MMX registers
260 in SImode and DImode */
261 {4, 8}, /* cost of storing MMX registers
262 in SImode and DImode */
263 2, /* cost of moving SSE register */
264 {4, 8, 16}, /* cost of loading SSE registers
265 in SImode, DImode and TImode */
266 {4, 8, 16}, /* cost of storing SSE registers
267 in SImode, DImode and TImode */
268 3, /* MMX or SSE register to integer */
269 4, /* size of l1 cache. 486 has 8kB cache
270 shared for code and data, so 4kB is
271 not really precise. */
272 4, /* size of l2 cache */
273 0, /* size of prefetch block */
274 0, /* number of parallel prefetches */
275 1, /* Branch cost */
276 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
277 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
278 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
279 COSTS_N_INSNS (3), /* cost of FABS instruction. */
280 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
281 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
282 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
283 DUMMY_STRINGOP_ALGS},
284 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
285 DUMMY_STRINGOP_ALGS},
286 1, /* scalar_stmt_cost. */
287 1, /* scalar load_cost. */
288 1, /* scalar_store_cost. */
289 1, /* vec_stmt_cost. */
290 1, /* vec_to_scalar_cost. */
291 1, /* scalar_to_vec_cost. */
292 1, /* vec_align_load_cost. */
293 2, /* vec_unalign_load_cost. */
294 1, /* vec_store_cost. */
295 3, /* cond_taken_branch_cost. */
296 1, /* cond_not_taken_branch_cost. */
297 };
298
299 static const
300 struct processor_costs pentium_cost = {
301 COSTS_N_INSNS (1), /* cost of an add instruction */
302 COSTS_N_INSNS (1), /* cost of a lea instruction */
303 COSTS_N_INSNS (4), /* variable shift costs */
304 COSTS_N_INSNS (1), /* constant shift costs */
305 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
306 COSTS_N_INSNS (11), /* HI */
307 COSTS_N_INSNS (11), /* SI */
308 COSTS_N_INSNS (11), /* DI */
309 COSTS_N_INSNS (11)}, /* other */
310 0, /* cost of multiply per each bit set */
311 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
312 COSTS_N_INSNS (25), /* HI */
313 COSTS_N_INSNS (25), /* SI */
314 COSTS_N_INSNS (25), /* DI */
315 COSTS_N_INSNS (25)}, /* other */
316 COSTS_N_INSNS (3), /* cost of movsx */
317 COSTS_N_INSNS (2), /* cost of movzx */
318 8, /* "large" insn */
319 6, /* MOVE_RATIO */
320 6, /* cost for loading QImode using movzbl */
321 {2, 4, 2}, /* cost of loading integer registers
322 in QImode, HImode and SImode.
323 Relative to reg-reg move (2). */
324 {2, 4, 2}, /* cost of storing integer registers */
325 2, /* cost of reg,reg fld/fst */
326 {2, 2, 6}, /* cost of loading fp registers
327 in SFmode, DFmode and XFmode */
328 {4, 4, 6}, /* cost of storing fp registers
329 in SFmode, DFmode and XFmode */
330 8, /* cost of moving MMX register */
331 {8, 8}, /* cost of loading MMX registers
332 in SImode and DImode */
333 {8, 8}, /* cost of storing MMX registers
334 in SImode and DImode */
335 2, /* cost of moving SSE register */
336 {4, 8, 16}, /* cost of loading SSE registers
337 in SImode, DImode and TImode */
338 {4, 8, 16}, /* cost of storing SSE registers
339 in SImode, DImode and TImode */
340 3, /* MMX or SSE register to integer */
341 8, /* size of l1 cache. */
342 8, /* size of l2 cache */
343 0, /* size of prefetch block */
344 0, /* number of parallel prefetches */
345 2, /* Branch cost */
346 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
347 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
348 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
349 COSTS_N_INSNS (1), /* cost of FABS instruction. */
350 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
351 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
352 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
353 DUMMY_STRINGOP_ALGS},
354 {{libcall, {{-1, rep_prefix_4_byte, false}}},
355 DUMMY_STRINGOP_ALGS},
356 1, /* scalar_stmt_cost. */
357 1, /* scalar load_cost. */
358 1, /* scalar_store_cost. */
359 1, /* vec_stmt_cost. */
360 1, /* vec_to_scalar_cost. */
361 1, /* scalar_to_vec_cost. */
362 1, /* vec_align_load_cost. */
363 2, /* vec_unalign_load_cost. */
364 1, /* vec_store_cost. */
365 3, /* cond_taken_branch_cost. */
366 1, /* cond_not_taken_branch_cost. */
367 };
368
369 static const
370 struct processor_costs pentiumpro_cost = {
371 COSTS_N_INSNS (1), /* cost of an add instruction */
372 COSTS_N_INSNS (1), /* cost of a lea instruction */
373 COSTS_N_INSNS (1), /* variable shift costs */
374 COSTS_N_INSNS (1), /* constant shift costs */
375 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
376 COSTS_N_INSNS (4), /* HI */
377 COSTS_N_INSNS (4), /* SI */
378 COSTS_N_INSNS (4), /* DI */
379 COSTS_N_INSNS (4)}, /* other */
380 0, /* cost of multiply per each bit set */
381 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
382 COSTS_N_INSNS (17), /* HI */
383 COSTS_N_INSNS (17), /* SI */
384 COSTS_N_INSNS (17), /* DI */
385 COSTS_N_INSNS (17)}, /* other */
386 COSTS_N_INSNS (1), /* cost of movsx */
387 COSTS_N_INSNS (1), /* cost of movzx */
388 8, /* "large" insn */
389 6, /* MOVE_RATIO */
390 2, /* cost for loading QImode using movzbl */
391 {4, 4, 4}, /* cost of loading integer registers
392 in QImode, HImode and SImode.
393 Relative to reg-reg move (2). */
394 {2, 2, 2}, /* cost of storing integer registers */
395 2, /* cost of reg,reg fld/fst */
396 {2, 2, 6}, /* cost of loading fp registers
397 in SFmode, DFmode and XFmode */
398 {4, 4, 6}, /* cost of storing fp registers
399 in SFmode, DFmode and XFmode */
400 2, /* cost of moving MMX register */
401 {2, 2}, /* cost of loading MMX registers
402 in SImode and DImode */
403 {2, 2}, /* cost of storing MMX registers
404 in SImode and DImode */
405 2, /* cost of moving SSE register */
406 {2, 2, 8}, /* cost of loading SSE registers
407 in SImode, DImode and TImode */
408 {2, 2, 8}, /* cost of storing SSE registers
409 in SImode, DImode and TImode */
410 3, /* MMX or SSE register to integer */
411 8, /* size of l1 cache. */
412 256, /* size of l2 cache */
413 32, /* size of prefetch block */
414 6, /* number of parallel prefetches */
415 2, /* Branch cost */
416 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
417 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
418 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
419 COSTS_N_INSNS (2), /* cost of FABS instruction. */
420 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
421 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
422 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
423 (we ensure the alignment). For small blocks inline loop is still a
424 noticeable win, for bigger blocks either rep movsl or rep movsb is
425 way to go. Rep movsb has apparently more expensive startup time in CPU,
426 but after 4K the difference is down in the noise. */
427 {{rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
428 {8192, rep_prefix_4_byte, false},
429 {-1, rep_prefix_1_byte, false}}},
430 DUMMY_STRINGOP_ALGS},
431 {{rep_prefix_4_byte, {{1024, unrolled_loop, false},
432 {8192, rep_prefix_4_byte, false},
433 {-1, libcall, false}}},
434 DUMMY_STRINGOP_ALGS},
435 1, /* scalar_stmt_cost. */
436 1, /* scalar load_cost. */
437 1, /* scalar_store_cost. */
438 1, /* vec_stmt_cost. */
439 1, /* vec_to_scalar_cost. */
440 1, /* scalar_to_vec_cost. */
441 1, /* vec_align_load_cost. */
442 2, /* vec_unalign_load_cost. */
443 1, /* vec_store_cost. */
444 3, /* cond_taken_branch_cost. */
445 1, /* cond_not_taken_branch_cost. */
446 };
447
448 static const
449 struct processor_costs geode_cost = {
450 COSTS_N_INSNS (1), /* cost of an add instruction */
451 COSTS_N_INSNS (1), /* cost of a lea instruction */
452 COSTS_N_INSNS (2), /* variable shift costs */
453 COSTS_N_INSNS (1), /* constant shift costs */
454 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
455 COSTS_N_INSNS (4), /* HI */
456 COSTS_N_INSNS (7), /* SI */
457 COSTS_N_INSNS (7), /* DI */
458 COSTS_N_INSNS (7)}, /* other */
459 0, /* cost of multiply per each bit set */
460 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
461 COSTS_N_INSNS (23), /* HI */
462 COSTS_N_INSNS (39), /* SI */
463 COSTS_N_INSNS (39), /* DI */
464 COSTS_N_INSNS (39)}, /* other */
465 COSTS_N_INSNS (1), /* cost of movsx */
466 COSTS_N_INSNS (1), /* cost of movzx */
467 8, /* "large" insn */
468 4, /* MOVE_RATIO */
469 1, /* cost for loading QImode using movzbl */
470 {1, 1, 1}, /* cost of loading integer registers
471 in QImode, HImode and SImode.
472 Relative to reg-reg move (2). */
473 {1, 1, 1}, /* cost of storing integer registers */
474 1, /* cost of reg,reg fld/fst */
475 {1, 1, 1}, /* cost of loading fp registers
476 in SFmode, DFmode and XFmode */
477 {4, 6, 6}, /* cost of storing fp registers
478 in SFmode, DFmode and XFmode */
479
480 1, /* cost of moving MMX register */
481 {1, 1}, /* cost of loading MMX registers
482 in SImode and DImode */
483 {1, 1}, /* cost of storing MMX registers
484 in SImode and DImode */
485 1, /* cost of moving SSE register */
486 {1, 1, 1}, /* cost of loading SSE registers
487 in SImode, DImode and TImode */
488 {1, 1, 1}, /* cost of storing SSE registers
489 in SImode, DImode and TImode */
490 1, /* MMX or SSE register to integer */
491 64, /* size of l1 cache. */
492 128, /* size of l2 cache. */
493 32, /* size of prefetch block */
494 1, /* number of parallel prefetches */
495 1, /* Branch cost */
496 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
497 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
498 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
499 COSTS_N_INSNS (1), /* cost of FABS instruction. */
500 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
501 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
502 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
503 DUMMY_STRINGOP_ALGS},
504 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
505 DUMMY_STRINGOP_ALGS},
506 1, /* scalar_stmt_cost. */
507 1, /* scalar load_cost. */
508 1, /* scalar_store_cost. */
509 1, /* vec_stmt_cost. */
510 1, /* vec_to_scalar_cost. */
511 1, /* scalar_to_vec_cost. */
512 1, /* vec_align_load_cost. */
513 2, /* vec_unalign_load_cost. */
514 1, /* vec_store_cost. */
515 3, /* cond_taken_branch_cost. */
516 1, /* cond_not_taken_branch_cost. */
517 };
518
519 static const
520 struct processor_costs k6_cost = {
521 COSTS_N_INSNS (1), /* cost of an add instruction */
522 COSTS_N_INSNS (2), /* cost of a lea instruction */
523 COSTS_N_INSNS (1), /* variable shift costs */
524 COSTS_N_INSNS (1), /* constant shift costs */
525 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
526 COSTS_N_INSNS (3), /* HI */
527 COSTS_N_INSNS (3), /* SI */
528 COSTS_N_INSNS (3), /* DI */
529 COSTS_N_INSNS (3)}, /* other */
530 0, /* cost of multiply per each bit set */
531 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
532 COSTS_N_INSNS (18), /* HI */
533 COSTS_N_INSNS (18), /* SI */
534 COSTS_N_INSNS (18), /* DI */
535 COSTS_N_INSNS (18)}, /* other */
536 COSTS_N_INSNS (2), /* cost of movsx */
537 COSTS_N_INSNS (2), /* cost of movzx */
538 8, /* "large" insn */
539 4, /* MOVE_RATIO */
540 3, /* cost for loading QImode using movzbl */
541 {4, 5, 4}, /* cost of loading integer registers
542 in QImode, HImode and SImode.
543 Relative to reg-reg move (2). */
544 {2, 3, 2}, /* cost of storing integer registers */
545 4, /* cost of reg,reg fld/fst */
546 {6, 6, 6}, /* cost of loading fp registers
547 in SFmode, DFmode and XFmode */
548 {4, 4, 4}, /* cost of storing fp registers
549 in SFmode, DFmode and XFmode */
550 2, /* cost of moving MMX register */
551 {2, 2}, /* cost of loading MMX registers
552 in SImode and DImode */
553 {2, 2}, /* cost of storing MMX registers
554 in SImode and DImode */
555 2, /* cost of moving SSE register */
556 {2, 2, 8}, /* cost of loading SSE registers
557 in SImode, DImode and TImode */
558 {2, 2, 8}, /* cost of storing SSE registers
559 in SImode, DImode and TImode */
560 6, /* MMX or SSE register to integer */
561 32, /* size of l1 cache. */
562 32, /* size of l2 cache. Some models
563 have integrated l2 cache, but
564 optimizing for k6 is not important
565 enough to worry about that. */
566 32, /* size of prefetch block */
567 1, /* number of parallel prefetches */
568 1, /* Branch cost */
569 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
570 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
571 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
572 COSTS_N_INSNS (2), /* cost of FABS instruction. */
573 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
574 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
575 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
576 DUMMY_STRINGOP_ALGS},
577 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
578 DUMMY_STRINGOP_ALGS},
579 1, /* scalar_stmt_cost. */
580 1, /* scalar load_cost. */
581 1, /* scalar_store_cost. */
582 1, /* vec_stmt_cost. */
583 1, /* vec_to_scalar_cost. */
584 1, /* scalar_to_vec_cost. */
585 1, /* vec_align_load_cost. */
586 2, /* vec_unalign_load_cost. */
587 1, /* vec_store_cost. */
588 3, /* cond_taken_branch_cost. */
589 1, /* cond_not_taken_branch_cost. */
590 };
591
592 static const
593 struct processor_costs athlon_cost = {
594 COSTS_N_INSNS (1), /* cost of an add instruction */
595 COSTS_N_INSNS (2), /* cost of a lea instruction */
596 COSTS_N_INSNS (1), /* variable shift costs */
597 COSTS_N_INSNS (1), /* constant shift costs */
598 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
599 COSTS_N_INSNS (5), /* HI */
600 COSTS_N_INSNS (5), /* SI */
601 COSTS_N_INSNS (5), /* DI */
602 COSTS_N_INSNS (5)}, /* other */
603 0, /* cost of multiply per each bit set */
604 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
605 COSTS_N_INSNS (26), /* HI */
606 COSTS_N_INSNS (42), /* SI */
607 COSTS_N_INSNS (74), /* DI */
608 COSTS_N_INSNS (74)}, /* other */
609 COSTS_N_INSNS (1), /* cost of movsx */
610 COSTS_N_INSNS (1), /* cost of movzx */
611 8, /* "large" insn */
612 9, /* MOVE_RATIO */
613 4, /* cost for loading QImode using movzbl */
614 {3, 4, 3}, /* cost of loading integer registers
615 in QImode, HImode and SImode.
616 Relative to reg-reg move (2). */
617 {3, 4, 3}, /* cost of storing integer registers */
618 4, /* cost of reg,reg fld/fst */
619 {4, 4, 12}, /* cost of loading fp registers
620 in SFmode, DFmode and XFmode */
621 {6, 6, 8}, /* cost of storing fp registers
622 in SFmode, DFmode and XFmode */
623 2, /* cost of moving MMX register */
624 {4, 4}, /* cost of loading MMX registers
625 in SImode and DImode */
626 {4, 4}, /* cost of storing MMX registers
627 in SImode and DImode */
628 2, /* cost of moving SSE register */
629 {4, 4, 6}, /* cost of loading SSE registers
630 in SImode, DImode and TImode */
631 {4, 4, 5}, /* cost of storing SSE registers
632 in SImode, DImode and TImode */
633 5, /* MMX or SSE register to integer */
634 64, /* size of l1 cache. */
635 256, /* size of l2 cache. */
636 64, /* size of prefetch block */
637 6, /* number of parallel prefetches */
638 5, /* Branch cost */
639 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
640 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
641 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
642 COSTS_N_INSNS (2), /* cost of FABS instruction. */
643 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
644 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
645 /* For some reason, Athlon deals better with REP prefix (relative to loops)
646 compared to K8. Alignment becomes important after 8 bytes for memcpy and
647 128 bytes for memset. */
648 {{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
649 DUMMY_STRINGOP_ALGS},
650 {{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
651 DUMMY_STRINGOP_ALGS},
652 1, /* scalar_stmt_cost. */
653 1, /* scalar load_cost. */
654 1, /* scalar_store_cost. */
655 1, /* vec_stmt_cost. */
656 1, /* vec_to_scalar_cost. */
657 1, /* scalar_to_vec_cost. */
658 1, /* vec_align_load_cost. */
659 2, /* vec_unalign_load_cost. */
660 1, /* vec_store_cost. */
661 3, /* cond_taken_branch_cost. */
662 1, /* cond_not_taken_branch_cost. */
663 };
664
665 static const
666 struct processor_costs k8_cost = {
667 COSTS_N_INSNS (1), /* cost of an add instruction */
668 COSTS_N_INSNS (2), /* cost of a lea instruction */
669 COSTS_N_INSNS (1), /* variable shift costs */
670 COSTS_N_INSNS (1), /* constant shift costs */
671 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
672 COSTS_N_INSNS (4), /* HI */
673 COSTS_N_INSNS (3), /* SI */
674 COSTS_N_INSNS (4), /* DI */
675 COSTS_N_INSNS (5)}, /* other */
676 0, /* cost of multiply per each bit set */
677 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
678 COSTS_N_INSNS (26), /* HI */
679 COSTS_N_INSNS (42), /* SI */
680 COSTS_N_INSNS (74), /* DI */
681 COSTS_N_INSNS (74)}, /* other */
682 COSTS_N_INSNS (1), /* cost of movsx */
683 COSTS_N_INSNS (1), /* cost of movzx */
684 8, /* "large" insn */
685 9, /* MOVE_RATIO */
686 4, /* cost for loading QImode using movzbl */
687 {3, 4, 3}, /* cost of loading integer registers
688 in QImode, HImode and SImode.
689 Relative to reg-reg move (2). */
690 {3, 4, 3}, /* cost of storing integer registers */
691 4, /* cost of reg,reg fld/fst */
692 {4, 4, 12}, /* cost of loading fp registers
693 in SFmode, DFmode and XFmode */
694 {6, 6, 8}, /* cost of storing fp registers
695 in SFmode, DFmode and XFmode */
696 2, /* cost of moving MMX register */
697 {3, 3}, /* cost of loading MMX registers
698 in SImode and DImode */
699 {4, 4}, /* cost of storing MMX registers
700 in SImode and DImode */
701 2, /* cost of moving SSE register */
702 {4, 3, 6}, /* cost of loading SSE registers
703 in SImode, DImode and TImode */
704 {4, 4, 5}, /* cost of storing SSE registers
705 in SImode, DImode and TImode */
706 5, /* MMX or SSE register to integer */
707 64, /* size of l1 cache. */
708 512, /* size of l2 cache. */
709 64, /* size of prefetch block */
710 /* New AMD processors never drop prefetches; if they cannot be performed
711 immediately, they are queued. We set number of simultaneous prefetches
712 to a large constant to reflect this (it probably is not a good idea not
713 to limit number of prefetches at all, as their execution also takes some
714 time). */
715 100, /* number of parallel prefetches */
716 3, /* Branch cost */
717 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
718 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
719 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
720 COSTS_N_INSNS (2), /* cost of FABS instruction. */
721 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
722 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
723 /* K8 has optimized REP instruction for medium sized blocks, but for very
724 small blocks it is better to use loop. For large blocks, libcall can
725 do nontemporary accesses and beat inline considerably. */
726 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
727 {-1, rep_prefix_4_byte, false}}},
728 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
729 {-1, libcall, false}}}},
730 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
731 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
732 {libcall, {{48, unrolled_loop, false},
733 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
734 4, /* scalar_stmt_cost. */
735 2, /* scalar load_cost. */
736 2, /* scalar_store_cost. */
737 5, /* vec_stmt_cost. */
738 0, /* vec_to_scalar_cost. */
739 2, /* scalar_to_vec_cost. */
740 2, /* vec_align_load_cost. */
741 3, /* vec_unalign_load_cost. */
742 3, /* vec_store_cost. */
743 3, /* cond_taken_branch_cost. */
744 2, /* cond_not_taken_branch_cost. */
745 };
746
747 struct processor_costs amdfam10_cost = {
748 COSTS_N_INSNS (1), /* cost of an add instruction */
749 COSTS_N_INSNS (2), /* cost of a lea instruction */
750 COSTS_N_INSNS (1), /* variable shift costs */
751 COSTS_N_INSNS (1), /* constant shift costs */
752 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
753 COSTS_N_INSNS (4), /* HI */
754 COSTS_N_INSNS (3), /* SI */
755 COSTS_N_INSNS (4), /* DI */
756 COSTS_N_INSNS (5)}, /* other */
757 0, /* cost of multiply per each bit set */
758 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
759 COSTS_N_INSNS (35), /* HI */
760 COSTS_N_INSNS (51), /* SI */
761 COSTS_N_INSNS (83), /* DI */
762 COSTS_N_INSNS (83)}, /* other */
763 COSTS_N_INSNS (1), /* cost of movsx */
764 COSTS_N_INSNS (1), /* cost of movzx */
765 8, /* "large" insn */
766 9, /* MOVE_RATIO */
767 4, /* cost for loading QImode using movzbl */
768 {3, 4, 3}, /* cost of loading integer registers
769 in QImode, HImode and SImode.
770 Relative to reg-reg move (2). */
771 {3, 4, 3}, /* cost of storing integer registers */
772 4, /* cost of reg,reg fld/fst */
773 {4, 4, 12}, /* cost of loading fp registers
774 in SFmode, DFmode and XFmode */
775 {6, 6, 8}, /* cost of storing fp registers
776 in SFmode, DFmode and XFmode */
777 2, /* cost of moving MMX register */
778 {3, 3}, /* cost of loading MMX registers
779 in SImode and DImode */
780 {4, 4}, /* cost of storing MMX registers
781 in SImode and DImode */
782 2, /* cost of moving SSE register */
783 {4, 4, 3}, /* cost of loading SSE registers
784 in SImode, DImode and TImode */
785 {4, 4, 5}, /* cost of storing SSE registers
786 in SImode, DImode and TImode */
787 3, /* MMX or SSE register to integer */
788 /* On K8:
789 MOVD reg64, xmmreg Double FSTORE 4
790 MOVD reg32, xmmreg Double FSTORE 4
791 On AMDFAM10:
792 MOVD reg64, xmmreg Double FADD 3
793 1/1 1/1
794 MOVD reg32, xmmreg Double FADD 3
795 1/1 1/1 */
796 64, /* size of l1 cache. */
797 512, /* size of l2 cache. */
798 64, /* size of prefetch block */
799 /* New AMD processors never drop prefetches; if they cannot be performed
800 immediately, they are queued. We set number of simultaneous prefetches
801 to a large constant to reflect this (it probably is not a good idea not
802 to limit number of prefetches at all, as their execution also takes some
803 time). */
804 100, /* number of parallel prefetches */
805 2, /* Branch cost */
806 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
807 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
808 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
809 COSTS_N_INSNS (2), /* cost of FABS instruction. */
810 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
811 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
812
813 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
814 very small blocks it is better to use loop. For large blocks, libcall can
815 do nontemporary accesses and beat inline considerably. */
816 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
817 {-1, rep_prefix_4_byte, false}}},
818 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
819 {-1, libcall, false}}}},
820 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
821 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
822 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
823 {-1, libcall, false}}}},
824 4, /* scalar_stmt_cost. */
825 2, /* scalar load_cost. */
826 2, /* scalar_store_cost. */
827 6, /* vec_stmt_cost. */
828 0, /* vec_to_scalar_cost. */
829 2, /* scalar_to_vec_cost. */
830 2, /* vec_align_load_cost. */
831 2, /* vec_unalign_load_cost. */
832 2, /* vec_store_cost. */
833 2, /* cond_taken_branch_cost. */
834 1, /* cond_not_taken_branch_cost. */
835 };
836
837 struct processor_costs bdver1_cost = {
838 COSTS_N_INSNS (1), /* cost of an add instruction */
839 COSTS_N_INSNS (1), /* cost of a lea instruction */
840 COSTS_N_INSNS (1), /* variable shift costs */
841 COSTS_N_INSNS (1), /* constant shift costs */
842 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
843 COSTS_N_INSNS (4), /* HI */
844 COSTS_N_INSNS (4), /* SI */
845 COSTS_N_INSNS (6), /* DI */
846 COSTS_N_INSNS (6)}, /* other */
847 0, /* cost of multiply per each bit set */
848 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
849 COSTS_N_INSNS (35), /* HI */
850 COSTS_N_INSNS (51), /* SI */
851 COSTS_N_INSNS (83), /* DI */
852 COSTS_N_INSNS (83)}, /* other */
853 COSTS_N_INSNS (1), /* cost of movsx */
854 COSTS_N_INSNS (1), /* cost of movzx */
855 8, /* "large" insn */
856 9, /* MOVE_RATIO */
857 4, /* cost for loading QImode using movzbl */
858 {5, 5, 4}, /* cost of loading integer registers
859 in QImode, HImode and SImode.
860 Relative to reg-reg move (2). */
861 {4, 4, 4}, /* cost of storing integer registers */
862 2, /* cost of reg,reg fld/fst */
863 {5, 5, 12}, /* cost of loading fp registers
864 in SFmode, DFmode and XFmode */
865 {4, 4, 8}, /* cost of storing fp registers
866 in SFmode, DFmode and XFmode */
867 2, /* cost of moving MMX register */
868 {4, 4}, /* cost of loading MMX registers
869 in SImode and DImode */
870 {4, 4}, /* cost of storing MMX registers
871 in SImode and DImode */
872 2, /* cost of moving SSE register */
873 {4, 4, 4}, /* cost of loading SSE registers
874 in SImode, DImode and TImode */
875 {4, 4, 4}, /* cost of storing SSE registers
876 in SImode, DImode and TImode */
877 2, /* MMX or SSE register to integer */
878 /* On K8:
879 MOVD reg64, xmmreg Double FSTORE 4
880 MOVD reg32, xmmreg Double FSTORE 4
881 On AMDFAM10:
882 MOVD reg64, xmmreg Double FADD 3
883 1/1 1/1
884 MOVD reg32, xmmreg Double FADD 3
885 1/1 1/1 */
886 16, /* size of l1 cache. */
887 2048, /* size of l2 cache. */
888 64, /* size of prefetch block */
889 /* New AMD processors never drop prefetches; if they cannot be performed
890 immediately, they are queued. We set number of simultaneous prefetches
891 to a large constant to reflect this (it probably is not a good idea not
892 to limit number of prefetches at all, as their execution also takes some
893 time). */
894 100, /* number of parallel prefetches */
895 2, /* Branch cost */
896 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
897 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
898 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
899 COSTS_N_INSNS (2), /* cost of FABS instruction. */
900 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
901 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
902
903 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
904 very small blocks it is better to use loop. For large blocks, libcall
905 can do nontemporary accesses and beat inline considerably. */
906 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
907 {-1, rep_prefix_4_byte, false}}},
908 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
909 {-1, libcall, false}}}},
910 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
911 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
912 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
913 {-1, libcall, false}}}},
914 6, /* scalar_stmt_cost. */
915 4, /* scalar load_cost. */
916 4, /* scalar_store_cost. */
917 6, /* vec_stmt_cost. */
918 0, /* vec_to_scalar_cost. */
919 2, /* scalar_to_vec_cost. */
920 4, /* vec_align_load_cost. */
921 4, /* vec_unalign_load_cost. */
922 4, /* vec_store_cost. */
923 2, /* cond_taken_branch_cost. */
924 1, /* cond_not_taken_branch_cost. */
925 };
926
927 struct processor_costs bdver2_cost = {
928 COSTS_N_INSNS (1), /* cost of an add instruction */
929 COSTS_N_INSNS (1), /* cost of a lea instruction */
930 COSTS_N_INSNS (1), /* variable shift costs */
931 COSTS_N_INSNS (1), /* constant shift costs */
932 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
933 COSTS_N_INSNS (4), /* HI */
934 COSTS_N_INSNS (4), /* SI */
935 COSTS_N_INSNS (6), /* DI */
936 COSTS_N_INSNS (6)}, /* other */
937 0, /* cost of multiply per each bit set */
938 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
939 COSTS_N_INSNS (35), /* HI */
940 COSTS_N_INSNS (51), /* SI */
941 COSTS_N_INSNS (83), /* DI */
942 COSTS_N_INSNS (83)}, /* other */
943 COSTS_N_INSNS (1), /* cost of movsx */
944 COSTS_N_INSNS (1), /* cost of movzx */
945 8, /* "large" insn */
946 9, /* MOVE_RATIO */
947 4, /* cost for loading QImode using movzbl */
948 {5, 5, 4}, /* cost of loading integer registers
949 in QImode, HImode and SImode.
950 Relative to reg-reg move (2). */
951 {4, 4, 4}, /* cost of storing integer registers */
952 2, /* cost of reg,reg fld/fst */
953 {5, 5, 12}, /* cost of loading fp registers
954 in SFmode, DFmode and XFmode */
955 {4, 4, 8}, /* cost of storing fp registers
956 in SFmode, DFmode and XFmode */
957 2, /* cost of moving MMX register */
958 {4, 4}, /* cost of loading MMX registers
959 in SImode and DImode */
960 {4, 4}, /* cost of storing MMX registers
961 in SImode and DImode */
962 2, /* cost of moving SSE register */
963 {4, 4, 4}, /* cost of loading SSE registers
964 in SImode, DImode and TImode */
965 {4, 4, 4}, /* cost of storing SSE registers
966 in SImode, DImode and TImode */
967 2, /* MMX or SSE register to integer */
968 /* On K8:
969 MOVD reg64, xmmreg Double FSTORE 4
970 MOVD reg32, xmmreg Double FSTORE 4
971 On AMDFAM10:
972 MOVD reg64, xmmreg Double FADD 3
973 1/1 1/1
974 MOVD reg32, xmmreg Double FADD 3
975 1/1 1/1 */
976 16, /* size of l1 cache. */
977 2048, /* size of l2 cache. */
978 64, /* size of prefetch block */
979 /* New AMD processors never drop prefetches; if they cannot be performed
980 immediately, they are queued. We set number of simultaneous prefetches
981 to a large constant to reflect this (it probably is not a good idea not
982 to limit number of prefetches at all, as their execution also takes some
983 time). */
984 100, /* number of parallel prefetches */
985 2, /* Branch cost */
986 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
987 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
988 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
989 COSTS_N_INSNS (2), /* cost of FABS instruction. */
990 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
991 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
992
993 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
994 very small blocks it is better to use loop. For large blocks, libcall
995 can do nontemporary accesses and beat inline considerably. */
996 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
997 {-1, rep_prefix_4_byte, false}}},
998 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
999 {-1, libcall, false}}}},
1000 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1001 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1002 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1003 {-1, libcall, false}}}},
1004 6, /* scalar_stmt_cost. */
1005 4, /* scalar load_cost. */
1006 4, /* scalar_store_cost. */
1007 6, /* vec_stmt_cost. */
1008 0, /* vec_to_scalar_cost. */
1009 2, /* scalar_to_vec_cost. */
1010 4, /* vec_align_load_cost. */
1011 4, /* vec_unalign_load_cost. */
1012 4, /* vec_store_cost. */
1013 2, /* cond_taken_branch_cost. */
1014 1, /* cond_not_taken_branch_cost. */
1015 };
1016
1017 struct processor_costs bdver3_cost = {
1018 COSTS_N_INSNS (1), /* cost of an add instruction */
1019 COSTS_N_INSNS (1), /* cost of a lea instruction */
1020 COSTS_N_INSNS (1), /* variable shift costs */
1021 COSTS_N_INSNS (1), /* constant shift costs */
1022 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1023 COSTS_N_INSNS (4), /* HI */
1024 COSTS_N_INSNS (4), /* SI */
1025 COSTS_N_INSNS (6), /* DI */
1026 COSTS_N_INSNS (6)}, /* other */
1027 0, /* cost of multiply per each bit set */
1028 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1029 COSTS_N_INSNS (35), /* HI */
1030 COSTS_N_INSNS (51), /* SI */
1031 COSTS_N_INSNS (83), /* DI */
1032 COSTS_N_INSNS (83)}, /* other */
1033 COSTS_N_INSNS (1), /* cost of movsx */
1034 COSTS_N_INSNS (1), /* cost of movzx */
1035 8, /* "large" insn */
1036 9, /* MOVE_RATIO */
1037 4, /* cost for loading QImode using movzbl */
1038 {5, 5, 4}, /* cost of loading integer registers
1039 in QImode, HImode and SImode.
1040 Relative to reg-reg move (2). */
1041 {4, 4, 4}, /* cost of storing integer registers */
1042 2, /* cost of reg,reg fld/fst */
1043 {5, 5, 12}, /* cost of loading fp registers
1044 in SFmode, DFmode and XFmode */
1045 {4, 4, 8}, /* cost of storing fp registers
1046 in SFmode, DFmode and XFmode */
1047 2, /* cost of moving MMX register */
1048 {4, 4}, /* cost of loading MMX registers
1049 in SImode and DImode */
1050 {4, 4}, /* cost of storing MMX registers
1051 in SImode and DImode */
1052 2, /* cost of moving SSE register */
1053 {4, 4, 4}, /* cost of loading SSE registers
1054 in SImode, DImode and TImode */
1055 {4, 4, 4}, /* cost of storing SSE registers
1056 in SImode, DImode and TImode */
1057 2, /* MMX or SSE register to integer */
1058 16, /* size of l1 cache. */
1059 2048, /* size of l2 cache. */
1060 64, /* size of prefetch block */
1061 /* New AMD processors never drop prefetches; if they cannot be performed
1062 immediately, they are queued. We set number of simultaneous prefetches
1063 to a large constant to reflect this (it probably is not a good idea not
1064 to limit number of prefetches at all, as their execution also takes some
1065 time). */
1066 100, /* number of parallel prefetches */
1067 2, /* Branch cost */
1068 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1069 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1070 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1071 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1072 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1073 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1074
1075 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1076 very small blocks it is better to use loop. For large blocks, libcall
1077 can do nontemporary accesses and beat inline considerably. */
1078 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
1079 {-1, rep_prefix_4_byte, false}}},
1080 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1081 {-1, libcall, false}}}},
1082 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1083 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1084 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1085 {-1, libcall, false}}}},
1086 6, /* scalar_stmt_cost. */
1087 4, /* scalar load_cost. */
1088 4, /* scalar_store_cost. */
1089 6, /* vec_stmt_cost. */
1090 0, /* vec_to_scalar_cost. */
1091 2, /* scalar_to_vec_cost. */
1092 4, /* vec_align_load_cost. */
1093 4, /* vec_unalign_load_cost. */
1094 4, /* vec_store_cost. */
1095 2, /* cond_taken_branch_cost. */
1096 1, /* cond_not_taken_branch_cost. */
1097 };
1098
1099 struct processor_costs btver1_cost = {
1100 COSTS_N_INSNS (1), /* cost of an add instruction */
1101 COSTS_N_INSNS (2), /* cost of a lea instruction */
1102 COSTS_N_INSNS (1), /* variable shift costs */
1103 COSTS_N_INSNS (1), /* constant shift costs */
1104 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1105 COSTS_N_INSNS (4), /* HI */
1106 COSTS_N_INSNS (3), /* SI */
1107 COSTS_N_INSNS (4), /* DI */
1108 COSTS_N_INSNS (5)}, /* other */
1109 0, /* cost of multiply per each bit set */
1110 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1111 COSTS_N_INSNS (35), /* HI */
1112 COSTS_N_INSNS (51), /* SI */
1113 COSTS_N_INSNS (83), /* DI */
1114 COSTS_N_INSNS (83)}, /* other */
1115 COSTS_N_INSNS (1), /* cost of movsx */
1116 COSTS_N_INSNS (1), /* cost of movzx */
1117 8, /* "large" insn */
1118 9, /* MOVE_RATIO */
1119 4, /* cost for loading QImode using movzbl */
1120 {3, 4, 3}, /* cost of loading integer registers
1121 in QImode, HImode and SImode.
1122 Relative to reg-reg move (2). */
1123 {3, 4, 3}, /* cost of storing integer registers */
1124 4, /* cost of reg,reg fld/fst */
1125 {4, 4, 12}, /* cost of loading fp registers
1126 in SFmode, DFmode and XFmode */
1127 {6, 6, 8}, /* cost of storing fp registers
1128 in SFmode, DFmode and XFmode */
1129 2, /* cost of moving MMX register */
1130 {3, 3}, /* cost of loading MMX registers
1131 in SImode and DImode */
1132 {4, 4}, /* cost of storing MMX registers
1133 in SImode and DImode */
1134 2, /* cost of moving SSE register */
1135 {4, 4, 3}, /* cost of loading SSE registers
1136 in SImode, DImode and TImode */
1137 {4, 4, 5}, /* cost of storing SSE registers
1138 in SImode, DImode and TImode */
1139 3, /* MMX or SSE register to integer */
1140 /* On K8:
1141 MOVD reg64, xmmreg Double FSTORE 4
1142 MOVD reg32, xmmreg Double FSTORE 4
1143 On AMDFAM10:
1144 MOVD reg64, xmmreg Double FADD 3
1145 1/1 1/1
1146 MOVD reg32, xmmreg Double FADD 3
1147 1/1 1/1 */
1148 32, /* size of l1 cache. */
1149 512, /* size of l2 cache. */
1150 64, /* size of prefetch block */
1151 100, /* number of parallel prefetches */
1152 2, /* Branch cost */
1153 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1154 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1155 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1156 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1157 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1158 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1159
1160 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1161 very small blocks it is better to use loop. For large blocks, libcall can
1162 do nontemporary accesses and beat inline considerably. */
1163 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
1164 {-1, rep_prefix_4_byte, false}}},
1165 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1166 {-1, libcall, false}}}},
1167 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1168 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1169 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1170 {-1, libcall, false}}}},
1171 4, /* scalar_stmt_cost. */
1172 2, /* scalar load_cost. */
1173 2, /* scalar_store_cost. */
1174 6, /* vec_stmt_cost. */
1175 0, /* vec_to_scalar_cost. */
1176 2, /* scalar_to_vec_cost. */
1177 2, /* vec_align_load_cost. */
1178 2, /* vec_unalign_load_cost. */
1179 2, /* vec_store_cost. */
1180 2, /* cond_taken_branch_cost. */
1181 1, /* cond_not_taken_branch_cost. */
1182 };
1183
1184 struct processor_costs btver2_cost = {
1185 COSTS_N_INSNS (1), /* cost of an add instruction */
1186 COSTS_N_INSNS (2), /* cost of a lea instruction */
1187 COSTS_N_INSNS (1), /* variable shift costs */
1188 COSTS_N_INSNS (1), /* constant shift costs */
1189 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1190 COSTS_N_INSNS (4), /* HI */
1191 COSTS_N_INSNS (3), /* SI */
1192 COSTS_N_INSNS (4), /* DI */
1193 COSTS_N_INSNS (5)}, /* other */
1194 0, /* cost of multiply per each bit set */
1195 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1196 COSTS_N_INSNS (35), /* HI */
1197 COSTS_N_INSNS (51), /* SI */
1198 COSTS_N_INSNS (83), /* DI */
1199 COSTS_N_INSNS (83)}, /* other */
1200 COSTS_N_INSNS (1), /* cost of movsx */
1201 COSTS_N_INSNS (1), /* cost of movzx */
1202 8, /* "large" insn */
1203 9, /* MOVE_RATIO */
1204 4, /* cost for loading QImode using movzbl */
1205 {3, 4, 3}, /* cost of loading integer registers
1206 in QImode, HImode and SImode.
1207 Relative to reg-reg move (2). */
1208 {3, 4, 3}, /* cost of storing integer registers */
1209 4, /* cost of reg,reg fld/fst */
1210 {4, 4, 12}, /* cost of loading fp registers
1211 in SFmode, DFmode and XFmode */
1212 {6, 6, 8}, /* cost of storing fp registers
1213 in SFmode, DFmode and XFmode */
1214 2, /* cost of moving MMX register */
1215 {3, 3}, /* cost of loading MMX registers
1216 in SImode and DImode */
1217 {4, 4}, /* cost of storing MMX registers
1218 in SImode and DImode */
1219 2, /* cost of moving SSE register */
1220 {4, 4, 3}, /* cost of loading SSE registers
1221 in SImode, DImode and TImode */
1222 {4, 4, 5}, /* cost of storing SSE registers
1223 in SImode, DImode and TImode */
1224 3, /* MMX or SSE register to integer */
1225 /* On K8:
1226 MOVD reg64, xmmreg Double FSTORE 4
1227 MOVD reg32, xmmreg Double FSTORE 4
1228 On AMDFAM10:
1229 MOVD reg64, xmmreg Double FADD 3
1230 1/1 1/1
1231 MOVD reg32, xmmreg Double FADD 3
1232 1/1 1/1 */
1233 32, /* size of l1 cache. */
1234 2048, /* size of l2 cache. */
1235 64, /* size of prefetch block */
1236 100, /* number of parallel prefetches */
1237 2, /* Branch cost */
1238 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1239 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1240 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1241 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1242 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1243 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1244
1245 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
1246 {-1, rep_prefix_4_byte, false}}},
1247 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1248 {-1, libcall, false}}}},
1249 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1250 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1251 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1252 {-1, libcall, false}}}},
1253 4, /* scalar_stmt_cost. */
1254 2, /* scalar load_cost. */
1255 2, /* scalar_store_cost. */
1256 6, /* vec_stmt_cost. */
1257 0, /* vec_to_scalar_cost. */
1258 2, /* scalar_to_vec_cost. */
1259 2, /* vec_align_load_cost. */
1260 2, /* vec_unalign_load_cost. */
1261 2, /* vec_store_cost. */
1262 2, /* cond_taken_branch_cost. */
1263 1, /* cond_not_taken_branch_cost. */
1264 };
1265
1266 static const
1267 struct processor_costs pentium4_cost = {
1268 COSTS_N_INSNS (1), /* cost of an add instruction */
1269 COSTS_N_INSNS (3), /* cost of a lea instruction */
1270 COSTS_N_INSNS (4), /* variable shift costs */
1271 COSTS_N_INSNS (4), /* constant shift costs */
1272 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1273 COSTS_N_INSNS (15), /* HI */
1274 COSTS_N_INSNS (15), /* SI */
1275 COSTS_N_INSNS (15), /* DI */
1276 COSTS_N_INSNS (15)}, /* other */
1277 0, /* cost of multiply per each bit set */
1278 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1279 COSTS_N_INSNS (56), /* HI */
1280 COSTS_N_INSNS (56), /* SI */
1281 COSTS_N_INSNS (56), /* DI */
1282 COSTS_N_INSNS (56)}, /* other */
1283 COSTS_N_INSNS (1), /* cost of movsx */
1284 COSTS_N_INSNS (1), /* cost of movzx */
1285 16, /* "large" insn */
1286 6, /* MOVE_RATIO */
1287 2, /* cost for loading QImode using movzbl */
1288 {4, 5, 4}, /* cost of loading integer registers
1289 in QImode, HImode and SImode.
1290 Relative to reg-reg move (2). */
1291 {2, 3, 2}, /* cost of storing integer registers */
1292 2, /* cost of reg,reg fld/fst */
1293 {2, 2, 6}, /* cost of loading fp registers
1294 in SFmode, DFmode and XFmode */
1295 {4, 4, 6}, /* cost of storing fp registers
1296 in SFmode, DFmode and XFmode */
1297 2, /* cost of moving MMX register */
1298 {2, 2}, /* cost of loading MMX registers
1299 in SImode and DImode */
1300 {2, 2}, /* cost of storing MMX registers
1301 in SImode and DImode */
1302 12, /* cost of moving SSE register */
1303 {12, 12, 12}, /* cost of loading SSE registers
1304 in SImode, DImode and TImode */
1305 {2, 2, 8}, /* cost of storing SSE registers
1306 in SImode, DImode and TImode */
1307 10, /* MMX or SSE register to integer */
1308 8, /* size of l1 cache. */
1309 256, /* size of l2 cache. */
1310 64, /* size of prefetch block */
1311 6, /* number of parallel prefetches */
1312 2, /* Branch cost */
1313 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1314 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1315 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1316 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1317 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1318 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1319 {{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1320 DUMMY_STRINGOP_ALGS},
1321 {{libcall, {{6, loop_1_byte, false}, {48, loop, false},
1322 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1323 DUMMY_STRINGOP_ALGS},
1324 1, /* scalar_stmt_cost. */
1325 1, /* scalar load_cost. */
1326 1, /* scalar_store_cost. */
1327 1, /* vec_stmt_cost. */
1328 1, /* vec_to_scalar_cost. */
1329 1, /* scalar_to_vec_cost. */
1330 1, /* vec_align_load_cost. */
1331 2, /* vec_unalign_load_cost. */
1332 1, /* vec_store_cost. */
1333 3, /* cond_taken_branch_cost. */
1334 1, /* cond_not_taken_branch_cost. */
1335 };
1336
1337 static const
1338 struct processor_costs nocona_cost = {
1339 COSTS_N_INSNS (1), /* cost of an add instruction */
1340 COSTS_N_INSNS (1), /* cost of a lea instruction */
1341 COSTS_N_INSNS (1), /* variable shift costs */
1342 COSTS_N_INSNS (1), /* constant shift costs */
1343 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1344 COSTS_N_INSNS (10), /* HI */
1345 COSTS_N_INSNS (10), /* SI */
1346 COSTS_N_INSNS (10), /* DI */
1347 COSTS_N_INSNS (10)}, /* other */
1348 0, /* cost of multiply per each bit set */
1349 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1350 COSTS_N_INSNS (66), /* HI */
1351 COSTS_N_INSNS (66), /* SI */
1352 COSTS_N_INSNS (66), /* DI */
1353 COSTS_N_INSNS (66)}, /* other */
1354 COSTS_N_INSNS (1), /* cost of movsx */
1355 COSTS_N_INSNS (1), /* cost of movzx */
1356 16, /* "large" insn */
1357 17, /* MOVE_RATIO */
1358 4, /* cost for loading QImode using movzbl */
1359 {4, 4, 4}, /* cost of loading integer registers
1360 in QImode, HImode and SImode.
1361 Relative to reg-reg move (2). */
1362 {4, 4, 4}, /* cost of storing integer registers */
1363 3, /* cost of reg,reg fld/fst */
1364 {12, 12, 12}, /* cost of loading fp registers
1365 in SFmode, DFmode and XFmode */
1366 {4, 4, 4}, /* cost of storing fp registers
1367 in SFmode, DFmode and XFmode */
1368 6, /* cost of moving MMX register */
1369 {12, 12}, /* cost of loading MMX registers
1370 in SImode and DImode */
1371 {12, 12}, /* cost of storing MMX registers
1372 in SImode and DImode */
1373 6, /* cost of moving SSE register */
1374 {12, 12, 12}, /* cost of loading SSE registers
1375 in SImode, DImode and TImode */
1376 {12, 12, 12}, /* cost of storing SSE registers
1377 in SImode, DImode and TImode */
1378 8, /* MMX or SSE register to integer */
1379 8, /* size of l1 cache. */
1380 1024, /* size of l2 cache. */
1381 128, /* size of prefetch block */
1382 8, /* number of parallel prefetches */
1383 1, /* Branch cost */
1384 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1385 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1386 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1387 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1388 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1389 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1390 {{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1391 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1392 {100000, unrolled_loop, false}, {-1, libcall, false}}}},
1393 {{libcall, {{6, loop_1_byte, false}, {48, loop, false},
1394 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1395 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1396 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
1397 1, /* scalar_stmt_cost. */
1398 1, /* scalar load_cost. */
1399 1, /* scalar_store_cost. */
1400 1, /* vec_stmt_cost. */
1401 1, /* vec_to_scalar_cost. */
1402 1, /* scalar_to_vec_cost. */
1403 1, /* vec_align_load_cost. */
1404 2, /* vec_unalign_load_cost. */
1405 1, /* vec_store_cost. */
1406 3, /* cond_taken_branch_cost. */
1407 1, /* cond_not_taken_branch_cost. */
1408 };
1409
1410 static const
1411 struct processor_costs atom_cost = {
1412 COSTS_N_INSNS (1), /* cost of an add instruction */
1413 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1414 COSTS_N_INSNS (1), /* variable shift costs */
1415 COSTS_N_INSNS (1), /* constant shift costs */
1416 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1417 COSTS_N_INSNS (4), /* HI */
1418 COSTS_N_INSNS (3), /* SI */
1419 COSTS_N_INSNS (4), /* DI */
1420 COSTS_N_INSNS (2)}, /* other */
1421 0, /* cost of multiply per each bit set */
1422 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1423 COSTS_N_INSNS (26), /* HI */
1424 COSTS_N_INSNS (42), /* SI */
1425 COSTS_N_INSNS (74), /* DI */
1426 COSTS_N_INSNS (74)}, /* other */
1427 COSTS_N_INSNS (1), /* cost of movsx */
1428 COSTS_N_INSNS (1), /* cost of movzx */
1429 8, /* "large" insn */
1430 17, /* MOVE_RATIO */
1431 4, /* cost for loading QImode using movzbl */
1432 {4, 4, 4}, /* cost of loading integer registers
1433 in QImode, HImode and SImode.
1434 Relative to reg-reg move (2). */
1435 {4, 4, 4}, /* cost of storing integer registers */
1436 4, /* cost of reg,reg fld/fst */
1437 {12, 12, 12}, /* cost of loading fp registers
1438 in SFmode, DFmode and XFmode */
1439 {6, 6, 8}, /* cost of storing fp registers
1440 in SFmode, DFmode and XFmode */
1441 2, /* cost of moving MMX register */
1442 {8, 8}, /* cost of loading MMX registers
1443 in SImode and DImode */
1444 {8, 8}, /* cost of storing MMX registers
1445 in SImode and DImode */
1446 2, /* cost of moving SSE register */
1447 {8, 8, 8}, /* cost of loading SSE registers
1448 in SImode, DImode and TImode */
1449 {8, 8, 8}, /* cost of storing SSE registers
1450 in SImode, DImode and TImode */
1451 5, /* MMX or SSE register to integer */
1452 32, /* size of l1 cache. */
1453 256, /* size of l2 cache. */
1454 64, /* size of prefetch block */
1455 6, /* number of parallel prefetches */
1456 3, /* Branch cost */
1457 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1458 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1459 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1460 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1461 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1462 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1463 {{libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1464 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1465 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
1466 {{libcall, {{8, loop, false}, {15, unrolled_loop, false},
1467 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1468 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1469 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
1470 1, /* scalar_stmt_cost. */
1471 1, /* scalar load_cost. */
1472 1, /* scalar_store_cost. */
1473 1, /* vec_stmt_cost. */
1474 1, /* vec_to_scalar_cost. */
1475 1, /* scalar_to_vec_cost. */
1476 1, /* vec_align_load_cost. */
1477 2, /* vec_unalign_load_cost. */
1478 1, /* vec_store_cost. */
1479 3, /* cond_taken_branch_cost. */
1480 1, /* cond_not_taken_branch_cost. */
1481 };
1482
1483 /* Generic64 should produce code tuned for Nocona and K8. */
1484 static const
1485 struct processor_costs generic64_cost = {
1486 COSTS_N_INSNS (1), /* cost of an add instruction */
1487 /* On all chips taken into consideration lea is 2 cycles and more. With
1488 this cost however our current implementation of synth_mult results in
1489 use of unnecessary temporary registers causing regression on several
1490 SPECfp benchmarks. */
1491 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1492 COSTS_N_INSNS (1), /* variable shift costs */
1493 COSTS_N_INSNS (1), /* constant shift costs */
1494 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1495 COSTS_N_INSNS (4), /* HI */
1496 COSTS_N_INSNS (3), /* SI */
1497 COSTS_N_INSNS (4), /* DI */
1498 COSTS_N_INSNS (2)}, /* other */
1499 0, /* cost of multiply per each bit set */
1500 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1501 COSTS_N_INSNS (26), /* HI */
1502 COSTS_N_INSNS (42), /* SI */
1503 COSTS_N_INSNS (74), /* DI */
1504 COSTS_N_INSNS (74)}, /* other */
1505 COSTS_N_INSNS (1), /* cost of movsx */
1506 COSTS_N_INSNS (1), /* cost of movzx */
1507 8, /* "large" insn */
1508 17, /* MOVE_RATIO */
1509 4, /* cost for loading QImode using movzbl */
1510 {4, 4, 4}, /* cost of loading integer registers
1511 in QImode, HImode and SImode.
1512 Relative to reg-reg move (2). */
1513 {4, 4, 4}, /* cost of storing integer registers */
1514 4, /* cost of reg,reg fld/fst */
1515 {12, 12, 12}, /* cost of loading fp registers
1516 in SFmode, DFmode and XFmode */
1517 {6, 6, 8}, /* cost of storing fp registers
1518 in SFmode, DFmode and XFmode */
1519 2, /* cost of moving MMX register */
1520 {8, 8}, /* cost of loading MMX registers
1521 in SImode and DImode */
1522 {8, 8}, /* cost of storing MMX registers
1523 in SImode and DImode */
1524 2, /* cost of moving SSE register */
1525 {8, 8, 8}, /* cost of loading SSE registers
1526 in SImode, DImode and TImode */
1527 {8, 8, 8}, /* cost of storing SSE registers
1528 in SImode, DImode and TImode */
1529 5, /* MMX or SSE register to integer */
1530 32, /* size of l1 cache. */
1531 512, /* size of l2 cache. */
1532 64, /* size of prefetch block */
1533 6, /* number of parallel prefetches */
1534 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1535 value is increased to perhaps more appropriate value of 5. */
1536 3, /* Branch cost */
1537 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1538 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1539 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1540 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1541 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1542 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1543 {DUMMY_STRINGOP_ALGS,
1544 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1545 {-1, libcall, false}}}},
1546 {DUMMY_STRINGOP_ALGS,
1547 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1548 {-1, libcall, false}}}},
1549 1, /* scalar_stmt_cost. */
1550 1, /* scalar load_cost. */
1551 1, /* scalar_store_cost. */
1552 1, /* vec_stmt_cost. */
1553 1, /* vec_to_scalar_cost. */
1554 1, /* scalar_to_vec_cost. */
1555 1, /* vec_align_load_cost. */
1556 2, /* vec_unalign_load_cost. */
1557 1, /* vec_store_cost. */
1558 3, /* cond_taken_branch_cost. */
1559 1, /* cond_not_taken_branch_cost. */
1560 };
1561
1562 /* core_cost should produce code tuned for Core familly of CPUs. */
1563 static const
1564 struct processor_costs core_cost = {
1565 COSTS_N_INSNS (1), /* cost of an add instruction */
1566 /* On all chips taken into consideration lea is 2 cycles and more. With
1567 this cost however our current implementation of synth_mult results in
1568 use of unnecessary temporary registers causing regression on several
1569 SPECfp benchmarks. */
1570 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1571 COSTS_N_INSNS (1), /* variable shift costs */
1572 COSTS_N_INSNS (1), /* constant shift costs */
1573 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1574 COSTS_N_INSNS (4), /* HI */
1575 COSTS_N_INSNS (3), /* SI */
1576 COSTS_N_INSNS (4), /* DI */
1577 COSTS_N_INSNS (2)}, /* other */
1578 0, /* cost of multiply per each bit set */
1579 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1580 COSTS_N_INSNS (26), /* HI */
1581 COSTS_N_INSNS (42), /* SI */
1582 COSTS_N_INSNS (74), /* DI */
1583 COSTS_N_INSNS (74)}, /* other */
1584 COSTS_N_INSNS (1), /* cost of movsx */
1585 COSTS_N_INSNS (1), /* cost of movzx */
1586 8, /* "large" insn */
1587 17, /* MOVE_RATIO */
1588 4, /* cost for loading QImode using movzbl */
1589 {4, 4, 4}, /* cost of loading integer registers
1590 in QImode, HImode and SImode.
1591 Relative to reg-reg move (2). */
1592 {4, 4, 4}, /* cost of storing integer registers */
1593 4, /* cost of reg,reg fld/fst */
1594 {12, 12, 12}, /* cost of loading fp registers
1595 in SFmode, DFmode and XFmode */
1596 {6, 6, 8}, /* cost of storing fp registers
1597 in SFmode, DFmode and XFmode */
1598 2, /* cost of moving MMX register */
1599 {8, 8}, /* cost of loading MMX registers
1600 in SImode and DImode */
1601 {8, 8}, /* cost of storing MMX registers
1602 in SImode and DImode */
1603 2, /* cost of moving SSE register */
1604 {8, 8, 8}, /* cost of loading SSE registers
1605 in SImode, DImode and TImode */
1606 {8, 8, 8}, /* cost of storing SSE registers
1607 in SImode, DImode and TImode */
1608 5, /* MMX or SSE register to integer */
1609 64, /* size of l1 cache. */
1610 512, /* size of l2 cache. */
1611 64, /* size of prefetch block */
1612 6, /* number of parallel prefetches */
1613 /* FIXME perhaps more appropriate value is 5. */
1614 3, /* Branch cost */
1615 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1616 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1617 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1618 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1619 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1620 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1621 {{libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
1622 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
1623 {-1, libcall, false}}}},
1624 {{libcall, {{6, loop_1_byte, true},
1625 {24, loop, true},
1626 {8192, rep_prefix_4_byte, true},
1627 {-1, libcall, false}}},
1628 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
1629 {-1, libcall, false}}}},
1630 1, /* scalar_stmt_cost. */
1631 1, /* scalar load_cost. */
1632 1, /* scalar_store_cost. */
1633 1, /* vec_stmt_cost. */
1634 1, /* vec_to_scalar_cost. */
1635 1, /* scalar_to_vec_cost. */
1636 1, /* vec_align_load_cost. */
1637 2, /* vec_unalign_load_cost. */
1638 1, /* vec_store_cost. */
1639 3, /* cond_taken_branch_cost. */
1640 1, /* cond_not_taken_branch_cost. */
1641 };
1642
1643 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1644 Athlon and K8. */
1645 static const
1646 struct processor_costs generic32_cost = {
1647 COSTS_N_INSNS (1), /* cost of an add instruction */
1648 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1649 COSTS_N_INSNS (1), /* variable shift costs */
1650 COSTS_N_INSNS (1), /* constant shift costs */
1651 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1652 COSTS_N_INSNS (4), /* HI */
1653 COSTS_N_INSNS (3), /* SI */
1654 COSTS_N_INSNS (4), /* DI */
1655 COSTS_N_INSNS (2)}, /* other */
1656 0, /* cost of multiply per each bit set */
1657 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1658 COSTS_N_INSNS (26), /* HI */
1659 COSTS_N_INSNS (42), /* SI */
1660 COSTS_N_INSNS (74), /* DI */
1661 COSTS_N_INSNS (74)}, /* other */
1662 COSTS_N_INSNS (1), /* cost of movsx */
1663 COSTS_N_INSNS (1), /* cost of movzx */
1664 8, /* "large" insn */
1665 17, /* MOVE_RATIO */
1666 4, /* cost for loading QImode using movzbl */
1667 {4, 4, 4}, /* cost of loading integer registers
1668 in QImode, HImode and SImode.
1669 Relative to reg-reg move (2). */
1670 {4, 4, 4}, /* cost of storing integer registers */
1671 4, /* cost of reg,reg fld/fst */
1672 {12, 12, 12}, /* cost of loading fp registers
1673 in SFmode, DFmode and XFmode */
1674 {6, 6, 8}, /* cost of storing fp registers
1675 in SFmode, DFmode and XFmode */
1676 2, /* cost of moving MMX register */
1677 {8, 8}, /* cost of loading MMX registers
1678 in SImode and DImode */
1679 {8, 8}, /* cost of storing MMX registers
1680 in SImode and DImode */
1681 2, /* cost of moving SSE register */
1682 {8, 8, 8}, /* cost of loading SSE registers
1683 in SImode, DImode and TImode */
1684 {8, 8, 8}, /* cost of storing SSE registers
1685 in SImode, DImode and TImode */
1686 5, /* MMX or SSE register to integer */
1687 32, /* size of l1 cache. */
1688 256, /* size of l2 cache. */
1689 64, /* size of prefetch block */
1690 6, /* number of parallel prefetches */
1691 3, /* Branch cost */
1692 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1693 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1694 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1695 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1696 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1697 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1698 {{libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1699 {-1, libcall, false}}},
1700 DUMMY_STRINGOP_ALGS},
1701 {{libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1702 {-1, libcall, false}}},
1703 DUMMY_STRINGOP_ALGS},
1704 1, /* scalar_stmt_cost. */
1705 1, /* scalar load_cost. */
1706 1, /* scalar_store_cost. */
1707 1, /* vec_stmt_cost. */
1708 1, /* vec_to_scalar_cost. */
1709 1, /* scalar_to_vec_cost. */
1710 1, /* vec_align_load_cost. */
1711 2, /* vec_unalign_load_cost. */
1712 1, /* vec_store_cost. */
1713 3, /* cond_taken_branch_cost. */
1714 1, /* cond_not_taken_branch_cost. */
1715 };
1716
1717 /* Set by -mtune. */
1718 const struct processor_costs *ix86_tune_cost = &pentium_cost;
1719
1720 /* Set by -mtune or -Os. */
1721 const struct processor_costs *ix86_cost = &pentium_cost;
1722
1723 /* Processor feature/optimization bitmasks. */
1724 #define m_386 (1<<PROCESSOR_I386)
1725 #define m_486 (1<<PROCESSOR_I486)
1726 #define m_PENT (1<<PROCESSOR_PENTIUM)
1727 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1728 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1729 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1730 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1731 #define m_CORE2 (1<<PROCESSOR_CORE2)
1732 #define m_COREI7 (1<<PROCESSOR_COREI7)
1733 #define m_CORE2I7 (m_CORE2 | m_COREI7)
1734 #define m_ATOM (1<<PROCESSOR_ATOM)
1735
1736 #define m_GEODE (1<<PROCESSOR_GEODE)
1737 #define m_K6 (1<<PROCESSOR_K6)
1738 #define m_K6_GEODE (m_K6 | m_GEODE)
1739 #define m_K8 (1<<PROCESSOR_K8)
1740 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1741 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1742 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1743 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1744 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1745 #define m_BDVER3 (1<<PROCESSOR_BDVER3)
1746 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3)
1747 #define m_BTVER (m_BTVER1 | m_BTVER2)
1748 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1749 #define m_BTVER2 (1<<PROCESSOR_BTVER2)
1750 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
1751
1752 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1753 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1754
1755 /* Generic instruction choice should be common subset of supported CPUs
1756 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1757 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1758
1759 /* Feature tests against the various tunings. */
1760 unsigned char ix86_tune_features[X86_TUNE_LAST];
1761
1762 /* Feature tests against the various tunings used to create ix86_tune_features
1763 based on the processor mask. */
1764 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1765 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1766 negatively, so enabling for Generic64 seems like good code size
1767 tradeoff. We can't enable it for 32bit generic because it does not
1768 work well with PPro base chips. */
1769 m_386 | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
1770
1771 /* X86_TUNE_PUSH_MEMORY */
1772 m_386 | m_P4_NOCONA | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1773
1774 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1775 m_486 | m_PENT,
1776
1777 /* X86_TUNE_UNROLL_STRLEN */
1778 m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE2I7 | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
1779
1780 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1781 on simulation result. But after P4 was made, no performance benefit
1782 was observed with branch hints. It also increases the code size.
1783 As a result, icc never generates branch hints. */
1784 0,
1785
1786 /* X86_TUNE_DOUBLE_WITH_ADD */
1787 ~m_386,
1788
1789 /* X86_TUNE_USE_SAHF */
1790 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC,
1791
1792 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1793 partial dependencies. */
1794 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1795
1796 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1797 register stalls on Generic32 compilation setting as well. However
1798 in current implementation the partial register stalls are not eliminated
1799 very well - they can be introduced via subregs synthesized by combine
1800 and can happen in caller/callee saving sequences. Because this option
1801 pays back little on PPro based chips and is in conflict with partial reg
1802 dependencies used by Athlon/P4 based chips, it is better to leave it off
1803 for generic32 for now. */
1804 m_PPRO,
1805
1806 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1807 m_CORE2I7 | m_GENERIC,
1808
1809 /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
1810 * on 16-bit immediate moves into memory on Core2 and Corei7. */
1811 m_CORE2I7 | m_GENERIC,
1812
1813 /* X86_TUNE_USE_HIMODE_FIOP */
1814 m_386 | m_486 | m_K6_GEODE,
1815
1816 /* X86_TUNE_USE_SIMODE_FIOP */
1817 ~(m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
1818
1819 /* X86_TUNE_USE_MOV0 */
1820 m_K6,
1821
1822 /* X86_TUNE_USE_CLTD */
1823 ~(m_PENT | m_ATOM | m_K6),
1824
1825 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
1826 m_PENT4,
1827
1828 /* X86_TUNE_SPLIT_LONG_MOVES */
1829 m_PPRO,
1830
1831 /* X86_TUNE_READ_MODIFY_WRITE */
1832 ~m_PENT,
1833
1834 /* X86_TUNE_READ_MODIFY */
1835 ~(m_PENT | m_PPRO),
1836
1837 /* X86_TUNE_PROMOTE_QIMODE */
1838 m_386 | m_486 | m_PENT | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1839
1840 /* X86_TUNE_FAST_PREFIX */
1841 ~(m_386 | m_486 | m_PENT),
1842
1843 /* X86_TUNE_SINGLE_STRINGOP */
1844 m_386 | m_P4_NOCONA,
1845
1846 /* X86_TUNE_QIMODE_MATH */
1847 ~0,
1848
1849 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
1850 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
1851 might be considered for Generic32 if our scheme for avoiding partial
1852 stalls was more effective. */
1853 ~m_PPRO,
1854
1855 /* X86_TUNE_PROMOTE_QI_REGS */
1856 0,
1857
1858 /* X86_TUNE_PROMOTE_HI_REGS */
1859 m_PPRO,
1860
1861 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
1862 over esp addition. */
1863 m_386 | m_486 | m_PENT | m_PPRO,
1864
1865 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
1866 over esp addition. */
1867 m_PENT,
1868
1869 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
1870 over esp subtraction. */
1871 m_386 | m_486 | m_PENT | m_K6_GEODE,
1872
1873 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
1874 over esp subtraction. */
1875 m_PENT | m_K6_GEODE,
1876
1877 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
1878 for DFmode copies */
1879 ~(m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
1880
1881 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
1882 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1883
1884 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
1885 conflict here in between PPro/Pentium4 based chips that thread 128bit
1886 SSE registers as single units versus K8 based chips that divide SSE
1887 registers to two 64bit halves. This knob promotes all store destinations
1888 to be 128bit to allow register renaming on 128bit SSE units, but usually
1889 results in one extra microop on 64bit SSE units. Experimental results
1890 shows that disabling this option on P4 brings over 20% SPECfp regression,
1891 while enabling it on K8 brings roughly 2.4% regression that can be partly
1892 masked by careful scheduling of moves. */
1893 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
1894
1895 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
1896 m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER,
1897
1898 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
1899 m_COREI7 | m_BDVER,
1900
1901 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
1902 m_BDVER ,
1903
1904 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
1905 are resolved on SSE register parts instead of whole registers, so we may
1906 maintain just lower part of scalar values in proper format leaving the
1907 upper part undefined. */
1908 m_ATHLON_K8,
1909
1910 /* X86_TUNE_SSE_TYPELESS_STORES */
1911 m_AMD_MULTIPLE,
1912
1913 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
1914 m_PPRO | m_P4_NOCONA,
1915
1916 /* X86_TUNE_MEMORY_MISMATCH_STALL */
1917 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1918
1919 /* X86_TUNE_PROLOGUE_USING_MOVE */
1920 m_PPRO | m_ATHLON_K8,
1921
1922 /* X86_TUNE_EPILOGUE_USING_MOVE */
1923 m_PPRO | m_ATHLON_K8,
1924
1925 /* X86_TUNE_SHIFT1 */
1926 ~m_486,
1927
1928 /* X86_TUNE_USE_FFREEP */
1929 m_AMD_MULTIPLE,
1930
1931 /* X86_TUNE_INTER_UNIT_MOVES */
1932 ~(m_AMD_MULTIPLE | m_GENERIC),
1933
1934 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
1935 ~(m_AMDFAM10 | m_BDVER ),
1936
1937 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
1938 than 4 branch instructions in the 16 byte window. */
1939 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1940
1941 /* X86_TUNE_SCHEDULE */
1942 m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1943
1944 /* X86_TUNE_USE_BT */
1945 m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1946
1947 /* X86_TUNE_USE_INCDEC */
1948 ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC),
1949
1950 /* X86_TUNE_PAD_RETURNS */
1951 m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC,
1952
1953 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
1954 m_ATOM,
1955
1956 /* X86_TUNE_EXT_80387_CONSTANTS */
1957 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
1958
1959 /* X86_TUNE_AVOID_VECTOR_DECODE */
1960 m_CORE2I7 | m_K8 | m_GENERIC64,
1961
1962 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
1963 and SImode multiply, but 386 and 486 do HImode multiply faster. */
1964 ~(m_386 | m_486),
1965
1966 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
1967 vector path on AMD machines. */
1968 m_CORE2I7 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
1969
1970 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
1971 machines. */
1972 m_CORE2I7 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
1973
1974 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
1975 than a MOV. */
1976 m_PENT,
1977
1978 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
1979 but one byte longer. */
1980 m_PENT,
1981
1982 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
1983 operand that cannot be represented using a modRM byte. The XOR
1984 replacement is long decoded, so this split helps here as well. */
1985 m_K6,
1986
1987 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
1988 from FP to FP. */
1989 m_CORE2I7 | m_AMDFAM10 | m_GENERIC,
1990
1991 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
1992 from integer to FP. */
1993 m_AMDFAM10,
1994
1995 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
1996 with a subsequent conditional jump instruction into a single
1997 compare-and-branch uop. */
1998 m_BDVER,
1999
2000 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2001 will impact LEA instruction selection. */
2002 m_ATOM,
2003
2004 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2005 instructions. */
2006 ~m_ATOM,
2007
2008 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2009 at -O3. For the moment, the prefetching seems badly tuned for Intel
2010 chips. */
2011 m_K6_GEODE | m_AMD_MULTIPLE,
2012
2013 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2014 the auto-vectorizer. */
2015 m_BDVER | m_BTVER2,
2016
2017 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
2018 during reassociation of integer computation. */
2019 m_ATOM,
2020
2021 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
2022 during reassociation of fp computation. */
2023 m_ATOM,
2024
2025 /* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE
2026 regs instead of memory. */
2027 m_COREI7 | m_CORE2I7,
2028
2029 /* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for
2030 a conditional move. */
2031 m_ATOM
2032 };
2033
2034 /* Feature tests against the various architecture variations. */
2035 unsigned char ix86_arch_features[X86_ARCH_LAST];
2036
2037 /* Feature tests against the various architecture variations, used to create
2038 ix86_arch_features based on the processor mask. */
2039 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2040 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
2041 ~(m_386 | m_486 | m_PENT | m_K6),
2042
2043 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2044 ~m_386,
2045
2046 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2047 ~(m_386 | m_486),
2048
2049 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2050 ~m_386,
2051
2052 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2053 ~m_386,
2054 };
2055
2056 static const unsigned int x86_accumulate_outgoing_args
2057 = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC;
2058
2059 static const unsigned int x86_arch_always_fancy_math_387
2060 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
2061
2062 static const unsigned int x86_avx256_split_unaligned_load
2063 = m_COREI7 | m_GENERIC;
2064
2065 static const unsigned int x86_avx256_split_unaligned_store
2066 = m_COREI7 | m_BDVER | m_GENERIC;
2067
2068 /* In case the average insn count for single function invocation is
2069 lower than this constant, emit fast (but longer) prologue and
2070 epilogue code. */
2071 #define FAST_PROLOGUE_INSN_COUNT 20
2072
2073 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2074 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2075 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2076 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2077
2078 /* Array of the smallest class containing reg number REGNO, indexed by
2079 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2080
2081 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2082 {
2083 /* ax, dx, cx, bx */
2084 AREG, DREG, CREG, BREG,
2085 /* si, di, bp, sp */
2086 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2087 /* FP registers */
2088 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2089 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2090 /* arg pointer */
2091 NON_Q_REGS,
2092 /* flags, fpsr, fpcr, frame */
2093 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2094 /* SSE registers */
2095 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2096 SSE_REGS, SSE_REGS,
2097 /* MMX registers */
2098 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2099 MMX_REGS, MMX_REGS,
2100 /* REX registers */
2101 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2102 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2103 /* SSE REX registers */
2104 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2105 SSE_REGS, SSE_REGS,
2106 };
2107
2108 /* The "default" register map used in 32bit mode. */
2109
2110 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2111 {
2112 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2113 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2114 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2115 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2116 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2117 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2118 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2119 };
2120
2121 /* The "default" register map used in 64bit mode. */
2122
2123 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2124 {
2125 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2126 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2127 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2128 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2129 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2130 8,9,10,11,12,13,14,15, /* extended integer registers */
2131 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2132 };
2133
2134 /* Define the register numbers to be used in Dwarf debugging information.
2135 The SVR4 reference port C compiler uses the following register numbers
2136 in its Dwarf output code:
2137 0 for %eax (gcc regno = 0)
2138 1 for %ecx (gcc regno = 2)
2139 2 for %edx (gcc regno = 1)
2140 3 for %ebx (gcc regno = 3)
2141 4 for %esp (gcc regno = 7)
2142 5 for %ebp (gcc regno = 6)
2143 6 for %esi (gcc regno = 4)
2144 7 for %edi (gcc regno = 5)
2145 The following three DWARF register numbers are never generated by
2146 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2147 believes these numbers have these meanings.
2148 8 for %eip (no gcc equivalent)
2149 9 for %eflags (gcc regno = 17)
2150 10 for %trapno (no gcc equivalent)
2151 It is not at all clear how we should number the FP stack registers
2152 for the x86 architecture. If the version of SDB on x86/svr4 were
2153 a bit less brain dead with respect to floating-point then we would
2154 have a precedent to follow with respect to DWARF register numbers
2155 for x86 FP registers, but the SDB on x86/svr4 is so completely
2156 broken with respect to FP registers that it is hardly worth thinking
2157 of it as something to strive for compatibility with.
2158 The version of x86/svr4 SDB I have at the moment does (partially)
2159 seem to believe that DWARF register number 11 is associated with
2160 the x86 register %st(0), but that's about all. Higher DWARF
2161 register numbers don't seem to be associated with anything in
2162 particular, and even for DWARF regno 11, SDB only seems to under-
2163 stand that it should say that a variable lives in %st(0) (when
2164 asked via an `=' command) if we said it was in DWARF regno 11,
2165 but SDB still prints garbage when asked for the value of the
2166 variable in question (via a `/' command).
2167 (Also note that the labels SDB prints for various FP stack regs
2168 when doing an `x' command are all wrong.)
2169 Note that these problems generally don't affect the native SVR4
2170 C compiler because it doesn't allow the use of -O with -g and
2171 because when it is *not* optimizing, it allocates a memory
2172 location for each floating-point variable, and the memory
2173 location is what gets described in the DWARF AT_location
2174 attribute for the variable in question.
2175 Regardless of the severe mental illness of the x86/svr4 SDB, we
2176 do something sensible here and we use the following DWARF
2177 register numbers. Note that these are all stack-top-relative
2178 numbers.
2179 11 for %st(0) (gcc regno = 8)
2180 12 for %st(1) (gcc regno = 9)
2181 13 for %st(2) (gcc regno = 10)
2182 14 for %st(3) (gcc regno = 11)
2183 15 for %st(4) (gcc regno = 12)
2184 16 for %st(5) (gcc regno = 13)
2185 17 for %st(6) (gcc regno = 14)
2186 18 for %st(7) (gcc regno = 15)
2187 */
2188 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2189 {
2190 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2191 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2192 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2193 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2194 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2195 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2196 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2197 };
2198
2199 /* Define parameter passing and return registers. */
2200
2201 static int const x86_64_int_parameter_registers[6] =
2202 {
2203 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2204 };
2205
2206 static int const x86_64_ms_abi_int_parameter_registers[4] =
2207 {
2208 CX_REG, DX_REG, R8_REG, R9_REG
2209 };
2210
2211 static int const x86_64_int_return_registers[4] =
2212 {
2213 AX_REG, DX_REG, DI_REG, SI_REG
2214 };
2215
2216 /* Define the structure for the machine field in struct function. */
2217
2218 struct GTY(()) stack_local_entry {
2219 unsigned short mode;
2220 unsigned short n;
2221 rtx rtl;
2222 struct stack_local_entry *next;
2223 };
2224
2225 /* Structure describing stack frame layout.
2226 Stack grows downward:
2227
2228 [arguments]
2229 <- ARG_POINTER
2230 saved pc
2231
2232 saved static chain if ix86_static_chain_on_stack
2233
2234 saved frame pointer if frame_pointer_needed
2235 <- HARD_FRAME_POINTER
2236 [saved regs]
2237 <- regs_save_offset
2238 [padding0]
2239
2240 [saved SSE regs]
2241 <- sse_regs_save_offset
2242 [padding1] |
2243 | <- FRAME_POINTER
2244 [va_arg registers] |
2245 |
2246 [frame] |
2247 |
2248 [padding2] | = to_allocate
2249 <- STACK_POINTER
2250 */
2251 struct ix86_frame
2252 {
2253 int nsseregs;
2254 int nregs;
2255 int va_arg_size;
2256 int red_zone_size;
2257 int outgoing_arguments_size;
2258
2259 /* The offsets relative to ARG_POINTER. */
2260 HOST_WIDE_INT frame_pointer_offset;
2261 HOST_WIDE_INT hard_frame_pointer_offset;
2262 HOST_WIDE_INT stack_pointer_offset;
2263 HOST_WIDE_INT hfp_save_offset;
2264 HOST_WIDE_INT reg_save_offset;
2265 HOST_WIDE_INT sse_reg_save_offset;
2266
2267 /* When save_regs_using_mov is set, emit prologue using
2268 move instead of push instructions. */
2269 bool save_regs_using_mov;
2270 };
2271
2272 /* Which cpu are we scheduling for. */
2273 enum attr_cpu ix86_schedule;
2274
2275 /* Which cpu are we optimizing for. */
2276 enum processor_type ix86_tune;
2277
2278 /* Which instruction set architecture to use. */
2279 enum processor_type ix86_arch;
2280
2281 /* True if processor has SSE prefetch instruction. */
2282 unsigned char x86_prefetch_sse;
2283
2284 /* -mstackrealign option */
2285 static const char ix86_force_align_arg_pointer_string[]
2286 = "force_align_arg_pointer";
2287
2288 static rtx (*ix86_gen_leave) (void);
2289 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2290 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2291 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2292 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2293 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2294 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2295 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2296 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2297 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2298 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2299 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2300
2301 /* Preferred alignment for stack boundary in bits. */
2302 unsigned int ix86_preferred_stack_boundary;
2303
2304 /* Alignment for incoming stack boundary in bits specified at
2305 command line. */
2306 static unsigned int ix86_user_incoming_stack_boundary;
2307
2308 /* Default alignment for incoming stack boundary in bits. */
2309 static unsigned int ix86_default_incoming_stack_boundary;
2310
2311 /* Alignment for incoming stack boundary in bits. */
2312 unsigned int ix86_incoming_stack_boundary;
2313
2314 /* Calling abi specific va_list type nodes. */
2315 static GTY(()) tree sysv_va_list_type_node;
2316 static GTY(()) tree ms_va_list_type_node;
2317
2318 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2319 char internal_label_prefix[16];
2320 int internal_label_prefix_len;
2321
2322 /* Fence to use after loop using movnt. */
2323 tree x86_mfence;
2324
2325 /* Register class used for passing given 64bit part of the argument.
2326 These represent classes as documented by the PS ABI, with the exception
2327 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2328 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2329
2330 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2331 whenever possible (upper half does contain padding). */
2332 enum x86_64_reg_class
2333 {
2334 X86_64_NO_CLASS,
2335 X86_64_INTEGER_CLASS,
2336 X86_64_INTEGERSI_CLASS,
2337 X86_64_SSE_CLASS,
2338 X86_64_SSESF_CLASS,
2339 X86_64_SSEDF_CLASS,
2340 X86_64_SSEUP_CLASS,
2341 X86_64_X87_CLASS,
2342 X86_64_X87UP_CLASS,
2343 X86_64_COMPLEX_X87_CLASS,
2344 X86_64_MEMORY_CLASS
2345 };
2346
2347 #define MAX_CLASSES 4
2348
2349 /* Table of constants used by fldpi, fldln2, etc.... */
2350 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2351 static bool ext_80387_constants_init = 0;
2352
2353 \f
2354 static struct machine_function * ix86_init_machine_status (void);
2355 static rtx ix86_function_value (const_tree, const_tree, bool);
2356 static bool ix86_function_value_regno_p (const unsigned int);
2357 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2358 const_tree);
2359 static rtx ix86_static_chain (const_tree, bool);
2360 static int ix86_function_regparm (const_tree, const_tree);
2361 static void ix86_compute_frame_layout (struct ix86_frame *);
2362 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2363 rtx, rtx, int);
2364 static void ix86_add_new_builtins (HOST_WIDE_INT);
2365 static tree ix86_canonical_va_list_type (tree);
2366 static void predict_jump (int);
2367 static unsigned int split_stack_prologue_scratch_regno (void);
2368 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2369
2370 enum ix86_function_specific_strings
2371 {
2372 IX86_FUNCTION_SPECIFIC_ARCH,
2373 IX86_FUNCTION_SPECIFIC_TUNE,
2374 IX86_FUNCTION_SPECIFIC_MAX
2375 };
2376
2377 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2378 const char *, enum fpmath_unit, bool);
2379 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2380 static void ix86_function_specific_save (struct cl_target_option *);
2381 static void ix86_function_specific_restore (struct cl_target_option *);
2382 static void ix86_function_specific_print (FILE *, int,
2383 struct cl_target_option *);
2384 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2385 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2386 struct gcc_options *);
2387 static bool ix86_can_inline_p (tree, tree);
2388 static void ix86_set_current_function (tree);
2389 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2390
2391 static enum calling_abi ix86_function_abi (const_tree);
2392
2393 \f
2394 #ifndef SUBTARGET32_DEFAULT_CPU
2395 #define SUBTARGET32_DEFAULT_CPU "i386"
2396 #endif
2397
2398 /* Whether -mtune= or -march= were specified */
2399 static int ix86_tune_defaulted;
2400 static int ix86_arch_specified;
2401
2402 /* Vectorization library interface and handlers. */
2403 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2404
2405 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2406 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2407
2408 /* Processor target table, indexed by processor number */
2409 struct ptt
2410 {
2411 const struct processor_costs *cost; /* Processor costs */
2412 const int align_loop; /* Default alignments. */
2413 const int align_loop_max_skip;
2414 const int align_jump;
2415 const int align_jump_max_skip;
2416 const int align_func;
2417 };
2418
2419 static const struct ptt processor_target_table[PROCESSOR_max] =
2420 {
2421 {&i386_cost, 4, 3, 4, 3, 4},
2422 {&i486_cost, 16, 15, 16, 15, 16},
2423 {&pentium_cost, 16, 7, 16, 7, 16},
2424 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2425 {&geode_cost, 0, 0, 0, 0, 0},
2426 {&k6_cost, 32, 7, 32, 7, 32},
2427 {&athlon_cost, 16, 7, 16, 7, 16},
2428 {&pentium4_cost, 0, 0, 0, 0, 0},
2429 {&k8_cost, 16, 7, 16, 7, 16},
2430 {&nocona_cost, 0, 0, 0, 0, 0},
2431 /* Core 2 */
2432 {&core_cost, 16, 10, 16, 10, 16},
2433 /* Core i7 */
2434 {&core_cost, 16, 10, 16, 10, 16},
2435 {&generic32_cost, 16, 7, 16, 7, 16},
2436 {&generic64_cost, 16, 10, 16, 10, 16},
2437 {&amdfam10_cost, 32, 24, 32, 7, 32},
2438 {&bdver1_cost, 32, 24, 32, 7, 32},
2439 {&bdver2_cost, 32, 24, 32, 7, 32},
2440 {&bdver3_cost, 32, 24, 32, 7, 32},
2441 {&btver1_cost, 32, 24, 32, 7, 32},
2442 {&btver2_cost, 32, 24, 32, 7, 32},
2443 {&atom_cost, 16, 15, 16, 7, 16}
2444 };
2445
2446 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2447 {
2448 "generic",
2449 "i386",
2450 "i486",
2451 "pentium",
2452 "pentium-mmx",
2453 "pentiumpro",
2454 "pentium2",
2455 "pentium3",
2456 "pentium4",
2457 "pentium-m",
2458 "prescott",
2459 "nocona",
2460 "core2",
2461 "corei7",
2462 "atom",
2463 "geode",
2464 "k6",
2465 "k6-2",
2466 "k6-3",
2467 "athlon",
2468 "athlon-4",
2469 "k8",
2470 "amdfam10",
2471 "bdver1",
2472 "bdver2",
2473 "bdver3",
2474 "btver1",
2475 "btver2"
2476 };
2477 \f
2478 static bool
2479 gate_insert_vzeroupper (void)
2480 {
2481 return TARGET_VZEROUPPER;
2482 }
2483
2484 static unsigned int
2485 rest_of_handle_insert_vzeroupper (void)
2486 {
2487 int i;
2488
2489 /* vzeroupper instructions are inserted immediately after reload to
2490 account for possible spills from 256bit registers. The pass
2491 reuses mode switching infrastructure by re-running mode insertion
2492 pass, so disable entities that have already been processed. */
2493 for (i = 0; i < MAX_386_ENTITIES; i++)
2494 ix86_optimize_mode_switching[i] = 0;
2495
2496 ix86_optimize_mode_switching[AVX_U128] = 1;
2497
2498 /* Call optimize_mode_switching. */
2499 pass_mode_switching.pass.execute ();
2500 return 0;
2501 }
2502
2503 struct rtl_opt_pass pass_insert_vzeroupper =
2504 {
2505 {
2506 RTL_PASS,
2507 "vzeroupper", /* name */
2508 OPTGROUP_NONE, /* optinfo_flags */
2509 gate_insert_vzeroupper, /* gate */
2510 rest_of_handle_insert_vzeroupper, /* execute */
2511 NULL, /* sub */
2512 NULL, /* next */
2513 0, /* static_pass_number */
2514 TV_NONE, /* tv_id */
2515 0, /* properties_required */
2516 0, /* properties_provided */
2517 0, /* properties_destroyed */
2518 0, /* todo_flags_start */
2519 TODO_df_finish | TODO_verify_rtl_sharing |
2520 0, /* todo_flags_finish */
2521 }
2522 };
2523
2524 /* Return true if a red-zone is in use. */
2525
2526 static inline bool
2527 ix86_using_red_zone (void)
2528 {
2529 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2530 }
2531 \f
2532 /* Return a string that documents the current -m options. The caller is
2533 responsible for freeing the string. */
2534
2535 static char *
2536 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2537 const char *tune, enum fpmath_unit fpmath,
2538 bool add_nl_p)
2539 {
2540 struct ix86_target_opts
2541 {
2542 const char *option; /* option string */
2543 HOST_WIDE_INT mask; /* isa mask options */
2544 };
2545
2546 /* This table is ordered so that options like -msse4.2 that imply
2547 preceding options while match those first. */
2548 static struct ix86_target_opts isa_opts[] =
2549 {
2550 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2551 { "-mfma", OPTION_MASK_ISA_FMA },
2552 { "-mxop", OPTION_MASK_ISA_XOP },
2553 { "-mlwp", OPTION_MASK_ISA_LWP },
2554 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2555 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2556 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2557 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2558 { "-msse3", OPTION_MASK_ISA_SSE3 },
2559 { "-msse2", OPTION_MASK_ISA_SSE2 },
2560 { "-msse", OPTION_MASK_ISA_SSE },
2561 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2562 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2563 { "-mmmx", OPTION_MASK_ISA_MMX },
2564 { "-mabm", OPTION_MASK_ISA_ABM },
2565 { "-mbmi", OPTION_MASK_ISA_BMI },
2566 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2567 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2568 { "-mhle", OPTION_MASK_ISA_HLE },
2569 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2570 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2571 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2572 { "-madx", OPTION_MASK_ISA_ADX },
2573 { "-mtbm", OPTION_MASK_ISA_TBM },
2574 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2575 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2576 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2577 { "-maes", OPTION_MASK_ISA_AES },
2578 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2579 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2580 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2581 { "-mf16c", OPTION_MASK_ISA_F16C },
2582 { "-mrtm", OPTION_MASK_ISA_RTM },
2583 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2584 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2585 };
2586
2587 /* Flag options. */
2588 static struct ix86_target_opts flag_opts[] =
2589 {
2590 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2591 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2592 { "-m80387", MASK_80387 },
2593 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2594 { "-malign-double", MASK_ALIGN_DOUBLE },
2595 { "-mcld", MASK_CLD },
2596 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2597 { "-mieee-fp", MASK_IEEE_FP },
2598 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2599 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2600 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2601 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2602 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2603 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2604 { "-mno-red-zone", MASK_NO_RED_ZONE },
2605 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2606 { "-mrecip", MASK_RECIP },
2607 { "-mrtd", MASK_RTD },
2608 { "-msseregparm", MASK_SSEREGPARM },
2609 { "-mstack-arg-probe", MASK_STACK_PROBE },
2610 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2611 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2612 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2613 { "-mvzeroupper", MASK_VZEROUPPER },
2614 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2615 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2616 { "-mprefer-avx128", MASK_PREFER_AVX128},
2617 };
2618
2619 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2620
2621 char isa_other[40];
2622 char target_other[40];
2623 unsigned num = 0;
2624 unsigned i, j;
2625 char *ret;
2626 char *ptr;
2627 size_t len;
2628 size_t line_len;
2629 size_t sep_len;
2630 const char *abi;
2631
2632 memset (opts, '\0', sizeof (opts));
2633
2634 /* Add -march= option. */
2635 if (arch)
2636 {
2637 opts[num][0] = "-march=";
2638 opts[num++][1] = arch;
2639 }
2640
2641 /* Add -mtune= option. */
2642 if (tune)
2643 {
2644 opts[num][0] = "-mtune=";
2645 opts[num++][1] = tune;
2646 }
2647
2648 /* Add -m32/-m64/-mx32. */
2649 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2650 {
2651 if ((isa & OPTION_MASK_ABI_64) != 0)
2652 abi = "-m64";
2653 else
2654 abi = "-mx32";
2655 isa &= ~ (OPTION_MASK_ISA_64BIT
2656 | OPTION_MASK_ABI_64
2657 | OPTION_MASK_ABI_X32);
2658 }
2659 else
2660 abi = "-m32";
2661 opts[num++][0] = abi;
2662
2663 /* Pick out the options in isa options. */
2664 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2665 {
2666 if ((isa & isa_opts[i].mask) != 0)
2667 {
2668 opts[num++][0] = isa_opts[i].option;
2669 isa &= ~ isa_opts[i].mask;
2670 }
2671 }
2672
2673 if (isa && add_nl_p)
2674 {
2675 opts[num++][0] = isa_other;
2676 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2677 isa);
2678 }
2679
2680 /* Add flag options. */
2681 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2682 {
2683 if ((flags & flag_opts[i].mask) != 0)
2684 {
2685 opts[num++][0] = flag_opts[i].option;
2686 flags &= ~ flag_opts[i].mask;
2687 }
2688 }
2689
2690 if (flags && add_nl_p)
2691 {
2692 opts[num++][0] = target_other;
2693 sprintf (target_other, "(other flags: %#x)", flags);
2694 }
2695
2696 /* Add -fpmath= option. */
2697 if (fpmath)
2698 {
2699 opts[num][0] = "-mfpmath=";
2700 switch ((int) fpmath)
2701 {
2702 case FPMATH_387:
2703 opts[num++][1] = "387";
2704 break;
2705
2706 case FPMATH_SSE:
2707 opts[num++][1] = "sse";
2708 break;
2709
2710 case FPMATH_387 | FPMATH_SSE:
2711 opts[num++][1] = "sse+387";
2712 break;
2713
2714 default:
2715 gcc_unreachable ();
2716 }
2717 }
2718
2719 /* Any options? */
2720 if (num == 0)
2721 return NULL;
2722
2723 gcc_assert (num < ARRAY_SIZE (opts));
2724
2725 /* Size the string. */
2726 len = 0;
2727 sep_len = (add_nl_p) ? 3 : 1;
2728 for (i = 0; i < num; i++)
2729 {
2730 len += sep_len;
2731 for (j = 0; j < 2; j++)
2732 if (opts[i][j])
2733 len += strlen (opts[i][j]);
2734 }
2735
2736 /* Build the string. */
2737 ret = ptr = (char *) xmalloc (len);
2738 line_len = 0;
2739
2740 for (i = 0; i < num; i++)
2741 {
2742 size_t len2[2];
2743
2744 for (j = 0; j < 2; j++)
2745 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2746
2747 if (i != 0)
2748 {
2749 *ptr++ = ' ';
2750 line_len++;
2751
2752 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2753 {
2754 *ptr++ = '\\';
2755 *ptr++ = '\n';
2756 line_len = 0;
2757 }
2758 }
2759
2760 for (j = 0; j < 2; j++)
2761 if (opts[i][j])
2762 {
2763 memcpy (ptr, opts[i][j], len2[j]);
2764 ptr += len2[j];
2765 line_len += len2[j];
2766 }
2767 }
2768
2769 *ptr = '\0';
2770 gcc_assert (ret + len >= ptr);
2771
2772 return ret;
2773 }
2774
2775 /* Return true, if profiling code should be emitted before
2776 prologue. Otherwise it returns false.
2777 Note: For x86 with "hotfix" it is sorried. */
2778 static bool
2779 ix86_profile_before_prologue (void)
2780 {
2781 return flag_fentry != 0;
2782 }
2783
2784 /* Function that is callable from the debugger to print the current
2785 options. */
2786 void
2787 ix86_debug_options (void)
2788 {
2789 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2790 ix86_arch_string, ix86_tune_string,
2791 ix86_fpmath, true);
2792
2793 if (opts)
2794 {
2795 fprintf (stderr, "%s\n\n", opts);
2796 free (opts);
2797 }
2798 else
2799 fputs ("<no options>\n\n", stderr);
2800
2801 return;
2802 }
2803 \f
2804 /* Override various settings based on options. If MAIN_ARGS_P, the
2805 options are from the command line, otherwise they are from
2806 attributes. */
2807
2808 static void
2809 ix86_option_override_internal (bool main_args_p)
2810 {
2811 int i;
2812 unsigned int ix86_arch_mask, ix86_tune_mask;
2813 const bool ix86_tune_specified = (ix86_tune_string != NULL);
2814 const char *prefix;
2815 const char *suffix;
2816 const char *sw;
2817
2818 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
2819 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
2820 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
2821 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
2822 #define PTA_AES (HOST_WIDE_INT_1 << 4)
2823 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
2824 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
2825 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
2826 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
2827 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
2828 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
2829 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
2830 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
2831 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
2832 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
2833 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
2834 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
2835 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
2836 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
2837 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
2838 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
2839 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
2840 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
2841 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
2842 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
2843 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
2844 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
2845 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
2846 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
2847 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
2848 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
2849 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
2850 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
2851 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
2852 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
2853 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
2854 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
2855 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
2856 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
2857 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
2858
2859 /* if this reaches 64, need to widen struct pta flags below */
2860
2861 static struct pta
2862 {
2863 const char *const name; /* processor name or nickname. */
2864 const enum processor_type processor;
2865 const enum attr_cpu schedule;
2866 const unsigned HOST_WIDE_INT flags;
2867 }
2868 const processor_alias_table[] =
2869 {
2870 {"i386", PROCESSOR_I386, CPU_NONE, 0},
2871 {"i486", PROCESSOR_I486, CPU_NONE, 0},
2872 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2873 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2874 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2875 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2876 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2877 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2878 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE},
2879 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2880 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2881 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
2882 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2883 PTA_MMX | PTA_SSE | PTA_FXSR},
2884 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2885 PTA_MMX | PTA_SSE | PTA_FXSR},
2886 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2887 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
2888 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
2889 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
2890 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
2891 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
2892 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
2893 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
2894 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
2895 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2896 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
2897 {"core2", PROCESSOR_CORE2, CPU_CORE2,
2898 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2899 | PTA_SSSE3 | PTA_CX16 | PTA_FXSR},
2900 {"corei7", PROCESSOR_COREI7, CPU_COREI7,
2901 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2902 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_FXSR},
2903 {"corei7-avx", PROCESSOR_COREI7, CPU_COREI7,
2904 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2905 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2906 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL
2907 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
2908 {"core-avx-i", PROCESSOR_COREI7, CPU_COREI7,
2909 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2910 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2911 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2912 | PTA_RDRND | PTA_F16C | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
2913 {"core-avx2", PROCESSOR_COREI7, CPU_COREI7,
2914 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2915 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
2916 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2917 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
2918 | PTA_FMA | PTA_MOVBE | PTA_RTM | PTA_HLE | PTA_FXSR | PTA_XSAVE
2919 | PTA_XSAVEOPT},
2920 {"atom", PROCESSOR_ATOM, CPU_ATOM,
2921 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2922 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE | PTA_FXSR},
2923 {"geode", PROCESSOR_GEODE, CPU_GEODE,
2924 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
2925 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
2926 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
2927 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
2928 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
2929 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
2930 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
2931 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
2932 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
2933 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
2934 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
2935 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
2936 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
2937 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
2938 {"x86-64", PROCESSOR_K8, CPU_K8,
2939 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
2940 {"k8", PROCESSOR_K8, CPU_K8,
2941 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2942 | PTA_SSE2 | PTA_NO_SAHF},
2943 {"k8-sse3", PROCESSOR_K8, CPU_K8,
2944 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2945 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
2946 {"opteron", PROCESSOR_K8, CPU_K8,
2947 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2948 | PTA_SSE2 | PTA_NO_SAHF},
2949 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
2950 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2951 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
2952 {"athlon64", PROCESSOR_K8, CPU_K8,
2953 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2954 | PTA_SSE2 | PTA_NO_SAHF},
2955 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
2956 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2957 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
2958 {"athlon-fx", PROCESSOR_K8, CPU_K8,
2959 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2960 | PTA_SSE2 | PTA_NO_SAHF},
2961 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
2962 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2963 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
2964 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
2965 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2966 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
2967 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
2968 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2969 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
2970 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
2971 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
2972 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
2973 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2974 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
2975 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
2976 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
2977 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
2978 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
2979 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2980 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
2981 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
2982 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
2983 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
2984 | PTA_XSAVEOPT},
2985 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
2986 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2987 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
2988 | PTA_FXSR | PTA_XSAVE},
2989 {"btver2", PROCESSOR_BTVER2, CPU_GENERIC64,
2990 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2991 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
2992 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
2993 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
2994 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
2995
2996 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
2997 PTA_HLE /* flags are only used for -march switch. */ },
2998 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
2999 PTA_64BIT
3000 | PTA_HLE /* flags are only used for -march switch. */ },
3001 };
3002
3003 /* -mrecip options. */
3004 static struct
3005 {
3006 const char *string; /* option name */
3007 unsigned int mask; /* mask bits to set */
3008 }
3009 const recip_options[] =
3010 {
3011 { "all", RECIP_MASK_ALL },
3012 { "none", RECIP_MASK_NONE },
3013 { "div", RECIP_MASK_DIV },
3014 { "sqrt", RECIP_MASK_SQRT },
3015 { "vec-div", RECIP_MASK_VEC_DIV },
3016 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3017 };
3018
3019 int const pta_size = ARRAY_SIZE (processor_alias_table);
3020
3021 /* Set up prefix/suffix so the error messages refer to either the command
3022 line argument, or the attribute(target). */
3023 if (main_args_p)
3024 {
3025 prefix = "-m";
3026 suffix = "";
3027 sw = "switch";
3028 }
3029 else
3030 {
3031 prefix = "option(\"";
3032 suffix = "\")";
3033 sw = "attribute";
3034 }
3035
3036 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3037 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3038 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT)
3039 ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3040 #ifdef TARGET_BI_ARCH
3041 else
3042 {
3043 #if TARGET_BI_ARCH == 1
3044 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3045 is on and OPTION_MASK_ABI_X32 is off. We turn off
3046 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3047 -mx32. */
3048 if (TARGET_X32)
3049 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3050 #else
3051 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3052 on and OPTION_MASK_ABI_64 is off. We turn off
3053 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3054 -m64. */
3055 if (TARGET_LP64)
3056 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3057 #endif
3058 }
3059 #endif
3060
3061 if (TARGET_X32)
3062 {
3063 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3064 OPTION_MASK_ABI_64 for TARGET_X32. */
3065 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3066 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3067 }
3068 else if (TARGET_LP64)
3069 {
3070 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3071 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3072 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3073 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3074 }
3075
3076 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3077 SUBTARGET_OVERRIDE_OPTIONS;
3078 #endif
3079
3080 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3081 SUBSUBTARGET_OVERRIDE_OPTIONS;
3082 #endif
3083
3084 /* -fPIC is the default for x86_64. */
3085 if (TARGET_MACHO && TARGET_64BIT)
3086 flag_pic = 2;
3087
3088 /* Need to check -mtune=generic first. */
3089 if (ix86_tune_string)
3090 {
3091 if (!strcmp (ix86_tune_string, "generic")
3092 || !strcmp (ix86_tune_string, "i686")
3093 /* As special support for cross compilers we read -mtune=native
3094 as -mtune=generic. With native compilers we won't see the
3095 -mtune=native, as it was changed by the driver. */
3096 || !strcmp (ix86_tune_string, "native"))
3097 {
3098 if (TARGET_64BIT)
3099 ix86_tune_string = "generic64";
3100 else
3101 ix86_tune_string = "generic32";
3102 }
3103 /* If this call is for setting the option attribute, allow the
3104 generic32/generic64 that was previously set. */
3105 else if (!main_args_p
3106 && (!strcmp (ix86_tune_string, "generic32")
3107 || !strcmp (ix86_tune_string, "generic64")))
3108 ;
3109 else if (!strncmp (ix86_tune_string, "generic", 7))
3110 error ("bad value (%s) for %stune=%s %s",
3111 ix86_tune_string, prefix, suffix, sw);
3112 else if (!strcmp (ix86_tune_string, "x86-64"))
3113 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3114 "%stune=k8%s or %stune=generic%s instead as appropriate",
3115 prefix, suffix, prefix, suffix, prefix, suffix);
3116 }
3117 else
3118 {
3119 if (ix86_arch_string)
3120 ix86_tune_string = ix86_arch_string;
3121 if (!ix86_tune_string)
3122 {
3123 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3124 ix86_tune_defaulted = 1;
3125 }
3126
3127 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3128 need to use a sensible tune option. */
3129 if (!strcmp (ix86_tune_string, "generic")
3130 || !strcmp (ix86_tune_string, "x86-64")
3131 || !strcmp (ix86_tune_string, "i686"))
3132 {
3133 if (TARGET_64BIT)
3134 ix86_tune_string = "generic64";
3135 else
3136 ix86_tune_string = "generic32";
3137 }
3138 }
3139
3140 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3141 {
3142 /* rep; movq isn't available in 32-bit code. */
3143 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3144 ix86_stringop_alg = no_stringop;
3145 }
3146
3147 if (!ix86_arch_string)
3148 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3149 else
3150 ix86_arch_specified = 1;
3151
3152 if (global_options_set.x_ix86_pmode)
3153 {
3154 if ((TARGET_LP64 && ix86_pmode == PMODE_SI)
3155 || (!TARGET_64BIT && ix86_pmode == PMODE_DI))
3156 error ("address mode %qs not supported in the %s bit mode",
3157 TARGET_64BIT ? "short" : "long",
3158 TARGET_64BIT ? "64" : "32");
3159 }
3160 else
3161 ix86_pmode = TARGET_LP64 ? PMODE_DI : PMODE_SI;
3162
3163 if (!global_options_set.x_ix86_abi)
3164 ix86_abi = DEFAULT_ABI;
3165
3166 if (global_options_set.x_ix86_cmodel)
3167 {
3168 switch (ix86_cmodel)
3169 {
3170 case CM_SMALL:
3171 case CM_SMALL_PIC:
3172 if (flag_pic)
3173 ix86_cmodel = CM_SMALL_PIC;
3174 if (!TARGET_64BIT)
3175 error ("code model %qs not supported in the %s bit mode",
3176 "small", "32");
3177 break;
3178
3179 case CM_MEDIUM:
3180 case CM_MEDIUM_PIC:
3181 if (flag_pic)
3182 ix86_cmodel = CM_MEDIUM_PIC;
3183 if (!TARGET_64BIT)
3184 error ("code model %qs not supported in the %s bit mode",
3185 "medium", "32");
3186 else if (TARGET_X32)
3187 error ("code model %qs not supported in x32 mode",
3188 "medium");
3189 break;
3190
3191 case CM_LARGE:
3192 case CM_LARGE_PIC:
3193 if (flag_pic)
3194 ix86_cmodel = CM_LARGE_PIC;
3195 if (!TARGET_64BIT)
3196 error ("code model %qs not supported in the %s bit mode",
3197 "large", "32");
3198 else if (TARGET_X32)
3199 error ("code model %qs not supported in x32 mode",
3200 "large");
3201 break;
3202
3203 case CM_32:
3204 if (flag_pic)
3205 error ("code model %s does not support PIC mode", "32");
3206 if (TARGET_64BIT)
3207 error ("code model %qs not supported in the %s bit mode",
3208 "32", "64");
3209 break;
3210
3211 case CM_KERNEL:
3212 if (flag_pic)
3213 {
3214 error ("code model %s does not support PIC mode", "kernel");
3215 ix86_cmodel = CM_32;
3216 }
3217 if (!TARGET_64BIT)
3218 error ("code model %qs not supported in the %s bit mode",
3219 "kernel", "32");
3220 break;
3221
3222 default:
3223 gcc_unreachable ();
3224 }
3225 }
3226 else
3227 {
3228 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3229 use of rip-relative addressing. This eliminates fixups that
3230 would otherwise be needed if this object is to be placed in a
3231 DLL, and is essentially just as efficient as direct addressing. */
3232 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
3233 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
3234 else if (TARGET_64BIT)
3235 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3236 else
3237 ix86_cmodel = CM_32;
3238 }
3239 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3240 {
3241 error ("-masm=intel not supported in this configuration");
3242 ix86_asm_dialect = ASM_ATT;
3243 }
3244 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3245 sorry ("%i-bit mode not compiled in",
3246 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3247
3248 for (i = 0; i < pta_size; i++)
3249 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3250 {
3251 ix86_schedule = processor_alias_table[i].schedule;
3252 ix86_arch = processor_alias_table[i].processor;
3253 /* Default cpu tuning to the architecture. */
3254 ix86_tune = ix86_arch;
3255
3256 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3257 error ("CPU you selected does not support x86-64 "
3258 "instruction set");
3259
3260 if (processor_alias_table[i].flags & PTA_MMX
3261 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3262 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3263 if (processor_alias_table[i].flags & PTA_3DNOW
3264 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3265 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3266 if (processor_alias_table[i].flags & PTA_3DNOW_A
3267 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3268 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3269 if (processor_alias_table[i].flags & PTA_SSE
3270 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3271 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3272 if (processor_alias_table[i].flags & PTA_SSE2
3273 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3274 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3275 if (processor_alias_table[i].flags & PTA_SSE3
3276 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3277 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3278 if (processor_alias_table[i].flags & PTA_SSSE3
3279 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3280 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3281 if (processor_alias_table[i].flags & PTA_SSE4_1
3282 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3283 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3284 if (processor_alias_table[i].flags & PTA_SSE4_2
3285 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3286 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3287 if (processor_alias_table[i].flags & PTA_AVX
3288 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3289 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3290 if (processor_alias_table[i].flags & PTA_AVX2
3291 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3292 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3293 if (processor_alias_table[i].flags & PTA_FMA
3294 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3295 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3296 if (processor_alias_table[i].flags & PTA_SSE4A
3297 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3298 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3299 if (processor_alias_table[i].flags & PTA_FMA4
3300 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3301 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3302 if (processor_alias_table[i].flags & PTA_XOP
3303 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3304 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3305 if (processor_alias_table[i].flags & PTA_LWP
3306 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3307 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3308 if (processor_alias_table[i].flags & PTA_ABM
3309 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3310 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3311 if (processor_alias_table[i].flags & PTA_BMI
3312 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3313 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3314 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3315 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3316 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3317 if (processor_alias_table[i].flags & PTA_TBM
3318 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3319 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3320 if (processor_alias_table[i].flags & PTA_BMI2
3321 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3322 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3323 if (processor_alias_table[i].flags & PTA_CX16
3324 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3325 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3326 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3327 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3328 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3329 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3330 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3331 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3332 if (processor_alias_table[i].flags & PTA_MOVBE
3333 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3334 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3335 if (processor_alias_table[i].flags & PTA_AES
3336 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3337 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3338 if (processor_alias_table[i].flags & PTA_PCLMUL
3339 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3340 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3341 if (processor_alias_table[i].flags & PTA_FSGSBASE
3342 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3343 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3344 if (processor_alias_table[i].flags & PTA_RDRND
3345 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3346 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3347 if (processor_alias_table[i].flags & PTA_F16C
3348 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3349 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3350 if (processor_alias_table[i].flags & PTA_RTM
3351 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
3352 ix86_isa_flags |= OPTION_MASK_ISA_RTM;
3353 if (processor_alias_table[i].flags & PTA_HLE
3354 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
3355 ix86_isa_flags |= OPTION_MASK_ISA_HLE;
3356 if (processor_alias_table[i].flags & PTA_PRFCHW
3357 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
3358 ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
3359 if (processor_alias_table[i].flags & PTA_RDSEED
3360 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
3361 ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
3362 if (processor_alias_table[i].flags & PTA_ADX
3363 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
3364 ix86_isa_flags |= OPTION_MASK_ISA_ADX;
3365 if (processor_alias_table[i].flags & PTA_FXSR
3366 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
3367 ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
3368 if (processor_alias_table[i].flags & PTA_XSAVE
3369 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
3370 ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
3371 if (processor_alias_table[i].flags & PTA_XSAVEOPT
3372 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
3373 ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
3374 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3375 x86_prefetch_sse = true;
3376
3377 break;
3378 }
3379
3380 if (!strcmp (ix86_arch_string, "generic"))
3381 error ("generic CPU can be used only for %stune=%s %s",
3382 prefix, suffix, sw);
3383 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3384 error ("bad value (%s) for %sarch=%s %s",
3385 ix86_arch_string, prefix, suffix, sw);
3386
3387 ix86_arch_mask = 1u << ix86_arch;
3388 for (i = 0; i < X86_ARCH_LAST; ++i)
3389 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3390
3391 for (i = 0; i < pta_size; i++)
3392 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3393 {
3394 ix86_schedule = processor_alias_table[i].schedule;
3395 ix86_tune = processor_alias_table[i].processor;
3396 if (TARGET_64BIT)
3397 {
3398 if (!(processor_alias_table[i].flags & PTA_64BIT))
3399 {
3400 if (ix86_tune_defaulted)
3401 {
3402 ix86_tune_string = "x86-64";
3403 for (i = 0; i < pta_size; i++)
3404 if (! strcmp (ix86_tune_string,
3405 processor_alias_table[i].name))
3406 break;
3407 ix86_schedule = processor_alias_table[i].schedule;
3408 ix86_tune = processor_alias_table[i].processor;
3409 }
3410 else
3411 error ("CPU you selected does not support x86-64 "
3412 "instruction set");
3413 }
3414 }
3415 else
3416 {
3417 /* Adjust tuning when compiling for 32-bit ABI. */
3418 switch (ix86_tune)
3419 {
3420 case PROCESSOR_GENERIC64:
3421 ix86_tune = PROCESSOR_GENERIC32;
3422 ix86_schedule = CPU_PENTIUMPRO;
3423 break;
3424
3425 default:
3426 break;
3427 }
3428 }
3429 /* Intel CPUs have always interpreted SSE prefetch instructions as
3430 NOPs; so, we can enable SSE prefetch instructions even when
3431 -mtune (rather than -march) points us to a processor that has them.
3432 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3433 higher processors. */
3434 if (TARGET_CMOV
3435 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3436 x86_prefetch_sse = true;
3437 break;
3438 }
3439
3440 if (ix86_tune_specified && i == pta_size)
3441 error ("bad value (%s) for %stune=%s %s",
3442 ix86_tune_string, prefix, suffix, sw);
3443
3444 ix86_tune_mask = 1u << ix86_tune;
3445 for (i = 0; i < X86_TUNE_LAST; ++i)
3446 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3447
3448 #ifndef USE_IX86_FRAME_POINTER
3449 #define USE_IX86_FRAME_POINTER 0
3450 #endif
3451
3452 #ifndef USE_X86_64_FRAME_POINTER
3453 #define USE_X86_64_FRAME_POINTER 0
3454 #endif
3455
3456 /* Set the default values for switches whose default depends on TARGET_64BIT
3457 in case they weren't overwritten by command line options. */
3458 if (TARGET_64BIT)
3459 {
3460 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3461 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3462 if (flag_asynchronous_unwind_tables == 2)
3463 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3464 if (flag_pcc_struct_return == 2)
3465 flag_pcc_struct_return = 0;
3466 }
3467 else
3468 {
3469 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3470 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3471 if (flag_asynchronous_unwind_tables == 2)
3472 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3473 if (flag_pcc_struct_return == 2)
3474 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3475 }
3476
3477 ix86_tune_cost = processor_target_table[ix86_tune].cost;
3478 if (optimize_size)
3479 ix86_cost = &ix86_size_cost;
3480 else
3481 ix86_cost = ix86_tune_cost;
3482
3483 /* Arrange to set up i386_stack_locals for all functions. */
3484 init_machine_status = ix86_init_machine_status;
3485
3486 /* Validate -mregparm= value. */
3487 if (global_options_set.x_ix86_regparm)
3488 {
3489 if (TARGET_64BIT)
3490 warning (0, "-mregparm is ignored in 64-bit mode");
3491 if (ix86_regparm > REGPARM_MAX)
3492 {
3493 error ("-mregparm=%d is not between 0 and %d",
3494 ix86_regparm, REGPARM_MAX);
3495 ix86_regparm = 0;
3496 }
3497 }
3498 if (TARGET_64BIT)
3499 ix86_regparm = REGPARM_MAX;
3500
3501 /* Default align_* from the processor table. */
3502 if (align_loops == 0)
3503 {
3504 align_loops = processor_target_table[ix86_tune].align_loop;
3505 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3506 }
3507 if (align_jumps == 0)
3508 {
3509 align_jumps = processor_target_table[ix86_tune].align_jump;
3510 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3511 }
3512 if (align_functions == 0)
3513 {
3514 align_functions = processor_target_table[ix86_tune].align_func;
3515 }
3516
3517 /* Provide default for -mbranch-cost= value. */
3518 if (!global_options_set.x_ix86_branch_cost)
3519 ix86_branch_cost = ix86_cost->branch_cost;
3520
3521 if (TARGET_64BIT)
3522 {
3523 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3524
3525 /* Enable by default the SSE and MMX builtins. Do allow the user to
3526 explicitly disable any of these. In particular, disabling SSE and
3527 MMX for kernel code is extremely useful. */
3528 if (!ix86_arch_specified)
3529 ix86_isa_flags
3530 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3531 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3532
3533 if (TARGET_RTD)
3534 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3535 }
3536 else
3537 {
3538 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3539
3540 if (!ix86_arch_specified)
3541 ix86_isa_flags
3542 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3543
3544 /* i386 ABI does not specify red zone. It still makes sense to use it
3545 when programmer takes care to stack from being destroyed. */
3546 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3547 target_flags |= MASK_NO_RED_ZONE;
3548 }
3549
3550 /* Keep nonleaf frame pointers. */
3551 if (flag_omit_frame_pointer)
3552 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3553 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3554 flag_omit_frame_pointer = 1;
3555
3556 /* If we're doing fast math, we don't care about comparison order
3557 wrt NaNs. This lets us use a shorter comparison sequence. */
3558 if (flag_finite_math_only)
3559 target_flags &= ~MASK_IEEE_FP;
3560
3561 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3562 since the insns won't need emulation. */
3563 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3564 target_flags &= ~MASK_NO_FANCY_MATH_387;
3565
3566 /* Likewise, if the target doesn't have a 387, or we've specified
3567 software floating point, don't use 387 inline intrinsics. */
3568 if (!TARGET_80387)
3569 target_flags |= MASK_NO_FANCY_MATH_387;
3570
3571 /* Turn on MMX builtins for -msse. */
3572 if (TARGET_SSE)
3573 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3574
3575 /* Enable SSE prefetch. */
3576 if (TARGET_SSE || TARGET_PRFCHW)
3577 x86_prefetch_sse = true;
3578
3579 /* Turn on popcnt instruction for -msse4.2 or -mabm. */
3580 if (TARGET_SSE4_2 || TARGET_ABM)
3581 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3582
3583 /* Turn on lzcnt instruction for -mabm. */
3584 if (TARGET_ABM)
3585 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3586
3587 /* Validate -mpreferred-stack-boundary= value or default it to
3588 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3589 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3590 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3591 {
3592 int min = (TARGET_64BIT ? (TARGET_SSE ? 4 : 3) : 2);
3593 int max = (TARGET_SEH ? 4 : 12);
3594
3595 if (ix86_preferred_stack_boundary_arg < min
3596 || ix86_preferred_stack_boundary_arg > max)
3597 {
3598 if (min == max)
3599 error ("-mpreferred-stack-boundary is not supported "
3600 "for this target");
3601 else
3602 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3603 ix86_preferred_stack_boundary_arg, min, max);
3604 }
3605 else
3606 ix86_preferred_stack_boundary
3607 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3608 }
3609
3610 /* Set the default value for -mstackrealign. */
3611 if (ix86_force_align_arg_pointer == -1)
3612 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3613
3614 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3615
3616 /* Validate -mincoming-stack-boundary= value or default it to
3617 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3618 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3619 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3620 {
3621 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3622 || ix86_incoming_stack_boundary_arg > 12)
3623 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3624 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3625 else
3626 {
3627 ix86_user_incoming_stack_boundary
3628 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3629 ix86_incoming_stack_boundary
3630 = ix86_user_incoming_stack_boundary;
3631 }
3632 }
3633
3634 /* Accept -msseregparm only if at least SSE support is enabled. */
3635 if (TARGET_SSEREGPARM
3636 && ! TARGET_SSE)
3637 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3638
3639 if (global_options_set.x_ix86_fpmath)
3640 {
3641 if (ix86_fpmath & FPMATH_SSE)
3642 {
3643 if (!TARGET_SSE)
3644 {
3645 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3646 ix86_fpmath = FPMATH_387;
3647 }
3648 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3649 {
3650 warning (0, "387 instruction set disabled, using SSE arithmetics");
3651 ix86_fpmath = FPMATH_SSE;
3652 }
3653 }
3654 }
3655 else
3656 ix86_fpmath = TARGET_FPMATH_DEFAULT;
3657
3658 /* If the i387 is disabled, then do not return values in it. */
3659 if (!TARGET_80387)
3660 target_flags &= ~MASK_FLOAT_RETURNS;
3661
3662 /* Use external vectorized library in vectorizing intrinsics. */
3663 if (global_options_set.x_ix86_veclibabi_type)
3664 switch (ix86_veclibabi_type)
3665 {
3666 case ix86_veclibabi_type_svml:
3667 ix86_veclib_handler = ix86_veclibabi_svml;
3668 break;
3669
3670 case ix86_veclibabi_type_acml:
3671 ix86_veclib_handler = ix86_veclibabi_acml;
3672 break;
3673
3674 default:
3675 gcc_unreachable ();
3676 }
3677
3678 if ((!USE_IX86_FRAME_POINTER
3679 || (x86_accumulate_outgoing_args & ix86_tune_mask))
3680 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3681 && !optimize_size)
3682 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3683
3684 /* ??? Unwind info is not correct around the CFG unless either a frame
3685 pointer is present or M_A_O_A is set. Fixing this requires rewriting
3686 unwind info generation to be aware of the CFG and propagating states
3687 around edges. */
3688 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3689 || flag_exceptions || flag_non_call_exceptions)
3690 && flag_omit_frame_pointer
3691 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3692 {
3693 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3694 warning (0, "unwind tables currently require either a frame pointer "
3695 "or %saccumulate-outgoing-args%s for correctness",
3696 prefix, suffix);
3697 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3698 }
3699
3700 /* If stack probes are required, the space used for large function
3701 arguments on the stack must also be probed, so enable
3702 -maccumulate-outgoing-args so this happens in the prologue. */
3703 if (TARGET_STACK_PROBE
3704 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3705 {
3706 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3707 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3708 "for correctness", prefix, suffix);
3709 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3710 }
3711
3712 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3713 {
3714 char *p;
3715 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3716 p = strchr (internal_label_prefix, 'X');
3717 internal_label_prefix_len = p - internal_label_prefix;
3718 *p = '\0';
3719 }
3720
3721 /* When scheduling description is not available, disable scheduler pass
3722 so it won't slow down the compilation and make x87 code slower. */
3723 if (!TARGET_SCHEDULE)
3724 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3725
3726 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3727 ix86_tune_cost->simultaneous_prefetches,
3728 global_options.x_param_values,
3729 global_options_set.x_param_values);
3730 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
3731 ix86_tune_cost->prefetch_block,
3732 global_options.x_param_values,
3733 global_options_set.x_param_values);
3734 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
3735 ix86_tune_cost->l1_cache_size,
3736 global_options.x_param_values,
3737 global_options_set.x_param_values);
3738 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
3739 ix86_tune_cost->l2_cache_size,
3740 global_options.x_param_values,
3741 global_options_set.x_param_values);
3742
3743 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3744 if (flag_prefetch_loop_arrays < 0
3745 && HAVE_prefetch
3746 && (optimize >= 3 || flag_profile_use)
3747 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3748 flag_prefetch_loop_arrays = 1;
3749
3750 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3751 can be optimized to ap = __builtin_next_arg (0). */
3752 if (!TARGET_64BIT && !flag_split_stack)
3753 targetm.expand_builtin_va_start = NULL;
3754
3755 if (TARGET_64BIT)
3756 {
3757 ix86_gen_leave = gen_leave_rex64;
3758 if (Pmode == DImode)
3759 {
3760 ix86_gen_monitor = gen_sse3_monitor64_di;
3761 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
3762 ix86_gen_tls_local_dynamic_base_64
3763 = gen_tls_local_dynamic_base_64_di;
3764 }
3765 else
3766 {
3767 ix86_gen_monitor = gen_sse3_monitor64_si;
3768 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
3769 ix86_gen_tls_local_dynamic_base_64
3770 = gen_tls_local_dynamic_base_64_si;
3771 }
3772 }
3773 else
3774 {
3775 ix86_gen_leave = gen_leave;
3776 ix86_gen_monitor = gen_sse3_monitor;
3777 }
3778
3779 if (Pmode == DImode)
3780 {
3781 ix86_gen_add3 = gen_adddi3;
3782 ix86_gen_sub3 = gen_subdi3;
3783 ix86_gen_sub3_carry = gen_subdi3_carry;
3784 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3785 ix86_gen_andsp = gen_anddi3;
3786 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3787 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3788 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3789 }
3790 else
3791 {
3792 ix86_gen_add3 = gen_addsi3;
3793 ix86_gen_sub3 = gen_subsi3;
3794 ix86_gen_sub3_carry = gen_subsi3_carry;
3795 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3796 ix86_gen_andsp = gen_andsi3;
3797 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3798 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3799 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3800 }
3801
3802 #ifdef USE_IX86_CLD
3803 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3804 if (!TARGET_64BIT)
3805 target_flags |= MASK_CLD & ~target_flags_explicit;
3806 #endif
3807
3808 if (!TARGET_64BIT && flag_pic)
3809 {
3810 if (flag_fentry > 0)
3811 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3812 "with -fpic");
3813 flag_fentry = 0;
3814 }
3815 else if (TARGET_SEH)
3816 {
3817 if (flag_fentry == 0)
3818 sorry ("-mno-fentry isn%'t compatible with SEH");
3819 flag_fentry = 1;
3820 }
3821 else if (flag_fentry < 0)
3822 {
3823 #if defined(PROFILE_BEFORE_PROLOGUE)
3824 flag_fentry = 1;
3825 #else
3826 flag_fentry = 0;
3827 #endif
3828 }
3829
3830 if (TARGET_AVX)
3831 {
3832 /* When not optimize for size, enable vzeroupper optimization for
3833 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3834 AVX unaligned load/store. */
3835 if (!optimize_size)
3836 {
3837 if (flag_expensive_optimizations
3838 && !(target_flags_explicit & MASK_VZEROUPPER))
3839 target_flags |= MASK_VZEROUPPER;
3840 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
3841 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
3842 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
3843 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
3844 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
3845 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
3846 /* Enable 128-bit AVX instruction generation
3847 for the auto-vectorizer. */
3848 if (TARGET_AVX128_OPTIMAL
3849 && !(target_flags_explicit & MASK_PREFER_AVX128))
3850 target_flags |= MASK_PREFER_AVX128;
3851 }
3852 }
3853 else
3854 {
3855 /* Disable vzeroupper pass if TARGET_AVX is disabled. */
3856 target_flags &= ~MASK_VZEROUPPER;
3857 }
3858
3859 if (ix86_recip_name)
3860 {
3861 char *p = ASTRDUP (ix86_recip_name);
3862 char *q;
3863 unsigned int mask, i;
3864 bool invert;
3865
3866 while ((q = strtok (p, ",")) != NULL)
3867 {
3868 p = NULL;
3869 if (*q == '!')
3870 {
3871 invert = true;
3872 q++;
3873 }
3874 else
3875 invert = false;
3876
3877 if (!strcmp (q, "default"))
3878 mask = RECIP_MASK_ALL;
3879 else
3880 {
3881 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
3882 if (!strcmp (q, recip_options[i].string))
3883 {
3884 mask = recip_options[i].mask;
3885 break;
3886 }
3887
3888 if (i == ARRAY_SIZE (recip_options))
3889 {
3890 error ("unknown option for -mrecip=%s", q);
3891 invert = false;
3892 mask = RECIP_MASK_NONE;
3893 }
3894 }
3895
3896 recip_mask_explicit |= mask;
3897 if (invert)
3898 recip_mask &= ~mask;
3899 else
3900 recip_mask |= mask;
3901 }
3902 }
3903
3904 if (TARGET_RECIP)
3905 recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
3906 else if (target_flags_explicit & MASK_RECIP)
3907 recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
3908
3909 /* Default long double to 64-bit for Bionic. */
3910 if (TARGET_HAS_BIONIC
3911 && !(target_flags_explicit & MASK_LONG_DOUBLE_64))
3912 target_flags |= MASK_LONG_DOUBLE_64;
3913
3914 /* Save the initial options in case the user does function specific
3915 options. */
3916 if (main_args_p)
3917 target_option_default_node = target_option_current_node
3918 = build_target_option_node ();
3919 }
3920
3921 /* Implement the TARGET_OPTION_OVERRIDE hook. */
3922
3923 static void
3924 ix86_option_override (void)
3925 {
3926 static struct register_pass_info insert_vzeroupper_info
3927 = { &pass_insert_vzeroupper.pass, "reload",
3928 1, PASS_POS_INSERT_AFTER
3929 };
3930
3931 ix86_option_override_internal (true);
3932
3933
3934 /* This needs to be done at start up. It's convenient to do it here. */
3935 register_pass (&insert_vzeroupper_info);
3936 }
3937
3938 /* Update register usage after having seen the compiler flags. */
3939
3940 static void
3941 ix86_conditional_register_usage (void)
3942 {
3943 int i, c_mask;
3944 unsigned int j;
3945
3946 /* The PIC register, if it exists, is fixed. */
3947 j = PIC_OFFSET_TABLE_REGNUM;
3948 if (j != INVALID_REGNUM)
3949 fixed_regs[j] = call_used_regs[j] = 1;
3950
3951 /* For 32-bit targets, squash the REX registers. */
3952 if (! TARGET_64BIT)
3953 {
3954 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
3955 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3956 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3957 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3958 }
3959
3960 /* See the definition of CALL_USED_REGISTERS in i386.h. */
3961 c_mask = (TARGET_64BIT_MS_ABI ? (1 << 3)
3962 : TARGET_64BIT ? (1 << 2)
3963 : (1 << 1));
3964
3965 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
3966
3967 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3968 {
3969 /* Set/reset conditionally defined registers from
3970 CALL_USED_REGISTERS initializer. */
3971 if (call_used_regs[i] > 1)
3972 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
3973
3974 /* Calculate registers of CLOBBERED_REGS register set
3975 as call used registers from GENERAL_REGS register set. */
3976 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
3977 && call_used_regs[i])
3978 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
3979 }
3980
3981 /* If MMX is disabled, squash the registers. */
3982 if (! TARGET_MMX)
3983 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3984 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
3985 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3986
3987 /* If SSE is disabled, squash the registers. */
3988 if (! TARGET_SSE)
3989 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3990 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
3991 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3992
3993 /* If the FPU is disabled, squash the registers. */
3994 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
3995 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3996 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
3997 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3998 }
3999
4000 \f
4001 /* Save the current options */
4002
4003 static void
4004 ix86_function_specific_save (struct cl_target_option *ptr)
4005 {
4006 ptr->arch = ix86_arch;
4007 ptr->schedule = ix86_schedule;
4008 ptr->tune = ix86_tune;
4009 ptr->branch_cost = ix86_branch_cost;
4010 ptr->tune_defaulted = ix86_tune_defaulted;
4011 ptr->arch_specified = ix86_arch_specified;
4012 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
4013 ptr->ix86_target_flags_explicit = target_flags_explicit;
4014 ptr->x_recip_mask_explicit = recip_mask_explicit;
4015
4016 /* The fields are char but the variables are not; make sure the
4017 values fit in the fields. */
4018 gcc_assert (ptr->arch == ix86_arch);
4019 gcc_assert (ptr->schedule == ix86_schedule);
4020 gcc_assert (ptr->tune == ix86_tune);
4021 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4022 }
4023
4024 /* Restore the current options */
4025
4026 static void
4027 ix86_function_specific_restore (struct cl_target_option *ptr)
4028 {
4029 enum processor_type old_tune = ix86_tune;
4030 enum processor_type old_arch = ix86_arch;
4031 unsigned int ix86_arch_mask, ix86_tune_mask;
4032 int i;
4033
4034 ix86_arch = (enum processor_type) ptr->arch;
4035 ix86_schedule = (enum attr_cpu) ptr->schedule;
4036 ix86_tune = (enum processor_type) ptr->tune;
4037 ix86_branch_cost = ptr->branch_cost;
4038 ix86_tune_defaulted = ptr->tune_defaulted;
4039 ix86_arch_specified = ptr->arch_specified;
4040 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4041 target_flags_explicit = ptr->ix86_target_flags_explicit;
4042 recip_mask_explicit = ptr->x_recip_mask_explicit;
4043
4044 /* Recreate the arch feature tests if the arch changed */
4045 if (old_arch != ix86_arch)
4046 {
4047 ix86_arch_mask = 1u << ix86_arch;
4048 for (i = 0; i < X86_ARCH_LAST; ++i)
4049 ix86_arch_features[i]
4050 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4051 }
4052
4053 /* Recreate the tune optimization tests */
4054 if (old_tune != ix86_tune)
4055 {
4056 ix86_tune_mask = 1u << ix86_tune;
4057 for (i = 0; i < X86_TUNE_LAST; ++i)
4058 ix86_tune_features[i]
4059 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4060 }
4061 }
4062
4063 /* Print the current options */
4064
4065 static void
4066 ix86_function_specific_print (FILE *file, int indent,
4067 struct cl_target_option *ptr)
4068 {
4069 char *target_string
4070 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4071 NULL, NULL, ptr->x_ix86_fpmath, false);
4072
4073 fprintf (file, "%*sarch = %d (%s)\n",
4074 indent, "",
4075 ptr->arch,
4076 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4077 ? cpu_names[ptr->arch]
4078 : "<unknown>"));
4079
4080 fprintf (file, "%*stune = %d (%s)\n",
4081 indent, "",
4082 ptr->tune,
4083 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4084 ? cpu_names[ptr->tune]
4085 : "<unknown>"));
4086
4087 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4088
4089 if (target_string)
4090 {
4091 fprintf (file, "%*s%s\n", indent, "", target_string);
4092 free (target_string);
4093 }
4094 }
4095
4096 \f
4097 /* Inner function to process the attribute((target(...))), take an argument and
4098 set the current options from the argument. If we have a list, recursively go
4099 over the list. */
4100
4101 static bool
4102 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4103 struct gcc_options *enum_opts_set)
4104 {
4105 char *next_optstr;
4106 bool ret = true;
4107
4108 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4109 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4110 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4111 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4112 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4113
4114 enum ix86_opt_type
4115 {
4116 ix86_opt_unknown,
4117 ix86_opt_yes,
4118 ix86_opt_no,
4119 ix86_opt_str,
4120 ix86_opt_enum,
4121 ix86_opt_isa
4122 };
4123
4124 static const struct
4125 {
4126 const char *string;
4127 size_t len;
4128 enum ix86_opt_type type;
4129 int opt;
4130 int mask;
4131 } attrs[] = {
4132 /* isa options */
4133 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4134 IX86_ATTR_ISA ("abm", OPT_mabm),
4135 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4136 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4137 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4138 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4139 IX86_ATTR_ISA ("aes", OPT_maes),
4140 IX86_ATTR_ISA ("avx", OPT_mavx),
4141 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4142 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4143 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4144 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4145 IX86_ATTR_ISA ("sse", OPT_msse),
4146 IX86_ATTR_ISA ("sse2", OPT_msse2),
4147 IX86_ATTR_ISA ("sse3", OPT_msse3),
4148 IX86_ATTR_ISA ("sse4", OPT_msse4),
4149 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4150 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4151 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4152 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4153 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4154 IX86_ATTR_ISA ("fma", OPT_mfma),
4155 IX86_ATTR_ISA ("xop", OPT_mxop),
4156 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4157 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4158 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4159 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4160 IX86_ATTR_ISA ("rtm", OPT_mrtm),
4161 IX86_ATTR_ISA ("hle", OPT_mhle),
4162 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
4163 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
4164 IX86_ATTR_ISA ("adx", OPT_madx),
4165 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
4166 IX86_ATTR_ISA ("xsave", OPT_mxsave),
4167 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
4168
4169 /* enum options */
4170 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4171
4172 /* string options */
4173 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4174 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4175
4176 /* flag options */
4177 IX86_ATTR_YES ("cld",
4178 OPT_mcld,
4179 MASK_CLD),
4180
4181 IX86_ATTR_NO ("fancy-math-387",
4182 OPT_mfancy_math_387,
4183 MASK_NO_FANCY_MATH_387),
4184
4185 IX86_ATTR_YES ("ieee-fp",
4186 OPT_mieee_fp,
4187 MASK_IEEE_FP),
4188
4189 IX86_ATTR_YES ("inline-all-stringops",
4190 OPT_minline_all_stringops,
4191 MASK_INLINE_ALL_STRINGOPS),
4192
4193 IX86_ATTR_YES ("inline-stringops-dynamically",
4194 OPT_minline_stringops_dynamically,
4195 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4196
4197 IX86_ATTR_NO ("align-stringops",
4198 OPT_mno_align_stringops,
4199 MASK_NO_ALIGN_STRINGOPS),
4200
4201 IX86_ATTR_YES ("recip",
4202 OPT_mrecip,
4203 MASK_RECIP),
4204
4205 };
4206
4207 /* If this is a list, recurse to get the options. */
4208 if (TREE_CODE (args) == TREE_LIST)
4209 {
4210 bool ret = true;
4211
4212 for (; args; args = TREE_CHAIN (args))
4213 if (TREE_VALUE (args)
4214 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4215 p_strings, enum_opts_set))
4216 ret = false;
4217
4218 return ret;
4219 }
4220
4221 else if (TREE_CODE (args) != STRING_CST)
4222 gcc_unreachable ();
4223
4224 /* Handle multiple arguments separated by commas. */
4225 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4226
4227 while (next_optstr && *next_optstr != '\0')
4228 {
4229 char *p = next_optstr;
4230 char *orig_p = p;
4231 char *comma = strchr (next_optstr, ',');
4232 const char *opt_string;
4233 size_t len, opt_len;
4234 int opt;
4235 bool opt_set_p;
4236 char ch;
4237 unsigned i;
4238 enum ix86_opt_type type = ix86_opt_unknown;
4239 int mask = 0;
4240
4241 if (comma)
4242 {
4243 *comma = '\0';
4244 len = comma - next_optstr;
4245 next_optstr = comma + 1;
4246 }
4247 else
4248 {
4249 len = strlen (p);
4250 next_optstr = NULL;
4251 }
4252
4253 /* Recognize no-xxx. */
4254 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4255 {
4256 opt_set_p = false;
4257 p += 3;
4258 len -= 3;
4259 }
4260 else
4261 opt_set_p = true;
4262
4263 /* Find the option. */
4264 ch = *p;
4265 opt = N_OPTS;
4266 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4267 {
4268 type = attrs[i].type;
4269 opt_len = attrs[i].len;
4270 if (ch == attrs[i].string[0]
4271 && ((type != ix86_opt_str && type != ix86_opt_enum)
4272 ? len == opt_len
4273 : len > opt_len)
4274 && memcmp (p, attrs[i].string, opt_len) == 0)
4275 {
4276 opt = attrs[i].opt;
4277 mask = attrs[i].mask;
4278 opt_string = attrs[i].string;
4279 break;
4280 }
4281 }
4282
4283 /* Process the option. */
4284 if (opt == N_OPTS)
4285 {
4286 error ("attribute(target(\"%s\")) is unknown", orig_p);
4287 ret = false;
4288 }
4289
4290 else if (type == ix86_opt_isa)
4291 {
4292 struct cl_decoded_option decoded;
4293
4294 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4295 ix86_handle_option (&global_options, &global_options_set,
4296 &decoded, input_location);
4297 }
4298
4299 else if (type == ix86_opt_yes || type == ix86_opt_no)
4300 {
4301 if (type == ix86_opt_no)
4302 opt_set_p = !opt_set_p;
4303
4304 if (opt_set_p)
4305 target_flags |= mask;
4306 else
4307 target_flags &= ~mask;
4308 }
4309
4310 else if (type == ix86_opt_str)
4311 {
4312 if (p_strings[opt])
4313 {
4314 error ("option(\"%s\") was already specified", opt_string);
4315 ret = false;
4316 }
4317 else
4318 p_strings[opt] = xstrdup (p + opt_len);
4319 }
4320
4321 else if (type == ix86_opt_enum)
4322 {
4323 bool arg_ok;
4324 int value;
4325
4326 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4327 if (arg_ok)
4328 set_option (&global_options, enum_opts_set, opt, value,
4329 p + opt_len, DK_UNSPECIFIED, input_location,
4330 global_dc);
4331 else
4332 {
4333 error ("attribute(target(\"%s\")) is unknown", orig_p);
4334 ret = false;
4335 }
4336 }
4337
4338 else
4339 gcc_unreachable ();
4340 }
4341
4342 return ret;
4343 }
4344
4345 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4346
4347 tree
4348 ix86_valid_target_attribute_tree (tree args)
4349 {
4350 const char *orig_arch_string = ix86_arch_string;
4351 const char *orig_tune_string = ix86_tune_string;
4352 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4353 int orig_tune_defaulted = ix86_tune_defaulted;
4354 int orig_arch_specified = ix86_arch_specified;
4355 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4356 tree t = NULL_TREE;
4357 int i;
4358 struct cl_target_option *def
4359 = TREE_TARGET_OPTION (target_option_default_node);
4360 struct gcc_options enum_opts_set;
4361
4362 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4363
4364 /* Process each of the options on the chain. */
4365 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4366 &enum_opts_set))
4367 return NULL_TREE;
4368
4369 /* If the changed options are different from the default, rerun
4370 ix86_option_override_internal, and then save the options away.
4371 The string options are are attribute options, and will be undone
4372 when we copy the save structure. */
4373 if (ix86_isa_flags != def->x_ix86_isa_flags
4374 || target_flags != def->x_target_flags
4375 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4376 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4377 || enum_opts_set.x_ix86_fpmath)
4378 {
4379 /* If we are using the default tune= or arch=, undo the string assigned,
4380 and use the default. */
4381 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4382 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4383 else if (!orig_arch_specified)
4384 ix86_arch_string = NULL;
4385
4386 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4387 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4388 else if (orig_tune_defaulted)
4389 ix86_tune_string = NULL;
4390
4391 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4392 if (enum_opts_set.x_ix86_fpmath)
4393 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4394 else if (!TARGET_64BIT && TARGET_SSE)
4395 {
4396 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4397 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4398 }
4399
4400 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4401 ix86_option_override_internal (false);
4402
4403 /* Add any builtin functions with the new isa if any. */
4404 ix86_add_new_builtins (ix86_isa_flags);
4405
4406 /* Save the current options unless we are validating options for
4407 #pragma. */
4408 t = build_target_option_node ();
4409
4410 ix86_arch_string = orig_arch_string;
4411 ix86_tune_string = orig_tune_string;
4412 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4413
4414 /* Free up memory allocated to hold the strings */
4415 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4416 free (option_strings[i]);
4417 }
4418
4419 return t;
4420 }
4421
4422 /* Hook to validate attribute((target("string"))). */
4423
4424 static bool
4425 ix86_valid_target_attribute_p (tree fndecl,
4426 tree ARG_UNUSED (name),
4427 tree args,
4428 int ARG_UNUSED (flags))
4429 {
4430 struct cl_target_option cur_target;
4431 bool ret = true;
4432 tree old_optimize = build_optimization_node ();
4433 tree new_target, new_optimize;
4434 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4435
4436 /* If the function changed the optimization levels as well as setting target
4437 options, start with the optimizations specified. */
4438 if (func_optimize && func_optimize != old_optimize)
4439 cl_optimization_restore (&global_options,
4440 TREE_OPTIMIZATION (func_optimize));
4441
4442 /* The target attributes may also change some optimization flags, so update
4443 the optimization options if necessary. */
4444 cl_target_option_save (&cur_target, &global_options);
4445 new_target = ix86_valid_target_attribute_tree (args);
4446 new_optimize = build_optimization_node ();
4447
4448 if (!new_target)
4449 ret = false;
4450
4451 else if (fndecl)
4452 {
4453 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4454
4455 if (old_optimize != new_optimize)
4456 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4457 }
4458
4459 cl_target_option_restore (&global_options, &cur_target);
4460
4461 if (old_optimize != new_optimize)
4462 cl_optimization_restore (&global_options,
4463 TREE_OPTIMIZATION (old_optimize));
4464
4465 return ret;
4466 }
4467
4468 \f
4469 /* Hook to determine if one function can safely inline another. */
4470
4471 static bool
4472 ix86_can_inline_p (tree caller, tree callee)
4473 {
4474 bool ret = false;
4475 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4476 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4477
4478 /* If callee has no option attributes, then it is ok to inline. */
4479 if (!callee_tree)
4480 ret = true;
4481
4482 /* If caller has no option attributes, but callee does then it is not ok to
4483 inline. */
4484 else if (!caller_tree)
4485 ret = false;
4486
4487 else
4488 {
4489 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4490 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4491
4492 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4493 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4494 function. */
4495 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4496 != callee_opts->x_ix86_isa_flags)
4497 ret = false;
4498
4499 /* See if we have the same non-isa options. */
4500 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4501 ret = false;
4502
4503 /* See if arch, tune, etc. are the same. */
4504 else if (caller_opts->arch != callee_opts->arch)
4505 ret = false;
4506
4507 else if (caller_opts->tune != callee_opts->tune)
4508 ret = false;
4509
4510 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4511 ret = false;
4512
4513 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4514 ret = false;
4515
4516 else
4517 ret = true;
4518 }
4519
4520 return ret;
4521 }
4522
4523 \f
4524 /* Remember the last target of ix86_set_current_function. */
4525 static GTY(()) tree ix86_previous_fndecl;
4526
4527 /* Establish appropriate back-end context for processing the function
4528 FNDECL. The argument might be NULL to indicate processing at top
4529 level, outside of any function scope. */
4530 static void
4531 ix86_set_current_function (tree fndecl)
4532 {
4533 /* Only change the context if the function changes. This hook is called
4534 several times in the course of compiling a function, and we don't want to
4535 slow things down too much or call target_reinit when it isn't safe. */
4536 if (fndecl && fndecl != ix86_previous_fndecl)
4537 {
4538 tree old_tree = (ix86_previous_fndecl
4539 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4540 : NULL_TREE);
4541
4542 tree new_tree = (fndecl
4543 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4544 : NULL_TREE);
4545
4546 ix86_previous_fndecl = fndecl;
4547 if (old_tree == new_tree)
4548 ;
4549
4550 else if (new_tree)
4551 {
4552 cl_target_option_restore (&global_options,
4553 TREE_TARGET_OPTION (new_tree));
4554 target_reinit ();
4555 }
4556
4557 else if (old_tree)
4558 {
4559 struct cl_target_option *def
4560 = TREE_TARGET_OPTION (target_option_current_node);
4561
4562 cl_target_option_restore (&global_options, def);
4563 target_reinit ();
4564 }
4565 }
4566 }
4567
4568 \f
4569 /* Return true if this goes in large data/bss. */
4570
4571 static bool
4572 ix86_in_large_data_p (tree exp)
4573 {
4574 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4575 return false;
4576
4577 /* Functions are never large data. */
4578 if (TREE_CODE (exp) == FUNCTION_DECL)
4579 return false;
4580
4581 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4582 {
4583 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4584 if (strcmp (section, ".ldata") == 0
4585 || strcmp (section, ".lbss") == 0)
4586 return true;
4587 return false;
4588 }
4589 else
4590 {
4591 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4592
4593 /* If this is an incomplete type with size 0, then we can't put it
4594 in data because it might be too big when completed. */
4595 if (!size || size > ix86_section_threshold)
4596 return true;
4597 }
4598
4599 return false;
4600 }
4601
4602 /* Switch to the appropriate section for output of DECL.
4603 DECL is either a `VAR_DECL' node or a constant of some sort.
4604 RELOC indicates whether forming the initial value of DECL requires
4605 link-time relocations. */
4606
4607 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4608 ATTRIBUTE_UNUSED;
4609
4610 static section *
4611 x86_64_elf_select_section (tree decl, int reloc,
4612 unsigned HOST_WIDE_INT align)
4613 {
4614 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4615 && ix86_in_large_data_p (decl))
4616 {
4617 const char *sname = NULL;
4618 unsigned int flags = SECTION_WRITE;
4619 switch (categorize_decl_for_section (decl, reloc))
4620 {
4621 case SECCAT_DATA:
4622 sname = ".ldata";
4623 break;
4624 case SECCAT_DATA_REL:
4625 sname = ".ldata.rel";
4626 break;
4627 case SECCAT_DATA_REL_LOCAL:
4628 sname = ".ldata.rel.local";
4629 break;
4630 case SECCAT_DATA_REL_RO:
4631 sname = ".ldata.rel.ro";
4632 break;
4633 case SECCAT_DATA_REL_RO_LOCAL:
4634 sname = ".ldata.rel.ro.local";
4635 break;
4636 case SECCAT_BSS:
4637 sname = ".lbss";
4638 flags |= SECTION_BSS;
4639 break;
4640 case SECCAT_RODATA:
4641 case SECCAT_RODATA_MERGE_STR:
4642 case SECCAT_RODATA_MERGE_STR_INIT:
4643 case SECCAT_RODATA_MERGE_CONST:
4644 sname = ".lrodata";
4645 flags = 0;
4646 break;
4647 case SECCAT_SRODATA:
4648 case SECCAT_SDATA:
4649 case SECCAT_SBSS:
4650 gcc_unreachable ();
4651 case SECCAT_TEXT:
4652 case SECCAT_TDATA:
4653 case SECCAT_TBSS:
4654 /* We don't split these for medium model. Place them into
4655 default sections and hope for best. */
4656 break;
4657 }
4658 if (sname)
4659 {
4660 /* We might get called with string constants, but get_named_section
4661 doesn't like them as they are not DECLs. Also, we need to set
4662 flags in that case. */
4663 if (!DECL_P (decl))
4664 return get_section (sname, flags, NULL);
4665 return get_named_section (decl, sname, reloc);
4666 }
4667 }
4668 return default_elf_select_section (decl, reloc, align);
4669 }
4670
4671 /* Build up a unique section name, expressed as a
4672 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4673 RELOC indicates whether the initial value of EXP requires
4674 link-time relocations. */
4675
4676 static void ATTRIBUTE_UNUSED
4677 x86_64_elf_unique_section (tree decl, int reloc)
4678 {
4679 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4680 && ix86_in_large_data_p (decl))
4681 {
4682 const char *prefix = NULL;
4683 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4684 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4685
4686 switch (categorize_decl_for_section (decl, reloc))
4687 {
4688 case SECCAT_DATA:
4689 case SECCAT_DATA_REL:
4690 case SECCAT_DATA_REL_LOCAL:
4691 case SECCAT_DATA_REL_RO:
4692 case SECCAT_DATA_REL_RO_LOCAL:
4693 prefix = one_only ? ".ld" : ".ldata";
4694 break;
4695 case SECCAT_BSS:
4696 prefix = one_only ? ".lb" : ".lbss";
4697 break;
4698 case SECCAT_RODATA:
4699 case SECCAT_RODATA_MERGE_STR:
4700 case SECCAT_RODATA_MERGE_STR_INIT:
4701 case SECCAT_RODATA_MERGE_CONST:
4702 prefix = one_only ? ".lr" : ".lrodata";
4703 break;
4704 case SECCAT_SRODATA:
4705 case SECCAT_SDATA:
4706 case SECCAT_SBSS:
4707 gcc_unreachable ();
4708 case SECCAT_TEXT:
4709 case SECCAT_TDATA:
4710 case SECCAT_TBSS:
4711 /* We don't split these for medium model. Place them into
4712 default sections and hope for best. */
4713 break;
4714 }
4715 if (prefix)
4716 {
4717 const char *name, *linkonce;
4718 char *string;
4719
4720 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4721 name = targetm.strip_name_encoding (name);
4722
4723 /* If we're using one_only, then there needs to be a .gnu.linkonce
4724 prefix to the section name. */
4725 linkonce = one_only ? ".gnu.linkonce" : "";
4726
4727 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4728
4729 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4730 return;
4731 }
4732 }
4733 default_unique_section (decl, reloc);
4734 }
4735
4736 #ifdef COMMON_ASM_OP
4737 /* This says how to output assembler code to declare an
4738 uninitialized external linkage data object.
4739
4740 For medium model x86-64 we need to use .largecomm opcode for
4741 large objects. */
4742 void
4743 x86_elf_aligned_common (FILE *file,
4744 const char *name, unsigned HOST_WIDE_INT size,
4745 int align)
4746 {
4747 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4748 && size > (unsigned int)ix86_section_threshold)
4749 fputs (".largecomm\t", file);
4750 else
4751 fputs (COMMON_ASM_OP, file);
4752 assemble_name (file, name);
4753 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4754 size, align / BITS_PER_UNIT);
4755 }
4756 #endif
4757
4758 /* Utility function for targets to use in implementing
4759 ASM_OUTPUT_ALIGNED_BSS. */
4760
4761 void
4762 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4763 const char *name, unsigned HOST_WIDE_INT size,
4764 int align)
4765 {
4766 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4767 && size > (unsigned int)ix86_section_threshold)
4768 switch_to_section (get_named_section (decl, ".lbss", 0));
4769 else
4770 switch_to_section (bss_section);
4771 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4772 #ifdef ASM_DECLARE_OBJECT_NAME
4773 last_assemble_variable_decl = decl;
4774 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4775 #else
4776 /* Standard thing is just output label for the object. */
4777 ASM_OUTPUT_LABEL (file, name);
4778 #endif /* ASM_DECLARE_OBJECT_NAME */
4779 ASM_OUTPUT_SKIP (file, size ? size : 1);
4780 }
4781 \f
4782 /* Decide whether we must probe the stack before any space allocation
4783 on this target. It's essentially TARGET_STACK_PROBE except when
4784 -fstack-check causes the stack to be already probed differently. */
4785
4786 bool
4787 ix86_target_stack_probe (void)
4788 {
4789 /* Do not probe the stack twice if static stack checking is enabled. */
4790 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4791 return false;
4792
4793 return TARGET_STACK_PROBE;
4794 }
4795 \f
4796 /* Decide whether we can make a sibling call to a function. DECL is the
4797 declaration of the function being targeted by the call and EXP is the
4798 CALL_EXPR representing the call. */
4799
4800 static bool
4801 ix86_function_ok_for_sibcall (tree decl, tree exp)
4802 {
4803 tree type, decl_or_type;
4804 rtx a, b;
4805
4806 /* If we are generating position-independent code, we cannot sibcall
4807 optimize any indirect call, or a direct call to a global function,
4808 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
4809 if (!TARGET_MACHO
4810 && !TARGET_64BIT
4811 && flag_pic
4812 && (!decl || !targetm.binds_local_p (decl)))
4813 return false;
4814
4815 /* If we need to align the outgoing stack, then sibcalling would
4816 unalign the stack, which may break the called function. */
4817 if (ix86_minimum_incoming_stack_boundary (true)
4818 < PREFERRED_STACK_BOUNDARY)
4819 return false;
4820
4821 if (decl)
4822 {
4823 decl_or_type = decl;
4824 type = TREE_TYPE (decl);
4825 }
4826 else
4827 {
4828 /* We're looking at the CALL_EXPR, we need the type of the function. */
4829 type = CALL_EXPR_FN (exp); /* pointer expression */
4830 type = TREE_TYPE (type); /* pointer type */
4831 type = TREE_TYPE (type); /* function type */
4832 decl_or_type = type;
4833 }
4834
4835 /* Check that the return value locations are the same. Like
4836 if we are returning floats on the 80387 register stack, we cannot
4837 make a sibcall from a function that doesn't return a float to a
4838 function that does or, conversely, from a function that does return
4839 a float to a function that doesn't; the necessary stack adjustment
4840 would not be executed. This is also the place we notice
4841 differences in the return value ABI. Note that it is ok for one
4842 of the functions to have void return type as long as the return
4843 value of the other is passed in a register. */
4844 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
4845 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
4846 cfun->decl, false);
4847 if (STACK_REG_P (a) || STACK_REG_P (b))
4848 {
4849 if (!rtx_equal_p (a, b))
4850 return false;
4851 }
4852 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
4853 ;
4854 else if (!rtx_equal_p (a, b))
4855 return false;
4856
4857 if (TARGET_64BIT)
4858 {
4859 /* The SYSV ABI has more call-clobbered registers;
4860 disallow sibcalls from MS to SYSV. */
4861 if (cfun->machine->call_abi == MS_ABI
4862 && ix86_function_type_abi (type) == SYSV_ABI)
4863 return false;
4864 }
4865 else
4866 {
4867 /* If this call is indirect, we'll need to be able to use a
4868 call-clobbered register for the address of the target function.
4869 Make sure that all such registers are not used for passing
4870 parameters. Note that DLLIMPORT functions are indirect. */
4871 if (!decl
4872 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
4873 {
4874 if (ix86_function_regparm (type, NULL) >= 3)
4875 {
4876 /* ??? Need to count the actual number of registers to be used,
4877 not the possible number of registers. Fix later. */
4878 return false;
4879 }
4880 }
4881 }
4882
4883 /* Otherwise okay. That also includes certain types of indirect calls. */
4884 return true;
4885 }
4886
4887 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
4888 and "sseregparm" calling convention attributes;
4889 arguments as in struct attribute_spec.handler. */
4890
4891 static tree
4892 ix86_handle_cconv_attribute (tree *node, tree name,
4893 tree args,
4894 int flags ATTRIBUTE_UNUSED,
4895 bool *no_add_attrs)
4896 {
4897 if (TREE_CODE (*node) != FUNCTION_TYPE
4898 && TREE_CODE (*node) != METHOD_TYPE
4899 && TREE_CODE (*node) != FIELD_DECL
4900 && TREE_CODE (*node) != TYPE_DECL)
4901 {
4902 warning (OPT_Wattributes, "%qE attribute only applies to functions",
4903 name);
4904 *no_add_attrs = true;
4905 return NULL_TREE;
4906 }
4907
4908 /* Can combine regparm with all attributes but fastcall, and thiscall. */
4909 if (is_attribute_p ("regparm", name))
4910 {
4911 tree cst;
4912
4913 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4914 {
4915 error ("fastcall and regparm attributes are not compatible");
4916 }
4917
4918 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4919 {
4920 error ("regparam and thiscall attributes are not compatible");
4921 }
4922
4923 cst = TREE_VALUE (args);
4924 if (TREE_CODE (cst) != INTEGER_CST)
4925 {
4926 warning (OPT_Wattributes,
4927 "%qE attribute requires an integer constant argument",
4928 name);
4929 *no_add_attrs = true;
4930 }
4931 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
4932 {
4933 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
4934 name, REGPARM_MAX);
4935 *no_add_attrs = true;
4936 }
4937
4938 return NULL_TREE;
4939 }
4940
4941 if (TARGET_64BIT)
4942 {
4943 /* Do not warn when emulating the MS ABI. */
4944 if ((TREE_CODE (*node) != FUNCTION_TYPE
4945 && TREE_CODE (*node) != METHOD_TYPE)
4946 || ix86_function_type_abi (*node) != MS_ABI)
4947 warning (OPT_Wattributes, "%qE attribute ignored",
4948 name);
4949 *no_add_attrs = true;
4950 return NULL_TREE;
4951 }
4952
4953 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
4954 if (is_attribute_p ("fastcall", name))
4955 {
4956 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4957 {
4958 error ("fastcall and cdecl attributes are not compatible");
4959 }
4960 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4961 {
4962 error ("fastcall and stdcall attributes are not compatible");
4963 }
4964 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
4965 {
4966 error ("fastcall and regparm attributes are not compatible");
4967 }
4968 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4969 {
4970 error ("fastcall and thiscall attributes are not compatible");
4971 }
4972 }
4973
4974 /* Can combine stdcall with fastcall (redundant), regparm and
4975 sseregparm. */
4976 else if (is_attribute_p ("stdcall", name))
4977 {
4978 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4979 {
4980 error ("stdcall and cdecl attributes are not compatible");
4981 }
4982 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4983 {
4984 error ("stdcall and fastcall attributes are not compatible");
4985 }
4986 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4987 {
4988 error ("stdcall and thiscall attributes are not compatible");
4989 }
4990 }
4991
4992 /* Can combine cdecl with regparm and sseregparm. */
4993 else if (is_attribute_p ("cdecl", name))
4994 {
4995 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4996 {
4997 error ("stdcall and cdecl attributes are not compatible");
4998 }
4999 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5000 {
5001 error ("fastcall and cdecl attributes are not compatible");
5002 }
5003 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5004 {
5005 error ("cdecl and thiscall attributes are not compatible");
5006 }
5007 }
5008 else if (is_attribute_p ("thiscall", name))
5009 {
5010 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5011 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5012 name);
5013 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5014 {
5015 error ("stdcall and thiscall attributes are not compatible");
5016 }
5017 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5018 {
5019 error ("fastcall and thiscall attributes are not compatible");
5020 }
5021 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5022 {
5023 error ("cdecl and thiscall attributes are not compatible");
5024 }
5025 }
5026
5027 /* Can combine sseregparm with all attributes. */
5028
5029 return NULL_TREE;
5030 }
5031
5032 /* The transactional memory builtins are implicitly regparm or fastcall
5033 depending on the ABI. Override the generic do-nothing attribute that
5034 these builtins were declared with, and replace it with one of the two
5035 attributes that we expect elsewhere. */
5036
5037 static tree
5038 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5039 tree args ATTRIBUTE_UNUSED,
5040 int flags ATTRIBUTE_UNUSED,
5041 bool *no_add_attrs)
5042 {
5043 tree alt;
5044
5045 /* In no case do we want to add the placeholder attribute. */
5046 *no_add_attrs = true;
5047
5048 /* The 64-bit ABI is unchanged for transactional memory. */
5049 if (TARGET_64BIT)
5050 return NULL_TREE;
5051
5052 /* ??? Is there a better way to validate 32-bit windows? We have
5053 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5054 if (CHECK_STACK_LIMIT > 0)
5055 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5056 else
5057 {
5058 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5059 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5060 }
5061 decl_attributes (node, alt, flags);
5062
5063 return NULL_TREE;
5064 }
5065
5066 /* This function determines from TYPE the calling-convention. */
5067
5068 unsigned int
5069 ix86_get_callcvt (const_tree type)
5070 {
5071 unsigned int ret = 0;
5072 bool is_stdarg;
5073 tree attrs;
5074
5075 if (TARGET_64BIT)
5076 return IX86_CALLCVT_CDECL;
5077
5078 attrs = TYPE_ATTRIBUTES (type);
5079 if (attrs != NULL_TREE)
5080 {
5081 if (lookup_attribute ("cdecl", attrs))
5082 ret |= IX86_CALLCVT_CDECL;
5083 else if (lookup_attribute ("stdcall", attrs))
5084 ret |= IX86_CALLCVT_STDCALL;
5085 else if (lookup_attribute ("fastcall", attrs))
5086 ret |= IX86_CALLCVT_FASTCALL;
5087 else if (lookup_attribute ("thiscall", attrs))
5088 ret |= IX86_CALLCVT_THISCALL;
5089
5090 /* Regparam isn't allowed for thiscall and fastcall. */
5091 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5092 {
5093 if (lookup_attribute ("regparm", attrs))
5094 ret |= IX86_CALLCVT_REGPARM;
5095 if (lookup_attribute ("sseregparm", attrs))
5096 ret |= IX86_CALLCVT_SSEREGPARM;
5097 }
5098
5099 if (IX86_BASE_CALLCVT(ret) != 0)
5100 return ret;
5101 }
5102
5103 is_stdarg = stdarg_p (type);
5104 if (TARGET_RTD && !is_stdarg)
5105 return IX86_CALLCVT_STDCALL | ret;
5106
5107 if (ret != 0
5108 || is_stdarg
5109 || TREE_CODE (type) != METHOD_TYPE
5110 || ix86_function_type_abi (type) != MS_ABI)
5111 return IX86_CALLCVT_CDECL | ret;
5112
5113 return IX86_CALLCVT_THISCALL;
5114 }
5115
5116 /* Return 0 if the attributes for two types are incompatible, 1 if they
5117 are compatible, and 2 if they are nearly compatible (which causes a
5118 warning to be generated). */
5119
5120 static int
5121 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5122 {
5123 unsigned int ccvt1, ccvt2;
5124
5125 if (TREE_CODE (type1) != FUNCTION_TYPE
5126 && TREE_CODE (type1) != METHOD_TYPE)
5127 return 1;
5128
5129 ccvt1 = ix86_get_callcvt (type1);
5130 ccvt2 = ix86_get_callcvt (type2);
5131 if (ccvt1 != ccvt2)
5132 return 0;
5133 if (ix86_function_regparm (type1, NULL)
5134 != ix86_function_regparm (type2, NULL))
5135 return 0;
5136
5137 return 1;
5138 }
5139 \f
5140 /* Return the regparm value for a function with the indicated TYPE and DECL.
5141 DECL may be NULL when calling function indirectly
5142 or considering a libcall. */
5143
5144 static int
5145 ix86_function_regparm (const_tree type, const_tree decl)
5146 {
5147 tree attr;
5148 int regparm;
5149 unsigned int ccvt;
5150
5151 if (TARGET_64BIT)
5152 return (ix86_function_type_abi (type) == SYSV_ABI
5153 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5154 ccvt = ix86_get_callcvt (type);
5155 regparm = ix86_regparm;
5156
5157 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5158 {
5159 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5160 if (attr)
5161 {
5162 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5163 return regparm;
5164 }
5165 }
5166 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5167 return 2;
5168 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5169 return 1;
5170
5171 /* Use register calling convention for local functions when possible. */
5172 if (decl
5173 && TREE_CODE (decl) == FUNCTION_DECL
5174 && optimize
5175 && !(profile_flag && !flag_fentry))
5176 {
5177 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5178 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5179 if (i && i->local && i->can_change_signature)
5180 {
5181 int local_regparm, globals = 0, regno;
5182
5183 /* Make sure no regparm register is taken by a
5184 fixed register variable. */
5185 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5186 if (fixed_regs[local_regparm])
5187 break;
5188
5189 /* We don't want to use regparm(3) for nested functions as
5190 these use a static chain pointer in the third argument. */
5191 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5192 local_regparm = 2;
5193
5194 /* In 32-bit mode save a register for the split stack. */
5195 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5196 local_regparm = 2;
5197
5198 /* Each fixed register usage increases register pressure,
5199 so less registers should be used for argument passing.
5200 This functionality can be overriden by an explicit
5201 regparm value. */
5202 for (regno = AX_REG; regno <= DI_REG; regno++)
5203 if (fixed_regs[regno])
5204 globals++;
5205
5206 local_regparm
5207 = globals < local_regparm ? local_regparm - globals : 0;
5208
5209 if (local_regparm > regparm)
5210 regparm = local_regparm;
5211 }
5212 }
5213
5214 return regparm;
5215 }
5216
5217 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5218 DFmode (2) arguments in SSE registers for a function with the
5219 indicated TYPE and DECL. DECL may be NULL when calling function
5220 indirectly or considering a libcall. Otherwise return 0. */
5221
5222 static int
5223 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5224 {
5225 gcc_assert (!TARGET_64BIT);
5226
5227 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5228 by the sseregparm attribute. */
5229 if (TARGET_SSEREGPARM
5230 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5231 {
5232 if (!TARGET_SSE)
5233 {
5234 if (warn)
5235 {
5236 if (decl)
5237 error ("calling %qD with attribute sseregparm without "
5238 "SSE/SSE2 enabled", decl);
5239 else
5240 error ("calling %qT with attribute sseregparm without "
5241 "SSE/SSE2 enabled", type);
5242 }
5243 return 0;
5244 }
5245
5246 return 2;
5247 }
5248
5249 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5250 (and DFmode for SSE2) arguments in SSE registers. */
5251 if (decl && TARGET_SSE_MATH && optimize
5252 && !(profile_flag && !flag_fentry))
5253 {
5254 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5255 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5256 if (i && i->local && i->can_change_signature)
5257 return TARGET_SSE2 ? 2 : 1;
5258 }
5259
5260 return 0;
5261 }
5262
5263 /* Return true if EAX is live at the start of the function. Used by
5264 ix86_expand_prologue to determine if we need special help before
5265 calling allocate_stack_worker. */
5266
5267 static bool
5268 ix86_eax_live_at_start_p (void)
5269 {
5270 /* Cheat. Don't bother working forward from ix86_function_regparm
5271 to the function type to whether an actual argument is located in
5272 eax. Instead just look at cfg info, which is still close enough
5273 to correct at this point. This gives false positives for broken
5274 functions that might use uninitialized data that happens to be
5275 allocated in eax, but who cares? */
5276 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5277 }
5278
5279 static bool
5280 ix86_keep_aggregate_return_pointer (tree fntype)
5281 {
5282 tree attr;
5283
5284 if (!TARGET_64BIT)
5285 {
5286 attr = lookup_attribute ("callee_pop_aggregate_return",
5287 TYPE_ATTRIBUTES (fntype));
5288 if (attr)
5289 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5290
5291 /* For 32-bit MS-ABI the default is to keep aggregate
5292 return pointer. */
5293 if (ix86_function_type_abi (fntype) == MS_ABI)
5294 return true;
5295 }
5296 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5297 }
5298
5299 /* Value is the number of bytes of arguments automatically
5300 popped when returning from a subroutine call.
5301 FUNDECL is the declaration node of the function (as a tree),
5302 FUNTYPE is the data type of the function (as a tree),
5303 or for a library call it is an identifier node for the subroutine name.
5304 SIZE is the number of bytes of arguments passed on the stack.
5305
5306 On the 80386, the RTD insn may be used to pop them if the number
5307 of args is fixed, but if the number is variable then the caller
5308 must pop them all. RTD can't be used for library calls now
5309 because the library is compiled with the Unix compiler.
5310 Use of RTD is a selectable option, since it is incompatible with
5311 standard Unix calling sequences. If the option is not selected,
5312 the caller must always pop the args.
5313
5314 The attribute stdcall is equivalent to RTD on a per module basis. */
5315
5316 static int
5317 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5318 {
5319 unsigned int ccvt;
5320
5321 /* None of the 64-bit ABIs pop arguments. */
5322 if (TARGET_64BIT)
5323 return 0;
5324
5325 ccvt = ix86_get_callcvt (funtype);
5326
5327 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5328 | IX86_CALLCVT_THISCALL)) != 0
5329 && ! stdarg_p (funtype))
5330 return size;
5331
5332 /* Lose any fake structure return argument if it is passed on the stack. */
5333 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5334 && !ix86_keep_aggregate_return_pointer (funtype))
5335 {
5336 int nregs = ix86_function_regparm (funtype, fundecl);
5337 if (nregs == 0)
5338 return GET_MODE_SIZE (Pmode);
5339 }
5340
5341 return 0;
5342 }
5343
5344 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
5345
5346 static bool
5347 ix86_legitimate_combined_insn (rtx insn)
5348 {
5349 /* Check operand constraints in case hard registers were propagated
5350 into insn pattern. This check prevents combine pass from
5351 generating insn patterns with invalid hard register operands.
5352 These invalid insns can eventually confuse reload to error out
5353 with a spill failure. See also PRs 46829 and 46843. */
5354 if ((INSN_CODE (insn) = recog (PATTERN (insn), insn, 0)) >= 0)
5355 {
5356 int i;
5357
5358 extract_insn (insn);
5359 preprocess_constraints ();
5360
5361 for (i = 0; i < recog_data.n_operands; i++)
5362 {
5363 rtx op = recog_data.operand[i];
5364 enum machine_mode mode = GET_MODE (op);
5365 struct operand_alternative *op_alt;
5366 int offset = 0;
5367 bool win;
5368 int j;
5369
5370 /* A unary operator may be accepted by the predicate, but it
5371 is irrelevant for matching constraints. */
5372 if (UNARY_P (op))
5373 op = XEXP (op, 0);
5374
5375 if (GET_CODE (op) == SUBREG)
5376 {
5377 if (REG_P (SUBREG_REG (op))
5378 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
5379 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
5380 GET_MODE (SUBREG_REG (op)),
5381 SUBREG_BYTE (op),
5382 GET_MODE (op));
5383 op = SUBREG_REG (op);
5384 }
5385
5386 if (!(REG_P (op) && HARD_REGISTER_P (op)))
5387 continue;
5388
5389 op_alt = recog_op_alt[i];
5390
5391 /* Operand has no constraints, anything is OK. */
5392 win = !recog_data.n_alternatives;
5393
5394 for (j = 0; j < recog_data.n_alternatives; j++)
5395 {
5396 if (op_alt[j].anything_ok
5397 || (op_alt[j].matches != -1
5398 && operands_match_p
5399 (recog_data.operand[i],
5400 recog_data.operand[op_alt[j].matches]))
5401 || reg_fits_class_p (op, op_alt[j].cl, offset, mode))
5402 {
5403 win = true;
5404 break;
5405 }
5406 }
5407
5408 if (!win)
5409 return false;
5410 }
5411 }
5412
5413 return true;
5414 }
5415 \f
5416 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
5417
5418 static unsigned HOST_WIDE_INT
5419 ix86_asan_shadow_offset (void)
5420 {
5421 return (unsigned HOST_WIDE_INT) 1 << (TARGET_LP64 ? 44 : 29);
5422 }
5423 \f
5424 /* Argument support functions. */
5425
5426 /* Return true when register may be used to pass function parameters. */
5427 bool
5428 ix86_function_arg_regno_p (int regno)
5429 {
5430 int i;
5431 const int *parm_regs;
5432
5433 if (!TARGET_64BIT)
5434 {
5435 if (TARGET_MACHO)
5436 return (regno < REGPARM_MAX
5437 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5438 else
5439 return (regno < REGPARM_MAX
5440 || (TARGET_MMX && MMX_REGNO_P (regno)
5441 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5442 || (TARGET_SSE && SSE_REGNO_P (regno)
5443 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5444 }
5445
5446 if (TARGET_MACHO)
5447 {
5448 if (SSE_REGNO_P (regno) && TARGET_SSE)
5449 return true;
5450 }
5451 else
5452 {
5453 if (TARGET_SSE && SSE_REGNO_P (regno)
5454 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5455 return true;
5456 }
5457
5458 /* TODO: The function should depend on current function ABI but
5459 builtins.c would need updating then. Therefore we use the
5460 default ABI. */
5461
5462 /* RAX is used as hidden argument to va_arg functions. */
5463 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5464 return true;
5465
5466 if (ix86_abi == MS_ABI)
5467 parm_regs = x86_64_ms_abi_int_parameter_registers;
5468 else
5469 parm_regs = x86_64_int_parameter_registers;
5470 for (i = 0; i < (ix86_abi == MS_ABI
5471 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5472 if (regno == parm_regs[i])
5473 return true;
5474 return false;
5475 }
5476
5477 /* Return if we do not know how to pass TYPE solely in registers. */
5478
5479 static bool
5480 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5481 {
5482 if (must_pass_in_stack_var_size_or_pad (mode, type))
5483 return true;
5484
5485 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5486 The layout_type routine is crafty and tries to trick us into passing
5487 currently unsupported vector types on the stack by using TImode. */
5488 return (!TARGET_64BIT && mode == TImode
5489 && type && TREE_CODE (type) != VECTOR_TYPE);
5490 }
5491
5492 /* It returns the size, in bytes, of the area reserved for arguments passed
5493 in registers for the function represented by fndecl dependent to the used
5494 abi format. */
5495 int
5496 ix86_reg_parm_stack_space (const_tree fndecl)
5497 {
5498 enum calling_abi call_abi = SYSV_ABI;
5499 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5500 call_abi = ix86_function_abi (fndecl);
5501 else
5502 call_abi = ix86_function_type_abi (fndecl);
5503 if (TARGET_64BIT && call_abi == MS_ABI)
5504 return 32;
5505 return 0;
5506 }
5507
5508 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5509 call abi used. */
5510 enum calling_abi
5511 ix86_function_type_abi (const_tree fntype)
5512 {
5513 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5514 {
5515 enum calling_abi abi = ix86_abi;
5516 if (abi == SYSV_ABI)
5517 {
5518 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5519 abi = MS_ABI;
5520 }
5521 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5522 abi = SYSV_ABI;
5523 return abi;
5524 }
5525 return ix86_abi;
5526 }
5527
5528 static bool
5529 ix86_function_ms_hook_prologue (const_tree fn)
5530 {
5531 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5532 {
5533 if (decl_function_context (fn) != NULL_TREE)
5534 error_at (DECL_SOURCE_LOCATION (fn),
5535 "ms_hook_prologue is not compatible with nested function");
5536 else
5537 return true;
5538 }
5539 return false;
5540 }
5541
5542 static enum calling_abi
5543 ix86_function_abi (const_tree fndecl)
5544 {
5545 if (! fndecl)
5546 return ix86_abi;
5547 return ix86_function_type_abi (TREE_TYPE (fndecl));
5548 }
5549
5550 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5551 call abi used. */
5552 enum calling_abi
5553 ix86_cfun_abi (void)
5554 {
5555 if (! cfun)
5556 return ix86_abi;
5557 return cfun->machine->call_abi;
5558 }
5559
5560 /* Write the extra assembler code needed to declare a function properly. */
5561
5562 void
5563 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5564 tree decl)
5565 {
5566 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5567
5568 if (is_ms_hook)
5569 {
5570 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5571 unsigned int filler_cc = 0xcccccccc;
5572
5573 for (i = 0; i < filler_count; i += 4)
5574 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5575 }
5576
5577 #ifdef SUBTARGET_ASM_UNWIND_INIT
5578 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5579 #endif
5580
5581 ASM_OUTPUT_LABEL (asm_out_file, fname);
5582
5583 /* Output magic byte marker, if hot-patch attribute is set. */
5584 if (is_ms_hook)
5585 {
5586 if (TARGET_64BIT)
5587 {
5588 /* leaq [%rsp + 0], %rsp */
5589 asm_fprintf (asm_out_file, ASM_BYTE
5590 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5591 }
5592 else
5593 {
5594 /* movl.s %edi, %edi
5595 push %ebp
5596 movl.s %esp, %ebp */
5597 asm_fprintf (asm_out_file, ASM_BYTE
5598 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5599 }
5600 }
5601 }
5602
5603 /* regclass.c */
5604 extern void init_regs (void);
5605
5606 /* Implementation of call abi switching target hook. Specific to FNDECL
5607 the specific call register sets are set. See also
5608 ix86_conditional_register_usage for more details. */
5609 void
5610 ix86_call_abi_override (const_tree fndecl)
5611 {
5612 if (fndecl == NULL_TREE)
5613 cfun->machine->call_abi = ix86_abi;
5614 else
5615 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5616 }
5617
5618 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5619 expensive re-initialization of init_regs each time we switch function context
5620 since this is needed only during RTL expansion. */
5621 static void
5622 ix86_maybe_switch_abi (void)
5623 {
5624 if (TARGET_64BIT &&
5625 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5626 reinit_regs ();
5627 }
5628
5629 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5630 for a call to a function whose data type is FNTYPE.
5631 For a library call, FNTYPE is 0. */
5632
5633 void
5634 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5635 tree fntype, /* tree ptr for function decl */
5636 rtx libname, /* SYMBOL_REF of library name or 0 */
5637 tree fndecl,
5638 int caller)
5639 {
5640 struct cgraph_local_info *i;
5641
5642 memset (cum, 0, sizeof (*cum));
5643
5644 if (fndecl)
5645 {
5646 i = cgraph_local_info (fndecl);
5647 cum->call_abi = ix86_function_abi (fndecl);
5648 }
5649 else
5650 {
5651 i = NULL;
5652 cum->call_abi = ix86_function_type_abi (fntype);
5653 }
5654
5655 cum->caller = caller;
5656
5657 /* Set up the number of registers to use for passing arguments. */
5658
5659 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5660 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5661 "or subtarget optimization implying it");
5662 cum->nregs = ix86_regparm;
5663 if (TARGET_64BIT)
5664 {
5665 cum->nregs = (cum->call_abi == SYSV_ABI
5666 ? X86_64_REGPARM_MAX
5667 : X86_64_MS_REGPARM_MAX);
5668 }
5669 if (TARGET_SSE)
5670 {
5671 cum->sse_nregs = SSE_REGPARM_MAX;
5672 if (TARGET_64BIT)
5673 {
5674 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5675 ? X86_64_SSE_REGPARM_MAX
5676 : X86_64_MS_SSE_REGPARM_MAX);
5677 }
5678 }
5679 if (TARGET_MMX)
5680 cum->mmx_nregs = MMX_REGPARM_MAX;
5681 cum->warn_avx = true;
5682 cum->warn_sse = true;
5683 cum->warn_mmx = true;
5684
5685 /* Because type might mismatch in between caller and callee, we need to
5686 use actual type of function for local calls.
5687 FIXME: cgraph_analyze can be told to actually record if function uses
5688 va_start so for local functions maybe_vaarg can be made aggressive
5689 helping K&R code.
5690 FIXME: once typesytem is fixed, we won't need this code anymore. */
5691 if (i && i->local && i->can_change_signature)
5692 fntype = TREE_TYPE (fndecl);
5693 cum->maybe_vaarg = (fntype
5694 ? (!prototype_p (fntype) || stdarg_p (fntype))
5695 : !libname);
5696
5697 if (!TARGET_64BIT)
5698 {
5699 /* If there are variable arguments, then we won't pass anything
5700 in registers in 32-bit mode. */
5701 if (stdarg_p (fntype))
5702 {
5703 cum->nregs = 0;
5704 cum->sse_nregs = 0;
5705 cum->mmx_nregs = 0;
5706 cum->warn_avx = 0;
5707 cum->warn_sse = 0;
5708 cum->warn_mmx = 0;
5709 return;
5710 }
5711
5712 /* Use ecx and edx registers if function has fastcall attribute,
5713 else look for regparm information. */
5714 if (fntype)
5715 {
5716 unsigned int ccvt = ix86_get_callcvt (fntype);
5717 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5718 {
5719 cum->nregs = 1;
5720 cum->fastcall = 1; /* Same first register as in fastcall. */
5721 }
5722 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5723 {
5724 cum->nregs = 2;
5725 cum->fastcall = 1;
5726 }
5727 else
5728 cum->nregs = ix86_function_regparm (fntype, fndecl);
5729 }
5730
5731 /* Set up the number of SSE registers used for passing SFmode
5732 and DFmode arguments. Warn for mismatching ABI. */
5733 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5734 }
5735 }
5736
5737 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5738 But in the case of vector types, it is some vector mode.
5739
5740 When we have only some of our vector isa extensions enabled, then there
5741 are some modes for which vector_mode_supported_p is false. For these
5742 modes, the generic vector support in gcc will choose some non-vector mode
5743 in order to implement the type. By computing the natural mode, we'll
5744 select the proper ABI location for the operand and not depend on whatever
5745 the middle-end decides to do with these vector types.
5746
5747 The midde-end can't deal with the vector types > 16 bytes. In this
5748 case, we return the original mode and warn ABI change if CUM isn't
5749 NULL. */
5750
5751 static enum machine_mode
5752 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5753 {
5754 enum machine_mode mode = TYPE_MODE (type);
5755
5756 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5757 {
5758 HOST_WIDE_INT size = int_size_in_bytes (type);
5759 if ((size == 8 || size == 16 || size == 32)
5760 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5761 && TYPE_VECTOR_SUBPARTS (type) > 1)
5762 {
5763 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5764
5765 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5766 mode = MIN_MODE_VECTOR_FLOAT;
5767 else
5768 mode = MIN_MODE_VECTOR_INT;
5769
5770 /* Get the mode which has this inner mode and number of units. */
5771 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5772 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5773 && GET_MODE_INNER (mode) == innermode)
5774 {
5775 if (size == 32 && !TARGET_AVX)
5776 {
5777 static bool warnedavx;
5778
5779 if (cum
5780 && !warnedavx
5781 && cum->warn_avx)
5782 {
5783 warnedavx = true;
5784 warning (0, "AVX vector argument without AVX "
5785 "enabled changes the ABI");
5786 }
5787 return TYPE_MODE (type);
5788 }
5789 else if ((size == 8 || size == 16) && !TARGET_SSE)
5790 {
5791 static bool warnedsse;
5792
5793 if (cum
5794 && !warnedsse
5795 && cum->warn_sse)
5796 {
5797 warnedsse = true;
5798 warning (0, "SSE vector argument without SSE "
5799 "enabled changes the ABI");
5800 }
5801 return mode;
5802 }
5803 else
5804 return mode;
5805 }
5806
5807 gcc_unreachable ();
5808 }
5809 }
5810
5811 return mode;
5812 }
5813
5814 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
5815 this may not agree with the mode that the type system has chosen for the
5816 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
5817 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
5818
5819 static rtx
5820 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
5821 unsigned int regno)
5822 {
5823 rtx tmp;
5824
5825 if (orig_mode != BLKmode)
5826 tmp = gen_rtx_REG (orig_mode, regno);
5827 else
5828 {
5829 tmp = gen_rtx_REG (mode, regno);
5830 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
5831 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
5832 }
5833
5834 return tmp;
5835 }
5836
5837 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
5838 of this code is to classify each 8bytes of incoming argument by the register
5839 class and assign registers accordingly. */
5840
5841 /* Return the union class of CLASS1 and CLASS2.
5842 See the x86-64 PS ABI for details. */
5843
5844 static enum x86_64_reg_class
5845 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
5846 {
5847 /* Rule #1: If both classes are equal, this is the resulting class. */
5848 if (class1 == class2)
5849 return class1;
5850
5851 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
5852 the other class. */
5853 if (class1 == X86_64_NO_CLASS)
5854 return class2;
5855 if (class2 == X86_64_NO_CLASS)
5856 return class1;
5857
5858 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
5859 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
5860 return X86_64_MEMORY_CLASS;
5861
5862 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
5863 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
5864 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
5865 return X86_64_INTEGERSI_CLASS;
5866 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
5867 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
5868 return X86_64_INTEGER_CLASS;
5869
5870 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
5871 MEMORY is used. */
5872 if (class1 == X86_64_X87_CLASS
5873 || class1 == X86_64_X87UP_CLASS
5874 || class1 == X86_64_COMPLEX_X87_CLASS
5875 || class2 == X86_64_X87_CLASS
5876 || class2 == X86_64_X87UP_CLASS
5877 || class2 == X86_64_COMPLEX_X87_CLASS)
5878 return X86_64_MEMORY_CLASS;
5879
5880 /* Rule #6: Otherwise class SSE is used. */
5881 return X86_64_SSE_CLASS;
5882 }
5883
5884 /* Classify the argument of type TYPE and mode MODE.
5885 CLASSES will be filled by the register class used to pass each word
5886 of the operand. The number of words is returned. In case the parameter
5887 should be passed in memory, 0 is returned. As a special case for zero
5888 sized containers, classes[0] will be NO_CLASS and 1 is returned.
5889
5890 BIT_OFFSET is used internally for handling records and specifies offset
5891 of the offset in bits modulo 256 to avoid overflow cases.
5892
5893 See the x86-64 PS ABI for details.
5894 */
5895
5896 static int
5897 classify_argument (enum machine_mode mode, const_tree type,
5898 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
5899 {
5900 HOST_WIDE_INT bytes =
5901 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
5902 int words
5903 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5904
5905 /* Variable sized entities are always passed/returned in memory. */
5906 if (bytes < 0)
5907 return 0;
5908
5909 if (mode != VOIDmode
5910 && targetm.calls.must_pass_in_stack (mode, type))
5911 return 0;
5912
5913 if (type && AGGREGATE_TYPE_P (type))
5914 {
5915 int i;
5916 tree field;
5917 enum x86_64_reg_class subclasses[MAX_CLASSES];
5918
5919 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
5920 if (bytes > 32)
5921 return 0;
5922
5923 for (i = 0; i < words; i++)
5924 classes[i] = X86_64_NO_CLASS;
5925
5926 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
5927 signalize memory class, so handle it as special case. */
5928 if (!words)
5929 {
5930 classes[0] = X86_64_NO_CLASS;
5931 return 1;
5932 }
5933
5934 /* Classify each field of record and merge classes. */
5935 switch (TREE_CODE (type))
5936 {
5937 case RECORD_TYPE:
5938 /* And now merge the fields of structure. */
5939 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5940 {
5941 if (TREE_CODE (field) == FIELD_DECL)
5942 {
5943 int num;
5944
5945 if (TREE_TYPE (field) == error_mark_node)
5946 continue;
5947
5948 /* Bitfields are always classified as integer. Handle them
5949 early, since later code would consider them to be
5950 misaligned integers. */
5951 if (DECL_BIT_FIELD (field))
5952 {
5953 for (i = (int_bit_position (field)
5954 + (bit_offset % 64)) / 8 / 8;
5955 i < ((int_bit_position (field) + (bit_offset % 64))
5956 + tree_low_cst (DECL_SIZE (field), 0)
5957 + 63) / 8 / 8; i++)
5958 classes[i] =
5959 merge_classes (X86_64_INTEGER_CLASS,
5960 classes[i]);
5961 }
5962 else
5963 {
5964 int pos;
5965
5966 type = TREE_TYPE (field);
5967
5968 /* Flexible array member is ignored. */
5969 if (TYPE_MODE (type) == BLKmode
5970 && TREE_CODE (type) == ARRAY_TYPE
5971 && TYPE_SIZE (type) == NULL_TREE
5972 && TYPE_DOMAIN (type) != NULL_TREE
5973 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
5974 == NULL_TREE))
5975 {
5976 static bool warned;
5977
5978 if (!warned && warn_psabi)
5979 {
5980 warned = true;
5981 inform (input_location,
5982 "the ABI of passing struct with"
5983 " a flexible array member has"
5984 " changed in GCC 4.4");
5985 }
5986 continue;
5987 }
5988 num = classify_argument (TYPE_MODE (type), type,
5989 subclasses,
5990 (int_bit_position (field)
5991 + bit_offset) % 256);
5992 if (!num)
5993 return 0;
5994 pos = (int_bit_position (field)
5995 + (bit_offset % 64)) / 8 / 8;
5996 for (i = 0; i < num && (i + pos) < words; i++)
5997 classes[i + pos] =
5998 merge_classes (subclasses[i], classes[i + pos]);
5999 }
6000 }
6001 }
6002 break;
6003
6004 case ARRAY_TYPE:
6005 /* Arrays are handled as small records. */
6006 {
6007 int num;
6008 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
6009 TREE_TYPE (type), subclasses, bit_offset);
6010 if (!num)
6011 return 0;
6012
6013 /* The partial classes are now full classes. */
6014 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
6015 subclasses[0] = X86_64_SSE_CLASS;
6016 if (subclasses[0] == X86_64_INTEGERSI_CLASS
6017 && !((bit_offset % 64) == 0 && bytes == 4))
6018 subclasses[0] = X86_64_INTEGER_CLASS;
6019
6020 for (i = 0; i < words; i++)
6021 classes[i] = subclasses[i % num];
6022
6023 break;
6024 }
6025 case UNION_TYPE:
6026 case QUAL_UNION_TYPE:
6027 /* Unions are similar to RECORD_TYPE but offset is always 0.
6028 */
6029 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6030 {
6031 if (TREE_CODE (field) == FIELD_DECL)
6032 {
6033 int num;
6034
6035 if (TREE_TYPE (field) == error_mark_node)
6036 continue;
6037
6038 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
6039 TREE_TYPE (field), subclasses,
6040 bit_offset);
6041 if (!num)
6042 return 0;
6043 for (i = 0; i < num; i++)
6044 classes[i] = merge_classes (subclasses[i], classes[i]);
6045 }
6046 }
6047 break;
6048
6049 default:
6050 gcc_unreachable ();
6051 }
6052
6053 if (words > 2)
6054 {
6055 /* When size > 16 bytes, if the first one isn't
6056 X86_64_SSE_CLASS or any other ones aren't
6057 X86_64_SSEUP_CLASS, everything should be passed in
6058 memory. */
6059 if (classes[0] != X86_64_SSE_CLASS)
6060 return 0;
6061
6062 for (i = 1; i < words; i++)
6063 if (classes[i] != X86_64_SSEUP_CLASS)
6064 return 0;
6065 }
6066
6067 /* Final merger cleanup. */
6068 for (i = 0; i < words; i++)
6069 {
6070 /* If one class is MEMORY, everything should be passed in
6071 memory. */
6072 if (classes[i] == X86_64_MEMORY_CLASS)
6073 return 0;
6074
6075 /* The X86_64_SSEUP_CLASS should be always preceded by
6076 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6077 if (classes[i] == X86_64_SSEUP_CLASS
6078 && classes[i - 1] != X86_64_SSE_CLASS
6079 && classes[i - 1] != X86_64_SSEUP_CLASS)
6080 {
6081 /* The first one should never be X86_64_SSEUP_CLASS. */
6082 gcc_assert (i != 0);
6083 classes[i] = X86_64_SSE_CLASS;
6084 }
6085
6086 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6087 everything should be passed in memory. */
6088 if (classes[i] == X86_64_X87UP_CLASS
6089 && (classes[i - 1] != X86_64_X87_CLASS))
6090 {
6091 static bool warned;
6092
6093 /* The first one should never be X86_64_X87UP_CLASS. */
6094 gcc_assert (i != 0);
6095 if (!warned && warn_psabi)
6096 {
6097 warned = true;
6098 inform (input_location,
6099 "the ABI of passing union with long double"
6100 " has changed in GCC 4.4");
6101 }
6102 return 0;
6103 }
6104 }
6105 return words;
6106 }
6107
6108 /* Compute alignment needed. We align all types to natural boundaries with
6109 exception of XFmode that is aligned to 64bits. */
6110 if (mode != VOIDmode && mode != BLKmode)
6111 {
6112 int mode_alignment = GET_MODE_BITSIZE (mode);
6113
6114 if (mode == XFmode)
6115 mode_alignment = 128;
6116 else if (mode == XCmode)
6117 mode_alignment = 256;
6118 if (COMPLEX_MODE_P (mode))
6119 mode_alignment /= 2;
6120 /* Misaligned fields are always returned in memory. */
6121 if (bit_offset % mode_alignment)
6122 return 0;
6123 }
6124
6125 /* for V1xx modes, just use the base mode */
6126 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6127 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6128 mode = GET_MODE_INNER (mode);
6129
6130 /* Classification of atomic types. */
6131 switch (mode)
6132 {
6133 case SDmode:
6134 case DDmode:
6135 classes[0] = X86_64_SSE_CLASS;
6136 return 1;
6137 case TDmode:
6138 classes[0] = X86_64_SSE_CLASS;
6139 classes[1] = X86_64_SSEUP_CLASS;
6140 return 2;
6141 case DImode:
6142 case SImode:
6143 case HImode:
6144 case QImode:
6145 case CSImode:
6146 case CHImode:
6147 case CQImode:
6148 {
6149 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6150
6151 if (size <= 32)
6152 {
6153 classes[0] = X86_64_INTEGERSI_CLASS;
6154 return 1;
6155 }
6156 else if (size <= 64)
6157 {
6158 classes[0] = X86_64_INTEGER_CLASS;
6159 return 1;
6160 }
6161 else if (size <= 64+32)
6162 {
6163 classes[0] = X86_64_INTEGER_CLASS;
6164 classes[1] = X86_64_INTEGERSI_CLASS;
6165 return 2;
6166 }
6167 else if (size <= 64+64)
6168 {
6169 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6170 return 2;
6171 }
6172 else
6173 gcc_unreachable ();
6174 }
6175 case CDImode:
6176 case TImode:
6177 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6178 return 2;
6179 case COImode:
6180 case OImode:
6181 /* OImode shouldn't be used directly. */
6182 gcc_unreachable ();
6183 case CTImode:
6184 return 0;
6185 case SFmode:
6186 if (!(bit_offset % 64))
6187 classes[0] = X86_64_SSESF_CLASS;
6188 else
6189 classes[0] = X86_64_SSE_CLASS;
6190 return 1;
6191 case DFmode:
6192 classes[0] = X86_64_SSEDF_CLASS;
6193 return 1;
6194 case XFmode:
6195 classes[0] = X86_64_X87_CLASS;
6196 classes[1] = X86_64_X87UP_CLASS;
6197 return 2;
6198 case TFmode:
6199 classes[0] = X86_64_SSE_CLASS;
6200 classes[1] = X86_64_SSEUP_CLASS;
6201 return 2;
6202 case SCmode:
6203 classes[0] = X86_64_SSE_CLASS;
6204 if (!(bit_offset % 64))
6205 return 1;
6206 else
6207 {
6208 static bool warned;
6209
6210 if (!warned && warn_psabi)
6211 {
6212 warned = true;
6213 inform (input_location,
6214 "the ABI of passing structure with complex float"
6215 " member has changed in GCC 4.4");
6216 }
6217 classes[1] = X86_64_SSESF_CLASS;
6218 return 2;
6219 }
6220 case DCmode:
6221 classes[0] = X86_64_SSEDF_CLASS;
6222 classes[1] = X86_64_SSEDF_CLASS;
6223 return 2;
6224 case XCmode:
6225 classes[0] = X86_64_COMPLEX_X87_CLASS;
6226 return 1;
6227 case TCmode:
6228 /* This modes is larger than 16 bytes. */
6229 return 0;
6230 case V8SFmode:
6231 case V8SImode:
6232 case V32QImode:
6233 case V16HImode:
6234 case V4DFmode:
6235 case V4DImode:
6236 classes[0] = X86_64_SSE_CLASS;
6237 classes[1] = X86_64_SSEUP_CLASS;
6238 classes[2] = X86_64_SSEUP_CLASS;
6239 classes[3] = X86_64_SSEUP_CLASS;
6240 return 4;
6241 case V4SFmode:
6242 case V4SImode:
6243 case V16QImode:
6244 case V8HImode:
6245 case V2DFmode:
6246 case V2DImode:
6247 classes[0] = X86_64_SSE_CLASS;
6248 classes[1] = X86_64_SSEUP_CLASS;
6249 return 2;
6250 case V1TImode:
6251 case V1DImode:
6252 case V2SFmode:
6253 case V2SImode:
6254 case V4HImode:
6255 case V8QImode:
6256 classes[0] = X86_64_SSE_CLASS;
6257 return 1;
6258 case BLKmode:
6259 case VOIDmode:
6260 return 0;
6261 default:
6262 gcc_assert (VECTOR_MODE_P (mode));
6263
6264 if (bytes > 16)
6265 return 0;
6266
6267 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6268
6269 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6270 classes[0] = X86_64_INTEGERSI_CLASS;
6271 else
6272 classes[0] = X86_64_INTEGER_CLASS;
6273 classes[1] = X86_64_INTEGER_CLASS;
6274 return 1 + (bytes > 8);
6275 }
6276 }
6277
6278 /* Examine the argument and return set number of register required in each
6279 class. Return 0 iff parameter should be passed in memory. */
6280 static int
6281 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6282 int *int_nregs, int *sse_nregs)
6283 {
6284 enum x86_64_reg_class regclass[MAX_CLASSES];
6285 int n = classify_argument (mode, type, regclass, 0);
6286
6287 *int_nregs = 0;
6288 *sse_nregs = 0;
6289 if (!n)
6290 return 0;
6291 for (n--; n >= 0; n--)
6292 switch (regclass[n])
6293 {
6294 case X86_64_INTEGER_CLASS:
6295 case X86_64_INTEGERSI_CLASS:
6296 (*int_nregs)++;
6297 break;
6298 case X86_64_SSE_CLASS:
6299 case X86_64_SSESF_CLASS:
6300 case X86_64_SSEDF_CLASS:
6301 (*sse_nregs)++;
6302 break;
6303 case X86_64_NO_CLASS:
6304 case X86_64_SSEUP_CLASS:
6305 break;
6306 case X86_64_X87_CLASS:
6307 case X86_64_X87UP_CLASS:
6308 if (!in_return)
6309 return 0;
6310 break;
6311 case X86_64_COMPLEX_X87_CLASS:
6312 return in_return ? 2 : 0;
6313 case X86_64_MEMORY_CLASS:
6314 gcc_unreachable ();
6315 }
6316 return 1;
6317 }
6318
6319 /* Construct container for the argument used by GCC interface. See
6320 FUNCTION_ARG for the detailed description. */
6321
6322 static rtx
6323 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6324 const_tree type, int in_return, int nintregs, int nsseregs,
6325 const int *intreg, int sse_regno)
6326 {
6327 /* The following variables hold the static issued_error state. */
6328 static bool issued_sse_arg_error;
6329 static bool issued_sse_ret_error;
6330 static bool issued_x87_ret_error;
6331
6332 enum machine_mode tmpmode;
6333 int bytes =
6334 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6335 enum x86_64_reg_class regclass[MAX_CLASSES];
6336 int n;
6337 int i;
6338 int nexps = 0;
6339 int needed_sseregs, needed_intregs;
6340 rtx exp[MAX_CLASSES];
6341 rtx ret;
6342
6343 n = classify_argument (mode, type, regclass, 0);
6344 if (!n)
6345 return NULL;
6346 if (!examine_argument (mode, type, in_return, &needed_intregs,
6347 &needed_sseregs))
6348 return NULL;
6349 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6350 return NULL;
6351
6352 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6353 some less clueful developer tries to use floating-point anyway. */
6354 if (needed_sseregs && !TARGET_SSE)
6355 {
6356 if (in_return)
6357 {
6358 if (!issued_sse_ret_error)
6359 {
6360 error ("SSE register return with SSE disabled");
6361 issued_sse_ret_error = true;
6362 }
6363 }
6364 else if (!issued_sse_arg_error)
6365 {
6366 error ("SSE register argument with SSE disabled");
6367 issued_sse_arg_error = true;
6368 }
6369 return NULL;
6370 }
6371
6372 /* Likewise, error if the ABI requires us to return values in the
6373 x87 registers and the user specified -mno-80387. */
6374 if (!TARGET_80387 && in_return)
6375 for (i = 0; i < n; i++)
6376 if (regclass[i] == X86_64_X87_CLASS
6377 || regclass[i] == X86_64_X87UP_CLASS
6378 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6379 {
6380 if (!issued_x87_ret_error)
6381 {
6382 error ("x87 register return with x87 disabled");
6383 issued_x87_ret_error = true;
6384 }
6385 return NULL;
6386 }
6387
6388 /* First construct simple cases. Avoid SCmode, since we want to use
6389 single register to pass this type. */
6390 if (n == 1 && mode != SCmode)
6391 switch (regclass[0])
6392 {
6393 case X86_64_INTEGER_CLASS:
6394 case X86_64_INTEGERSI_CLASS:
6395 return gen_rtx_REG (mode, intreg[0]);
6396 case X86_64_SSE_CLASS:
6397 case X86_64_SSESF_CLASS:
6398 case X86_64_SSEDF_CLASS:
6399 if (mode != BLKmode)
6400 return gen_reg_or_parallel (mode, orig_mode,
6401 SSE_REGNO (sse_regno));
6402 break;
6403 case X86_64_X87_CLASS:
6404 case X86_64_COMPLEX_X87_CLASS:
6405 return gen_rtx_REG (mode, FIRST_STACK_REG);
6406 case X86_64_NO_CLASS:
6407 /* Zero sized array, struct or class. */
6408 return NULL;
6409 default:
6410 gcc_unreachable ();
6411 }
6412 if (n == 2
6413 && regclass[0] == X86_64_SSE_CLASS
6414 && regclass[1] == X86_64_SSEUP_CLASS
6415 && mode != BLKmode)
6416 return gen_reg_or_parallel (mode, orig_mode,
6417 SSE_REGNO (sse_regno));
6418 if (n == 4
6419 && regclass[0] == X86_64_SSE_CLASS
6420 && regclass[1] == X86_64_SSEUP_CLASS
6421 && regclass[2] == X86_64_SSEUP_CLASS
6422 && regclass[3] == X86_64_SSEUP_CLASS
6423 && mode != BLKmode)
6424 return gen_reg_or_parallel (mode, orig_mode,
6425 SSE_REGNO (sse_regno));
6426 if (n == 2
6427 && regclass[0] == X86_64_X87_CLASS
6428 && regclass[1] == X86_64_X87UP_CLASS)
6429 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6430
6431 if (n == 2
6432 && regclass[0] == X86_64_INTEGER_CLASS
6433 && regclass[1] == X86_64_INTEGER_CLASS
6434 && (mode == CDImode || mode == TImode || mode == TFmode)
6435 && intreg[0] + 1 == intreg[1])
6436 return gen_rtx_REG (mode, intreg[0]);
6437
6438 /* Otherwise figure out the entries of the PARALLEL. */
6439 for (i = 0; i < n; i++)
6440 {
6441 int pos;
6442
6443 switch (regclass[i])
6444 {
6445 case X86_64_NO_CLASS:
6446 break;
6447 case X86_64_INTEGER_CLASS:
6448 case X86_64_INTEGERSI_CLASS:
6449 /* Merge TImodes on aligned occasions here too. */
6450 if (i * 8 + 8 > bytes)
6451 tmpmode
6452 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6453 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6454 tmpmode = SImode;
6455 else
6456 tmpmode = DImode;
6457 /* We've requested 24 bytes we
6458 don't have mode for. Use DImode. */
6459 if (tmpmode == BLKmode)
6460 tmpmode = DImode;
6461 exp [nexps++]
6462 = gen_rtx_EXPR_LIST (VOIDmode,
6463 gen_rtx_REG (tmpmode, *intreg),
6464 GEN_INT (i*8));
6465 intreg++;
6466 break;
6467 case X86_64_SSESF_CLASS:
6468 exp [nexps++]
6469 = gen_rtx_EXPR_LIST (VOIDmode,
6470 gen_rtx_REG (SFmode,
6471 SSE_REGNO (sse_regno)),
6472 GEN_INT (i*8));
6473 sse_regno++;
6474 break;
6475 case X86_64_SSEDF_CLASS:
6476 exp [nexps++]
6477 = gen_rtx_EXPR_LIST (VOIDmode,
6478 gen_rtx_REG (DFmode,
6479 SSE_REGNO (sse_regno)),
6480 GEN_INT (i*8));
6481 sse_regno++;
6482 break;
6483 case X86_64_SSE_CLASS:
6484 pos = i;
6485 switch (n)
6486 {
6487 case 1:
6488 tmpmode = DImode;
6489 break;
6490 case 2:
6491 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6492 {
6493 tmpmode = TImode;
6494 i++;
6495 }
6496 else
6497 tmpmode = DImode;
6498 break;
6499 case 4:
6500 gcc_assert (i == 0
6501 && regclass[1] == X86_64_SSEUP_CLASS
6502 && regclass[2] == X86_64_SSEUP_CLASS
6503 && regclass[3] == X86_64_SSEUP_CLASS);
6504 tmpmode = OImode;
6505 i += 3;
6506 break;
6507 default:
6508 gcc_unreachable ();
6509 }
6510 exp [nexps++]
6511 = gen_rtx_EXPR_LIST (VOIDmode,
6512 gen_rtx_REG (tmpmode,
6513 SSE_REGNO (sse_regno)),
6514 GEN_INT (pos*8));
6515 sse_regno++;
6516 break;
6517 default:
6518 gcc_unreachable ();
6519 }
6520 }
6521
6522 /* Empty aligned struct, union or class. */
6523 if (nexps == 0)
6524 return NULL;
6525
6526 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6527 for (i = 0; i < nexps; i++)
6528 XVECEXP (ret, 0, i) = exp [i];
6529 return ret;
6530 }
6531
6532 /* Update the data in CUM to advance over an argument of mode MODE
6533 and data type TYPE. (TYPE is null for libcalls where that information
6534 may not be available.) */
6535
6536 static void
6537 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6538 const_tree type, HOST_WIDE_INT bytes,
6539 HOST_WIDE_INT words)
6540 {
6541 switch (mode)
6542 {
6543 default:
6544 break;
6545
6546 case BLKmode:
6547 if (bytes < 0)
6548 break;
6549 /* FALLTHRU */
6550
6551 case DImode:
6552 case SImode:
6553 case HImode:
6554 case QImode:
6555 cum->words += words;
6556 cum->nregs -= words;
6557 cum->regno += words;
6558
6559 if (cum->nregs <= 0)
6560 {
6561 cum->nregs = 0;
6562 cum->regno = 0;
6563 }
6564 break;
6565
6566 case OImode:
6567 /* OImode shouldn't be used directly. */
6568 gcc_unreachable ();
6569
6570 case DFmode:
6571 if (cum->float_in_sse < 2)
6572 break;
6573 case SFmode:
6574 if (cum->float_in_sse < 1)
6575 break;
6576 /* FALLTHRU */
6577
6578 case V8SFmode:
6579 case V8SImode:
6580 case V32QImode:
6581 case V16HImode:
6582 case V4DFmode:
6583 case V4DImode:
6584 case TImode:
6585 case V16QImode:
6586 case V8HImode:
6587 case V4SImode:
6588 case V2DImode:
6589 case V4SFmode:
6590 case V2DFmode:
6591 if (!type || !AGGREGATE_TYPE_P (type))
6592 {
6593 cum->sse_words += words;
6594 cum->sse_nregs -= 1;
6595 cum->sse_regno += 1;
6596 if (cum->sse_nregs <= 0)
6597 {
6598 cum->sse_nregs = 0;
6599 cum->sse_regno = 0;
6600 }
6601 }
6602 break;
6603
6604 case V8QImode:
6605 case V4HImode:
6606 case V2SImode:
6607 case V2SFmode:
6608 case V1TImode:
6609 case V1DImode:
6610 if (!type || !AGGREGATE_TYPE_P (type))
6611 {
6612 cum->mmx_words += words;
6613 cum->mmx_nregs -= 1;
6614 cum->mmx_regno += 1;
6615 if (cum->mmx_nregs <= 0)
6616 {
6617 cum->mmx_nregs = 0;
6618 cum->mmx_regno = 0;
6619 }
6620 }
6621 break;
6622 }
6623 }
6624
6625 static void
6626 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6627 const_tree type, HOST_WIDE_INT words, bool named)
6628 {
6629 int int_nregs, sse_nregs;
6630
6631 /* Unnamed 256bit vector mode parameters are passed on stack. */
6632 if (!named && VALID_AVX256_REG_MODE (mode))
6633 return;
6634
6635 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6636 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6637 {
6638 cum->nregs -= int_nregs;
6639 cum->sse_nregs -= sse_nregs;
6640 cum->regno += int_nregs;
6641 cum->sse_regno += sse_nregs;
6642 }
6643 else
6644 {
6645 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6646 cum->words = (cum->words + align - 1) & ~(align - 1);
6647 cum->words += words;
6648 }
6649 }
6650
6651 static void
6652 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6653 HOST_WIDE_INT words)
6654 {
6655 /* Otherwise, this should be passed indirect. */
6656 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6657
6658 cum->words += words;
6659 if (cum->nregs > 0)
6660 {
6661 cum->nregs -= 1;
6662 cum->regno += 1;
6663 }
6664 }
6665
6666 /* Update the data in CUM to advance over an argument of mode MODE and
6667 data type TYPE. (TYPE is null for libcalls where that information
6668 may not be available.) */
6669
6670 static void
6671 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6672 const_tree type, bool named)
6673 {
6674 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6675 HOST_WIDE_INT bytes, words;
6676
6677 if (mode == BLKmode)
6678 bytes = int_size_in_bytes (type);
6679 else
6680 bytes = GET_MODE_SIZE (mode);
6681 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6682
6683 if (type)
6684 mode = type_natural_mode (type, NULL);
6685
6686 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6687 function_arg_advance_ms_64 (cum, bytes, words);
6688 else if (TARGET_64BIT)
6689 function_arg_advance_64 (cum, mode, type, words, named);
6690 else
6691 function_arg_advance_32 (cum, mode, type, bytes, words);
6692 }
6693
6694 /* Define where to put the arguments to a function.
6695 Value is zero to push the argument on the stack,
6696 or a hard register in which to store the argument.
6697
6698 MODE is the argument's machine mode.
6699 TYPE is the data type of the argument (as a tree).
6700 This is null for libcalls where that information may
6701 not be available.
6702 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6703 the preceding args and about the function being called.
6704 NAMED is nonzero if this argument is a named parameter
6705 (otherwise it is an extra parameter matching an ellipsis). */
6706
6707 static rtx
6708 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6709 enum machine_mode orig_mode, const_tree type,
6710 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6711 {
6712 static bool warnedsse, warnedmmx;
6713
6714 /* Avoid the AL settings for the Unix64 ABI. */
6715 if (mode == VOIDmode)
6716 return constm1_rtx;
6717
6718 switch (mode)
6719 {
6720 default:
6721 break;
6722
6723 case BLKmode:
6724 if (bytes < 0)
6725 break;
6726 /* FALLTHRU */
6727 case DImode:
6728 case SImode:
6729 case HImode:
6730 case QImode:
6731 if (words <= cum->nregs)
6732 {
6733 int regno = cum->regno;
6734
6735 /* Fastcall allocates the first two DWORD (SImode) or
6736 smaller arguments to ECX and EDX if it isn't an
6737 aggregate type . */
6738 if (cum->fastcall)
6739 {
6740 if (mode == BLKmode
6741 || mode == DImode
6742 || (type && AGGREGATE_TYPE_P (type)))
6743 break;
6744
6745 /* ECX not EAX is the first allocated register. */
6746 if (regno == AX_REG)
6747 regno = CX_REG;
6748 }
6749 return gen_rtx_REG (mode, regno);
6750 }
6751 break;
6752
6753 case DFmode:
6754 if (cum->float_in_sse < 2)
6755 break;
6756 case SFmode:
6757 if (cum->float_in_sse < 1)
6758 break;
6759 /* FALLTHRU */
6760 case TImode:
6761 /* In 32bit, we pass TImode in xmm registers. */
6762 case V16QImode:
6763 case V8HImode:
6764 case V4SImode:
6765 case V2DImode:
6766 case V4SFmode:
6767 case V2DFmode:
6768 if (!type || !AGGREGATE_TYPE_P (type))
6769 {
6770 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6771 {
6772 warnedsse = true;
6773 warning (0, "SSE vector argument without SSE enabled "
6774 "changes the ABI");
6775 }
6776 if (cum->sse_nregs)
6777 return gen_reg_or_parallel (mode, orig_mode,
6778 cum->sse_regno + FIRST_SSE_REG);
6779 }
6780 break;
6781
6782 case OImode:
6783 /* OImode shouldn't be used directly. */
6784 gcc_unreachable ();
6785
6786 case V8SFmode:
6787 case V8SImode:
6788 case V32QImode:
6789 case V16HImode:
6790 case V4DFmode:
6791 case V4DImode:
6792 if (!type || !AGGREGATE_TYPE_P (type))
6793 {
6794 if (cum->sse_nregs)
6795 return gen_reg_or_parallel (mode, orig_mode,
6796 cum->sse_regno + FIRST_SSE_REG);
6797 }
6798 break;
6799
6800 case V8QImode:
6801 case V4HImode:
6802 case V2SImode:
6803 case V2SFmode:
6804 case V1TImode:
6805 case V1DImode:
6806 if (!type || !AGGREGATE_TYPE_P (type))
6807 {
6808 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
6809 {
6810 warnedmmx = true;
6811 warning (0, "MMX vector argument without MMX enabled "
6812 "changes the ABI");
6813 }
6814 if (cum->mmx_nregs)
6815 return gen_reg_or_parallel (mode, orig_mode,
6816 cum->mmx_regno + FIRST_MMX_REG);
6817 }
6818 break;
6819 }
6820
6821 return NULL_RTX;
6822 }
6823
6824 static rtx
6825 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6826 enum machine_mode orig_mode, const_tree type, bool named)
6827 {
6828 /* Handle a hidden AL argument containing number of registers
6829 for varargs x86-64 functions. */
6830 if (mode == VOIDmode)
6831 return GEN_INT (cum->maybe_vaarg
6832 ? (cum->sse_nregs < 0
6833 ? X86_64_SSE_REGPARM_MAX
6834 : cum->sse_regno)
6835 : -1);
6836
6837 switch (mode)
6838 {
6839 default:
6840 break;
6841
6842 case V8SFmode:
6843 case V8SImode:
6844 case V32QImode:
6845 case V16HImode:
6846 case V4DFmode:
6847 case V4DImode:
6848 /* Unnamed 256bit vector mode parameters are passed on stack. */
6849 if (!named)
6850 return NULL;
6851 break;
6852 }
6853
6854 return construct_container (mode, orig_mode, type, 0, cum->nregs,
6855 cum->sse_nregs,
6856 &x86_64_int_parameter_registers [cum->regno],
6857 cum->sse_regno);
6858 }
6859
6860 static rtx
6861 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6862 enum machine_mode orig_mode, bool named,
6863 HOST_WIDE_INT bytes)
6864 {
6865 unsigned int regno;
6866
6867 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
6868 We use value of -2 to specify that current function call is MSABI. */
6869 if (mode == VOIDmode)
6870 return GEN_INT (-2);
6871
6872 /* If we've run out of registers, it goes on the stack. */
6873 if (cum->nregs == 0)
6874 return NULL_RTX;
6875
6876 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
6877
6878 /* Only floating point modes are passed in anything but integer regs. */
6879 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
6880 {
6881 if (named)
6882 regno = cum->regno + FIRST_SSE_REG;
6883 else
6884 {
6885 rtx t1, t2;
6886
6887 /* Unnamed floating parameters are passed in both the
6888 SSE and integer registers. */
6889 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
6890 t2 = gen_rtx_REG (mode, regno);
6891 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
6892 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
6893 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
6894 }
6895 }
6896 /* Handle aggregated types passed in register. */
6897 if (orig_mode == BLKmode)
6898 {
6899 if (bytes > 0 && bytes <= 8)
6900 mode = (bytes > 4 ? DImode : SImode);
6901 if (mode == BLKmode)
6902 mode = DImode;
6903 }
6904
6905 return gen_reg_or_parallel (mode, orig_mode, regno);
6906 }
6907
6908 /* Return where to put the arguments to a function.
6909 Return zero to push the argument on the stack, or a hard register in which to store the argument.
6910
6911 MODE is the argument's machine mode. TYPE is the data type of the
6912 argument. It is null for libcalls where that information may not be
6913 available. CUM gives information about the preceding args and about
6914 the function being called. NAMED is nonzero if this argument is a
6915 named parameter (otherwise it is an extra parameter matching an
6916 ellipsis). */
6917
6918 static rtx
6919 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
6920 const_tree type, bool named)
6921 {
6922 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6923 enum machine_mode mode = omode;
6924 HOST_WIDE_INT bytes, words;
6925 rtx arg;
6926
6927 if (mode == BLKmode)
6928 bytes = int_size_in_bytes (type);
6929 else
6930 bytes = GET_MODE_SIZE (mode);
6931 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6932
6933 /* To simplify the code below, represent vector types with a vector mode
6934 even if MMX/SSE are not active. */
6935 if (type && TREE_CODE (type) == VECTOR_TYPE)
6936 mode = type_natural_mode (type, cum);
6937
6938 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6939 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
6940 else if (TARGET_64BIT)
6941 arg = function_arg_64 (cum, mode, omode, type, named);
6942 else
6943 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
6944
6945 return arg;
6946 }
6947
6948 /* A C expression that indicates when an argument must be passed by
6949 reference. If nonzero for an argument, a copy of that argument is
6950 made in memory and a pointer to the argument is passed instead of
6951 the argument itself. The pointer is passed in whatever way is
6952 appropriate for passing a pointer to that type. */
6953
6954 static bool
6955 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
6956 enum machine_mode mode ATTRIBUTE_UNUSED,
6957 const_tree type, bool named ATTRIBUTE_UNUSED)
6958 {
6959 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6960
6961 /* See Windows x64 Software Convention. */
6962 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6963 {
6964 int msize = (int) GET_MODE_SIZE (mode);
6965 if (type)
6966 {
6967 /* Arrays are passed by reference. */
6968 if (TREE_CODE (type) == ARRAY_TYPE)
6969 return true;
6970
6971 if (AGGREGATE_TYPE_P (type))
6972 {
6973 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
6974 are passed by reference. */
6975 msize = int_size_in_bytes (type);
6976 }
6977 }
6978
6979 /* __m128 is passed by reference. */
6980 switch (msize) {
6981 case 1: case 2: case 4: case 8:
6982 break;
6983 default:
6984 return true;
6985 }
6986 }
6987 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
6988 return 1;
6989
6990 return 0;
6991 }
6992
6993 /* Return true when TYPE should be 128bit aligned for 32bit argument
6994 passing ABI. XXX: This function is obsolete and is only used for
6995 checking psABI compatibility with previous versions of GCC. */
6996
6997 static bool
6998 ix86_compat_aligned_value_p (const_tree type)
6999 {
7000 enum machine_mode mode = TYPE_MODE (type);
7001 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
7002 || mode == TDmode
7003 || mode == TFmode
7004 || mode == TCmode)
7005 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
7006 return true;
7007 if (TYPE_ALIGN (type) < 128)
7008 return false;
7009
7010 if (AGGREGATE_TYPE_P (type))
7011 {
7012 /* Walk the aggregates recursively. */
7013 switch (TREE_CODE (type))
7014 {
7015 case RECORD_TYPE:
7016 case UNION_TYPE:
7017 case QUAL_UNION_TYPE:
7018 {
7019 tree field;
7020
7021 /* Walk all the structure fields. */
7022 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7023 {
7024 if (TREE_CODE (field) == FIELD_DECL
7025 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
7026 return true;
7027 }
7028 break;
7029 }
7030
7031 case ARRAY_TYPE:
7032 /* Just for use if some languages passes arrays by value. */
7033 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
7034 return true;
7035 break;
7036
7037 default:
7038 gcc_unreachable ();
7039 }
7040 }
7041 return false;
7042 }
7043
7044 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
7045 XXX: This function is obsolete and is only used for checking psABI
7046 compatibility with previous versions of GCC. */
7047
7048 static unsigned int
7049 ix86_compat_function_arg_boundary (enum machine_mode mode,
7050 const_tree type, unsigned int align)
7051 {
7052 /* In 32bit, only _Decimal128 and __float128 are aligned to their
7053 natural boundaries. */
7054 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
7055 {
7056 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
7057 make an exception for SSE modes since these require 128bit
7058 alignment.
7059
7060 The handling here differs from field_alignment. ICC aligns MMX
7061 arguments to 4 byte boundaries, while structure fields are aligned
7062 to 8 byte boundaries. */
7063 if (!type)
7064 {
7065 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7066 align = PARM_BOUNDARY;
7067 }
7068 else
7069 {
7070 if (!ix86_compat_aligned_value_p (type))
7071 align = PARM_BOUNDARY;
7072 }
7073 }
7074 if (align > BIGGEST_ALIGNMENT)
7075 align = BIGGEST_ALIGNMENT;
7076 return align;
7077 }
7078
7079 /* Return true when TYPE should be 128bit aligned for 32bit argument
7080 passing ABI. */
7081
7082 static bool
7083 ix86_contains_aligned_value_p (const_tree type)
7084 {
7085 enum machine_mode mode = TYPE_MODE (type);
7086
7087 if (mode == XFmode || mode == XCmode)
7088 return false;
7089
7090 if (TYPE_ALIGN (type) < 128)
7091 return false;
7092
7093 if (AGGREGATE_TYPE_P (type))
7094 {
7095 /* Walk the aggregates recursively. */
7096 switch (TREE_CODE (type))
7097 {
7098 case RECORD_TYPE:
7099 case UNION_TYPE:
7100 case QUAL_UNION_TYPE:
7101 {
7102 tree field;
7103
7104 /* Walk all the structure fields. */
7105 for (field = TYPE_FIELDS (type);
7106 field;
7107 field = DECL_CHAIN (field))
7108 {
7109 if (TREE_CODE (field) == FIELD_DECL
7110 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7111 return true;
7112 }
7113 break;
7114 }
7115
7116 case ARRAY_TYPE:
7117 /* Just for use if some languages passes arrays by value. */
7118 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7119 return true;
7120 break;
7121
7122 default:
7123 gcc_unreachable ();
7124 }
7125 }
7126 else
7127 return TYPE_ALIGN (type) >= 128;
7128
7129 return false;
7130 }
7131
7132 /* Gives the alignment boundary, in bits, of an argument with the
7133 specified mode and type. */
7134
7135 static unsigned int
7136 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7137 {
7138 unsigned int align;
7139 if (type)
7140 {
7141 /* Since the main variant type is used for call, we convert it to
7142 the main variant type. */
7143 type = TYPE_MAIN_VARIANT (type);
7144 align = TYPE_ALIGN (type);
7145 }
7146 else
7147 align = GET_MODE_ALIGNMENT (mode);
7148 if (align < PARM_BOUNDARY)
7149 align = PARM_BOUNDARY;
7150 else
7151 {
7152 static bool warned;
7153 unsigned int saved_align = align;
7154
7155 if (!TARGET_64BIT)
7156 {
7157 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7158 if (!type)
7159 {
7160 if (mode == XFmode || mode == XCmode)
7161 align = PARM_BOUNDARY;
7162 }
7163 else if (!ix86_contains_aligned_value_p (type))
7164 align = PARM_BOUNDARY;
7165
7166 if (align < 128)
7167 align = PARM_BOUNDARY;
7168 }
7169
7170 if (warn_psabi
7171 && !warned
7172 && align != ix86_compat_function_arg_boundary (mode, type,
7173 saved_align))
7174 {
7175 warned = true;
7176 inform (input_location,
7177 "The ABI for passing parameters with %d-byte"
7178 " alignment has changed in GCC 4.6",
7179 align / BITS_PER_UNIT);
7180 }
7181 }
7182
7183 return align;
7184 }
7185
7186 /* Return true if N is a possible register number of function value. */
7187
7188 static bool
7189 ix86_function_value_regno_p (const unsigned int regno)
7190 {
7191 switch (regno)
7192 {
7193 case AX_REG:
7194 return true;
7195
7196 case FIRST_FLOAT_REG:
7197 /* TODO: The function should depend on current function ABI but
7198 builtins.c would need updating then. Therefore we use the
7199 default ABI. */
7200 if (TARGET_64BIT && ix86_abi == MS_ABI)
7201 return false;
7202 return TARGET_FLOAT_RETURNS_IN_80387;
7203
7204 case FIRST_SSE_REG:
7205 return TARGET_SSE;
7206
7207 case FIRST_MMX_REG:
7208 if (TARGET_MACHO || TARGET_64BIT)
7209 return false;
7210 return TARGET_MMX;
7211 }
7212
7213 return false;
7214 }
7215
7216 /* Define how to find the value returned by a function.
7217 VALTYPE is the data type of the value (as a tree).
7218 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7219 otherwise, FUNC is 0. */
7220
7221 static rtx
7222 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7223 const_tree fntype, const_tree fn)
7224 {
7225 unsigned int regno;
7226
7227 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7228 we normally prevent this case when mmx is not available. However
7229 some ABIs may require the result to be returned like DImode. */
7230 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7231 regno = FIRST_MMX_REG;
7232
7233 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7234 we prevent this case when sse is not available. However some ABIs
7235 may require the result to be returned like integer TImode. */
7236 else if (mode == TImode
7237 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7238 regno = FIRST_SSE_REG;
7239
7240 /* 32-byte vector modes in %ymm0. */
7241 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7242 regno = FIRST_SSE_REG;
7243
7244 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7245 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7246 regno = FIRST_FLOAT_REG;
7247 else
7248 /* Most things go in %eax. */
7249 regno = AX_REG;
7250
7251 /* Override FP return register with %xmm0 for local functions when
7252 SSE math is enabled or for functions with sseregparm attribute. */
7253 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7254 {
7255 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7256 if ((sse_level >= 1 && mode == SFmode)
7257 || (sse_level == 2 && mode == DFmode))
7258 regno = FIRST_SSE_REG;
7259 }
7260
7261 /* OImode shouldn't be used directly. */
7262 gcc_assert (mode != OImode);
7263
7264 return gen_rtx_REG (orig_mode, regno);
7265 }
7266
7267 static rtx
7268 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7269 const_tree valtype)
7270 {
7271 rtx ret;
7272
7273 /* Handle libcalls, which don't provide a type node. */
7274 if (valtype == NULL)
7275 {
7276 unsigned int regno;
7277
7278 switch (mode)
7279 {
7280 case SFmode:
7281 case SCmode:
7282 case DFmode:
7283 case DCmode:
7284 case TFmode:
7285 case SDmode:
7286 case DDmode:
7287 case TDmode:
7288 regno = FIRST_SSE_REG;
7289 break;
7290 case XFmode:
7291 case XCmode:
7292 regno = FIRST_FLOAT_REG;
7293 break;
7294 case TCmode:
7295 return NULL;
7296 default:
7297 regno = AX_REG;
7298 }
7299
7300 return gen_rtx_REG (mode, regno);
7301 }
7302 else if (POINTER_TYPE_P (valtype))
7303 {
7304 /* Pointers are always returned in word_mode. */
7305 mode = word_mode;
7306 }
7307
7308 ret = construct_container (mode, orig_mode, valtype, 1,
7309 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7310 x86_64_int_return_registers, 0);
7311
7312 /* For zero sized structures, construct_container returns NULL, but we
7313 need to keep rest of compiler happy by returning meaningful value. */
7314 if (!ret)
7315 ret = gen_rtx_REG (orig_mode, AX_REG);
7316
7317 return ret;
7318 }
7319
7320 static rtx
7321 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode)
7322 {
7323 unsigned int regno = AX_REG;
7324
7325 if (TARGET_SSE)
7326 {
7327 switch (GET_MODE_SIZE (mode))
7328 {
7329 case 16:
7330 if((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7331 && !COMPLEX_MODE_P (mode))
7332 regno = FIRST_SSE_REG;
7333 break;
7334 case 8:
7335 case 4:
7336 if (mode == SFmode || mode == DFmode)
7337 regno = FIRST_SSE_REG;
7338 break;
7339 default:
7340 break;
7341 }
7342 }
7343 return gen_rtx_REG (orig_mode, regno);
7344 }
7345
7346 static rtx
7347 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7348 enum machine_mode orig_mode, enum machine_mode mode)
7349 {
7350 const_tree fn, fntype;
7351
7352 fn = NULL_TREE;
7353 if (fntype_or_decl && DECL_P (fntype_or_decl))
7354 fn = fntype_or_decl;
7355 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7356
7357 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7358 return function_value_ms_64 (orig_mode, mode);
7359 else if (TARGET_64BIT)
7360 return function_value_64 (orig_mode, mode, valtype);
7361 else
7362 return function_value_32 (orig_mode, mode, fntype, fn);
7363 }
7364
7365 static rtx
7366 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7367 bool outgoing ATTRIBUTE_UNUSED)
7368 {
7369 enum machine_mode mode, orig_mode;
7370
7371 orig_mode = TYPE_MODE (valtype);
7372 mode = type_natural_mode (valtype, NULL);
7373 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7374 }
7375
7376 /* Pointer function arguments and return values are promoted to
7377 word_mode. */
7378
7379 static enum machine_mode
7380 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7381 int *punsignedp, const_tree fntype,
7382 int for_return)
7383 {
7384 if (type != NULL_TREE && POINTER_TYPE_P (type))
7385 {
7386 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7387 return word_mode;
7388 }
7389 return default_promote_function_mode (type, mode, punsignedp, fntype,
7390 for_return);
7391 }
7392
7393 /* Return true if a structure, union or array with MODE containing FIELD
7394 should be accessed using BLKmode. */
7395
7396 static bool
7397 ix86_member_type_forces_blk (const_tree field, enum machine_mode mode)
7398 {
7399 /* Union with XFmode must be in BLKmode. */
7400 return (mode == XFmode
7401 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
7402 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
7403 }
7404
7405 rtx
7406 ix86_libcall_value (enum machine_mode mode)
7407 {
7408 return ix86_function_value_1 (NULL, NULL, mode, mode);
7409 }
7410
7411 /* Return true iff type is returned in memory. */
7412
7413 static bool ATTRIBUTE_UNUSED
7414 return_in_memory_32 (const_tree type, enum machine_mode mode)
7415 {
7416 HOST_WIDE_INT size;
7417
7418 if (mode == BLKmode)
7419 return true;
7420
7421 size = int_size_in_bytes (type);
7422
7423 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7424 return false;
7425
7426 if (VECTOR_MODE_P (mode) || mode == TImode)
7427 {
7428 /* User-created vectors small enough to fit in EAX. */
7429 if (size < 8)
7430 return false;
7431
7432 /* MMX/3dNow values are returned in MM0,
7433 except when it doesn't exits or the ABI prescribes otherwise. */
7434 if (size == 8)
7435 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7436
7437 /* SSE values are returned in XMM0, except when it doesn't exist. */
7438 if (size == 16)
7439 return !TARGET_SSE;
7440
7441 /* AVX values are returned in YMM0, except when it doesn't exist. */
7442 if (size == 32)
7443 return !TARGET_AVX;
7444 }
7445
7446 if (mode == XFmode)
7447 return false;
7448
7449 if (size > 12)
7450 return true;
7451
7452 /* OImode shouldn't be used directly. */
7453 gcc_assert (mode != OImode);
7454
7455 return false;
7456 }
7457
7458 static bool ATTRIBUTE_UNUSED
7459 return_in_memory_64 (const_tree type, enum machine_mode mode)
7460 {
7461 int needed_intregs, needed_sseregs;
7462 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7463 }
7464
7465 static bool ATTRIBUTE_UNUSED
7466 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7467 {
7468 HOST_WIDE_INT size = int_size_in_bytes (type);
7469
7470 /* __m128 is returned in xmm0. */
7471 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7472 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7473 return false;
7474
7475 /* Otherwise, the size must be exactly in [1248]. */
7476 return size != 1 && size != 2 && size != 4 && size != 8;
7477 }
7478
7479 static bool
7480 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7481 {
7482 #ifdef SUBTARGET_RETURN_IN_MEMORY
7483 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7484 #else
7485 const enum machine_mode mode = type_natural_mode (type, NULL);
7486
7487 if (TARGET_64BIT)
7488 {
7489 if (ix86_function_type_abi (fntype) == MS_ABI)
7490 return return_in_memory_ms_64 (type, mode);
7491 else
7492 return return_in_memory_64 (type, mode);
7493 }
7494 else
7495 return return_in_memory_32 (type, mode);
7496 #endif
7497 }
7498
7499 /* When returning SSE vector types, we have a choice of either
7500 (1) being abi incompatible with a -march switch, or
7501 (2) generating an error.
7502 Given no good solution, I think the safest thing is one warning.
7503 The user won't be able to use -Werror, but....
7504
7505 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7506 called in response to actually generating a caller or callee that
7507 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7508 via aggregate_value_p for general type probing from tree-ssa. */
7509
7510 static rtx
7511 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7512 {
7513 static bool warnedsse, warnedmmx;
7514
7515 if (!TARGET_64BIT && type)
7516 {
7517 /* Look at the return type of the function, not the function type. */
7518 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7519
7520 if (!TARGET_SSE && !warnedsse)
7521 {
7522 if (mode == TImode
7523 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7524 {
7525 warnedsse = true;
7526 warning (0, "SSE vector return without SSE enabled "
7527 "changes the ABI");
7528 }
7529 }
7530
7531 if (!TARGET_MMX && !warnedmmx)
7532 {
7533 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7534 {
7535 warnedmmx = true;
7536 warning (0, "MMX vector return without MMX enabled "
7537 "changes the ABI");
7538 }
7539 }
7540 }
7541
7542 return NULL;
7543 }
7544
7545 \f
7546 /* Create the va_list data type. */
7547
7548 /* Returns the calling convention specific va_list date type.
7549 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7550
7551 static tree
7552 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7553 {
7554 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7555
7556 /* For i386 we use plain pointer to argument area. */
7557 if (!TARGET_64BIT || abi == MS_ABI)
7558 return build_pointer_type (char_type_node);
7559
7560 record = lang_hooks.types.make_type (RECORD_TYPE);
7561 type_decl = build_decl (BUILTINS_LOCATION,
7562 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7563
7564 f_gpr = build_decl (BUILTINS_LOCATION,
7565 FIELD_DECL, get_identifier ("gp_offset"),
7566 unsigned_type_node);
7567 f_fpr = build_decl (BUILTINS_LOCATION,
7568 FIELD_DECL, get_identifier ("fp_offset"),
7569 unsigned_type_node);
7570 f_ovf = build_decl (BUILTINS_LOCATION,
7571 FIELD_DECL, get_identifier ("overflow_arg_area"),
7572 ptr_type_node);
7573 f_sav = build_decl (BUILTINS_LOCATION,
7574 FIELD_DECL, get_identifier ("reg_save_area"),
7575 ptr_type_node);
7576
7577 va_list_gpr_counter_field = f_gpr;
7578 va_list_fpr_counter_field = f_fpr;
7579
7580 DECL_FIELD_CONTEXT (f_gpr) = record;
7581 DECL_FIELD_CONTEXT (f_fpr) = record;
7582 DECL_FIELD_CONTEXT (f_ovf) = record;
7583 DECL_FIELD_CONTEXT (f_sav) = record;
7584
7585 TYPE_STUB_DECL (record) = type_decl;
7586 TYPE_NAME (record) = type_decl;
7587 TYPE_FIELDS (record) = f_gpr;
7588 DECL_CHAIN (f_gpr) = f_fpr;
7589 DECL_CHAIN (f_fpr) = f_ovf;
7590 DECL_CHAIN (f_ovf) = f_sav;
7591
7592 layout_type (record);
7593
7594 /* The correct type is an array type of one element. */
7595 return build_array_type (record, build_index_type (size_zero_node));
7596 }
7597
7598 /* Setup the builtin va_list data type and for 64-bit the additional
7599 calling convention specific va_list data types. */
7600
7601 static tree
7602 ix86_build_builtin_va_list (void)
7603 {
7604 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7605
7606 /* Initialize abi specific va_list builtin types. */
7607 if (TARGET_64BIT)
7608 {
7609 tree t;
7610 if (ix86_abi == MS_ABI)
7611 {
7612 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7613 if (TREE_CODE (t) != RECORD_TYPE)
7614 t = build_variant_type_copy (t);
7615 sysv_va_list_type_node = t;
7616 }
7617 else
7618 {
7619 t = ret;
7620 if (TREE_CODE (t) != RECORD_TYPE)
7621 t = build_variant_type_copy (t);
7622 sysv_va_list_type_node = t;
7623 }
7624 if (ix86_abi != MS_ABI)
7625 {
7626 t = ix86_build_builtin_va_list_abi (MS_ABI);
7627 if (TREE_CODE (t) != RECORD_TYPE)
7628 t = build_variant_type_copy (t);
7629 ms_va_list_type_node = t;
7630 }
7631 else
7632 {
7633 t = ret;
7634 if (TREE_CODE (t) != RECORD_TYPE)
7635 t = build_variant_type_copy (t);
7636 ms_va_list_type_node = t;
7637 }
7638 }
7639
7640 return ret;
7641 }
7642
7643 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7644
7645 static void
7646 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7647 {
7648 rtx save_area, mem;
7649 alias_set_type set;
7650 int i, max;
7651
7652 /* GPR size of varargs save area. */
7653 if (cfun->va_list_gpr_size)
7654 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7655 else
7656 ix86_varargs_gpr_size = 0;
7657
7658 /* FPR size of varargs save area. We don't need it if we don't pass
7659 anything in SSE registers. */
7660 if (TARGET_SSE && cfun->va_list_fpr_size)
7661 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7662 else
7663 ix86_varargs_fpr_size = 0;
7664
7665 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7666 return;
7667
7668 save_area = frame_pointer_rtx;
7669 set = get_varargs_alias_set ();
7670
7671 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7672 if (max > X86_64_REGPARM_MAX)
7673 max = X86_64_REGPARM_MAX;
7674
7675 for (i = cum->regno; i < max; i++)
7676 {
7677 mem = gen_rtx_MEM (word_mode,
7678 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
7679 MEM_NOTRAP_P (mem) = 1;
7680 set_mem_alias_set (mem, set);
7681 emit_move_insn (mem,
7682 gen_rtx_REG (word_mode,
7683 x86_64_int_parameter_registers[i]));
7684 }
7685
7686 if (ix86_varargs_fpr_size)
7687 {
7688 enum machine_mode smode;
7689 rtx label, test;
7690
7691 /* Now emit code to save SSE registers. The AX parameter contains number
7692 of SSE parameter registers used to call this function, though all we
7693 actually check here is the zero/non-zero status. */
7694
7695 label = gen_label_rtx ();
7696 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7697 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7698 label));
7699
7700 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7701 we used movdqa (i.e. TImode) instead? Perhaps even better would
7702 be if we could determine the real mode of the data, via a hook
7703 into pass_stdarg. Ignore all that for now. */
7704 smode = V4SFmode;
7705 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7706 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7707
7708 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7709 if (max > X86_64_SSE_REGPARM_MAX)
7710 max = X86_64_SSE_REGPARM_MAX;
7711
7712 for (i = cum->sse_regno; i < max; ++i)
7713 {
7714 mem = plus_constant (Pmode, save_area,
7715 i * 16 + ix86_varargs_gpr_size);
7716 mem = gen_rtx_MEM (smode, mem);
7717 MEM_NOTRAP_P (mem) = 1;
7718 set_mem_alias_set (mem, set);
7719 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7720
7721 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7722 }
7723
7724 emit_label (label);
7725 }
7726 }
7727
7728 static void
7729 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7730 {
7731 alias_set_type set = get_varargs_alias_set ();
7732 int i;
7733
7734 /* Reset to zero, as there might be a sysv vaarg used
7735 before. */
7736 ix86_varargs_gpr_size = 0;
7737 ix86_varargs_fpr_size = 0;
7738
7739 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7740 {
7741 rtx reg, mem;
7742
7743 mem = gen_rtx_MEM (Pmode,
7744 plus_constant (Pmode, virtual_incoming_args_rtx,
7745 i * UNITS_PER_WORD));
7746 MEM_NOTRAP_P (mem) = 1;
7747 set_mem_alias_set (mem, set);
7748
7749 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7750 emit_move_insn (mem, reg);
7751 }
7752 }
7753
7754 static void
7755 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7756 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7757 int no_rtl)
7758 {
7759 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7760 CUMULATIVE_ARGS next_cum;
7761 tree fntype;
7762
7763 /* This argument doesn't appear to be used anymore. Which is good,
7764 because the old code here didn't suppress rtl generation. */
7765 gcc_assert (!no_rtl);
7766
7767 if (!TARGET_64BIT)
7768 return;
7769
7770 fntype = TREE_TYPE (current_function_decl);
7771
7772 /* For varargs, we do not want to skip the dummy va_dcl argument.
7773 For stdargs, we do want to skip the last named argument. */
7774 next_cum = *cum;
7775 if (stdarg_p (fntype))
7776 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
7777 true);
7778
7779 if (cum->call_abi == MS_ABI)
7780 setup_incoming_varargs_ms_64 (&next_cum);
7781 else
7782 setup_incoming_varargs_64 (&next_cum);
7783 }
7784
7785 /* Checks if TYPE is of kind va_list char *. */
7786
7787 static bool
7788 is_va_list_char_pointer (tree type)
7789 {
7790 tree canonic;
7791
7792 /* For 32-bit it is always true. */
7793 if (!TARGET_64BIT)
7794 return true;
7795 canonic = ix86_canonical_va_list_type (type);
7796 return (canonic == ms_va_list_type_node
7797 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
7798 }
7799
7800 /* Implement va_start. */
7801
7802 static void
7803 ix86_va_start (tree valist, rtx nextarg)
7804 {
7805 HOST_WIDE_INT words, n_gpr, n_fpr;
7806 tree f_gpr, f_fpr, f_ovf, f_sav;
7807 tree gpr, fpr, ovf, sav, t;
7808 tree type;
7809 rtx ovf_rtx;
7810
7811 if (flag_split_stack
7812 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7813 {
7814 unsigned int scratch_regno;
7815
7816 /* When we are splitting the stack, we can't refer to the stack
7817 arguments using internal_arg_pointer, because they may be on
7818 the old stack. The split stack prologue will arrange to
7819 leave a pointer to the old stack arguments in a scratch
7820 register, which we here copy to a pseudo-register. The split
7821 stack prologue can't set the pseudo-register directly because
7822 it (the prologue) runs before any registers have been saved. */
7823
7824 scratch_regno = split_stack_prologue_scratch_regno ();
7825 if (scratch_regno != INVALID_REGNUM)
7826 {
7827 rtx reg, seq;
7828
7829 reg = gen_reg_rtx (Pmode);
7830 cfun->machine->split_stack_varargs_pointer = reg;
7831
7832 start_sequence ();
7833 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
7834 seq = get_insns ();
7835 end_sequence ();
7836
7837 push_topmost_sequence ();
7838 emit_insn_after (seq, entry_of_function ());
7839 pop_topmost_sequence ();
7840 }
7841 }
7842
7843 /* Only 64bit target needs something special. */
7844 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7845 {
7846 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7847 std_expand_builtin_va_start (valist, nextarg);
7848 else
7849 {
7850 rtx va_r, next;
7851
7852 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
7853 next = expand_binop (ptr_mode, add_optab,
7854 cfun->machine->split_stack_varargs_pointer,
7855 crtl->args.arg_offset_rtx,
7856 NULL_RTX, 0, OPTAB_LIB_WIDEN);
7857 convert_move (va_r, next, 0);
7858 }
7859 return;
7860 }
7861
7862 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7863 f_fpr = DECL_CHAIN (f_gpr);
7864 f_ovf = DECL_CHAIN (f_fpr);
7865 f_sav = DECL_CHAIN (f_ovf);
7866
7867 valist = build_simple_mem_ref (valist);
7868 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
7869 /* The following should be folded into the MEM_REF offset. */
7870 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
7871 f_gpr, NULL_TREE);
7872 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
7873 f_fpr, NULL_TREE);
7874 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
7875 f_ovf, NULL_TREE);
7876 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
7877 f_sav, NULL_TREE);
7878
7879 /* Count number of gp and fp argument registers used. */
7880 words = crtl->args.info.words;
7881 n_gpr = crtl->args.info.regno;
7882 n_fpr = crtl->args.info.sse_regno;
7883
7884 if (cfun->va_list_gpr_size)
7885 {
7886 type = TREE_TYPE (gpr);
7887 t = build2 (MODIFY_EXPR, type,
7888 gpr, build_int_cst (type, n_gpr * 8));
7889 TREE_SIDE_EFFECTS (t) = 1;
7890 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7891 }
7892
7893 if (TARGET_SSE && cfun->va_list_fpr_size)
7894 {
7895 type = TREE_TYPE (fpr);
7896 t = build2 (MODIFY_EXPR, type, fpr,
7897 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
7898 TREE_SIDE_EFFECTS (t) = 1;
7899 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7900 }
7901
7902 /* Find the overflow area. */
7903 type = TREE_TYPE (ovf);
7904 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7905 ovf_rtx = crtl->args.internal_arg_pointer;
7906 else
7907 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
7908 t = make_tree (type, ovf_rtx);
7909 if (words != 0)
7910 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
7911 t = build2 (MODIFY_EXPR, type, ovf, t);
7912 TREE_SIDE_EFFECTS (t) = 1;
7913 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7914
7915 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
7916 {
7917 /* Find the register save area.
7918 Prologue of the function save it right above stack frame. */
7919 type = TREE_TYPE (sav);
7920 t = make_tree (type, frame_pointer_rtx);
7921 if (!ix86_varargs_gpr_size)
7922 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
7923 t = build2 (MODIFY_EXPR, type, sav, t);
7924 TREE_SIDE_EFFECTS (t) = 1;
7925 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7926 }
7927 }
7928
7929 /* Implement va_arg. */
7930
7931 static tree
7932 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
7933 gimple_seq *post_p)
7934 {
7935 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
7936 tree f_gpr, f_fpr, f_ovf, f_sav;
7937 tree gpr, fpr, ovf, sav, t;
7938 int size, rsize;
7939 tree lab_false, lab_over = NULL_TREE;
7940 tree addr, t2;
7941 rtx container;
7942 int indirect_p = 0;
7943 tree ptrtype;
7944 enum machine_mode nat_mode;
7945 unsigned int arg_boundary;
7946
7947 /* Only 64bit target needs something special. */
7948 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7949 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
7950
7951 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7952 f_fpr = DECL_CHAIN (f_gpr);
7953 f_ovf = DECL_CHAIN (f_fpr);
7954 f_sav = DECL_CHAIN (f_ovf);
7955
7956 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
7957 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
7958 valist = build_va_arg_indirect_ref (valist);
7959 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
7960 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
7961 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
7962
7963 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7964 if (indirect_p)
7965 type = build_pointer_type (type);
7966 size = int_size_in_bytes (type);
7967 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7968
7969 nat_mode = type_natural_mode (type, NULL);
7970 switch (nat_mode)
7971 {
7972 case V8SFmode:
7973 case V8SImode:
7974 case V32QImode:
7975 case V16HImode:
7976 case V4DFmode:
7977 case V4DImode:
7978 /* Unnamed 256bit vector mode parameters are passed on stack. */
7979 if (!TARGET_64BIT_MS_ABI)
7980 {
7981 container = NULL;
7982 break;
7983 }
7984
7985 default:
7986 container = construct_container (nat_mode, TYPE_MODE (type),
7987 type, 0, X86_64_REGPARM_MAX,
7988 X86_64_SSE_REGPARM_MAX, intreg,
7989 0);
7990 break;
7991 }
7992
7993 /* Pull the value out of the saved registers. */
7994
7995 addr = create_tmp_var (ptr_type_node, "addr");
7996
7997 if (container)
7998 {
7999 int needed_intregs, needed_sseregs;
8000 bool need_temp;
8001 tree int_addr, sse_addr;
8002
8003 lab_false = create_artificial_label (UNKNOWN_LOCATION);
8004 lab_over = create_artificial_label (UNKNOWN_LOCATION);
8005
8006 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
8007
8008 need_temp = (!REG_P (container)
8009 && ((needed_intregs && TYPE_ALIGN (type) > 64)
8010 || TYPE_ALIGN (type) > 128));
8011
8012 /* In case we are passing structure, verify that it is consecutive block
8013 on the register save area. If not we need to do moves. */
8014 if (!need_temp && !REG_P (container))
8015 {
8016 /* Verify that all registers are strictly consecutive */
8017 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
8018 {
8019 int i;
8020
8021 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8022 {
8023 rtx slot = XVECEXP (container, 0, i);
8024 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
8025 || INTVAL (XEXP (slot, 1)) != i * 16)
8026 need_temp = 1;
8027 }
8028 }
8029 else
8030 {
8031 int i;
8032
8033 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8034 {
8035 rtx slot = XVECEXP (container, 0, i);
8036 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
8037 || INTVAL (XEXP (slot, 1)) != i * 8)
8038 need_temp = 1;
8039 }
8040 }
8041 }
8042 if (!need_temp)
8043 {
8044 int_addr = addr;
8045 sse_addr = addr;
8046 }
8047 else
8048 {
8049 int_addr = create_tmp_var (ptr_type_node, "int_addr");
8050 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
8051 }
8052
8053 /* First ensure that we fit completely in registers. */
8054 if (needed_intregs)
8055 {
8056 t = build_int_cst (TREE_TYPE (gpr),
8057 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
8058 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
8059 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8060 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8061 gimplify_and_add (t, pre_p);
8062 }
8063 if (needed_sseregs)
8064 {
8065 t = build_int_cst (TREE_TYPE (fpr),
8066 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
8067 + X86_64_REGPARM_MAX * 8);
8068 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
8069 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8070 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8071 gimplify_and_add (t, pre_p);
8072 }
8073
8074 /* Compute index to start of area used for integer regs. */
8075 if (needed_intregs)
8076 {
8077 /* int_addr = gpr + sav; */
8078 t = fold_build_pointer_plus (sav, gpr);
8079 gimplify_assign (int_addr, t, pre_p);
8080 }
8081 if (needed_sseregs)
8082 {
8083 /* sse_addr = fpr + sav; */
8084 t = fold_build_pointer_plus (sav, fpr);
8085 gimplify_assign (sse_addr, t, pre_p);
8086 }
8087 if (need_temp)
8088 {
8089 int i, prev_size = 0;
8090 tree temp = create_tmp_var (type, "va_arg_tmp");
8091
8092 /* addr = &temp; */
8093 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8094 gimplify_assign (addr, t, pre_p);
8095
8096 for (i = 0; i < XVECLEN (container, 0); i++)
8097 {
8098 rtx slot = XVECEXP (container, 0, i);
8099 rtx reg = XEXP (slot, 0);
8100 enum machine_mode mode = GET_MODE (reg);
8101 tree piece_type;
8102 tree addr_type;
8103 tree daddr_type;
8104 tree src_addr, src;
8105 int src_offset;
8106 tree dest_addr, dest;
8107 int cur_size = GET_MODE_SIZE (mode);
8108
8109 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8110 prev_size = INTVAL (XEXP (slot, 1));
8111 if (prev_size + cur_size > size)
8112 {
8113 cur_size = size - prev_size;
8114 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8115 if (mode == BLKmode)
8116 mode = QImode;
8117 }
8118 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8119 if (mode == GET_MODE (reg))
8120 addr_type = build_pointer_type (piece_type);
8121 else
8122 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8123 true);
8124 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8125 true);
8126
8127 if (SSE_REGNO_P (REGNO (reg)))
8128 {
8129 src_addr = sse_addr;
8130 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8131 }
8132 else
8133 {
8134 src_addr = int_addr;
8135 src_offset = REGNO (reg) * 8;
8136 }
8137 src_addr = fold_convert (addr_type, src_addr);
8138 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8139
8140 dest_addr = fold_convert (daddr_type, addr);
8141 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8142 if (cur_size == GET_MODE_SIZE (mode))
8143 {
8144 src = build_va_arg_indirect_ref (src_addr);
8145 dest = build_va_arg_indirect_ref (dest_addr);
8146
8147 gimplify_assign (dest, src, pre_p);
8148 }
8149 else
8150 {
8151 tree copy
8152 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8153 3, dest_addr, src_addr,
8154 size_int (cur_size));
8155 gimplify_and_add (copy, pre_p);
8156 }
8157 prev_size += cur_size;
8158 }
8159 }
8160
8161 if (needed_intregs)
8162 {
8163 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8164 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8165 gimplify_assign (gpr, t, pre_p);
8166 }
8167
8168 if (needed_sseregs)
8169 {
8170 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8171 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8172 gimplify_assign (fpr, t, pre_p);
8173 }
8174
8175 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8176
8177 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8178 }
8179
8180 /* ... otherwise out of the overflow area. */
8181
8182 /* When we align parameter on stack for caller, if the parameter
8183 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8184 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8185 here with caller. */
8186 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8187 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8188 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8189
8190 /* Care for on-stack alignment if needed. */
8191 if (arg_boundary <= 64 || size == 0)
8192 t = ovf;
8193 else
8194 {
8195 HOST_WIDE_INT align = arg_boundary / 8;
8196 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8197 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8198 build_int_cst (TREE_TYPE (t), -align));
8199 }
8200
8201 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8202 gimplify_assign (addr, t, pre_p);
8203
8204 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8205 gimplify_assign (unshare_expr (ovf), t, pre_p);
8206
8207 if (container)
8208 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8209
8210 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8211 addr = fold_convert (ptrtype, addr);
8212
8213 if (indirect_p)
8214 addr = build_va_arg_indirect_ref (addr);
8215 return build_va_arg_indirect_ref (addr);
8216 }
8217 \f
8218 /* Return true if OPNUM's MEM should be matched
8219 in movabs* patterns. */
8220
8221 bool
8222 ix86_check_movabs (rtx insn, int opnum)
8223 {
8224 rtx set, mem;
8225
8226 set = PATTERN (insn);
8227 if (GET_CODE (set) == PARALLEL)
8228 set = XVECEXP (set, 0, 0);
8229 gcc_assert (GET_CODE (set) == SET);
8230 mem = XEXP (set, opnum);
8231 while (GET_CODE (mem) == SUBREG)
8232 mem = SUBREG_REG (mem);
8233 gcc_assert (MEM_P (mem));
8234 return volatile_ok || !MEM_VOLATILE_P (mem);
8235 }
8236 \f
8237 /* Initialize the table of extra 80387 mathematical constants. */
8238
8239 static void
8240 init_ext_80387_constants (void)
8241 {
8242 static const char * cst[5] =
8243 {
8244 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8245 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8246 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8247 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8248 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8249 };
8250 int i;
8251
8252 for (i = 0; i < 5; i++)
8253 {
8254 real_from_string (&ext_80387_constants_table[i], cst[i]);
8255 /* Ensure each constant is rounded to XFmode precision. */
8256 real_convert (&ext_80387_constants_table[i],
8257 XFmode, &ext_80387_constants_table[i]);
8258 }
8259
8260 ext_80387_constants_init = 1;
8261 }
8262
8263 /* Return non-zero if the constant is something that
8264 can be loaded with a special instruction. */
8265
8266 int
8267 standard_80387_constant_p (rtx x)
8268 {
8269 enum machine_mode mode = GET_MODE (x);
8270
8271 REAL_VALUE_TYPE r;
8272
8273 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8274 return -1;
8275
8276 if (x == CONST0_RTX (mode))
8277 return 1;
8278 if (x == CONST1_RTX (mode))
8279 return 2;
8280
8281 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8282
8283 /* For XFmode constants, try to find a special 80387 instruction when
8284 optimizing for size or on those CPUs that benefit from them. */
8285 if (mode == XFmode
8286 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8287 {
8288 int i;
8289
8290 if (! ext_80387_constants_init)
8291 init_ext_80387_constants ();
8292
8293 for (i = 0; i < 5; i++)
8294 if (real_identical (&r, &ext_80387_constants_table[i]))
8295 return i + 3;
8296 }
8297
8298 /* Load of the constant -0.0 or -1.0 will be split as
8299 fldz;fchs or fld1;fchs sequence. */
8300 if (real_isnegzero (&r))
8301 return 8;
8302 if (real_identical (&r, &dconstm1))
8303 return 9;
8304
8305 return 0;
8306 }
8307
8308 /* Return the opcode of the special instruction to be used to load
8309 the constant X. */
8310
8311 const char *
8312 standard_80387_constant_opcode (rtx x)
8313 {
8314 switch (standard_80387_constant_p (x))
8315 {
8316 case 1:
8317 return "fldz";
8318 case 2:
8319 return "fld1";
8320 case 3:
8321 return "fldlg2";
8322 case 4:
8323 return "fldln2";
8324 case 5:
8325 return "fldl2e";
8326 case 6:
8327 return "fldl2t";
8328 case 7:
8329 return "fldpi";
8330 case 8:
8331 case 9:
8332 return "#";
8333 default:
8334 gcc_unreachable ();
8335 }
8336 }
8337
8338 /* Return the CONST_DOUBLE representing the 80387 constant that is
8339 loaded by the specified special instruction. The argument IDX
8340 matches the return value from standard_80387_constant_p. */
8341
8342 rtx
8343 standard_80387_constant_rtx (int idx)
8344 {
8345 int i;
8346
8347 if (! ext_80387_constants_init)
8348 init_ext_80387_constants ();
8349
8350 switch (idx)
8351 {
8352 case 3:
8353 case 4:
8354 case 5:
8355 case 6:
8356 case 7:
8357 i = idx - 3;
8358 break;
8359
8360 default:
8361 gcc_unreachable ();
8362 }
8363
8364 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8365 XFmode);
8366 }
8367
8368 /* Return 1 if X is all 0s and 2 if x is all 1s
8369 in supported SSE/AVX vector mode. */
8370
8371 int
8372 standard_sse_constant_p (rtx x)
8373 {
8374 enum machine_mode mode = GET_MODE (x);
8375
8376 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8377 return 1;
8378 if (vector_all_ones_operand (x, mode))
8379 switch (mode)
8380 {
8381 case V16QImode:
8382 case V8HImode:
8383 case V4SImode:
8384 case V2DImode:
8385 if (TARGET_SSE2)
8386 return 2;
8387 case V32QImode:
8388 case V16HImode:
8389 case V8SImode:
8390 case V4DImode:
8391 if (TARGET_AVX2)
8392 return 2;
8393 default:
8394 break;
8395 }
8396
8397 return 0;
8398 }
8399
8400 /* Return the opcode of the special instruction to be used to load
8401 the constant X. */
8402
8403 const char *
8404 standard_sse_constant_opcode (rtx insn, rtx x)
8405 {
8406 switch (standard_sse_constant_p (x))
8407 {
8408 case 1:
8409 switch (get_attr_mode (insn))
8410 {
8411 case MODE_TI:
8412 return "%vpxor\t%0, %d0";
8413 case MODE_V2DF:
8414 return "%vxorpd\t%0, %d0";
8415 case MODE_V4SF:
8416 return "%vxorps\t%0, %d0";
8417
8418 case MODE_OI:
8419 return "vpxor\t%x0, %x0, %x0";
8420 case MODE_V4DF:
8421 return "vxorpd\t%x0, %x0, %x0";
8422 case MODE_V8SF:
8423 return "vxorps\t%x0, %x0, %x0";
8424
8425 default:
8426 break;
8427 }
8428
8429 case 2:
8430 if (TARGET_AVX)
8431 return "vpcmpeqd\t%0, %0, %0";
8432 else
8433 return "pcmpeqd\t%0, %0";
8434
8435 default:
8436 break;
8437 }
8438 gcc_unreachable ();
8439 }
8440
8441 /* Returns true if OP contains a symbol reference */
8442
8443 bool
8444 symbolic_reference_mentioned_p (rtx op)
8445 {
8446 const char *fmt;
8447 int i;
8448
8449 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8450 return true;
8451
8452 fmt = GET_RTX_FORMAT (GET_CODE (op));
8453 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8454 {
8455 if (fmt[i] == 'E')
8456 {
8457 int j;
8458
8459 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8460 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8461 return true;
8462 }
8463
8464 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8465 return true;
8466 }
8467
8468 return false;
8469 }
8470
8471 /* Return true if it is appropriate to emit `ret' instructions in the
8472 body of a function. Do this only if the epilogue is simple, needing a
8473 couple of insns. Prior to reloading, we can't tell how many registers
8474 must be saved, so return false then. Return false if there is no frame
8475 marker to de-allocate. */
8476
8477 bool
8478 ix86_can_use_return_insn_p (void)
8479 {
8480 struct ix86_frame frame;
8481
8482 if (! reload_completed || frame_pointer_needed)
8483 return 0;
8484
8485 /* Don't allow more than 32k pop, since that's all we can do
8486 with one instruction. */
8487 if (crtl->args.pops_args && crtl->args.size >= 32768)
8488 return 0;
8489
8490 ix86_compute_frame_layout (&frame);
8491 return (frame.stack_pointer_offset == UNITS_PER_WORD
8492 && (frame.nregs + frame.nsseregs) == 0);
8493 }
8494 \f
8495 /* Value should be nonzero if functions must have frame pointers.
8496 Zero means the frame pointer need not be set up (and parms may
8497 be accessed via the stack pointer) in functions that seem suitable. */
8498
8499 static bool
8500 ix86_frame_pointer_required (void)
8501 {
8502 /* If we accessed previous frames, then the generated code expects
8503 to be able to access the saved ebp value in our frame. */
8504 if (cfun->machine->accesses_prev_frame)
8505 return true;
8506
8507 /* Several x86 os'es need a frame pointer for other reasons,
8508 usually pertaining to setjmp. */
8509 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8510 return true;
8511
8512 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8513 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8514 return true;
8515
8516 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
8517 allocation is 4GB. */
8518 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
8519 return true;
8520
8521 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8522 turns off the frame pointer by default. Turn it back on now if
8523 we've not got a leaf function. */
8524 if (TARGET_OMIT_LEAF_FRAME_POINTER
8525 && (!crtl->is_leaf
8526 || ix86_current_function_calls_tls_descriptor))
8527 return true;
8528
8529 if (crtl->profile && !flag_fentry)
8530 return true;
8531
8532 return false;
8533 }
8534
8535 /* Record that the current function accesses previous call frames. */
8536
8537 void
8538 ix86_setup_frame_addresses (void)
8539 {
8540 cfun->machine->accesses_prev_frame = 1;
8541 }
8542 \f
8543 #ifndef USE_HIDDEN_LINKONCE
8544 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8545 # define USE_HIDDEN_LINKONCE 1
8546 # else
8547 # define USE_HIDDEN_LINKONCE 0
8548 # endif
8549 #endif
8550
8551 static int pic_labels_used;
8552
8553 /* Fills in the label name that should be used for a pc thunk for
8554 the given register. */
8555
8556 static void
8557 get_pc_thunk_name (char name[32], unsigned int regno)
8558 {
8559 gcc_assert (!TARGET_64BIT);
8560
8561 if (USE_HIDDEN_LINKONCE)
8562 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8563 else
8564 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8565 }
8566
8567
8568 /* This function generates code for -fpic that loads %ebx with
8569 the return address of the caller and then returns. */
8570
8571 static void
8572 ix86_code_end (void)
8573 {
8574 rtx xops[2];
8575 int regno;
8576
8577 for (regno = AX_REG; regno <= SP_REG; regno++)
8578 {
8579 char name[32];
8580 tree decl;
8581
8582 if (!(pic_labels_used & (1 << regno)))
8583 continue;
8584
8585 get_pc_thunk_name (name, regno);
8586
8587 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8588 get_identifier (name),
8589 build_function_type_list (void_type_node, NULL_TREE));
8590 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8591 NULL_TREE, void_type_node);
8592 TREE_PUBLIC (decl) = 1;
8593 TREE_STATIC (decl) = 1;
8594 DECL_IGNORED_P (decl) = 1;
8595
8596 #if TARGET_MACHO
8597 if (TARGET_MACHO)
8598 {
8599 switch_to_section (darwin_sections[text_coal_section]);
8600 fputs ("\t.weak_definition\t", asm_out_file);
8601 assemble_name (asm_out_file, name);
8602 fputs ("\n\t.private_extern\t", asm_out_file);
8603 assemble_name (asm_out_file, name);
8604 putc ('\n', asm_out_file);
8605 ASM_OUTPUT_LABEL (asm_out_file, name);
8606 DECL_WEAK (decl) = 1;
8607 }
8608 else
8609 #endif
8610 if (USE_HIDDEN_LINKONCE)
8611 {
8612 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8613
8614 targetm.asm_out.unique_section (decl, 0);
8615 switch_to_section (get_named_section (decl, NULL, 0));
8616
8617 targetm.asm_out.globalize_label (asm_out_file, name);
8618 fputs ("\t.hidden\t", asm_out_file);
8619 assemble_name (asm_out_file, name);
8620 putc ('\n', asm_out_file);
8621 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8622 }
8623 else
8624 {
8625 switch_to_section (text_section);
8626 ASM_OUTPUT_LABEL (asm_out_file, name);
8627 }
8628
8629 DECL_INITIAL (decl) = make_node (BLOCK);
8630 current_function_decl = decl;
8631 init_function_start (decl);
8632 first_function_block_is_cold = false;
8633 /* Make sure unwind info is emitted for the thunk if needed. */
8634 final_start_function (emit_barrier (), asm_out_file, 1);
8635
8636 /* Pad stack IP move with 4 instructions (two NOPs count
8637 as one instruction). */
8638 if (TARGET_PAD_SHORT_FUNCTION)
8639 {
8640 int i = 8;
8641
8642 while (i--)
8643 fputs ("\tnop\n", asm_out_file);
8644 }
8645
8646 xops[0] = gen_rtx_REG (Pmode, regno);
8647 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8648 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8649 fputs ("\tret\n", asm_out_file);
8650 final_end_function ();
8651 init_insn_lengths ();
8652 free_after_compilation (cfun);
8653 set_cfun (NULL);
8654 current_function_decl = NULL;
8655 }
8656
8657 if (flag_split_stack)
8658 file_end_indicate_split_stack ();
8659 }
8660
8661 /* Emit code for the SET_GOT patterns. */
8662
8663 const char *
8664 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8665 {
8666 rtx xops[3];
8667
8668 xops[0] = dest;
8669
8670 if (TARGET_VXWORKS_RTP && flag_pic)
8671 {
8672 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8673 xops[2] = gen_rtx_MEM (Pmode,
8674 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8675 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8676
8677 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8678 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8679 an unadorned address. */
8680 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8681 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8682 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8683 return "";
8684 }
8685
8686 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8687
8688 if (!flag_pic)
8689 {
8690 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8691
8692 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8693
8694 #if TARGET_MACHO
8695 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8696 is what will be referenced by the Mach-O PIC subsystem. */
8697 if (!label)
8698 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8699 #endif
8700
8701 targetm.asm_out.internal_label (asm_out_file, "L",
8702 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8703 }
8704 else
8705 {
8706 char name[32];
8707 get_pc_thunk_name (name, REGNO (dest));
8708 pic_labels_used |= 1 << REGNO (dest);
8709
8710 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8711 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8712 output_asm_insn ("call\t%X2", xops);
8713 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8714 is what will be referenced by the Mach-O PIC subsystem. */
8715 #if TARGET_MACHO
8716 if (!label)
8717 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8718 else
8719 targetm.asm_out.internal_label (asm_out_file, "L",
8720 CODE_LABEL_NUMBER (label));
8721 #endif
8722 }
8723
8724 if (!TARGET_MACHO)
8725 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8726
8727 return "";
8728 }
8729
8730 /* Generate an "push" pattern for input ARG. */
8731
8732 static rtx
8733 gen_push (rtx arg)
8734 {
8735 struct machine_function *m = cfun->machine;
8736
8737 if (m->fs.cfa_reg == stack_pointer_rtx)
8738 m->fs.cfa_offset += UNITS_PER_WORD;
8739 m->fs.sp_offset += UNITS_PER_WORD;
8740
8741 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8742 arg = gen_rtx_REG (word_mode, REGNO (arg));
8743
8744 return gen_rtx_SET (VOIDmode,
8745 gen_rtx_MEM (word_mode,
8746 gen_rtx_PRE_DEC (Pmode,
8747 stack_pointer_rtx)),
8748 arg);
8749 }
8750
8751 /* Generate an "pop" pattern for input ARG. */
8752
8753 static rtx
8754 gen_pop (rtx arg)
8755 {
8756 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8757 arg = gen_rtx_REG (word_mode, REGNO (arg));
8758
8759 return gen_rtx_SET (VOIDmode,
8760 arg,
8761 gen_rtx_MEM (word_mode,
8762 gen_rtx_POST_INC (Pmode,
8763 stack_pointer_rtx)));
8764 }
8765
8766 /* Return >= 0 if there is an unused call-clobbered register available
8767 for the entire function. */
8768
8769 static unsigned int
8770 ix86_select_alt_pic_regnum (void)
8771 {
8772 if (crtl->is_leaf
8773 && !crtl->profile
8774 && !ix86_current_function_calls_tls_descriptor)
8775 {
8776 int i, drap;
8777 /* Can't use the same register for both PIC and DRAP. */
8778 if (crtl->drap_reg)
8779 drap = REGNO (crtl->drap_reg);
8780 else
8781 drap = -1;
8782 for (i = 2; i >= 0; --i)
8783 if (i != drap && !df_regs_ever_live_p (i))
8784 return i;
8785 }
8786
8787 return INVALID_REGNUM;
8788 }
8789
8790 /* Return TRUE if we need to save REGNO. */
8791
8792 static bool
8793 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
8794 {
8795 if (pic_offset_table_rtx
8796 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
8797 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
8798 || crtl->profile
8799 || crtl->calls_eh_return
8800 || crtl->uses_const_pool))
8801 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
8802
8803 if (crtl->calls_eh_return && maybe_eh_return)
8804 {
8805 unsigned i;
8806 for (i = 0; ; i++)
8807 {
8808 unsigned test = EH_RETURN_DATA_REGNO (i);
8809 if (test == INVALID_REGNUM)
8810 break;
8811 if (test == regno)
8812 return true;
8813 }
8814 }
8815
8816 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
8817 return true;
8818
8819 return (df_regs_ever_live_p (regno)
8820 && !call_used_regs[regno]
8821 && !fixed_regs[regno]
8822 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
8823 }
8824
8825 /* Return number of saved general prupose registers. */
8826
8827 static int
8828 ix86_nsaved_regs (void)
8829 {
8830 int nregs = 0;
8831 int regno;
8832
8833 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8834 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8835 nregs ++;
8836 return nregs;
8837 }
8838
8839 /* Return number of saved SSE registrers. */
8840
8841 static int
8842 ix86_nsaved_sseregs (void)
8843 {
8844 int nregs = 0;
8845 int regno;
8846
8847 if (!TARGET_64BIT_MS_ABI)
8848 return 0;
8849 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8850 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8851 nregs ++;
8852 return nregs;
8853 }
8854
8855 /* Given FROM and TO register numbers, say whether this elimination is
8856 allowed. If stack alignment is needed, we can only replace argument
8857 pointer with hard frame pointer, or replace frame pointer with stack
8858 pointer. Otherwise, frame pointer elimination is automatically
8859 handled and all other eliminations are valid. */
8860
8861 static bool
8862 ix86_can_eliminate (const int from, const int to)
8863 {
8864 if (stack_realign_fp)
8865 return ((from == ARG_POINTER_REGNUM
8866 && to == HARD_FRAME_POINTER_REGNUM)
8867 || (from == FRAME_POINTER_REGNUM
8868 && to == STACK_POINTER_REGNUM));
8869 else
8870 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
8871 }
8872
8873 /* Return the offset between two registers, one to be eliminated, and the other
8874 its replacement, at the start of a routine. */
8875
8876 HOST_WIDE_INT
8877 ix86_initial_elimination_offset (int from, int to)
8878 {
8879 struct ix86_frame frame;
8880 ix86_compute_frame_layout (&frame);
8881
8882 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
8883 return frame.hard_frame_pointer_offset;
8884 else if (from == FRAME_POINTER_REGNUM
8885 && to == HARD_FRAME_POINTER_REGNUM)
8886 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
8887 else
8888 {
8889 gcc_assert (to == STACK_POINTER_REGNUM);
8890
8891 if (from == ARG_POINTER_REGNUM)
8892 return frame.stack_pointer_offset;
8893
8894 gcc_assert (from == FRAME_POINTER_REGNUM);
8895 return frame.stack_pointer_offset - frame.frame_pointer_offset;
8896 }
8897 }
8898
8899 /* In a dynamically-aligned function, we can't know the offset from
8900 stack pointer to frame pointer, so we must ensure that setjmp
8901 eliminates fp against the hard fp (%ebp) rather than trying to
8902 index from %esp up to the top of the frame across a gap that is
8903 of unknown (at compile-time) size. */
8904 static rtx
8905 ix86_builtin_setjmp_frame_value (void)
8906 {
8907 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
8908 }
8909
8910 /* When using -fsplit-stack, the allocation routines set a field in
8911 the TCB to the bottom of the stack plus this much space, measured
8912 in bytes. */
8913
8914 #define SPLIT_STACK_AVAILABLE 256
8915
8916 /* Fill structure ix86_frame about frame of currently computed function. */
8917
8918 static void
8919 ix86_compute_frame_layout (struct ix86_frame *frame)
8920 {
8921 unsigned HOST_WIDE_INT stack_alignment_needed;
8922 HOST_WIDE_INT offset;
8923 unsigned HOST_WIDE_INT preferred_alignment;
8924 HOST_WIDE_INT size = get_frame_size ();
8925 HOST_WIDE_INT to_allocate;
8926
8927 frame->nregs = ix86_nsaved_regs ();
8928 frame->nsseregs = ix86_nsaved_sseregs ();
8929
8930 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
8931 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
8932
8933 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
8934 function prologues and leaf. */
8935 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
8936 && (!crtl->is_leaf || cfun->calls_alloca != 0
8937 || ix86_current_function_calls_tls_descriptor))
8938 {
8939 preferred_alignment = 16;
8940 stack_alignment_needed = 16;
8941 crtl->preferred_stack_boundary = 128;
8942 crtl->stack_alignment_needed = 128;
8943 }
8944
8945 gcc_assert (!size || stack_alignment_needed);
8946 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
8947 gcc_assert (preferred_alignment <= stack_alignment_needed);
8948
8949 /* For SEH we have to limit the amount of code movement into the prologue.
8950 At present we do this via a BLOCKAGE, at which point there's very little
8951 scheduling that can be done, which means that there's very little point
8952 in doing anything except PUSHs. */
8953 if (TARGET_SEH)
8954 cfun->machine->use_fast_prologue_epilogue = false;
8955
8956 /* During reload iteration the amount of registers saved can change.
8957 Recompute the value as needed. Do not recompute when amount of registers
8958 didn't change as reload does multiple calls to the function and does not
8959 expect the decision to change within single iteration. */
8960 else if (!optimize_function_for_size_p (cfun)
8961 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
8962 {
8963 int count = frame->nregs;
8964 struct cgraph_node *node = cgraph_get_node (current_function_decl);
8965
8966 cfun->machine->use_fast_prologue_epilogue_nregs = count;
8967
8968 /* The fast prologue uses move instead of push to save registers. This
8969 is significantly longer, but also executes faster as modern hardware
8970 can execute the moves in parallel, but can't do that for push/pop.
8971
8972 Be careful about choosing what prologue to emit: When function takes
8973 many instructions to execute we may use slow version as well as in
8974 case function is known to be outside hot spot (this is known with
8975 feedback only). Weight the size of function by number of registers
8976 to save as it is cheap to use one or two push instructions but very
8977 slow to use many of them. */
8978 if (count)
8979 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
8980 if (node->frequency < NODE_FREQUENCY_NORMAL
8981 || (flag_branch_probabilities
8982 && node->frequency < NODE_FREQUENCY_HOT))
8983 cfun->machine->use_fast_prologue_epilogue = false;
8984 else
8985 cfun->machine->use_fast_prologue_epilogue
8986 = !expensive_function_p (count);
8987 }
8988
8989 frame->save_regs_using_mov
8990 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
8991 /* If static stack checking is enabled and done with probes,
8992 the registers need to be saved before allocating the frame. */
8993 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
8994
8995 /* Skip return address. */
8996 offset = UNITS_PER_WORD;
8997
8998 /* Skip pushed static chain. */
8999 if (ix86_static_chain_on_stack)
9000 offset += UNITS_PER_WORD;
9001
9002 /* Skip saved base pointer. */
9003 if (frame_pointer_needed)
9004 offset += UNITS_PER_WORD;
9005 frame->hfp_save_offset = offset;
9006
9007 /* The traditional frame pointer location is at the top of the frame. */
9008 frame->hard_frame_pointer_offset = offset;
9009
9010 /* Register save area */
9011 offset += frame->nregs * UNITS_PER_WORD;
9012 frame->reg_save_offset = offset;
9013
9014 /* On SEH target, registers are pushed just before the frame pointer
9015 location. */
9016 if (TARGET_SEH)
9017 frame->hard_frame_pointer_offset = offset;
9018
9019 /* Align and set SSE register save area. */
9020 if (frame->nsseregs)
9021 {
9022 /* The only ABI that has saved SSE registers (Win64) also has a
9023 16-byte aligned default stack, and thus we don't need to be
9024 within the re-aligned local stack frame to save them. */
9025 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
9026 offset = (offset + 16 - 1) & -16;
9027 offset += frame->nsseregs * 16;
9028 }
9029 frame->sse_reg_save_offset = offset;
9030
9031 /* The re-aligned stack starts here. Values before this point are not
9032 directly comparable with values below this point. In order to make
9033 sure that no value happens to be the same before and after, force
9034 the alignment computation below to add a non-zero value. */
9035 if (stack_realign_fp)
9036 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
9037
9038 /* Va-arg area */
9039 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
9040 offset += frame->va_arg_size;
9041
9042 /* Align start of frame for local function. */
9043 if (stack_realign_fp
9044 || offset != frame->sse_reg_save_offset
9045 || size != 0
9046 || !crtl->is_leaf
9047 || cfun->calls_alloca
9048 || ix86_current_function_calls_tls_descriptor)
9049 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
9050
9051 /* Frame pointer points here. */
9052 frame->frame_pointer_offset = offset;
9053
9054 offset += size;
9055
9056 /* Add outgoing arguments area. Can be skipped if we eliminated
9057 all the function calls as dead code.
9058 Skipping is however impossible when function calls alloca. Alloca
9059 expander assumes that last crtl->outgoing_args_size
9060 of stack frame are unused. */
9061 if (ACCUMULATE_OUTGOING_ARGS
9062 && (!crtl->is_leaf || cfun->calls_alloca
9063 || ix86_current_function_calls_tls_descriptor))
9064 {
9065 offset += crtl->outgoing_args_size;
9066 frame->outgoing_arguments_size = crtl->outgoing_args_size;
9067 }
9068 else
9069 frame->outgoing_arguments_size = 0;
9070
9071 /* Align stack boundary. Only needed if we're calling another function
9072 or using alloca. */
9073 if (!crtl->is_leaf || cfun->calls_alloca
9074 || ix86_current_function_calls_tls_descriptor)
9075 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
9076
9077 /* We've reached end of stack frame. */
9078 frame->stack_pointer_offset = offset;
9079
9080 /* Size prologue needs to allocate. */
9081 to_allocate = offset - frame->sse_reg_save_offset;
9082
9083 if ((!to_allocate && frame->nregs <= 1)
9084 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9085 frame->save_regs_using_mov = false;
9086
9087 if (ix86_using_red_zone ()
9088 && crtl->sp_is_unchanging
9089 && crtl->is_leaf
9090 && !ix86_current_function_calls_tls_descriptor)
9091 {
9092 frame->red_zone_size = to_allocate;
9093 if (frame->save_regs_using_mov)
9094 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9095 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9096 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9097 }
9098 else
9099 frame->red_zone_size = 0;
9100 frame->stack_pointer_offset -= frame->red_zone_size;
9101
9102 /* The SEH frame pointer location is near the bottom of the frame.
9103 This is enforced by the fact that the difference between the
9104 stack pointer and the frame pointer is limited to 240 bytes in
9105 the unwind data structure. */
9106 if (TARGET_SEH)
9107 {
9108 HOST_WIDE_INT diff;
9109
9110 /* If we can leave the frame pointer where it is, do so. Also, returns
9111 the establisher frame for __builtin_frame_address (0). */
9112 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9113 if (diff <= SEH_MAX_FRAME_SIZE
9114 && (diff > 240 || (diff & 15) != 0)
9115 && !crtl->accesses_prior_frames)
9116 {
9117 /* Ideally we'd determine what portion of the local stack frame
9118 (within the constraint of the lowest 240) is most heavily used.
9119 But without that complication, simply bias the frame pointer
9120 by 128 bytes so as to maximize the amount of the local stack
9121 frame that is addressable with 8-bit offsets. */
9122 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9123 }
9124 }
9125 }
9126
9127 /* This is semi-inlined memory_address_length, but simplified
9128 since we know that we're always dealing with reg+offset, and
9129 to avoid having to create and discard all that rtl. */
9130
9131 static inline int
9132 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9133 {
9134 int len = 4;
9135
9136 if (offset == 0)
9137 {
9138 /* EBP and R13 cannot be encoded without an offset. */
9139 len = (regno == BP_REG || regno == R13_REG);
9140 }
9141 else if (IN_RANGE (offset, -128, 127))
9142 len = 1;
9143
9144 /* ESP and R12 must be encoded with a SIB byte. */
9145 if (regno == SP_REG || regno == R12_REG)
9146 len++;
9147
9148 return len;
9149 }
9150
9151 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9152 The valid base registers are taken from CFUN->MACHINE->FS. */
9153
9154 static rtx
9155 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9156 {
9157 const struct machine_function *m = cfun->machine;
9158 rtx base_reg = NULL;
9159 HOST_WIDE_INT base_offset = 0;
9160
9161 if (m->use_fast_prologue_epilogue)
9162 {
9163 /* Choose the base register most likely to allow the most scheduling
9164 opportunities. Generally FP is valid throughout the function,
9165 while DRAP must be reloaded within the epilogue. But choose either
9166 over the SP due to increased encoding size. */
9167
9168 if (m->fs.fp_valid)
9169 {
9170 base_reg = hard_frame_pointer_rtx;
9171 base_offset = m->fs.fp_offset - cfa_offset;
9172 }
9173 else if (m->fs.drap_valid)
9174 {
9175 base_reg = crtl->drap_reg;
9176 base_offset = 0 - cfa_offset;
9177 }
9178 else if (m->fs.sp_valid)
9179 {
9180 base_reg = stack_pointer_rtx;
9181 base_offset = m->fs.sp_offset - cfa_offset;
9182 }
9183 }
9184 else
9185 {
9186 HOST_WIDE_INT toffset;
9187 int len = 16, tlen;
9188
9189 /* Choose the base register with the smallest address encoding.
9190 With a tie, choose FP > DRAP > SP. */
9191 if (m->fs.sp_valid)
9192 {
9193 base_reg = stack_pointer_rtx;
9194 base_offset = m->fs.sp_offset - cfa_offset;
9195 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9196 }
9197 if (m->fs.drap_valid)
9198 {
9199 toffset = 0 - cfa_offset;
9200 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9201 if (tlen <= len)
9202 {
9203 base_reg = crtl->drap_reg;
9204 base_offset = toffset;
9205 len = tlen;
9206 }
9207 }
9208 if (m->fs.fp_valid)
9209 {
9210 toffset = m->fs.fp_offset - cfa_offset;
9211 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9212 if (tlen <= len)
9213 {
9214 base_reg = hard_frame_pointer_rtx;
9215 base_offset = toffset;
9216 len = tlen;
9217 }
9218 }
9219 }
9220 gcc_assert (base_reg != NULL);
9221
9222 return plus_constant (Pmode, base_reg, base_offset);
9223 }
9224
9225 /* Emit code to save registers in the prologue. */
9226
9227 static void
9228 ix86_emit_save_regs (void)
9229 {
9230 unsigned int regno;
9231 rtx insn;
9232
9233 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9234 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9235 {
9236 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
9237 RTX_FRAME_RELATED_P (insn) = 1;
9238 }
9239 }
9240
9241 /* Emit a single register save at CFA - CFA_OFFSET. */
9242
9243 static void
9244 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9245 HOST_WIDE_INT cfa_offset)
9246 {
9247 struct machine_function *m = cfun->machine;
9248 rtx reg = gen_rtx_REG (mode, regno);
9249 rtx mem, addr, base, insn;
9250
9251 addr = choose_baseaddr (cfa_offset);
9252 mem = gen_frame_mem (mode, addr);
9253
9254 /* For SSE saves, we need to indicate the 128-bit alignment. */
9255 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9256
9257 insn = emit_move_insn (mem, reg);
9258 RTX_FRAME_RELATED_P (insn) = 1;
9259
9260 base = addr;
9261 if (GET_CODE (base) == PLUS)
9262 base = XEXP (base, 0);
9263 gcc_checking_assert (REG_P (base));
9264
9265 /* When saving registers into a re-aligned local stack frame, avoid
9266 any tricky guessing by dwarf2out. */
9267 if (m->fs.realigned)
9268 {
9269 gcc_checking_assert (stack_realign_drap);
9270
9271 if (regno == REGNO (crtl->drap_reg))
9272 {
9273 /* A bit of a hack. We force the DRAP register to be saved in
9274 the re-aligned stack frame, which provides us with a copy
9275 of the CFA that will last past the prologue. Install it. */
9276 gcc_checking_assert (cfun->machine->fs.fp_valid);
9277 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9278 cfun->machine->fs.fp_offset - cfa_offset);
9279 mem = gen_rtx_MEM (mode, addr);
9280 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9281 }
9282 else
9283 {
9284 /* The frame pointer is a stable reference within the
9285 aligned frame. Use it. */
9286 gcc_checking_assert (cfun->machine->fs.fp_valid);
9287 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9288 cfun->machine->fs.fp_offset - cfa_offset);
9289 mem = gen_rtx_MEM (mode, addr);
9290 add_reg_note (insn, REG_CFA_EXPRESSION,
9291 gen_rtx_SET (VOIDmode, mem, reg));
9292 }
9293 }
9294
9295 /* The memory may not be relative to the current CFA register,
9296 which means that we may need to generate a new pattern for
9297 use by the unwind info. */
9298 else if (base != m->fs.cfa_reg)
9299 {
9300 addr = plus_constant (Pmode, m->fs.cfa_reg,
9301 m->fs.cfa_offset - cfa_offset);
9302 mem = gen_rtx_MEM (mode, addr);
9303 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9304 }
9305 }
9306
9307 /* Emit code to save registers using MOV insns.
9308 First register is stored at CFA - CFA_OFFSET. */
9309 static void
9310 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9311 {
9312 unsigned int regno;
9313
9314 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9315 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9316 {
9317 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
9318 cfa_offset -= UNITS_PER_WORD;
9319 }
9320 }
9321
9322 /* Emit code to save SSE registers using MOV insns.
9323 First register is stored at CFA - CFA_OFFSET. */
9324 static void
9325 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9326 {
9327 unsigned int regno;
9328
9329 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9330 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9331 {
9332 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9333 cfa_offset -= 16;
9334 }
9335 }
9336
9337 static GTY(()) rtx queued_cfa_restores;
9338
9339 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9340 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9341 Don't add the note if the previously saved value will be left untouched
9342 within stack red-zone till return, as unwinders can find the same value
9343 in the register and on the stack. */
9344
9345 static void
9346 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9347 {
9348 if (!crtl->shrink_wrapped
9349 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9350 return;
9351
9352 if (insn)
9353 {
9354 add_reg_note (insn, REG_CFA_RESTORE, reg);
9355 RTX_FRAME_RELATED_P (insn) = 1;
9356 }
9357 else
9358 queued_cfa_restores
9359 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9360 }
9361
9362 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9363
9364 static void
9365 ix86_add_queued_cfa_restore_notes (rtx insn)
9366 {
9367 rtx last;
9368 if (!queued_cfa_restores)
9369 return;
9370 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9371 ;
9372 XEXP (last, 1) = REG_NOTES (insn);
9373 REG_NOTES (insn) = queued_cfa_restores;
9374 queued_cfa_restores = NULL_RTX;
9375 RTX_FRAME_RELATED_P (insn) = 1;
9376 }
9377
9378 /* Expand prologue or epilogue stack adjustment.
9379 The pattern exist to put a dependency on all ebp-based memory accesses.
9380 STYLE should be negative if instructions should be marked as frame related,
9381 zero if %r11 register is live and cannot be freely used and positive
9382 otherwise. */
9383
9384 static void
9385 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9386 int style, bool set_cfa)
9387 {
9388 struct machine_function *m = cfun->machine;
9389 rtx insn;
9390 bool add_frame_related_expr = false;
9391
9392 if (Pmode == SImode)
9393 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9394 else if (x86_64_immediate_operand (offset, DImode))
9395 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9396 else
9397 {
9398 rtx tmp;
9399 /* r11 is used by indirect sibcall return as well, set before the
9400 epilogue and used after the epilogue. */
9401 if (style)
9402 tmp = gen_rtx_REG (DImode, R11_REG);
9403 else
9404 {
9405 gcc_assert (src != hard_frame_pointer_rtx
9406 && dest != hard_frame_pointer_rtx);
9407 tmp = hard_frame_pointer_rtx;
9408 }
9409 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9410 if (style < 0)
9411 add_frame_related_expr = true;
9412
9413 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9414 }
9415
9416 insn = emit_insn (insn);
9417 if (style >= 0)
9418 ix86_add_queued_cfa_restore_notes (insn);
9419
9420 if (set_cfa)
9421 {
9422 rtx r;
9423
9424 gcc_assert (m->fs.cfa_reg == src);
9425 m->fs.cfa_offset += INTVAL (offset);
9426 m->fs.cfa_reg = dest;
9427
9428 r = gen_rtx_PLUS (Pmode, src, offset);
9429 r = gen_rtx_SET (VOIDmode, dest, r);
9430 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9431 RTX_FRAME_RELATED_P (insn) = 1;
9432 }
9433 else if (style < 0)
9434 {
9435 RTX_FRAME_RELATED_P (insn) = 1;
9436 if (add_frame_related_expr)
9437 {
9438 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9439 r = gen_rtx_SET (VOIDmode, dest, r);
9440 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9441 }
9442 }
9443
9444 if (dest == stack_pointer_rtx)
9445 {
9446 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9447 bool valid = m->fs.sp_valid;
9448
9449 if (src == hard_frame_pointer_rtx)
9450 {
9451 valid = m->fs.fp_valid;
9452 ooffset = m->fs.fp_offset;
9453 }
9454 else if (src == crtl->drap_reg)
9455 {
9456 valid = m->fs.drap_valid;
9457 ooffset = 0;
9458 }
9459 else
9460 {
9461 /* Else there are two possibilities: SP itself, which we set
9462 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9463 taken care of this by hand along the eh_return path. */
9464 gcc_checking_assert (src == stack_pointer_rtx
9465 || offset == const0_rtx);
9466 }
9467
9468 m->fs.sp_offset = ooffset - INTVAL (offset);
9469 m->fs.sp_valid = valid;
9470 }
9471 }
9472
9473 /* Find an available register to be used as dynamic realign argument
9474 pointer regsiter. Such a register will be written in prologue and
9475 used in begin of body, so it must not be
9476 1. parameter passing register.
9477 2. GOT pointer.
9478 We reuse static-chain register if it is available. Otherwise, we
9479 use DI for i386 and R13 for x86-64. We chose R13 since it has
9480 shorter encoding.
9481
9482 Return: the regno of chosen register. */
9483
9484 static unsigned int
9485 find_drap_reg (void)
9486 {
9487 tree decl = cfun->decl;
9488
9489 if (TARGET_64BIT)
9490 {
9491 /* Use R13 for nested function or function need static chain.
9492 Since function with tail call may use any caller-saved
9493 registers in epilogue, DRAP must not use caller-saved
9494 register in such case. */
9495 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9496 return R13_REG;
9497
9498 return R10_REG;
9499 }
9500 else
9501 {
9502 /* Use DI for nested function or function need static chain.
9503 Since function with tail call may use any caller-saved
9504 registers in epilogue, DRAP must not use caller-saved
9505 register in such case. */
9506 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9507 return DI_REG;
9508
9509 /* Reuse static chain register if it isn't used for parameter
9510 passing. */
9511 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9512 {
9513 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9514 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9515 return CX_REG;
9516 }
9517 return DI_REG;
9518 }
9519 }
9520
9521 /* Return minimum incoming stack alignment. */
9522
9523 static unsigned int
9524 ix86_minimum_incoming_stack_boundary (bool sibcall)
9525 {
9526 unsigned int incoming_stack_boundary;
9527
9528 /* Prefer the one specified at command line. */
9529 if (ix86_user_incoming_stack_boundary)
9530 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9531 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9532 if -mstackrealign is used, it isn't used for sibcall check and
9533 estimated stack alignment is 128bit. */
9534 else if (!sibcall
9535 && !TARGET_64BIT
9536 && ix86_force_align_arg_pointer
9537 && crtl->stack_alignment_estimated == 128)
9538 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9539 else
9540 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9541
9542 /* Incoming stack alignment can be changed on individual functions
9543 via force_align_arg_pointer attribute. We use the smallest
9544 incoming stack boundary. */
9545 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9546 && lookup_attribute (ix86_force_align_arg_pointer_string,
9547 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9548 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9549
9550 /* The incoming stack frame has to be aligned at least at
9551 parm_stack_boundary. */
9552 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9553 incoming_stack_boundary = crtl->parm_stack_boundary;
9554
9555 /* Stack at entrance of main is aligned by runtime. We use the
9556 smallest incoming stack boundary. */
9557 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9558 && DECL_NAME (current_function_decl)
9559 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9560 && DECL_FILE_SCOPE_P (current_function_decl))
9561 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9562
9563 return incoming_stack_boundary;
9564 }
9565
9566 /* Update incoming stack boundary and estimated stack alignment. */
9567
9568 static void
9569 ix86_update_stack_boundary (void)
9570 {
9571 ix86_incoming_stack_boundary
9572 = ix86_minimum_incoming_stack_boundary (false);
9573
9574 /* x86_64 vararg needs 16byte stack alignment for register save
9575 area. */
9576 if (TARGET_64BIT
9577 && cfun->stdarg
9578 && crtl->stack_alignment_estimated < 128)
9579 crtl->stack_alignment_estimated = 128;
9580 }
9581
9582 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9583 needed or an rtx for DRAP otherwise. */
9584
9585 static rtx
9586 ix86_get_drap_rtx (void)
9587 {
9588 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9589 crtl->need_drap = true;
9590
9591 if (stack_realign_drap)
9592 {
9593 /* Assign DRAP to vDRAP and returns vDRAP */
9594 unsigned int regno = find_drap_reg ();
9595 rtx drap_vreg;
9596 rtx arg_ptr;
9597 rtx seq, insn;
9598
9599 arg_ptr = gen_rtx_REG (Pmode, regno);
9600 crtl->drap_reg = arg_ptr;
9601
9602 start_sequence ();
9603 drap_vreg = copy_to_reg (arg_ptr);
9604 seq = get_insns ();
9605 end_sequence ();
9606
9607 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9608 if (!optimize)
9609 {
9610 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9611 RTX_FRAME_RELATED_P (insn) = 1;
9612 }
9613 return drap_vreg;
9614 }
9615 else
9616 return NULL;
9617 }
9618
9619 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9620
9621 static rtx
9622 ix86_internal_arg_pointer (void)
9623 {
9624 return virtual_incoming_args_rtx;
9625 }
9626
9627 struct scratch_reg {
9628 rtx reg;
9629 bool saved;
9630 };
9631
9632 /* Return a short-lived scratch register for use on function entry.
9633 In 32-bit mode, it is valid only after the registers are saved
9634 in the prologue. This register must be released by means of
9635 release_scratch_register_on_entry once it is dead. */
9636
9637 static void
9638 get_scratch_register_on_entry (struct scratch_reg *sr)
9639 {
9640 int regno;
9641
9642 sr->saved = false;
9643
9644 if (TARGET_64BIT)
9645 {
9646 /* We always use R11 in 64-bit mode. */
9647 regno = R11_REG;
9648 }
9649 else
9650 {
9651 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9652 bool fastcall_p
9653 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9654 bool thiscall_p
9655 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9656 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9657 int regparm = ix86_function_regparm (fntype, decl);
9658 int drap_regno
9659 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9660
9661 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9662 for the static chain register. */
9663 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9664 && drap_regno != AX_REG)
9665 regno = AX_REG;
9666 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
9667 for the static chain register. */
9668 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
9669 regno = AX_REG;
9670 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
9671 regno = DX_REG;
9672 /* ecx is the static chain register. */
9673 else if (regparm < 3 && !fastcall_p && !thiscall_p
9674 && !static_chain_p
9675 && drap_regno != CX_REG)
9676 regno = CX_REG;
9677 else if (ix86_save_reg (BX_REG, true))
9678 regno = BX_REG;
9679 /* esi is the static chain register. */
9680 else if (!(regparm == 3 && static_chain_p)
9681 && ix86_save_reg (SI_REG, true))
9682 regno = SI_REG;
9683 else if (ix86_save_reg (DI_REG, true))
9684 regno = DI_REG;
9685 else
9686 {
9687 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9688 sr->saved = true;
9689 }
9690 }
9691
9692 sr->reg = gen_rtx_REG (Pmode, regno);
9693 if (sr->saved)
9694 {
9695 rtx insn = emit_insn (gen_push (sr->reg));
9696 RTX_FRAME_RELATED_P (insn) = 1;
9697 }
9698 }
9699
9700 /* Release a scratch register obtained from the preceding function. */
9701
9702 static void
9703 release_scratch_register_on_entry (struct scratch_reg *sr)
9704 {
9705 if (sr->saved)
9706 {
9707 struct machine_function *m = cfun->machine;
9708 rtx x, insn = emit_insn (gen_pop (sr->reg));
9709
9710 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9711 RTX_FRAME_RELATED_P (insn) = 1;
9712 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9713 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9714 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9715 m->fs.sp_offset -= UNITS_PER_WORD;
9716 }
9717 }
9718
9719 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9720
9721 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9722
9723 static void
9724 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9725 {
9726 /* We skip the probe for the first interval + a small dope of 4 words and
9727 probe that many bytes past the specified size to maintain a protection
9728 area at the botton of the stack. */
9729 const int dope = 4 * UNITS_PER_WORD;
9730 rtx size_rtx = GEN_INT (size), last;
9731
9732 /* See if we have a constant small number of probes to generate. If so,
9733 that's the easy case. The run-time loop is made up of 11 insns in the
9734 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9735 for n # of intervals. */
9736 if (size <= 5 * PROBE_INTERVAL)
9737 {
9738 HOST_WIDE_INT i, adjust;
9739 bool first_probe = true;
9740
9741 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9742 values of N from 1 until it exceeds SIZE. If only one probe is
9743 needed, this will not generate any code. Then adjust and probe
9744 to PROBE_INTERVAL + SIZE. */
9745 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9746 {
9747 if (first_probe)
9748 {
9749 adjust = 2 * PROBE_INTERVAL + dope;
9750 first_probe = false;
9751 }
9752 else
9753 adjust = PROBE_INTERVAL;
9754
9755 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9756 plus_constant (Pmode, stack_pointer_rtx,
9757 -adjust)));
9758 emit_stack_probe (stack_pointer_rtx);
9759 }
9760
9761 if (first_probe)
9762 adjust = size + PROBE_INTERVAL + dope;
9763 else
9764 adjust = size + PROBE_INTERVAL - i;
9765
9766 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9767 plus_constant (Pmode, stack_pointer_rtx,
9768 -adjust)));
9769 emit_stack_probe (stack_pointer_rtx);
9770
9771 /* Adjust back to account for the additional first interval. */
9772 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9773 plus_constant (Pmode, stack_pointer_rtx,
9774 PROBE_INTERVAL + dope)));
9775 }
9776
9777 /* Otherwise, do the same as above, but in a loop. Note that we must be
9778 extra careful with variables wrapping around because we might be at
9779 the very top (or the very bottom) of the address space and we have
9780 to be able to handle this case properly; in particular, we use an
9781 equality test for the loop condition. */
9782 else
9783 {
9784 HOST_WIDE_INT rounded_size;
9785 struct scratch_reg sr;
9786
9787 get_scratch_register_on_entry (&sr);
9788
9789
9790 /* Step 1: round SIZE to the previous multiple of the interval. */
9791
9792 rounded_size = size & -PROBE_INTERVAL;
9793
9794
9795 /* Step 2: compute initial and final value of the loop counter. */
9796
9797 /* SP = SP_0 + PROBE_INTERVAL. */
9798 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9799 plus_constant (Pmode, stack_pointer_rtx,
9800 - (PROBE_INTERVAL + dope))));
9801
9802 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
9803 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
9804 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
9805 gen_rtx_PLUS (Pmode, sr.reg,
9806 stack_pointer_rtx)));
9807
9808
9809 /* Step 3: the loop
9810
9811 while (SP != LAST_ADDR)
9812 {
9813 SP = SP + PROBE_INTERVAL
9814 probe at SP
9815 }
9816
9817 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
9818 values of N from 1 until it is equal to ROUNDED_SIZE. */
9819
9820 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
9821
9822
9823 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
9824 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
9825
9826 if (size != rounded_size)
9827 {
9828 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9829 plus_constant (Pmode, stack_pointer_rtx,
9830 rounded_size - size)));
9831 emit_stack_probe (stack_pointer_rtx);
9832 }
9833
9834 /* Adjust back to account for the additional first interval. */
9835 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9836 plus_constant (Pmode, stack_pointer_rtx,
9837 PROBE_INTERVAL + dope)));
9838
9839 release_scratch_register_on_entry (&sr);
9840 }
9841
9842 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
9843
9844 /* Even if the stack pointer isn't the CFA register, we need to correctly
9845 describe the adjustments made to it, in particular differentiate the
9846 frame-related ones from the frame-unrelated ones. */
9847 if (size > 0)
9848 {
9849 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
9850 XVECEXP (expr, 0, 0)
9851 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9852 plus_constant (Pmode, stack_pointer_rtx, -size));
9853 XVECEXP (expr, 0, 1)
9854 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9855 plus_constant (Pmode, stack_pointer_rtx,
9856 PROBE_INTERVAL + dope + size));
9857 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
9858 RTX_FRAME_RELATED_P (last) = 1;
9859
9860 cfun->machine->fs.sp_offset += size;
9861 }
9862
9863 /* Make sure nothing is scheduled before we are done. */
9864 emit_insn (gen_blockage ());
9865 }
9866
9867 /* Adjust the stack pointer up to REG while probing it. */
9868
9869 const char *
9870 output_adjust_stack_and_probe (rtx reg)
9871 {
9872 static int labelno = 0;
9873 char loop_lab[32], end_lab[32];
9874 rtx xops[2];
9875
9876 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9877 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9878
9879 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9880
9881 /* Jump to END_LAB if SP == LAST_ADDR. */
9882 xops[0] = stack_pointer_rtx;
9883 xops[1] = reg;
9884 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9885 fputs ("\tje\t", asm_out_file);
9886 assemble_name_raw (asm_out_file, end_lab);
9887 fputc ('\n', asm_out_file);
9888
9889 /* SP = SP + PROBE_INTERVAL. */
9890 xops[1] = GEN_INT (PROBE_INTERVAL);
9891 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9892
9893 /* Probe at SP. */
9894 xops[1] = const0_rtx;
9895 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
9896
9897 fprintf (asm_out_file, "\tjmp\t");
9898 assemble_name_raw (asm_out_file, loop_lab);
9899 fputc ('\n', asm_out_file);
9900
9901 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9902
9903 return "";
9904 }
9905
9906 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
9907 inclusive. These are offsets from the current stack pointer. */
9908
9909 static void
9910 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
9911 {
9912 /* See if we have a constant small number of probes to generate. If so,
9913 that's the easy case. The run-time loop is made up of 7 insns in the
9914 generic case while the compile-time loop is made up of n insns for n #
9915 of intervals. */
9916 if (size <= 7 * PROBE_INTERVAL)
9917 {
9918 HOST_WIDE_INT i;
9919
9920 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
9921 it exceeds SIZE. If only one probe is needed, this will not
9922 generate any code. Then probe at FIRST + SIZE. */
9923 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9924 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9925 -(first + i)));
9926
9927 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9928 -(first + size)));
9929 }
9930
9931 /* Otherwise, do the same as above, but in a loop. Note that we must be
9932 extra careful with variables wrapping around because we might be at
9933 the very top (or the very bottom) of the address space and we have
9934 to be able to handle this case properly; in particular, we use an
9935 equality test for the loop condition. */
9936 else
9937 {
9938 HOST_WIDE_INT rounded_size, last;
9939 struct scratch_reg sr;
9940
9941 get_scratch_register_on_entry (&sr);
9942
9943
9944 /* Step 1: round SIZE to the previous multiple of the interval. */
9945
9946 rounded_size = size & -PROBE_INTERVAL;
9947
9948
9949 /* Step 2: compute initial and final value of the loop counter. */
9950
9951 /* TEST_OFFSET = FIRST. */
9952 emit_move_insn (sr.reg, GEN_INT (-first));
9953
9954 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
9955 last = first + rounded_size;
9956
9957
9958 /* Step 3: the loop
9959
9960 while (TEST_ADDR != LAST_ADDR)
9961 {
9962 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
9963 probe at TEST_ADDR
9964 }
9965
9966 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
9967 until it is equal to ROUNDED_SIZE. */
9968
9969 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
9970
9971
9972 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
9973 that SIZE is equal to ROUNDED_SIZE. */
9974
9975 if (size != rounded_size)
9976 emit_stack_probe (plus_constant (Pmode,
9977 gen_rtx_PLUS (Pmode,
9978 stack_pointer_rtx,
9979 sr.reg),
9980 rounded_size - size));
9981
9982 release_scratch_register_on_entry (&sr);
9983 }
9984
9985 /* Make sure nothing is scheduled before we are done. */
9986 emit_insn (gen_blockage ());
9987 }
9988
9989 /* Probe a range of stack addresses from REG to END, inclusive. These are
9990 offsets from the current stack pointer. */
9991
9992 const char *
9993 output_probe_stack_range (rtx reg, rtx end)
9994 {
9995 static int labelno = 0;
9996 char loop_lab[32], end_lab[32];
9997 rtx xops[3];
9998
9999 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10000 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10001
10002 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10003
10004 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
10005 xops[0] = reg;
10006 xops[1] = end;
10007 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10008 fputs ("\tje\t", asm_out_file);
10009 assemble_name_raw (asm_out_file, end_lab);
10010 fputc ('\n', asm_out_file);
10011
10012 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
10013 xops[1] = GEN_INT (PROBE_INTERVAL);
10014 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10015
10016 /* Probe at TEST_ADDR. */
10017 xops[0] = stack_pointer_rtx;
10018 xops[1] = reg;
10019 xops[2] = const0_rtx;
10020 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
10021
10022 fprintf (asm_out_file, "\tjmp\t");
10023 assemble_name_raw (asm_out_file, loop_lab);
10024 fputc ('\n', asm_out_file);
10025
10026 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10027
10028 return "";
10029 }
10030
10031 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
10032 to be generated in correct form. */
10033 static void
10034 ix86_finalize_stack_realign_flags (void)
10035 {
10036 /* Check if stack realign is really needed after reload, and
10037 stores result in cfun */
10038 unsigned int incoming_stack_boundary
10039 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
10040 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
10041 unsigned int stack_realign = (incoming_stack_boundary
10042 < (crtl->is_leaf
10043 ? crtl->max_used_stack_slot_alignment
10044 : crtl->stack_alignment_needed));
10045
10046 if (crtl->stack_realign_finalized)
10047 {
10048 /* After stack_realign_needed is finalized, we can't no longer
10049 change it. */
10050 gcc_assert (crtl->stack_realign_needed == stack_realign);
10051 return;
10052 }
10053
10054 /* If the only reason for frame_pointer_needed is that we conservatively
10055 assumed stack realignment might be needed, but in the end nothing that
10056 needed the stack alignment had been spilled, clear frame_pointer_needed
10057 and say we don't need stack realignment. */
10058 if (stack_realign
10059 && !crtl->need_drap
10060 && frame_pointer_needed
10061 && crtl->is_leaf
10062 && flag_omit_frame_pointer
10063 && crtl->sp_is_unchanging
10064 && !ix86_current_function_calls_tls_descriptor
10065 && !crtl->accesses_prior_frames
10066 && !cfun->calls_alloca
10067 && !crtl->calls_eh_return
10068 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
10069 && !ix86_frame_pointer_required ()
10070 && get_frame_size () == 0
10071 && ix86_nsaved_sseregs () == 0
10072 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
10073 {
10074 HARD_REG_SET set_up_by_prologue, prologue_used;
10075 basic_block bb;
10076
10077 CLEAR_HARD_REG_SET (prologue_used);
10078 CLEAR_HARD_REG_SET (set_up_by_prologue);
10079 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
10080 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
10081 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
10082 HARD_FRAME_POINTER_REGNUM);
10083 FOR_EACH_BB (bb)
10084 {
10085 rtx insn;
10086 FOR_BB_INSNS (bb, insn)
10087 if (NONDEBUG_INSN_P (insn)
10088 && requires_stack_frame_p (insn, prologue_used,
10089 set_up_by_prologue))
10090 {
10091 crtl->stack_realign_needed = stack_realign;
10092 crtl->stack_realign_finalized = true;
10093 return;
10094 }
10095 }
10096
10097 frame_pointer_needed = false;
10098 stack_realign = false;
10099 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10100 crtl->stack_alignment_needed = incoming_stack_boundary;
10101 crtl->stack_alignment_estimated = incoming_stack_boundary;
10102 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10103 crtl->preferred_stack_boundary = incoming_stack_boundary;
10104 df_finish_pass (true);
10105 df_scan_alloc (NULL);
10106 df_scan_blocks ();
10107 df_compute_regs_ever_live (true);
10108 df_analyze ();
10109 }
10110
10111 crtl->stack_realign_needed = stack_realign;
10112 crtl->stack_realign_finalized = true;
10113 }
10114
10115 /* Expand the prologue into a bunch of separate insns. */
10116
10117 void
10118 ix86_expand_prologue (void)
10119 {
10120 struct machine_function *m = cfun->machine;
10121 rtx insn, t;
10122 bool pic_reg_used;
10123 struct ix86_frame frame;
10124 HOST_WIDE_INT allocate;
10125 bool int_registers_saved;
10126 bool sse_registers_saved;
10127
10128 ix86_finalize_stack_realign_flags ();
10129
10130 /* DRAP should not coexist with stack_realign_fp */
10131 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10132
10133 memset (&m->fs, 0, sizeof (m->fs));
10134
10135 /* Initialize CFA state for before the prologue. */
10136 m->fs.cfa_reg = stack_pointer_rtx;
10137 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10138
10139 /* Track SP offset to the CFA. We continue tracking this after we've
10140 swapped the CFA register away from SP. In the case of re-alignment
10141 this is fudged; we're interested to offsets within the local frame. */
10142 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10143 m->fs.sp_valid = true;
10144
10145 ix86_compute_frame_layout (&frame);
10146
10147 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10148 {
10149 /* We should have already generated an error for any use of
10150 ms_hook on a nested function. */
10151 gcc_checking_assert (!ix86_static_chain_on_stack);
10152
10153 /* Check if profiling is active and we shall use profiling before
10154 prologue variant. If so sorry. */
10155 if (crtl->profile && flag_fentry != 0)
10156 sorry ("ms_hook_prologue attribute isn%'t compatible "
10157 "with -mfentry for 32-bit");
10158
10159 /* In ix86_asm_output_function_label we emitted:
10160 8b ff movl.s %edi,%edi
10161 55 push %ebp
10162 8b ec movl.s %esp,%ebp
10163
10164 This matches the hookable function prologue in Win32 API
10165 functions in Microsoft Windows XP Service Pack 2 and newer.
10166 Wine uses this to enable Windows apps to hook the Win32 API
10167 functions provided by Wine.
10168
10169 What that means is that we've already set up the frame pointer. */
10170
10171 if (frame_pointer_needed
10172 && !(crtl->drap_reg && crtl->stack_realign_needed))
10173 {
10174 rtx push, mov;
10175
10176 /* We've decided to use the frame pointer already set up.
10177 Describe this to the unwinder by pretending that both
10178 push and mov insns happen right here.
10179
10180 Putting the unwind info here at the end of the ms_hook
10181 is done so that we can make absolutely certain we get
10182 the required byte sequence at the start of the function,
10183 rather than relying on an assembler that can produce
10184 the exact encoding required.
10185
10186 However it does mean (in the unpatched case) that we have
10187 a 1 insn window where the asynchronous unwind info is
10188 incorrect. However, if we placed the unwind info at
10189 its correct location we would have incorrect unwind info
10190 in the patched case. Which is probably all moot since
10191 I don't expect Wine generates dwarf2 unwind info for the
10192 system libraries that use this feature. */
10193
10194 insn = emit_insn (gen_blockage ());
10195
10196 push = gen_push (hard_frame_pointer_rtx);
10197 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10198 stack_pointer_rtx);
10199 RTX_FRAME_RELATED_P (push) = 1;
10200 RTX_FRAME_RELATED_P (mov) = 1;
10201
10202 RTX_FRAME_RELATED_P (insn) = 1;
10203 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10204 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10205
10206 /* Note that gen_push incremented m->fs.cfa_offset, even
10207 though we didn't emit the push insn here. */
10208 m->fs.cfa_reg = hard_frame_pointer_rtx;
10209 m->fs.fp_offset = m->fs.cfa_offset;
10210 m->fs.fp_valid = true;
10211 }
10212 else
10213 {
10214 /* The frame pointer is not needed so pop %ebp again.
10215 This leaves us with a pristine state. */
10216 emit_insn (gen_pop (hard_frame_pointer_rtx));
10217 }
10218 }
10219
10220 /* The first insn of a function that accepts its static chain on the
10221 stack is to push the register that would be filled in by a direct
10222 call. This insn will be skipped by the trampoline. */
10223 else if (ix86_static_chain_on_stack)
10224 {
10225 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10226 emit_insn (gen_blockage ());
10227
10228 /* We don't want to interpret this push insn as a register save,
10229 only as a stack adjustment. The real copy of the register as
10230 a save will be done later, if needed. */
10231 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
10232 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10233 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10234 RTX_FRAME_RELATED_P (insn) = 1;
10235 }
10236
10237 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10238 of DRAP is needed and stack realignment is really needed after reload */
10239 if (stack_realign_drap)
10240 {
10241 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10242
10243 /* Only need to push parameter pointer reg if it is caller saved. */
10244 if (!call_used_regs[REGNO (crtl->drap_reg)])
10245 {
10246 /* Push arg pointer reg */
10247 insn = emit_insn (gen_push (crtl->drap_reg));
10248 RTX_FRAME_RELATED_P (insn) = 1;
10249 }
10250
10251 /* Grab the argument pointer. */
10252 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
10253 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10254 RTX_FRAME_RELATED_P (insn) = 1;
10255 m->fs.cfa_reg = crtl->drap_reg;
10256 m->fs.cfa_offset = 0;
10257
10258 /* Align the stack. */
10259 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10260 stack_pointer_rtx,
10261 GEN_INT (-align_bytes)));
10262 RTX_FRAME_RELATED_P (insn) = 1;
10263
10264 /* Replicate the return address on the stack so that return
10265 address can be reached via (argp - 1) slot. This is needed
10266 to implement macro RETURN_ADDR_RTX and intrinsic function
10267 expand_builtin_return_addr etc. */
10268 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
10269 t = gen_frame_mem (word_mode, t);
10270 insn = emit_insn (gen_push (t));
10271 RTX_FRAME_RELATED_P (insn) = 1;
10272
10273 /* For the purposes of frame and register save area addressing,
10274 we've started over with a new frame. */
10275 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10276 m->fs.realigned = true;
10277 }
10278
10279 int_registers_saved = (frame.nregs == 0);
10280 sse_registers_saved = (frame.nsseregs == 0);
10281
10282 if (frame_pointer_needed && !m->fs.fp_valid)
10283 {
10284 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10285 slower on all targets. Also sdb doesn't like it. */
10286 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10287 RTX_FRAME_RELATED_P (insn) = 1;
10288
10289 /* Push registers now, before setting the frame pointer
10290 on SEH target. */
10291 if (!int_registers_saved
10292 && TARGET_SEH
10293 && !frame.save_regs_using_mov)
10294 {
10295 ix86_emit_save_regs ();
10296 int_registers_saved = true;
10297 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10298 }
10299
10300 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10301 {
10302 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10303 RTX_FRAME_RELATED_P (insn) = 1;
10304
10305 if (m->fs.cfa_reg == stack_pointer_rtx)
10306 m->fs.cfa_reg = hard_frame_pointer_rtx;
10307 m->fs.fp_offset = m->fs.sp_offset;
10308 m->fs.fp_valid = true;
10309 }
10310 }
10311
10312 if (!int_registers_saved)
10313 {
10314 /* If saving registers via PUSH, do so now. */
10315 if (!frame.save_regs_using_mov)
10316 {
10317 ix86_emit_save_regs ();
10318 int_registers_saved = true;
10319 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10320 }
10321
10322 /* When using red zone we may start register saving before allocating
10323 the stack frame saving one cycle of the prologue. However, avoid
10324 doing this if we have to probe the stack; at least on x86_64 the
10325 stack probe can turn into a call that clobbers a red zone location. */
10326 else if (ix86_using_red_zone ()
10327 && (! TARGET_STACK_PROBE
10328 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10329 {
10330 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10331 int_registers_saved = true;
10332 }
10333 }
10334
10335 if (stack_realign_fp)
10336 {
10337 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10338 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10339
10340 /* The computation of the size of the re-aligned stack frame means
10341 that we must allocate the size of the register save area before
10342 performing the actual alignment. Otherwise we cannot guarantee
10343 that there's enough storage above the realignment point. */
10344 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10345 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10346 GEN_INT (m->fs.sp_offset
10347 - frame.sse_reg_save_offset),
10348 -1, false);
10349
10350 /* Align the stack. */
10351 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10352 stack_pointer_rtx,
10353 GEN_INT (-align_bytes)));
10354
10355 /* For the purposes of register save area addressing, the stack
10356 pointer is no longer valid. As for the value of sp_offset,
10357 see ix86_compute_frame_layout, which we need to match in order
10358 to pass verification of stack_pointer_offset at the end. */
10359 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10360 m->fs.sp_valid = false;
10361 }
10362
10363 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10364
10365 if (flag_stack_usage_info)
10366 {
10367 /* We start to count from ARG_POINTER. */
10368 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10369
10370 /* If it was realigned, take into account the fake frame. */
10371 if (stack_realign_drap)
10372 {
10373 if (ix86_static_chain_on_stack)
10374 stack_size += UNITS_PER_WORD;
10375
10376 if (!call_used_regs[REGNO (crtl->drap_reg)])
10377 stack_size += UNITS_PER_WORD;
10378
10379 /* This over-estimates by 1 minimal-stack-alignment-unit but
10380 mitigates that by counting in the new return address slot. */
10381 current_function_dynamic_stack_size
10382 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10383 }
10384
10385 current_function_static_stack_size = stack_size;
10386 }
10387
10388 /* On SEH target with very large frame size, allocate an area to save
10389 SSE registers (as the very large allocation won't be described). */
10390 if (TARGET_SEH
10391 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
10392 && !sse_registers_saved)
10393 {
10394 HOST_WIDE_INT sse_size =
10395 frame.sse_reg_save_offset - frame.reg_save_offset;
10396
10397 gcc_assert (int_registers_saved);
10398
10399 /* No need to do stack checking as the area will be immediately
10400 written. */
10401 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10402 GEN_INT (-sse_size), -1,
10403 m->fs.cfa_reg == stack_pointer_rtx);
10404 allocate -= sse_size;
10405 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10406 sse_registers_saved = true;
10407 }
10408
10409 /* The stack has already been decremented by the instruction calling us
10410 so probe if the size is non-negative to preserve the protection area. */
10411 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10412 {
10413 /* We expect the registers to be saved when probes are used. */
10414 gcc_assert (int_registers_saved);
10415
10416 if (STACK_CHECK_MOVING_SP)
10417 {
10418 ix86_adjust_stack_and_probe (allocate);
10419 allocate = 0;
10420 }
10421 else
10422 {
10423 HOST_WIDE_INT size = allocate;
10424
10425 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10426 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10427
10428 if (TARGET_STACK_PROBE)
10429 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10430 else
10431 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10432 }
10433 }
10434
10435 if (allocate == 0)
10436 ;
10437 else if (!ix86_target_stack_probe ()
10438 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10439 {
10440 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10441 GEN_INT (-allocate), -1,
10442 m->fs.cfa_reg == stack_pointer_rtx);
10443 }
10444 else
10445 {
10446 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10447 rtx r10 = NULL;
10448 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10449 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
10450 bool eax_live = false;
10451 bool r10_live = false;
10452
10453 if (TARGET_64BIT)
10454 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10455 if (!TARGET_64BIT_MS_ABI)
10456 eax_live = ix86_eax_live_at_start_p ();
10457
10458 /* Note that SEH directives need to continue tracking the stack
10459 pointer even after the frame pointer has been set up. */
10460 if (eax_live)
10461 {
10462 insn = emit_insn (gen_push (eax));
10463 allocate -= UNITS_PER_WORD;
10464 if (sp_is_cfa_reg || TARGET_SEH)
10465 {
10466 if (sp_is_cfa_reg)
10467 m->fs.cfa_offset += UNITS_PER_WORD;
10468 RTX_FRAME_RELATED_P (insn) = 1;
10469 }
10470 }
10471
10472 if (r10_live)
10473 {
10474 r10 = gen_rtx_REG (Pmode, R10_REG);
10475 insn = emit_insn (gen_push (r10));
10476 allocate -= UNITS_PER_WORD;
10477 if (sp_is_cfa_reg || TARGET_SEH)
10478 {
10479 if (sp_is_cfa_reg)
10480 m->fs.cfa_offset += UNITS_PER_WORD;
10481 RTX_FRAME_RELATED_P (insn) = 1;
10482 }
10483 }
10484
10485 emit_move_insn (eax, GEN_INT (allocate));
10486 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10487
10488 /* Use the fact that AX still contains ALLOCATE. */
10489 adjust_stack_insn = (Pmode == DImode
10490 ? gen_pro_epilogue_adjust_stack_di_sub
10491 : gen_pro_epilogue_adjust_stack_si_sub);
10492
10493 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10494 stack_pointer_rtx, eax));
10495
10496 if (sp_is_cfa_reg || TARGET_SEH)
10497 {
10498 if (sp_is_cfa_reg)
10499 m->fs.cfa_offset += allocate;
10500 RTX_FRAME_RELATED_P (insn) = 1;
10501 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10502 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10503 plus_constant (Pmode, stack_pointer_rtx,
10504 -allocate)));
10505 }
10506 m->fs.sp_offset += allocate;
10507
10508 if (r10_live && eax_live)
10509 {
10510 t = choose_baseaddr (m->fs.sp_offset - allocate);
10511 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
10512 gen_frame_mem (word_mode, t));
10513 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10514 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
10515 gen_frame_mem (word_mode, t));
10516 }
10517 else if (eax_live || r10_live)
10518 {
10519 t = choose_baseaddr (m->fs.sp_offset - allocate);
10520 emit_move_insn (gen_rtx_REG (word_mode,
10521 (eax_live ? AX_REG : R10_REG)),
10522 gen_frame_mem (word_mode, t));
10523 }
10524 }
10525 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10526
10527 /* If we havn't already set up the frame pointer, do so now. */
10528 if (frame_pointer_needed && !m->fs.fp_valid)
10529 {
10530 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10531 GEN_INT (frame.stack_pointer_offset
10532 - frame.hard_frame_pointer_offset));
10533 insn = emit_insn (insn);
10534 RTX_FRAME_RELATED_P (insn) = 1;
10535 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10536
10537 if (m->fs.cfa_reg == stack_pointer_rtx)
10538 m->fs.cfa_reg = hard_frame_pointer_rtx;
10539 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10540 m->fs.fp_valid = true;
10541 }
10542
10543 if (!int_registers_saved)
10544 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10545 if (!sse_registers_saved)
10546 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10547
10548 pic_reg_used = false;
10549 if (pic_offset_table_rtx
10550 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10551 || crtl->profile))
10552 {
10553 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10554
10555 if (alt_pic_reg_used != INVALID_REGNUM)
10556 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10557
10558 pic_reg_used = true;
10559 }
10560
10561 if (pic_reg_used)
10562 {
10563 if (TARGET_64BIT)
10564 {
10565 if (ix86_cmodel == CM_LARGE_PIC)
10566 {
10567 rtx label, tmp_reg;
10568
10569 gcc_assert (Pmode == DImode);
10570 label = gen_label_rtx ();
10571 emit_label (label);
10572 LABEL_PRESERVE_P (label) = 1;
10573 tmp_reg = gen_rtx_REG (Pmode, R11_REG);
10574 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10575 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
10576 label));
10577 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10578 insn = emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
10579 pic_offset_table_rtx, tmp_reg));
10580 }
10581 else
10582 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10583 }
10584 else
10585 {
10586 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10587 RTX_FRAME_RELATED_P (insn) = 1;
10588 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10589 }
10590 }
10591
10592 /* In the pic_reg_used case, make sure that the got load isn't deleted
10593 when mcount needs it. Blockage to avoid call movement across mcount
10594 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10595 note. */
10596 if (crtl->profile && !flag_fentry && pic_reg_used)
10597 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10598
10599 if (crtl->drap_reg && !crtl->stack_realign_needed)
10600 {
10601 /* vDRAP is setup but after reload it turns out stack realign
10602 isn't necessary, here we will emit prologue to setup DRAP
10603 without stack realign adjustment */
10604 t = choose_baseaddr (0);
10605 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10606 }
10607
10608 /* Prevent instructions from being scheduled into register save push
10609 sequence when access to the redzone area is done through frame pointer.
10610 The offset between the frame pointer and the stack pointer is calculated
10611 relative to the value of the stack pointer at the end of the function
10612 prologue, and moving instructions that access redzone area via frame
10613 pointer inside push sequence violates this assumption. */
10614 if (frame_pointer_needed && frame.red_zone_size)
10615 emit_insn (gen_memory_blockage ());
10616
10617 /* Emit cld instruction if stringops are used in the function. */
10618 if (TARGET_CLD && ix86_current_function_needs_cld)
10619 emit_insn (gen_cld ());
10620
10621 /* SEH requires that the prologue end within 256 bytes of the start of
10622 the function. Prevent instruction schedules that would extend that.
10623 Further, prevent alloca modifications to the stack pointer from being
10624 combined with prologue modifications. */
10625 if (TARGET_SEH)
10626 emit_insn (gen_prologue_use (stack_pointer_rtx));
10627 }
10628
10629 /* Emit code to restore REG using a POP insn. */
10630
10631 static void
10632 ix86_emit_restore_reg_using_pop (rtx reg)
10633 {
10634 struct machine_function *m = cfun->machine;
10635 rtx insn = emit_insn (gen_pop (reg));
10636
10637 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10638 m->fs.sp_offset -= UNITS_PER_WORD;
10639
10640 if (m->fs.cfa_reg == crtl->drap_reg
10641 && REGNO (reg) == REGNO (crtl->drap_reg))
10642 {
10643 /* Previously we'd represented the CFA as an expression
10644 like *(%ebp - 8). We've just popped that value from
10645 the stack, which means we need to reset the CFA to
10646 the drap register. This will remain until we restore
10647 the stack pointer. */
10648 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10649 RTX_FRAME_RELATED_P (insn) = 1;
10650
10651 /* This means that the DRAP register is valid for addressing too. */
10652 m->fs.drap_valid = true;
10653 return;
10654 }
10655
10656 if (m->fs.cfa_reg == stack_pointer_rtx)
10657 {
10658 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
10659 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10660 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10661 RTX_FRAME_RELATED_P (insn) = 1;
10662
10663 m->fs.cfa_offset -= UNITS_PER_WORD;
10664 }
10665
10666 /* When the frame pointer is the CFA, and we pop it, we are
10667 swapping back to the stack pointer as the CFA. This happens
10668 for stack frames that don't allocate other data, so we assume
10669 the stack pointer is now pointing at the return address, i.e.
10670 the function entry state, which makes the offset be 1 word. */
10671 if (reg == hard_frame_pointer_rtx)
10672 {
10673 m->fs.fp_valid = false;
10674 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10675 {
10676 m->fs.cfa_reg = stack_pointer_rtx;
10677 m->fs.cfa_offset -= UNITS_PER_WORD;
10678
10679 add_reg_note (insn, REG_CFA_DEF_CFA,
10680 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10681 GEN_INT (m->fs.cfa_offset)));
10682 RTX_FRAME_RELATED_P (insn) = 1;
10683 }
10684 }
10685 }
10686
10687 /* Emit code to restore saved registers using POP insns. */
10688
10689 static void
10690 ix86_emit_restore_regs_using_pop (void)
10691 {
10692 unsigned int regno;
10693
10694 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10695 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10696 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
10697 }
10698
10699 /* Emit code and notes for the LEAVE instruction. */
10700
10701 static void
10702 ix86_emit_leave (void)
10703 {
10704 struct machine_function *m = cfun->machine;
10705 rtx insn = emit_insn (ix86_gen_leave ());
10706
10707 ix86_add_queued_cfa_restore_notes (insn);
10708
10709 gcc_assert (m->fs.fp_valid);
10710 m->fs.sp_valid = true;
10711 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10712 m->fs.fp_valid = false;
10713
10714 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10715 {
10716 m->fs.cfa_reg = stack_pointer_rtx;
10717 m->fs.cfa_offset = m->fs.sp_offset;
10718
10719 add_reg_note (insn, REG_CFA_DEF_CFA,
10720 plus_constant (Pmode, stack_pointer_rtx,
10721 m->fs.sp_offset));
10722 RTX_FRAME_RELATED_P (insn) = 1;
10723 }
10724 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10725 m->fs.fp_offset);
10726 }
10727
10728 /* Emit code to restore saved registers using MOV insns.
10729 First register is restored from CFA - CFA_OFFSET. */
10730 static void
10731 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10732 bool maybe_eh_return)
10733 {
10734 struct machine_function *m = cfun->machine;
10735 unsigned int regno;
10736
10737 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10738 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10739 {
10740 rtx reg = gen_rtx_REG (word_mode, regno);
10741 rtx insn, mem;
10742
10743 mem = choose_baseaddr (cfa_offset);
10744 mem = gen_frame_mem (word_mode, mem);
10745 insn = emit_move_insn (reg, mem);
10746
10747 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10748 {
10749 /* Previously we'd represented the CFA as an expression
10750 like *(%ebp - 8). We've just popped that value from
10751 the stack, which means we need to reset the CFA to
10752 the drap register. This will remain until we restore
10753 the stack pointer. */
10754 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10755 RTX_FRAME_RELATED_P (insn) = 1;
10756
10757 /* This means that the DRAP register is valid for addressing. */
10758 m->fs.drap_valid = true;
10759 }
10760 else
10761 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10762
10763 cfa_offset -= UNITS_PER_WORD;
10764 }
10765 }
10766
10767 /* Emit code to restore saved registers using MOV insns.
10768 First register is restored from CFA - CFA_OFFSET. */
10769 static void
10770 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
10771 bool maybe_eh_return)
10772 {
10773 unsigned int regno;
10774
10775 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10776 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10777 {
10778 rtx reg = gen_rtx_REG (V4SFmode, regno);
10779 rtx mem;
10780
10781 mem = choose_baseaddr (cfa_offset);
10782 mem = gen_rtx_MEM (V4SFmode, mem);
10783 set_mem_align (mem, 128);
10784 emit_move_insn (reg, mem);
10785
10786 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10787
10788 cfa_offset -= 16;
10789 }
10790 }
10791
10792 /* Restore function stack, frame, and registers. */
10793
10794 void
10795 ix86_expand_epilogue (int style)
10796 {
10797 struct machine_function *m = cfun->machine;
10798 struct machine_frame_state frame_state_save = m->fs;
10799 struct ix86_frame frame;
10800 bool restore_regs_via_mov;
10801 bool using_drap;
10802
10803 ix86_finalize_stack_realign_flags ();
10804 ix86_compute_frame_layout (&frame);
10805
10806 m->fs.sp_valid = (!frame_pointer_needed
10807 || (crtl->sp_is_unchanging
10808 && !stack_realign_fp));
10809 gcc_assert (!m->fs.sp_valid
10810 || m->fs.sp_offset == frame.stack_pointer_offset);
10811
10812 /* The FP must be valid if the frame pointer is present. */
10813 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
10814 gcc_assert (!m->fs.fp_valid
10815 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
10816
10817 /* We must have *some* valid pointer to the stack frame. */
10818 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
10819
10820 /* The DRAP is never valid at this point. */
10821 gcc_assert (!m->fs.drap_valid);
10822
10823 /* See the comment about red zone and frame
10824 pointer usage in ix86_expand_prologue. */
10825 if (frame_pointer_needed && frame.red_zone_size)
10826 emit_insn (gen_memory_blockage ());
10827
10828 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
10829 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
10830
10831 /* Determine the CFA offset of the end of the red-zone. */
10832 m->fs.red_zone_offset = 0;
10833 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
10834 {
10835 /* The red-zone begins below the return address. */
10836 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
10837
10838 /* When the register save area is in the aligned portion of
10839 the stack, determine the maximum runtime displacement that
10840 matches up with the aligned frame. */
10841 if (stack_realign_drap)
10842 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
10843 + UNITS_PER_WORD);
10844 }
10845
10846 /* Special care must be taken for the normal return case of a function
10847 using eh_return: the eax and edx registers are marked as saved, but
10848 not restored along this path. Adjust the save location to match. */
10849 if (crtl->calls_eh_return && style != 2)
10850 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
10851
10852 /* EH_RETURN requires the use of moves to function properly. */
10853 if (crtl->calls_eh_return)
10854 restore_regs_via_mov = true;
10855 /* SEH requires the use of pops to identify the epilogue. */
10856 else if (TARGET_SEH)
10857 restore_regs_via_mov = false;
10858 /* If we're only restoring one register and sp is not valid then
10859 using a move instruction to restore the register since it's
10860 less work than reloading sp and popping the register. */
10861 else if (!m->fs.sp_valid && frame.nregs <= 1)
10862 restore_regs_via_mov = true;
10863 else if (TARGET_EPILOGUE_USING_MOVE
10864 && cfun->machine->use_fast_prologue_epilogue
10865 && (frame.nregs > 1
10866 || m->fs.sp_offset != frame.reg_save_offset))
10867 restore_regs_via_mov = true;
10868 else if (frame_pointer_needed
10869 && !frame.nregs
10870 && m->fs.sp_offset != frame.reg_save_offset)
10871 restore_regs_via_mov = true;
10872 else if (frame_pointer_needed
10873 && TARGET_USE_LEAVE
10874 && cfun->machine->use_fast_prologue_epilogue
10875 && frame.nregs == 1)
10876 restore_regs_via_mov = true;
10877 else
10878 restore_regs_via_mov = false;
10879
10880 if (restore_regs_via_mov || frame.nsseregs)
10881 {
10882 /* Ensure that the entire register save area is addressable via
10883 the stack pointer, if we will restore via sp. */
10884 if (TARGET_64BIT
10885 && m->fs.sp_offset > 0x7fffffff
10886 && !(m->fs.fp_valid || m->fs.drap_valid)
10887 && (frame.nsseregs + frame.nregs) != 0)
10888 {
10889 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10890 GEN_INT (m->fs.sp_offset
10891 - frame.sse_reg_save_offset),
10892 style,
10893 m->fs.cfa_reg == stack_pointer_rtx);
10894 }
10895 }
10896
10897 /* If there are any SSE registers to restore, then we have to do it
10898 via moves, since there's obviously no pop for SSE regs. */
10899 if (frame.nsseregs)
10900 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
10901 style == 2);
10902
10903 if (restore_regs_via_mov)
10904 {
10905 rtx t;
10906
10907 if (frame.nregs)
10908 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
10909
10910 /* eh_return epilogues need %ecx added to the stack pointer. */
10911 if (style == 2)
10912 {
10913 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
10914
10915 /* Stack align doesn't work with eh_return. */
10916 gcc_assert (!stack_realign_drap);
10917 /* Neither does regparm nested functions. */
10918 gcc_assert (!ix86_static_chain_on_stack);
10919
10920 if (frame_pointer_needed)
10921 {
10922 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
10923 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
10924 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
10925
10926 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
10927 insn = emit_move_insn (hard_frame_pointer_rtx, t);
10928
10929 /* Note that we use SA as a temporary CFA, as the return
10930 address is at the proper place relative to it. We
10931 pretend this happens at the FP restore insn because
10932 prior to this insn the FP would be stored at the wrong
10933 offset relative to SA, and after this insn we have no
10934 other reasonable register to use for the CFA. We don't
10935 bother resetting the CFA to the SP for the duration of
10936 the return insn. */
10937 add_reg_note (insn, REG_CFA_DEF_CFA,
10938 plus_constant (Pmode, sa, UNITS_PER_WORD));
10939 ix86_add_queued_cfa_restore_notes (insn);
10940 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
10941 RTX_FRAME_RELATED_P (insn) = 1;
10942
10943 m->fs.cfa_reg = sa;
10944 m->fs.cfa_offset = UNITS_PER_WORD;
10945 m->fs.fp_valid = false;
10946
10947 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
10948 const0_rtx, style, false);
10949 }
10950 else
10951 {
10952 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
10953 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
10954 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
10955 ix86_add_queued_cfa_restore_notes (insn);
10956
10957 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
10958 if (m->fs.cfa_offset != UNITS_PER_WORD)
10959 {
10960 m->fs.cfa_offset = UNITS_PER_WORD;
10961 add_reg_note (insn, REG_CFA_DEF_CFA,
10962 plus_constant (Pmode, stack_pointer_rtx,
10963 UNITS_PER_WORD));
10964 RTX_FRAME_RELATED_P (insn) = 1;
10965 }
10966 }
10967 m->fs.sp_offset = UNITS_PER_WORD;
10968 m->fs.sp_valid = true;
10969 }
10970 }
10971 else
10972 {
10973 /* SEH requires that the function end with (1) a stack adjustment
10974 if necessary, (2) a sequence of pops, and (3) a return or
10975 jump instruction. Prevent insns from the function body from
10976 being scheduled into this sequence. */
10977 if (TARGET_SEH)
10978 {
10979 /* Prevent a catch region from being adjacent to the standard
10980 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
10981 several other flags that would be interesting to test are
10982 not yet set up. */
10983 if (flag_non_call_exceptions)
10984 emit_insn (gen_nops (const1_rtx));
10985 else
10986 emit_insn (gen_blockage ());
10987 }
10988
10989 /* First step is to deallocate the stack frame so that we can
10990 pop the registers. Also do it on SEH target for very large
10991 frame as the emitted instructions aren't allowed by the ABI in
10992 epilogues. */
10993 if (!m->fs.sp_valid
10994 || (TARGET_SEH
10995 && (m->fs.sp_offset - frame.reg_save_offset
10996 >= SEH_MAX_FRAME_SIZE)))
10997 {
10998 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
10999 GEN_INT (m->fs.fp_offset
11000 - frame.reg_save_offset),
11001 style, false);
11002 }
11003 else if (m->fs.sp_offset != frame.reg_save_offset)
11004 {
11005 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11006 GEN_INT (m->fs.sp_offset
11007 - frame.reg_save_offset),
11008 style,
11009 m->fs.cfa_reg == stack_pointer_rtx);
11010 }
11011
11012 ix86_emit_restore_regs_using_pop ();
11013 }
11014
11015 /* If we used a stack pointer and haven't already got rid of it,
11016 then do so now. */
11017 if (m->fs.fp_valid)
11018 {
11019 /* If the stack pointer is valid and pointing at the frame
11020 pointer store address, then we only need a pop. */
11021 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
11022 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11023 /* Leave results in shorter dependency chains on CPUs that are
11024 able to grok it fast. */
11025 else if (TARGET_USE_LEAVE
11026 || optimize_function_for_size_p (cfun)
11027 || !cfun->machine->use_fast_prologue_epilogue)
11028 ix86_emit_leave ();
11029 else
11030 {
11031 pro_epilogue_adjust_stack (stack_pointer_rtx,
11032 hard_frame_pointer_rtx,
11033 const0_rtx, style, !using_drap);
11034 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11035 }
11036 }
11037
11038 if (using_drap)
11039 {
11040 int param_ptr_offset = UNITS_PER_WORD;
11041 rtx insn;
11042
11043 gcc_assert (stack_realign_drap);
11044
11045 if (ix86_static_chain_on_stack)
11046 param_ptr_offset += UNITS_PER_WORD;
11047 if (!call_used_regs[REGNO (crtl->drap_reg)])
11048 param_ptr_offset += UNITS_PER_WORD;
11049
11050 insn = emit_insn (gen_rtx_SET
11051 (VOIDmode, stack_pointer_rtx,
11052 gen_rtx_PLUS (Pmode,
11053 crtl->drap_reg,
11054 GEN_INT (-param_ptr_offset))));
11055 m->fs.cfa_reg = stack_pointer_rtx;
11056 m->fs.cfa_offset = param_ptr_offset;
11057 m->fs.sp_offset = param_ptr_offset;
11058 m->fs.realigned = false;
11059
11060 add_reg_note (insn, REG_CFA_DEF_CFA,
11061 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11062 GEN_INT (param_ptr_offset)));
11063 RTX_FRAME_RELATED_P (insn) = 1;
11064
11065 if (!call_used_regs[REGNO (crtl->drap_reg)])
11066 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
11067 }
11068
11069 /* At this point the stack pointer must be valid, and we must have
11070 restored all of the registers. We may not have deallocated the
11071 entire stack frame. We've delayed this until now because it may
11072 be possible to merge the local stack deallocation with the
11073 deallocation forced by ix86_static_chain_on_stack. */
11074 gcc_assert (m->fs.sp_valid);
11075 gcc_assert (!m->fs.fp_valid);
11076 gcc_assert (!m->fs.realigned);
11077 if (m->fs.sp_offset != UNITS_PER_WORD)
11078 {
11079 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11080 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
11081 style, true);
11082 }
11083 else
11084 ix86_add_queued_cfa_restore_notes (get_last_insn ());
11085
11086 /* Sibcall epilogues don't want a return instruction. */
11087 if (style == 0)
11088 {
11089 m->fs = frame_state_save;
11090 return;
11091 }
11092
11093 if (crtl->args.pops_args && crtl->args.size)
11094 {
11095 rtx popc = GEN_INT (crtl->args.pops_args);
11096
11097 /* i386 can only pop 64K bytes. If asked to pop more, pop return
11098 address, do explicit add, and jump indirectly to the caller. */
11099
11100 if (crtl->args.pops_args >= 65536)
11101 {
11102 rtx ecx = gen_rtx_REG (SImode, CX_REG);
11103 rtx insn;
11104
11105 /* There is no "pascal" calling convention in any 64bit ABI. */
11106 gcc_assert (!TARGET_64BIT);
11107
11108 insn = emit_insn (gen_pop (ecx));
11109 m->fs.cfa_offset -= UNITS_PER_WORD;
11110 m->fs.sp_offset -= UNITS_PER_WORD;
11111
11112 add_reg_note (insn, REG_CFA_ADJUST_CFA,
11113 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
11114 add_reg_note (insn, REG_CFA_REGISTER,
11115 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11116 RTX_FRAME_RELATED_P (insn) = 1;
11117
11118 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11119 popc, -1, true);
11120 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11121 }
11122 else
11123 emit_jump_insn (gen_simple_return_pop_internal (popc));
11124 }
11125 else
11126 emit_jump_insn (gen_simple_return_internal ());
11127
11128 /* Restore the state back to the state from the prologue,
11129 so that it's correct for the next epilogue. */
11130 m->fs = frame_state_save;
11131 }
11132
11133 /* Reset from the function's potential modifications. */
11134
11135 static void
11136 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11137 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11138 {
11139 if (pic_offset_table_rtx)
11140 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11141 #if TARGET_MACHO
11142 /* Mach-O doesn't support labels at the end of objects, so if
11143 it looks like we might want one, insert a NOP. */
11144 {
11145 rtx insn = get_last_insn ();
11146 rtx deleted_debug_label = NULL_RTX;
11147 while (insn
11148 && NOTE_P (insn)
11149 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11150 {
11151 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11152 notes only, instead set their CODE_LABEL_NUMBER to -1,
11153 otherwise there would be code generation differences
11154 in between -g and -g0. */
11155 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11156 deleted_debug_label = insn;
11157 insn = PREV_INSN (insn);
11158 }
11159 if (insn
11160 && (LABEL_P (insn)
11161 || (NOTE_P (insn)
11162 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11163 fputs ("\tnop\n", file);
11164 else if (deleted_debug_label)
11165 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11166 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11167 CODE_LABEL_NUMBER (insn) = -1;
11168 }
11169 #endif
11170
11171 }
11172
11173 /* Return a scratch register to use in the split stack prologue. The
11174 split stack prologue is used for -fsplit-stack. It is the first
11175 instructions in the function, even before the regular prologue.
11176 The scratch register can be any caller-saved register which is not
11177 used for parameters or for the static chain. */
11178
11179 static unsigned int
11180 split_stack_prologue_scratch_regno (void)
11181 {
11182 if (TARGET_64BIT)
11183 return R11_REG;
11184 else
11185 {
11186 bool is_fastcall, is_thiscall;
11187 int regparm;
11188
11189 is_fastcall = (lookup_attribute ("fastcall",
11190 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11191 != NULL);
11192 is_thiscall = (lookup_attribute ("thiscall",
11193 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11194 != NULL);
11195 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11196
11197 if (is_fastcall)
11198 {
11199 if (DECL_STATIC_CHAIN (cfun->decl))
11200 {
11201 sorry ("-fsplit-stack does not support fastcall with "
11202 "nested function");
11203 return INVALID_REGNUM;
11204 }
11205 return AX_REG;
11206 }
11207 else if (is_thiscall)
11208 {
11209 if (!DECL_STATIC_CHAIN (cfun->decl))
11210 return DX_REG;
11211 return AX_REG;
11212 }
11213 else if (regparm < 3)
11214 {
11215 if (!DECL_STATIC_CHAIN (cfun->decl))
11216 return CX_REG;
11217 else
11218 {
11219 if (regparm >= 2)
11220 {
11221 sorry ("-fsplit-stack does not support 2 register "
11222 " parameters for a nested function");
11223 return INVALID_REGNUM;
11224 }
11225 return DX_REG;
11226 }
11227 }
11228 else
11229 {
11230 /* FIXME: We could make this work by pushing a register
11231 around the addition and comparison. */
11232 sorry ("-fsplit-stack does not support 3 register parameters");
11233 return INVALID_REGNUM;
11234 }
11235 }
11236 }
11237
11238 /* A SYMBOL_REF for the function which allocates new stackspace for
11239 -fsplit-stack. */
11240
11241 static GTY(()) rtx split_stack_fn;
11242
11243 /* A SYMBOL_REF for the more stack function when using the large
11244 model. */
11245
11246 static GTY(()) rtx split_stack_fn_large;
11247
11248 /* Handle -fsplit-stack. These are the first instructions in the
11249 function, even before the regular prologue. */
11250
11251 void
11252 ix86_expand_split_stack_prologue (void)
11253 {
11254 struct ix86_frame frame;
11255 HOST_WIDE_INT allocate;
11256 unsigned HOST_WIDE_INT args_size;
11257 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11258 rtx scratch_reg = NULL_RTX;
11259 rtx varargs_label = NULL_RTX;
11260 rtx fn;
11261
11262 gcc_assert (flag_split_stack && reload_completed);
11263
11264 ix86_finalize_stack_realign_flags ();
11265 ix86_compute_frame_layout (&frame);
11266 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11267
11268 /* This is the label we will branch to if we have enough stack
11269 space. We expect the basic block reordering pass to reverse this
11270 branch if optimizing, so that we branch in the unlikely case. */
11271 label = gen_label_rtx ();
11272
11273 /* We need to compare the stack pointer minus the frame size with
11274 the stack boundary in the TCB. The stack boundary always gives
11275 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11276 can compare directly. Otherwise we need to do an addition. */
11277
11278 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11279 UNSPEC_STACK_CHECK);
11280 limit = gen_rtx_CONST (Pmode, limit);
11281 limit = gen_rtx_MEM (Pmode, limit);
11282 if (allocate < SPLIT_STACK_AVAILABLE)
11283 current = stack_pointer_rtx;
11284 else
11285 {
11286 unsigned int scratch_regno;
11287 rtx offset;
11288
11289 /* We need a scratch register to hold the stack pointer minus
11290 the required frame size. Since this is the very start of the
11291 function, the scratch register can be any caller-saved
11292 register which is not used for parameters. */
11293 offset = GEN_INT (- allocate);
11294 scratch_regno = split_stack_prologue_scratch_regno ();
11295 if (scratch_regno == INVALID_REGNUM)
11296 return;
11297 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11298 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11299 {
11300 /* We don't use ix86_gen_add3 in this case because it will
11301 want to split to lea, but when not optimizing the insn
11302 will not be split after this point. */
11303 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11304 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11305 offset)));
11306 }
11307 else
11308 {
11309 emit_move_insn (scratch_reg, offset);
11310 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
11311 stack_pointer_rtx));
11312 }
11313 current = scratch_reg;
11314 }
11315
11316 ix86_expand_branch (GEU, current, limit, label);
11317 jump_insn = get_last_insn ();
11318 JUMP_LABEL (jump_insn) = label;
11319
11320 /* Mark the jump as very likely to be taken. */
11321 add_reg_note (jump_insn, REG_BR_PROB,
11322 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11323
11324 if (split_stack_fn == NULL_RTX)
11325 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11326 fn = split_stack_fn;
11327
11328 /* Get more stack space. We pass in the desired stack space and the
11329 size of the arguments to copy to the new stack. In 32-bit mode
11330 we push the parameters; __morestack will return on a new stack
11331 anyhow. In 64-bit mode we pass the parameters in r10 and
11332 r11. */
11333 allocate_rtx = GEN_INT (allocate);
11334 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11335 call_fusage = NULL_RTX;
11336 if (TARGET_64BIT)
11337 {
11338 rtx reg10, reg11;
11339
11340 reg10 = gen_rtx_REG (Pmode, R10_REG);
11341 reg11 = gen_rtx_REG (Pmode, R11_REG);
11342
11343 /* If this function uses a static chain, it will be in %r10.
11344 Preserve it across the call to __morestack. */
11345 if (DECL_STATIC_CHAIN (cfun->decl))
11346 {
11347 rtx rax;
11348
11349 rax = gen_rtx_REG (word_mode, AX_REG);
11350 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
11351 use_reg (&call_fusage, rax);
11352 }
11353
11354 if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11355 {
11356 HOST_WIDE_INT argval;
11357
11358 gcc_assert (Pmode == DImode);
11359 /* When using the large model we need to load the address
11360 into a register, and we've run out of registers. So we
11361 switch to a different calling convention, and we call a
11362 different function: __morestack_large. We pass the
11363 argument size in the upper 32 bits of r10 and pass the
11364 frame size in the lower 32 bits. */
11365 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11366 gcc_assert ((args_size & 0xffffffff) == args_size);
11367
11368 if (split_stack_fn_large == NULL_RTX)
11369 split_stack_fn_large =
11370 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11371
11372 if (ix86_cmodel == CM_LARGE_PIC)
11373 {
11374 rtx label, x;
11375
11376 label = gen_label_rtx ();
11377 emit_label (label);
11378 LABEL_PRESERVE_P (label) = 1;
11379 emit_insn (gen_set_rip_rex64 (reg10, label));
11380 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11381 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
11382 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11383 UNSPEC_GOT);
11384 x = gen_rtx_CONST (Pmode, x);
11385 emit_move_insn (reg11, x);
11386 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11387 x = gen_const_mem (Pmode, x);
11388 emit_move_insn (reg11, x);
11389 }
11390 else
11391 emit_move_insn (reg11, split_stack_fn_large);
11392
11393 fn = reg11;
11394
11395 argval = ((args_size << 16) << 16) + allocate;
11396 emit_move_insn (reg10, GEN_INT (argval));
11397 }
11398 else
11399 {
11400 emit_move_insn (reg10, allocate_rtx);
11401 emit_move_insn (reg11, GEN_INT (args_size));
11402 use_reg (&call_fusage, reg11);
11403 }
11404
11405 use_reg (&call_fusage, reg10);
11406 }
11407 else
11408 {
11409 emit_insn (gen_push (GEN_INT (args_size)));
11410 emit_insn (gen_push (allocate_rtx));
11411 }
11412 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11413 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11414 NULL_RTX, false);
11415 add_function_usage_to (call_insn, call_fusage);
11416
11417 /* In order to make call/return prediction work right, we now need
11418 to execute a return instruction. See
11419 libgcc/config/i386/morestack.S for the details on how this works.
11420
11421 For flow purposes gcc must not see this as a return
11422 instruction--we need control flow to continue at the subsequent
11423 label. Therefore, we use an unspec. */
11424 gcc_assert (crtl->args.pops_args < 65536);
11425 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11426
11427 /* If we are in 64-bit mode and this function uses a static chain,
11428 we saved %r10 in %rax before calling _morestack. */
11429 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11430 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
11431 gen_rtx_REG (word_mode, AX_REG));
11432
11433 /* If this function calls va_start, we need to store a pointer to
11434 the arguments on the old stack, because they may not have been
11435 all copied to the new stack. At this point the old stack can be
11436 found at the frame pointer value used by __morestack, because
11437 __morestack has set that up before calling back to us. Here we
11438 store that pointer in a scratch register, and in
11439 ix86_expand_prologue we store the scratch register in a stack
11440 slot. */
11441 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11442 {
11443 unsigned int scratch_regno;
11444 rtx frame_reg;
11445 int words;
11446
11447 scratch_regno = split_stack_prologue_scratch_regno ();
11448 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11449 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11450
11451 /* 64-bit:
11452 fp -> old fp value
11453 return address within this function
11454 return address of caller of this function
11455 stack arguments
11456 So we add three words to get to the stack arguments.
11457
11458 32-bit:
11459 fp -> old fp value
11460 return address within this function
11461 first argument to __morestack
11462 second argument to __morestack
11463 return address of caller of this function
11464 stack arguments
11465 So we add five words to get to the stack arguments.
11466 */
11467 words = TARGET_64BIT ? 3 : 5;
11468 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11469 gen_rtx_PLUS (Pmode, frame_reg,
11470 GEN_INT (words * UNITS_PER_WORD))));
11471
11472 varargs_label = gen_label_rtx ();
11473 emit_jump_insn (gen_jump (varargs_label));
11474 JUMP_LABEL (get_last_insn ()) = varargs_label;
11475
11476 emit_barrier ();
11477 }
11478
11479 emit_label (label);
11480 LABEL_NUSES (label) = 1;
11481
11482 /* If this function calls va_start, we now have to set the scratch
11483 register for the case where we do not call __morestack. In this
11484 case we need to set it based on the stack pointer. */
11485 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11486 {
11487 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11488 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11489 GEN_INT (UNITS_PER_WORD))));
11490
11491 emit_label (varargs_label);
11492 LABEL_NUSES (varargs_label) = 1;
11493 }
11494 }
11495
11496 /* We may have to tell the dataflow pass that the split stack prologue
11497 is initializing a scratch register. */
11498
11499 static void
11500 ix86_live_on_entry (bitmap regs)
11501 {
11502 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11503 {
11504 gcc_assert (flag_split_stack);
11505 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11506 }
11507 }
11508 \f
11509 /* Determine if op is suitable SUBREG RTX for address. */
11510
11511 static bool
11512 ix86_address_subreg_operand (rtx op)
11513 {
11514 enum machine_mode mode;
11515
11516 if (!REG_P (op))
11517 return false;
11518
11519 mode = GET_MODE (op);
11520
11521 if (GET_MODE_CLASS (mode) != MODE_INT)
11522 return false;
11523
11524 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11525 failures when the register is one word out of a two word structure. */
11526 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11527 return false;
11528
11529 /* Allow only SUBREGs of non-eliminable hard registers. */
11530 return register_no_elim_operand (op, mode);
11531 }
11532
11533 /* Extract the parts of an RTL expression that is a valid memory address
11534 for an instruction. Return 0 if the structure of the address is
11535 grossly off. Return -1 if the address contains ASHIFT, so it is not
11536 strictly valid, but still used for computing length of lea instruction. */
11537
11538 int
11539 ix86_decompose_address (rtx addr, struct ix86_address *out)
11540 {
11541 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11542 rtx base_reg, index_reg;
11543 HOST_WIDE_INT scale = 1;
11544 rtx scale_rtx = NULL_RTX;
11545 rtx tmp;
11546 int retval = 1;
11547 enum ix86_address_seg seg = SEG_DEFAULT;
11548
11549 /* Allow zero-extended SImode addresses,
11550 they will be emitted with addr32 prefix. */
11551 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11552 {
11553 if (GET_CODE (addr) == ZERO_EXTEND
11554 && GET_MODE (XEXP (addr, 0)) == SImode)
11555 {
11556 addr = XEXP (addr, 0);
11557 if (CONST_INT_P (addr))
11558 return 0;
11559 }
11560 else if (GET_CODE (addr) == AND
11561 && const_32bit_mask (XEXP (addr, 1), DImode))
11562 {
11563 addr = simplify_gen_subreg (SImode, XEXP (addr, 0), DImode, 0);
11564 if (addr == NULL_RTX)
11565 return 0;
11566
11567 if (CONST_INT_P (addr))
11568 return 0;
11569 }
11570 }
11571
11572 /* Allow SImode subregs of DImode addresses,
11573 they will be emitted with addr32 prefix. */
11574 if (TARGET_64BIT && GET_MODE (addr) == SImode)
11575 {
11576 if (GET_CODE (addr) == SUBREG
11577 && GET_MODE (SUBREG_REG (addr)) == DImode)
11578 {
11579 addr = SUBREG_REG (addr);
11580 if (CONST_INT_P (addr))
11581 return 0;
11582 }
11583 }
11584
11585 if (REG_P (addr))
11586 base = addr;
11587 else if (GET_CODE (addr) == SUBREG)
11588 {
11589 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11590 base = addr;
11591 else
11592 return 0;
11593 }
11594 else if (GET_CODE (addr) == PLUS)
11595 {
11596 rtx addends[4], op;
11597 int n = 0, i;
11598
11599 op = addr;
11600 do
11601 {
11602 if (n >= 4)
11603 return 0;
11604 addends[n++] = XEXP (op, 1);
11605 op = XEXP (op, 0);
11606 }
11607 while (GET_CODE (op) == PLUS);
11608 if (n >= 4)
11609 return 0;
11610 addends[n] = op;
11611
11612 for (i = n; i >= 0; --i)
11613 {
11614 op = addends[i];
11615 switch (GET_CODE (op))
11616 {
11617 case MULT:
11618 if (index)
11619 return 0;
11620 index = XEXP (op, 0);
11621 scale_rtx = XEXP (op, 1);
11622 break;
11623
11624 case ASHIFT:
11625 if (index)
11626 return 0;
11627 index = XEXP (op, 0);
11628 tmp = XEXP (op, 1);
11629 if (!CONST_INT_P (tmp))
11630 return 0;
11631 scale = INTVAL (tmp);
11632 if ((unsigned HOST_WIDE_INT) scale > 3)
11633 return 0;
11634 scale = 1 << scale;
11635 break;
11636
11637 case ZERO_EXTEND:
11638 op = XEXP (op, 0);
11639 if (GET_CODE (op) != UNSPEC)
11640 return 0;
11641 /* FALLTHRU */
11642
11643 case UNSPEC:
11644 if (XINT (op, 1) == UNSPEC_TP
11645 && TARGET_TLS_DIRECT_SEG_REFS
11646 && seg == SEG_DEFAULT)
11647 seg = TARGET_64BIT ? SEG_FS : SEG_GS;
11648 else
11649 return 0;
11650 break;
11651
11652 case SUBREG:
11653 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11654 return 0;
11655 /* FALLTHRU */
11656
11657 case REG:
11658 if (!base)
11659 base = op;
11660 else if (!index)
11661 index = op;
11662 else
11663 return 0;
11664 break;
11665
11666 case CONST:
11667 case CONST_INT:
11668 case SYMBOL_REF:
11669 case LABEL_REF:
11670 if (disp)
11671 return 0;
11672 disp = op;
11673 break;
11674
11675 default:
11676 return 0;
11677 }
11678 }
11679 }
11680 else if (GET_CODE (addr) == MULT)
11681 {
11682 index = XEXP (addr, 0); /* index*scale */
11683 scale_rtx = XEXP (addr, 1);
11684 }
11685 else if (GET_CODE (addr) == ASHIFT)
11686 {
11687 /* We're called for lea too, which implements ashift on occasion. */
11688 index = XEXP (addr, 0);
11689 tmp = XEXP (addr, 1);
11690 if (!CONST_INT_P (tmp))
11691 return 0;
11692 scale = INTVAL (tmp);
11693 if ((unsigned HOST_WIDE_INT) scale > 3)
11694 return 0;
11695 scale = 1 << scale;
11696 retval = -1;
11697 }
11698 else if (CONST_INT_P (addr))
11699 {
11700 if (!x86_64_immediate_operand (addr, VOIDmode))
11701 return 0;
11702
11703 /* Constant addresses are sign extended to 64bit, we have to
11704 prevent addresses from 0x80000000 to 0xffffffff in x32 mode. */
11705 if (TARGET_X32
11706 && val_signbit_known_set_p (SImode, INTVAL (addr)))
11707 return 0;
11708
11709 disp = addr;
11710 }
11711 else
11712 disp = addr; /* displacement */
11713
11714 if (index)
11715 {
11716 if (REG_P (index))
11717 ;
11718 else if (GET_CODE (index) == SUBREG
11719 && ix86_address_subreg_operand (SUBREG_REG (index)))
11720 ;
11721 else
11722 return 0;
11723 }
11724
11725 /* Address override works only on the (%reg) part of %fs:(%reg). */
11726 if (seg != SEG_DEFAULT
11727 && ((base && GET_MODE (base) != word_mode)
11728 || (index && GET_MODE (index) != word_mode)))
11729 return 0;
11730
11731 /* Extract the integral value of scale. */
11732 if (scale_rtx)
11733 {
11734 if (!CONST_INT_P (scale_rtx))
11735 return 0;
11736 scale = INTVAL (scale_rtx);
11737 }
11738
11739 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11740 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11741
11742 /* Avoid useless 0 displacement. */
11743 if (disp == const0_rtx && (base || index))
11744 disp = NULL_RTX;
11745
11746 /* Allow arg pointer and stack pointer as index if there is not scaling. */
11747 if (base_reg && index_reg && scale == 1
11748 && (index_reg == arg_pointer_rtx
11749 || index_reg == frame_pointer_rtx
11750 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11751 {
11752 rtx tmp;
11753 tmp = base, base = index, index = tmp;
11754 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
11755 }
11756
11757 /* Special case: %ebp cannot be encoded as a base without a displacement.
11758 Similarly %r13. */
11759 if (!disp
11760 && base_reg
11761 && (base_reg == hard_frame_pointer_rtx
11762 || base_reg == frame_pointer_rtx
11763 || base_reg == arg_pointer_rtx
11764 || (REG_P (base_reg)
11765 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
11766 || REGNO (base_reg) == R13_REG))))
11767 disp = const0_rtx;
11768
11769 /* Special case: on K6, [%esi] makes the instruction vector decoded.
11770 Avoid this by transforming to [%esi+0].
11771 Reload calls address legitimization without cfun defined, so we need
11772 to test cfun for being non-NULL. */
11773 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
11774 && base_reg && !index_reg && !disp
11775 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
11776 disp = const0_rtx;
11777
11778 /* Special case: encode reg+reg instead of reg*2. */
11779 if (!base && index && scale == 2)
11780 base = index, base_reg = index_reg, scale = 1;
11781
11782 /* Special case: scaling cannot be encoded without base or displacement. */
11783 if (!base && !disp && index && scale != 1)
11784 disp = const0_rtx;
11785
11786 out->base = base;
11787 out->index = index;
11788 out->disp = disp;
11789 out->scale = scale;
11790 out->seg = seg;
11791
11792 return retval;
11793 }
11794 \f
11795 /* Return cost of the memory address x.
11796 For i386, it is better to use a complex address than let gcc copy
11797 the address into a reg and make a new pseudo. But not if the address
11798 requires to two regs - that would mean more pseudos with longer
11799 lifetimes. */
11800 static int
11801 ix86_address_cost (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED,
11802 addr_space_t as ATTRIBUTE_UNUSED,
11803 bool speed ATTRIBUTE_UNUSED)
11804 {
11805 struct ix86_address parts;
11806 int cost = 1;
11807 int ok = ix86_decompose_address (x, &parts);
11808
11809 gcc_assert (ok);
11810
11811 if (parts.base && GET_CODE (parts.base) == SUBREG)
11812 parts.base = SUBREG_REG (parts.base);
11813 if (parts.index && GET_CODE (parts.index) == SUBREG)
11814 parts.index = SUBREG_REG (parts.index);
11815
11816 /* Attempt to minimize number of registers in the address. */
11817 if ((parts.base
11818 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
11819 || (parts.index
11820 && (!REG_P (parts.index)
11821 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
11822 cost++;
11823
11824 if (parts.base
11825 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
11826 && parts.index
11827 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
11828 && parts.base != parts.index)
11829 cost++;
11830
11831 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
11832 since it's predecode logic can't detect the length of instructions
11833 and it degenerates to vector decoded. Increase cost of such
11834 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
11835 to split such addresses or even refuse such addresses at all.
11836
11837 Following addressing modes are affected:
11838 [base+scale*index]
11839 [scale*index+disp]
11840 [base+index]
11841
11842 The first and last case may be avoidable by explicitly coding the zero in
11843 memory address, but I don't have AMD-K6 machine handy to check this
11844 theory. */
11845
11846 if (TARGET_K6
11847 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
11848 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
11849 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
11850 cost += 10;
11851
11852 return cost;
11853 }
11854 \f
11855 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
11856 this is used for to form addresses to local data when -fPIC is in
11857 use. */
11858
11859 static bool
11860 darwin_local_data_pic (rtx disp)
11861 {
11862 return (GET_CODE (disp) == UNSPEC
11863 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
11864 }
11865
11866 /* Determine if a given RTX is a valid constant. We already know this
11867 satisfies CONSTANT_P. */
11868
11869 static bool
11870 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
11871 {
11872 switch (GET_CODE (x))
11873 {
11874 case CONST:
11875 x = XEXP (x, 0);
11876
11877 if (GET_CODE (x) == PLUS)
11878 {
11879 if (!CONST_INT_P (XEXP (x, 1)))
11880 return false;
11881 x = XEXP (x, 0);
11882 }
11883
11884 if (TARGET_MACHO && darwin_local_data_pic (x))
11885 return true;
11886
11887 /* Only some unspecs are valid as "constants". */
11888 if (GET_CODE (x) == UNSPEC)
11889 switch (XINT (x, 1))
11890 {
11891 case UNSPEC_GOT:
11892 case UNSPEC_GOTOFF:
11893 case UNSPEC_PLTOFF:
11894 return TARGET_64BIT;
11895 case UNSPEC_TPOFF:
11896 case UNSPEC_NTPOFF:
11897 x = XVECEXP (x, 0, 0);
11898 return (GET_CODE (x) == SYMBOL_REF
11899 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11900 case UNSPEC_DTPOFF:
11901 x = XVECEXP (x, 0, 0);
11902 return (GET_CODE (x) == SYMBOL_REF
11903 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
11904 default:
11905 return false;
11906 }
11907
11908 /* We must have drilled down to a symbol. */
11909 if (GET_CODE (x) == LABEL_REF)
11910 return true;
11911 if (GET_CODE (x) != SYMBOL_REF)
11912 return false;
11913 /* FALLTHRU */
11914
11915 case SYMBOL_REF:
11916 /* TLS symbols are never valid. */
11917 if (SYMBOL_REF_TLS_MODEL (x))
11918 return false;
11919
11920 /* DLLIMPORT symbols are never valid. */
11921 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
11922 && SYMBOL_REF_DLLIMPORT_P (x))
11923 return false;
11924
11925 #if TARGET_MACHO
11926 /* mdynamic-no-pic */
11927 if (MACHO_DYNAMIC_NO_PIC_P)
11928 return machopic_symbol_defined_p (x);
11929 #endif
11930 break;
11931
11932 case CONST_DOUBLE:
11933 if (GET_MODE (x) == TImode
11934 && x != CONST0_RTX (TImode)
11935 && !TARGET_64BIT)
11936 return false;
11937 break;
11938
11939 case CONST_VECTOR:
11940 if (!standard_sse_constant_p (x))
11941 return false;
11942
11943 default:
11944 break;
11945 }
11946
11947 /* Otherwise we handle everything else in the move patterns. */
11948 return true;
11949 }
11950
11951 /* Determine if it's legal to put X into the constant pool. This
11952 is not possible for the address of thread-local symbols, which
11953 is checked above. */
11954
11955 static bool
11956 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
11957 {
11958 /* We can always put integral constants and vectors in memory. */
11959 switch (GET_CODE (x))
11960 {
11961 case CONST_INT:
11962 case CONST_DOUBLE:
11963 case CONST_VECTOR:
11964 return false;
11965
11966 default:
11967 break;
11968 }
11969 return !ix86_legitimate_constant_p (mode, x);
11970 }
11971
11972
11973 /* Nonzero if the constant value X is a legitimate general operand
11974 when generating PIC code. It is given that flag_pic is on and
11975 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
11976
11977 bool
11978 legitimate_pic_operand_p (rtx x)
11979 {
11980 rtx inner;
11981
11982 switch (GET_CODE (x))
11983 {
11984 case CONST:
11985 inner = XEXP (x, 0);
11986 if (GET_CODE (inner) == PLUS
11987 && CONST_INT_P (XEXP (inner, 1)))
11988 inner = XEXP (inner, 0);
11989
11990 /* Only some unspecs are valid as "constants". */
11991 if (GET_CODE (inner) == UNSPEC)
11992 switch (XINT (inner, 1))
11993 {
11994 case UNSPEC_GOT:
11995 case UNSPEC_GOTOFF:
11996 case UNSPEC_PLTOFF:
11997 return TARGET_64BIT;
11998 case UNSPEC_TPOFF:
11999 x = XVECEXP (inner, 0, 0);
12000 return (GET_CODE (x) == SYMBOL_REF
12001 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12002 case UNSPEC_MACHOPIC_OFFSET:
12003 return legitimate_pic_address_disp_p (x);
12004 default:
12005 return false;
12006 }
12007 /* FALLTHRU */
12008
12009 case SYMBOL_REF:
12010 case LABEL_REF:
12011 return legitimate_pic_address_disp_p (x);
12012
12013 default:
12014 return true;
12015 }
12016 }
12017
12018 /* Determine if a given CONST RTX is a valid memory displacement
12019 in PIC mode. */
12020
12021 bool
12022 legitimate_pic_address_disp_p (rtx disp)
12023 {
12024 bool saw_plus;
12025
12026 /* In 64bit mode we can allow direct addresses of symbols and labels
12027 when they are not dynamic symbols. */
12028 if (TARGET_64BIT)
12029 {
12030 rtx op0 = disp, op1;
12031
12032 switch (GET_CODE (disp))
12033 {
12034 case LABEL_REF:
12035 return true;
12036
12037 case CONST:
12038 if (GET_CODE (XEXP (disp, 0)) != PLUS)
12039 break;
12040 op0 = XEXP (XEXP (disp, 0), 0);
12041 op1 = XEXP (XEXP (disp, 0), 1);
12042 if (!CONST_INT_P (op1)
12043 || INTVAL (op1) >= 16*1024*1024
12044 || INTVAL (op1) < -16*1024*1024)
12045 break;
12046 if (GET_CODE (op0) == LABEL_REF)
12047 return true;
12048 if (GET_CODE (op0) == CONST
12049 && GET_CODE (XEXP (op0, 0)) == UNSPEC
12050 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
12051 return true;
12052 if (GET_CODE (op0) == UNSPEC
12053 && XINT (op0, 1) == UNSPEC_PCREL)
12054 return true;
12055 if (GET_CODE (op0) != SYMBOL_REF)
12056 break;
12057 /* FALLTHRU */
12058
12059 case SYMBOL_REF:
12060 /* TLS references should always be enclosed in UNSPEC. */
12061 if (SYMBOL_REF_TLS_MODEL (op0))
12062 return false;
12063 if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)
12064 && ix86_cmodel != CM_LARGE_PIC)
12065 return true;
12066 break;
12067
12068 default:
12069 break;
12070 }
12071 }
12072 if (GET_CODE (disp) != CONST)
12073 return false;
12074 disp = XEXP (disp, 0);
12075
12076 if (TARGET_64BIT)
12077 {
12078 /* We are unsafe to allow PLUS expressions. This limit allowed distance
12079 of GOT tables. We should not need these anyway. */
12080 if (GET_CODE (disp) != UNSPEC
12081 || (XINT (disp, 1) != UNSPEC_GOTPCREL
12082 && XINT (disp, 1) != UNSPEC_GOTOFF
12083 && XINT (disp, 1) != UNSPEC_PCREL
12084 && XINT (disp, 1) != UNSPEC_PLTOFF))
12085 return false;
12086
12087 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
12088 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
12089 return false;
12090 return true;
12091 }
12092
12093 saw_plus = false;
12094 if (GET_CODE (disp) == PLUS)
12095 {
12096 if (!CONST_INT_P (XEXP (disp, 1)))
12097 return false;
12098 disp = XEXP (disp, 0);
12099 saw_plus = true;
12100 }
12101
12102 if (TARGET_MACHO && darwin_local_data_pic (disp))
12103 return true;
12104
12105 if (GET_CODE (disp) != UNSPEC)
12106 return false;
12107
12108 switch (XINT (disp, 1))
12109 {
12110 case UNSPEC_GOT:
12111 if (saw_plus)
12112 return false;
12113 /* We need to check for both symbols and labels because VxWorks loads
12114 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
12115 details. */
12116 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12117 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
12118 case UNSPEC_GOTOFF:
12119 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
12120 While ABI specify also 32bit relocation but we don't produce it in
12121 small PIC model at all. */
12122 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12123 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
12124 && !TARGET_64BIT)
12125 return gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
12126 return false;
12127 case UNSPEC_GOTTPOFF:
12128 case UNSPEC_GOTNTPOFF:
12129 case UNSPEC_INDNTPOFF:
12130 if (saw_plus)
12131 return false;
12132 disp = XVECEXP (disp, 0, 0);
12133 return (GET_CODE (disp) == SYMBOL_REF
12134 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
12135 case UNSPEC_NTPOFF:
12136 disp = XVECEXP (disp, 0, 0);
12137 return (GET_CODE (disp) == SYMBOL_REF
12138 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
12139 case UNSPEC_DTPOFF:
12140 disp = XVECEXP (disp, 0, 0);
12141 return (GET_CODE (disp) == SYMBOL_REF
12142 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
12143 }
12144
12145 return false;
12146 }
12147
12148 /* Our implementation of LEGITIMIZE_RELOAD_ADDRESS. Returns a value to
12149 replace the input X, or the original X if no replacement is called for.
12150 The output parameter *WIN is 1 if the calling macro should goto WIN,
12151 0 if it should not. */
12152
12153 bool
12154 ix86_legitimize_reload_address (rtx x,
12155 enum machine_mode mode ATTRIBUTE_UNUSED,
12156 int opnum, int type,
12157 int ind_levels ATTRIBUTE_UNUSED)
12158 {
12159 /* Reload can generate:
12160
12161 (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
12162 (reg:DI 97))
12163 (reg:DI 2 cx))
12164
12165 This RTX is rejected from ix86_legitimate_address_p due to
12166 non-strictness of base register 97. Following this rejection,
12167 reload pushes all three components into separate registers,
12168 creating invalid memory address RTX.
12169
12170 Following code reloads only the invalid part of the
12171 memory address RTX. */
12172
12173 if (GET_CODE (x) == PLUS
12174 && REG_P (XEXP (x, 1))
12175 && GET_CODE (XEXP (x, 0)) == PLUS
12176 && REG_P (XEXP (XEXP (x, 0), 1)))
12177 {
12178 rtx base, index;
12179 bool something_reloaded = false;
12180
12181 base = XEXP (XEXP (x, 0), 1);
12182 if (!REG_OK_FOR_BASE_STRICT_P (base))
12183 {
12184 push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
12185 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12186 opnum, (enum reload_type) type);
12187 something_reloaded = true;
12188 }
12189
12190 index = XEXP (x, 1);
12191 if (!REG_OK_FOR_INDEX_STRICT_P (index))
12192 {
12193 push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
12194 INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12195 opnum, (enum reload_type) type);
12196 something_reloaded = true;
12197 }
12198
12199 gcc_assert (something_reloaded);
12200 return true;
12201 }
12202
12203 return false;
12204 }
12205
12206 /* Recognizes RTL expressions that are valid memory addresses for an
12207 instruction. The MODE argument is the machine mode for the MEM
12208 expression that wants to use this address.
12209
12210 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
12211 convert common non-canonical forms to canonical form so that they will
12212 be recognized. */
12213
12214 static bool
12215 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12216 rtx addr, bool strict)
12217 {
12218 struct ix86_address parts;
12219 rtx base, index, disp;
12220 HOST_WIDE_INT scale;
12221
12222 if (ix86_decompose_address (addr, &parts) <= 0)
12223 /* Decomposition failed. */
12224 return false;
12225
12226 base = parts.base;
12227 index = parts.index;
12228 disp = parts.disp;
12229 scale = parts.scale;
12230
12231 /* Validate base register. */
12232 if (base)
12233 {
12234 rtx reg;
12235
12236 if (REG_P (base))
12237 reg = base;
12238 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
12239 reg = SUBREG_REG (base);
12240 else
12241 /* Base is not a register. */
12242 return false;
12243
12244 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
12245 return false;
12246
12247 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12248 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12249 /* Base is not valid. */
12250 return false;
12251 }
12252
12253 /* Validate index register. */
12254 if (index)
12255 {
12256 rtx reg;
12257
12258 if (REG_P (index))
12259 reg = index;
12260 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
12261 reg = SUBREG_REG (index);
12262 else
12263 /* Index is not a register. */
12264 return false;
12265
12266 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
12267 return false;
12268
12269 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12270 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12271 /* Index is not valid. */
12272 return false;
12273 }
12274
12275 /* Index and base should have the same mode. */
12276 if (base && index
12277 && GET_MODE (base) != GET_MODE (index))
12278 return false;
12279
12280 /* Validate scale factor. */
12281 if (scale != 1)
12282 {
12283 if (!index)
12284 /* Scale without index. */
12285 return false;
12286
12287 if (scale != 2 && scale != 4 && scale != 8)
12288 /* Scale is not a valid multiplier. */
12289 return false;
12290 }
12291
12292 /* Validate displacement. */
12293 if (disp)
12294 {
12295 if (GET_CODE (disp) == CONST
12296 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12297 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12298 switch (XINT (XEXP (disp, 0), 1))
12299 {
12300 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12301 used. While ABI specify also 32bit relocations, we don't produce
12302 them at all and use IP relative instead. */
12303 case UNSPEC_GOT:
12304 case UNSPEC_GOTOFF:
12305 gcc_assert (flag_pic);
12306 if (!TARGET_64BIT)
12307 goto is_legitimate_pic;
12308
12309 /* 64bit address unspec. */
12310 return false;
12311
12312 case UNSPEC_GOTPCREL:
12313 case UNSPEC_PCREL:
12314 gcc_assert (flag_pic);
12315 goto is_legitimate_pic;
12316
12317 case UNSPEC_GOTTPOFF:
12318 case UNSPEC_GOTNTPOFF:
12319 case UNSPEC_INDNTPOFF:
12320 case UNSPEC_NTPOFF:
12321 case UNSPEC_DTPOFF:
12322 break;
12323
12324 case UNSPEC_STACK_CHECK:
12325 gcc_assert (flag_split_stack);
12326 break;
12327
12328 default:
12329 /* Invalid address unspec. */
12330 return false;
12331 }
12332
12333 else if (SYMBOLIC_CONST (disp)
12334 && (flag_pic
12335 || (TARGET_MACHO
12336 #if TARGET_MACHO
12337 && MACHOPIC_INDIRECT
12338 && !machopic_operand_p (disp)
12339 #endif
12340 )))
12341 {
12342
12343 is_legitimate_pic:
12344 if (TARGET_64BIT && (index || base))
12345 {
12346 /* foo@dtpoff(%rX) is ok. */
12347 if (GET_CODE (disp) != CONST
12348 || GET_CODE (XEXP (disp, 0)) != PLUS
12349 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12350 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12351 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12352 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12353 /* Non-constant pic memory reference. */
12354 return false;
12355 }
12356 else if ((!TARGET_MACHO || flag_pic)
12357 && ! legitimate_pic_address_disp_p (disp))
12358 /* Displacement is an invalid pic construct. */
12359 return false;
12360 #if TARGET_MACHO
12361 else if (MACHO_DYNAMIC_NO_PIC_P
12362 && !ix86_legitimate_constant_p (Pmode, disp))
12363 /* displacment must be referenced via non_lazy_pointer */
12364 return false;
12365 #endif
12366
12367 /* This code used to verify that a symbolic pic displacement
12368 includes the pic_offset_table_rtx register.
12369
12370 While this is good idea, unfortunately these constructs may
12371 be created by "adds using lea" optimization for incorrect
12372 code like:
12373
12374 int a;
12375 int foo(int i)
12376 {
12377 return *(&a+i);
12378 }
12379
12380 This code is nonsensical, but results in addressing
12381 GOT table with pic_offset_table_rtx base. We can't
12382 just refuse it easily, since it gets matched by
12383 "addsi3" pattern, that later gets split to lea in the
12384 case output register differs from input. While this
12385 can be handled by separate addsi pattern for this case
12386 that never results in lea, this seems to be easier and
12387 correct fix for crash to disable this test. */
12388 }
12389 else if (GET_CODE (disp) != LABEL_REF
12390 && !CONST_INT_P (disp)
12391 && (GET_CODE (disp) != CONST
12392 || !ix86_legitimate_constant_p (Pmode, disp))
12393 && (GET_CODE (disp) != SYMBOL_REF
12394 || !ix86_legitimate_constant_p (Pmode, disp)))
12395 /* Displacement is not constant. */
12396 return false;
12397 else if (TARGET_64BIT
12398 && !x86_64_immediate_operand (disp, VOIDmode))
12399 /* Displacement is out of range. */
12400 return false;
12401 }
12402
12403 /* Everything looks valid. */
12404 return true;
12405 }
12406
12407 /* Determine if a given RTX is a valid constant address. */
12408
12409 bool
12410 constant_address_p (rtx x)
12411 {
12412 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12413 }
12414 \f
12415 /* Return a unique alias set for the GOT. */
12416
12417 static alias_set_type
12418 ix86_GOT_alias_set (void)
12419 {
12420 static alias_set_type set = -1;
12421 if (set == -1)
12422 set = new_alias_set ();
12423 return set;
12424 }
12425
12426 /* Return a legitimate reference for ORIG (an address) using the
12427 register REG. If REG is 0, a new pseudo is generated.
12428
12429 There are two types of references that must be handled:
12430
12431 1. Global data references must load the address from the GOT, via
12432 the PIC reg. An insn is emitted to do this load, and the reg is
12433 returned.
12434
12435 2. Static data references, constant pool addresses, and code labels
12436 compute the address as an offset from the GOT, whose base is in
12437 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12438 differentiate them from global data objects. The returned
12439 address is the PIC reg + an unspec constant.
12440
12441 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12442 reg also appears in the address. */
12443
12444 static rtx
12445 legitimize_pic_address (rtx orig, rtx reg)
12446 {
12447 rtx addr = orig;
12448 rtx new_rtx = orig;
12449
12450 #if TARGET_MACHO
12451 if (TARGET_MACHO && !TARGET_64BIT)
12452 {
12453 if (reg == 0)
12454 reg = gen_reg_rtx (Pmode);
12455 /* Use the generic Mach-O PIC machinery. */
12456 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12457 }
12458 #endif
12459
12460 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12461 new_rtx = addr;
12462 else if (TARGET_64BIT
12463 && ix86_cmodel != CM_SMALL_PIC
12464 && gotoff_operand (addr, Pmode))
12465 {
12466 rtx tmpreg;
12467 /* This symbol may be referenced via a displacement from the PIC
12468 base address (@GOTOFF). */
12469
12470 if (reload_in_progress)
12471 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12472 if (GET_CODE (addr) == CONST)
12473 addr = XEXP (addr, 0);
12474 if (GET_CODE (addr) == PLUS)
12475 {
12476 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12477 UNSPEC_GOTOFF);
12478 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12479 }
12480 else
12481 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12482 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12483 if (!reg)
12484 tmpreg = gen_reg_rtx (Pmode);
12485 else
12486 tmpreg = reg;
12487 emit_move_insn (tmpreg, new_rtx);
12488
12489 if (reg != 0)
12490 {
12491 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12492 tmpreg, 1, OPTAB_DIRECT);
12493 new_rtx = reg;
12494 }
12495 else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12496 }
12497 else if (!TARGET_64BIT && gotoff_operand (addr, Pmode))
12498 {
12499 /* This symbol may be referenced via a displacement from the PIC
12500 base address (@GOTOFF). */
12501
12502 if (reload_in_progress)
12503 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12504 if (GET_CODE (addr) == CONST)
12505 addr = XEXP (addr, 0);
12506 if (GET_CODE (addr) == PLUS)
12507 {
12508 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12509 UNSPEC_GOTOFF);
12510 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12511 }
12512 else
12513 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12514 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12515 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12516
12517 if (reg != 0)
12518 {
12519 emit_move_insn (reg, new_rtx);
12520 new_rtx = reg;
12521 }
12522 }
12523 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12524 /* We can't use @GOTOFF for text labels on VxWorks;
12525 see gotoff_operand. */
12526 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12527 {
12528 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12529 {
12530 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
12531 return legitimize_dllimport_symbol (addr, true);
12532 if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS
12533 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
12534 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
12535 {
12536 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true);
12537 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
12538 }
12539 }
12540
12541 /* For x64 PE-COFF there is no GOT table. So we use address
12542 directly. */
12543 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12544 {
12545 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12546 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12547
12548 if (reg == 0)
12549 reg = gen_reg_rtx (Pmode);
12550 emit_move_insn (reg, new_rtx);
12551 new_rtx = reg;
12552 }
12553 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12554 {
12555 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12556 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12557 new_rtx = gen_const_mem (Pmode, new_rtx);
12558 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12559
12560 if (reg == 0)
12561 reg = gen_reg_rtx (Pmode);
12562 /* Use directly gen_movsi, otherwise the address is loaded
12563 into register for CSE. We don't want to CSE this addresses,
12564 instead we CSE addresses from the GOT table, so skip this. */
12565 emit_insn (gen_movsi (reg, new_rtx));
12566 new_rtx = reg;
12567 }
12568 else
12569 {
12570 /* This symbol must be referenced via a load from the
12571 Global Offset Table (@GOT). */
12572
12573 if (reload_in_progress)
12574 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12575 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12576 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12577 if (TARGET_64BIT)
12578 new_rtx = force_reg (Pmode, new_rtx);
12579 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12580 new_rtx = gen_const_mem (Pmode, new_rtx);
12581 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12582
12583 if (reg == 0)
12584 reg = gen_reg_rtx (Pmode);
12585 emit_move_insn (reg, new_rtx);
12586 new_rtx = reg;
12587 }
12588 }
12589 else
12590 {
12591 if (CONST_INT_P (addr)
12592 && !x86_64_immediate_operand (addr, VOIDmode))
12593 {
12594 if (reg)
12595 {
12596 emit_move_insn (reg, addr);
12597 new_rtx = reg;
12598 }
12599 else
12600 new_rtx = force_reg (Pmode, addr);
12601 }
12602 else if (GET_CODE (addr) == CONST)
12603 {
12604 addr = XEXP (addr, 0);
12605
12606 /* We must match stuff we generate before. Assume the only
12607 unspecs that can get here are ours. Not that we could do
12608 anything with them anyway.... */
12609 if (GET_CODE (addr) == UNSPEC
12610 || (GET_CODE (addr) == PLUS
12611 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12612 return orig;
12613 gcc_assert (GET_CODE (addr) == PLUS);
12614 }
12615 if (GET_CODE (addr) == PLUS)
12616 {
12617 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12618
12619 /* Check first to see if this is a constant offset from a @GOTOFF
12620 symbol reference. */
12621 if (gotoff_operand (op0, Pmode)
12622 && CONST_INT_P (op1))
12623 {
12624 if (!TARGET_64BIT)
12625 {
12626 if (reload_in_progress)
12627 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12628 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12629 UNSPEC_GOTOFF);
12630 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12631 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12632 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12633
12634 if (reg != 0)
12635 {
12636 emit_move_insn (reg, new_rtx);
12637 new_rtx = reg;
12638 }
12639 }
12640 else
12641 {
12642 if (INTVAL (op1) < -16*1024*1024
12643 || INTVAL (op1) >= 16*1024*1024)
12644 {
12645 if (!x86_64_immediate_operand (op1, Pmode))
12646 op1 = force_reg (Pmode, op1);
12647 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12648 }
12649 }
12650 }
12651 else
12652 {
12653 rtx base = legitimize_pic_address (op0, reg);
12654 enum machine_mode mode = GET_MODE (base);
12655 new_rtx
12656 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
12657
12658 if (CONST_INT_P (new_rtx))
12659 {
12660 if (INTVAL (new_rtx) < -16*1024*1024
12661 || INTVAL (new_rtx) >= 16*1024*1024)
12662 {
12663 if (!x86_64_immediate_operand (new_rtx, mode))
12664 new_rtx = force_reg (mode, new_rtx);
12665 new_rtx
12666 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
12667 }
12668 else
12669 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
12670 }
12671 else
12672 {
12673 if (GET_CODE (new_rtx) == PLUS
12674 && CONSTANT_P (XEXP (new_rtx, 1)))
12675 {
12676 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
12677 new_rtx = XEXP (new_rtx, 1);
12678 }
12679 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
12680 }
12681 }
12682 }
12683 }
12684 return new_rtx;
12685 }
12686 \f
12687 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12688
12689 static rtx
12690 get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
12691 {
12692 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12693
12694 if (GET_MODE (tp) != tp_mode)
12695 {
12696 gcc_assert (GET_MODE (tp) == SImode);
12697 gcc_assert (tp_mode == DImode);
12698
12699 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
12700 }
12701
12702 if (to_reg)
12703 tp = copy_to_mode_reg (tp_mode, tp);
12704
12705 return tp;
12706 }
12707
12708 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12709
12710 static GTY(()) rtx ix86_tls_symbol;
12711
12712 static rtx
12713 ix86_tls_get_addr (void)
12714 {
12715 if (!ix86_tls_symbol)
12716 {
12717 const char *sym
12718 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12719 ? "___tls_get_addr" : "__tls_get_addr");
12720
12721 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12722 }
12723
12724 return ix86_tls_symbol;
12725 }
12726
12727 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12728
12729 static GTY(()) rtx ix86_tls_module_base_symbol;
12730
12731 rtx
12732 ix86_tls_module_base (void)
12733 {
12734 if (!ix86_tls_module_base_symbol)
12735 {
12736 ix86_tls_module_base_symbol
12737 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12738
12739 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12740 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12741 }
12742
12743 return ix86_tls_module_base_symbol;
12744 }
12745
12746 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
12747 false if we expect this to be used for a memory address and true if
12748 we expect to load the address into a register. */
12749
12750 static rtx
12751 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
12752 {
12753 rtx dest, base, off;
12754 rtx pic = NULL_RTX, tp = NULL_RTX;
12755 enum machine_mode tp_mode = Pmode;
12756 int type;
12757
12758 switch (model)
12759 {
12760 case TLS_MODEL_GLOBAL_DYNAMIC:
12761 dest = gen_reg_rtx (Pmode);
12762
12763 if (!TARGET_64BIT)
12764 {
12765 if (flag_pic)
12766 pic = pic_offset_table_rtx;
12767 else
12768 {
12769 pic = gen_reg_rtx (Pmode);
12770 emit_insn (gen_set_got (pic));
12771 }
12772 }
12773
12774 if (TARGET_GNU2_TLS)
12775 {
12776 if (TARGET_64BIT)
12777 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
12778 else
12779 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
12780
12781 tp = get_thread_pointer (Pmode, true);
12782 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
12783
12784 if (GET_MODE (x) != Pmode)
12785 x = gen_rtx_ZERO_EXTEND (Pmode, x);
12786
12787 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12788 }
12789 else
12790 {
12791 rtx caddr = ix86_tls_get_addr ();
12792
12793 if (TARGET_64BIT)
12794 {
12795 rtx rax = gen_rtx_REG (Pmode, AX_REG);
12796 rtx insns;
12797
12798 start_sequence ();
12799 emit_call_insn
12800 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
12801 insns = get_insns ();
12802 end_sequence ();
12803
12804 if (GET_MODE (x) != Pmode)
12805 x = gen_rtx_ZERO_EXTEND (Pmode, x);
12806
12807 RTL_CONST_CALL_P (insns) = 1;
12808 emit_libcall_block (insns, dest, rax, x);
12809 }
12810 else
12811 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
12812 }
12813 break;
12814
12815 case TLS_MODEL_LOCAL_DYNAMIC:
12816 base = gen_reg_rtx (Pmode);
12817
12818 if (!TARGET_64BIT)
12819 {
12820 if (flag_pic)
12821 pic = pic_offset_table_rtx;
12822 else
12823 {
12824 pic = gen_reg_rtx (Pmode);
12825 emit_insn (gen_set_got (pic));
12826 }
12827 }
12828
12829 if (TARGET_GNU2_TLS)
12830 {
12831 rtx tmp = ix86_tls_module_base ();
12832
12833 if (TARGET_64BIT)
12834 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
12835 else
12836 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
12837
12838 tp = get_thread_pointer (Pmode, true);
12839 set_unique_reg_note (get_last_insn (), REG_EQUAL,
12840 gen_rtx_MINUS (Pmode, tmp, tp));
12841 }
12842 else
12843 {
12844 rtx caddr = ix86_tls_get_addr ();
12845
12846 if (TARGET_64BIT)
12847 {
12848 rtx rax = gen_rtx_REG (Pmode, AX_REG);
12849 rtx insns, eqv;
12850
12851 start_sequence ();
12852 emit_call_insn
12853 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
12854 insns = get_insns ();
12855 end_sequence ();
12856
12857 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
12858 share the LD_BASE result with other LD model accesses. */
12859 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
12860 UNSPEC_TLS_LD_BASE);
12861
12862 RTL_CONST_CALL_P (insns) = 1;
12863 emit_libcall_block (insns, base, rax, eqv);
12864 }
12865 else
12866 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
12867 }
12868
12869 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
12870 off = gen_rtx_CONST (Pmode, off);
12871
12872 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
12873
12874 if (TARGET_GNU2_TLS)
12875 {
12876 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
12877
12878 if (GET_MODE (x) != Pmode)
12879 x = gen_rtx_ZERO_EXTEND (Pmode, x);
12880
12881 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12882 }
12883 break;
12884
12885 case TLS_MODEL_INITIAL_EXEC:
12886 if (TARGET_64BIT)
12887 {
12888 if (TARGET_SUN_TLS && !TARGET_X32)
12889 {
12890 /* The Sun linker took the AMD64 TLS spec literally
12891 and can only handle %rax as destination of the
12892 initial executable code sequence. */
12893
12894 dest = gen_reg_rtx (DImode);
12895 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
12896 return dest;
12897 }
12898
12899 /* Generate DImode references to avoid %fs:(%reg32)
12900 problems and linker IE->LE relaxation bug. */
12901 tp_mode = DImode;
12902 pic = NULL;
12903 type = UNSPEC_GOTNTPOFF;
12904 }
12905 else if (flag_pic)
12906 {
12907 if (reload_in_progress)
12908 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12909 pic = pic_offset_table_rtx;
12910 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
12911 }
12912 else if (!TARGET_ANY_GNU_TLS)
12913 {
12914 pic = gen_reg_rtx (Pmode);
12915 emit_insn (gen_set_got (pic));
12916 type = UNSPEC_GOTTPOFF;
12917 }
12918 else
12919 {
12920 pic = NULL;
12921 type = UNSPEC_INDNTPOFF;
12922 }
12923
12924 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
12925 off = gen_rtx_CONST (tp_mode, off);
12926 if (pic)
12927 off = gen_rtx_PLUS (tp_mode, pic, off);
12928 off = gen_const_mem (tp_mode, off);
12929 set_mem_alias_set (off, ix86_GOT_alias_set ());
12930
12931 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12932 {
12933 base = get_thread_pointer (tp_mode,
12934 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12935 off = force_reg (tp_mode, off);
12936 return gen_rtx_PLUS (tp_mode, base, off);
12937 }
12938 else
12939 {
12940 base = get_thread_pointer (Pmode, true);
12941 dest = gen_reg_rtx (Pmode);
12942 emit_insn (ix86_gen_sub3 (dest, base, off));
12943 }
12944 break;
12945
12946 case TLS_MODEL_LOCAL_EXEC:
12947 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
12948 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12949 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
12950 off = gen_rtx_CONST (Pmode, off);
12951
12952 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12953 {
12954 base = get_thread_pointer (Pmode,
12955 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12956 return gen_rtx_PLUS (Pmode, base, off);
12957 }
12958 else
12959 {
12960 base = get_thread_pointer (Pmode, true);
12961 dest = gen_reg_rtx (Pmode);
12962 emit_insn (ix86_gen_sub3 (dest, base, off));
12963 }
12964 break;
12965
12966 default:
12967 gcc_unreachable ();
12968 }
12969
12970 return dest;
12971 }
12972
12973 /* Create or return the unique __imp_DECL dllimport symbol corresponding
12974 to symbol DECL. */
12975
12976 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
12977 htab_t dllimport_map;
12978
12979 static tree
12980 get_dllimport_decl (tree decl)
12981 {
12982 struct tree_map *h, in;
12983 void **loc;
12984 const char *name;
12985 const char *prefix;
12986 size_t namelen, prefixlen;
12987 char *imp_name;
12988 tree to;
12989 rtx rtl;
12990
12991 if (!dllimport_map)
12992 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
12993
12994 in.hash = htab_hash_pointer (decl);
12995 in.base.from = decl;
12996 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
12997 h = (struct tree_map *) *loc;
12998 if (h)
12999 return h->to;
13000
13001 *loc = h = ggc_alloc_tree_map ();
13002 h->hash = in.hash;
13003 h->base.from = decl;
13004 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
13005 VAR_DECL, NULL, ptr_type_node);
13006 DECL_ARTIFICIAL (to) = 1;
13007 DECL_IGNORED_P (to) = 1;
13008 DECL_EXTERNAL (to) = 1;
13009 TREE_READONLY (to) = 1;
13010
13011 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
13012 name = targetm.strip_name_encoding (name);
13013 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
13014 ? "*__imp_" : "*__imp__";
13015 namelen = strlen (name);
13016 prefixlen = strlen (prefix);
13017 imp_name = (char *) alloca (namelen + prefixlen + 1);
13018 memcpy (imp_name, prefix, prefixlen);
13019 memcpy (imp_name + prefixlen, name, namelen + 1);
13020
13021 name = ggc_alloc_string (imp_name, namelen + prefixlen);
13022 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
13023 SET_SYMBOL_REF_DECL (rtl, to);
13024 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL;
13025
13026 rtl = gen_const_mem (Pmode, rtl);
13027 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
13028
13029 SET_DECL_RTL (to, rtl);
13030 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
13031
13032 return to;
13033 }
13034
13035 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
13036 true if we require the result be a register. */
13037
13038 static rtx
13039 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
13040 {
13041 tree imp_decl;
13042 rtx x;
13043
13044 gcc_assert (SYMBOL_REF_DECL (symbol));
13045 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol));
13046
13047 x = DECL_RTL (imp_decl);
13048 if (want_reg)
13049 x = force_reg (Pmode, x);
13050 return x;
13051 }
13052
13053 /* Try machine-dependent ways of modifying an illegitimate address
13054 to be legitimate. If we find one, return the new, valid address.
13055 This macro is used in only one place: `memory_address' in explow.c.
13056
13057 OLDX is the address as it was before break_out_memory_refs was called.
13058 In some cases it is useful to look at this to decide what needs to be done.
13059
13060 It is always safe for this macro to do nothing. It exists to recognize
13061 opportunities to optimize the output.
13062
13063 For the 80386, we handle X+REG by loading X into a register R and
13064 using R+REG. R will go in a general reg and indexing will be used.
13065 However, if REG is a broken-out memory address or multiplication,
13066 nothing needs to be done because REG can certainly go in a general reg.
13067
13068 When -fpic is used, special handling is needed for symbolic references.
13069 See comments by legitimize_pic_address in i386.c for details. */
13070
13071 static rtx
13072 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
13073 enum machine_mode mode)
13074 {
13075 int changed = 0;
13076 unsigned log;
13077
13078 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
13079 if (log)
13080 return legitimize_tls_address (x, (enum tls_model) log, false);
13081 if (GET_CODE (x) == CONST
13082 && GET_CODE (XEXP (x, 0)) == PLUS
13083 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13084 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
13085 {
13086 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
13087 (enum tls_model) log, false);
13088 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13089 }
13090
13091 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13092 {
13093 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x))
13094 return legitimize_dllimport_symbol (x, true);
13095 if (GET_CODE (x) == CONST
13096 && GET_CODE (XEXP (x, 0)) == PLUS
13097 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13098 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0)))
13099 {
13100 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true);
13101 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13102 }
13103 }
13104
13105 if (flag_pic && SYMBOLIC_CONST (x))
13106 return legitimize_pic_address (x, 0);
13107
13108 #if TARGET_MACHO
13109 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
13110 return machopic_indirect_data_reference (x, 0);
13111 #endif
13112
13113 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
13114 if (GET_CODE (x) == ASHIFT
13115 && CONST_INT_P (XEXP (x, 1))
13116 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
13117 {
13118 changed = 1;
13119 log = INTVAL (XEXP (x, 1));
13120 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
13121 GEN_INT (1 << log));
13122 }
13123
13124 if (GET_CODE (x) == PLUS)
13125 {
13126 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
13127
13128 if (GET_CODE (XEXP (x, 0)) == ASHIFT
13129 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13130 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
13131 {
13132 changed = 1;
13133 log = INTVAL (XEXP (XEXP (x, 0), 1));
13134 XEXP (x, 0) = gen_rtx_MULT (Pmode,
13135 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
13136 GEN_INT (1 << log));
13137 }
13138
13139 if (GET_CODE (XEXP (x, 1)) == ASHIFT
13140 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
13141 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
13142 {
13143 changed = 1;
13144 log = INTVAL (XEXP (XEXP (x, 1), 1));
13145 XEXP (x, 1) = gen_rtx_MULT (Pmode,
13146 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
13147 GEN_INT (1 << log));
13148 }
13149
13150 /* Put multiply first if it isn't already. */
13151 if (GET_CODE (XEXP (x, 1)) == MULT)
13152 {
13153 rtx tmp = XEXP (x, 0);
13154 XEXP (x, 0) = XEXP (x, 1);
13155 XEXP (x, 1) = tmp;
13156 changed = 1;
13157 }
13158
13159 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
13160 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
13161 created by virtual register instantiation, register elimination, and
13162 similar optimizations. */
13163 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
13164 {
13165 changed = 1;
13166 x = gen_rtx_PLUS (Pmode,
13167 gen_rtx_PLUS (Pmode, XEXP (x, 0),
13168 XEXP (XEXP (x, 1), 0)),
13169 XEXP (XEXP (x, 1), 1));
13170 }
13171
13172 /* Canonicalize
13173 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
13174 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
13175 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
13176 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13177 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
13178 && CONSTANT_P (XEXP (x, 1)))
13179 {
13180 rtx constant;
13181 rtx other = NULL_RTX;
13182
13183 if (CONST_INT_P (XEXP (x, 1)))
13184 {
13185 constant = XEXP (x, 1);
13186 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
13187 }
13188 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
13189 {
13190 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
13191 other = XEXP (x, 1);
13192 }
13193 else
13194 constant = 0;
13195
13196 if (constant)
13197 {
13198 changed = 1;
13199 x = gen_rtx_PLUS (Pmode,
13200 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
13201 XEXP (XEXP (XEXP (x, 0), 1), 0)),
13202 plus_constant (Pmode, other,
13203 INTVAL (constant)));
13204 }
13205 }
13206
13207 if (changed && ix86_legitimate_address_p (mode, x, false))
13208 return x;
13209
13210 if (GET_CODE (XEXP (x, 0)) == MULT)
13211 {
13212 changed = 1;
13213 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
13214 }
13215
13216 if (GET_CODE (XEXP (x, 1)) == MULT)
13217 {
13218 changed = 1;
13219 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13220 }
13221
13222 if (changed
13223 && REG_P (XEXP (x, 1))
13224 && REG_P (XEXP (x, 0)))
13225 return x;
13226
13227 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13228 {
13229 changed = 1;
13230 x = legitimize_pic_address (x, 0);
13231 }
13232
13233 if (changed && ix86_legitimate_address_p (mode, x, false))
13234 return x;
13235
13236 if (REG_P (XEXP (x, 0)))
13237 {
13238 rtx temp = gen_reg_rtx (Pmode);
13239 rtx val = force_operand (XEXP (x, 1), temp);
13240 if (val != temp)
13241 {
13242 val = convert_to_mode (Pmode, val, 1);
13243 emit_move_insn (temp, val);
13244 }
13245
13246 XEXP (x, 1) = temp;
13247 return x;
13248 }
13249
13250 else if (REG_P (XEXP (x, 1)))
13251 {
13252 rtx temp = gen_reg_rtx (Pmode);
13253 rtx val = force_operand (XEXP (x, 0), temp);
13254 if (val != temp)
13255 {
13256 val = convert_to_mode (Pmode, val, 1);
13257 emit_move_insn (temp, val);
13258 }
13259
13260 XEXP (x, 0) = temp;
13261 return x;
13262 }
13263 }
13264
13265 return x;
13266 }
13267 \f
13268 /* Print an integer constant expression in assembler syntax. Addition
13269 and subtraction are the only arithmetic that may appear in these
13270 expressions. FILE is the stdio stream to write to, X is the rtx, and
13271 CODE is the operand print code from the output string. */
13272
13273 static void
13274 output_pic_addr_const (FILE *file, rtx x, int code)
13275 {
13276 char buf[256];
13277
13278 switch (GET_CODE (x))
13279 {
13280 case PC:
13281 gcc_assert (flag_pic);
13282 putc ('.', file);
13283 break;
13284
13285 case SYMBOL_REF:
13286 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13287 output_addr_const (file, x);
13288 else
13289 {
13290 const char *name = XSTR (x, 0);
13291
13292 /* Mark the decl as referenced so that cgraph will
13293 output the function. */
13294 if (SYMBOL_REF_DECL (x))
13295 mark_decl_referenced (SYMBOL_REF_DECL (x));
13296
13297 #if TARGET_MACHO
13298 if (MACHOPIC_INDIRECT
13299 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13300 name = machopic_indirection_name (x, /*stub_p=*/true);
13301 #endif
13302 assemble_name (file, name);
13303 }
13304 if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI)
13305 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13306 fputs ("@PLT", file);
13307 break;
13308
13309 case LABEL_REF:
13310 x = XEXP (x, 0);
13311 /* FALLTHRU */
13312 case CODE_LABEL:
13313 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13314 assemble_name (asm_out_file, buf);
13315 break;
13316
13317 case CONST_INT:
13318 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13319 break;
13320
13321 case CONST:
13322 /* This used to output parentheses around the expression,
13323 but that does not work on the 386 (either ATT or BSD assembler). */
13324 output_pic_addr_const (file, XEXP (x, 0), code);
13325 break;
13326
13327 case CONST_DOUBLE:
13328 if (GET_MODE (x) == VOIDmode)
13329 {
13330 /* We can use %d if the number is <32 bits and positive. */
13331 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13332 fprintf (file, "0x%lx%08lx",
13333 (unsigned long) CONST_DOUBLE_HIGH (x),
13334 (unsigned long) CONST_DOUBLE_LOW (x));
13335 else
13336 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13337 }
13338 else
13339 /* We can't handle floating point constants;
13340 TARGET_PRINT_OPERAND must handle them. */
13341 output_operand_lossage ("floating constant misused");
13342 break;
13343
13344 case PLUS:
13345 /* Some assemblers need integer constants to appear first. */
13346 if (CONST_INT_P (XEXP (x, 0)))
13347 {
13348 output_pic_addr_const (file, XEXP (x, 0), code);
13349 putc ('+', file);
13350 output_pic_addr_const (file, XEXP (x, 1), code);
13351 }
13352 else
13353 {
13354 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13355 output_pic_addr_const (file, XEXP (x, 1), code);
13356 putc ('+', file);
13357 output_pic_addr_const (file, XEXP (x, 0), code);
13358 }
13359 break;
13360
13361 case MINUS:
13362 if (!TARGET_MACHO)
13363 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13364 output_pic_addr_const (file, XEXP (x, 0), code);
13365 putc ('-', file);
13366 output_pic_addr_const (file, XEXP (x, 1), code);
13367 if (!TARGET_MACHO)
13368 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13369 break;
13370
13371 case UNSPEC:
13372 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13373 {
13374 bool f = i386_asm_output_addr_const_extra (file, x);
13375 gcc_assert (f);
13376 break;
13377 }
13378
13379 gcc_assert (XVECLEN (x, 0) == 1);
13380 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13381 switch (XINT (x, 1))
13382 {
13383 case UNSPEC_GOT:
13384 fputs ("@GOT", file);
13385 break;
13386 case UNSPEC_GOTOFF:
13387 fputs ("@GOTOFF", file);
13388 break;
13389 case UNSPEC_PLTOFF:
13390 fputs ("@PLTOFF", file);
13391 break;
13392 case UNSPEC_PCREL:
13393 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13394 "(%rip)" : "[rip]", file);
13395 break;
13396 case UNSPEC_GOTPCREL:
13397 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13398 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13399 break;
13400 case UNSPEC_GOTTPOFF:
13401 /* FIXME: This might be @TPOFF in Sun ld too. */
13402 fputs ("@gottpoff", file);
13403 break;
13404 case UNSPEC_TPOFF:
13405 fputs ("@tpoff", file);
13406 break;
13407 case UNSPEC_NTPOFF:
13408 if (TARGET_64BIT)
13409 fputs ("@tpoff", file);
13410 else
13411 fputs ("@ntpoff", file);
13412 break;
13413 case UNSPEC_DTPOFF:
13414 fputs ("@dtpoff", file);
13415 break;
13416 case UNSPEC_GOTNTPOFF:
13417 if (TARGET_64BIT)
13418 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13419 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13420 else
13421 fputs ("@gotntpoff", file);
13422 break;
13423 case UNSPEC_INDNTPOFF:
13424 fputs ("@indntpoff", file);
13425 break;
13426 #if TARGET_MACHO
13427 case UNSPEC_MACHOPIC_OFFSET:
13428 putc ('-', file);
13429 machopic_output_function_base_name (file);
13430 break;
13431 #endif
13432 default:
13433 output_operand_lossage ("invalid UNSPEC as operand");
13434 break;
13435 }
13436 break;
13437
13438 default:
13439 output_operand_lossage ("invalid expression as operand");
13440 }
13441 }
13442
13443 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13444 We need to emit DTP-relative relocations. */
13445
13446 static void ATTRIBUTE_UNUSED
13447 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13448 {
13449 fputs (ASM_LONG, file);
13450 output_addr_const (file, x);
13451 fputs ("@dtpoff", file);
13452 switch (size)
13453 {
13454 case 4:
13455 break;
13456 case 8:
13457 fputs (", 0", file);
13458 break;
13459 default:
13460 gcc_unreachable ();
13461 }
13462 }
13463
13464 /* Return true if X is a representation of the PIC register. This copes
13465 with calls from ix86_find_base_term, where the register might have
13466 been replaced by a cselib value. */
13467
13468 static bool
13469 ix86_pic_register_p (rtx x)
13470 {
13471 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13472 return (pic_offset_table_rtx
13473 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13474 else
13475 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13476 }
13477
13478 /* Helper function for ix86_delegitimize_address.
13479 Attempt to delegitimize TLS local-exec accesses. */
13480
13481 static rtx
13482 ix86_delegitimize_tls_address (rtx orig_x)
13483 {
13484 rtx x = orig_x, unspec;
13485 struct ix86_address addr;
13486
13487 if (!TARGET_TLS_DIRECT_SEG_REFS)
13488 return orig_x;
13489 if (MEM_P (x))
13490 x = XEXP (x, 0);
13491 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13492 return orig_x;
13493 if (ix86_decompose_address (x, &addr) == 0
13494 || addr.seg != (TARGET_64BIT ? SEG_FS : SEG_GS)
13495 || addr.disp == NULL_RTX
13496 || GET_CODE (addr.disp) != CONST)
13497 return orig_x;
13498 unspec = XEXP (addr.disp, 0);
13499 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13500 unspec = XEXP (unspec, 0);
13501 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13502 return orig_x;
13503 x = XVECEXP (unspec, 0, 0);
13504 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13505 if (unspec != XEXP (addr.disp, 0))
13506 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13507 if (addr.index)
13508 {
13509 rtx idx = addr.index;
13510 if (addr.scale != 1)
13511 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13512 x = gen_rtx_PLUS (Pmode, idx, x);
13513 }
13514 if (addr.base)
13515 x = gen_rtx_PLUS (Pmode, addr.base, x);
13516 if (MEM_P (orig_x))
13517 x = replace_equiv_address_nv (orig_x, x);
13518 return x;
13519 }
13520
13521 /* In the name of slightly smaller debug output, and to cater to
13522 general assembler lossage, recognize PIC+GOTOFF and turn it back
13523 into a direct symbol reference.
13524
13525 On Darwin, this is necessary to avoid a crash, because Darwin
13526 has a different PIC label for each routine but the DWARF debugging
13527 information is not associated with any particular routine, so it's
13528 necessary to remove references to the PIC label from RTL stored by
13529 the DWARF output code. */
13530
13531 static rtx
13532 ix86_delegitimize_address (rtx x)
13533 {
13534 rtx orig_x = delegitimize_mem_from_attrs (x);
13535 /* addend is NULL or some rtx if x is something+GOTOFF where
13536 something doesn't include the PIC register. */
13537 rtx addend = NULL_RTX;
13538 /* reg_addend is NULL or a multiple of some register. */
13539 rtx reg_addend = NULL_RTX;
13540 /* const_addend is NULL or a const_int. */
13541 rtx const_addend = NULL_RTX;
13542 /* This is the result, or NULL. */
13543 rtx result = NULL_RTX;
13544
13545 x = orig_x;
13546
13547 if (MEM_P (x))
13548 x = XEXP (x, 0);
13549
13550 if (TARGET_64BIT)
13551 {
13552 if (GET_CODE (x) == CONST
13553 && GET_CODE (XEXP (x, 0)) == PLUS
13554 && GET_MODE (XEXP (x, 0)) == Pmode
13555 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13556 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
13557 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
13558 {
13559 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
13560 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
13561 if (MEM_P (orig_x))
13562 x = replace_equiv_address_nv (orig_x, x);
13563 return x;
13564 }
13565 if (GET_CODE (x) != CONST
13566 || GET_CODE (XEXP (x, 0)) != UNSPEC
13567 || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
13568 && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)
13569 || (!MEM_P (orig_x) && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL))
13570 return ix86_delegitimize_tls_address (orig_x);
13571 x = XVECEXP (XEXP (x, 0), 0, 0);
13572 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
13573 {
13574 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13575 GET_MODE (x), 0);
13576 if (x == NULL_RTX)
13577 return orig_x;
13578 }
13579 return x;
13580 }
13581
13582 if (GET_CODE (x) != PLUS
13583 || GET_CODE (XEXP (x, 1)) != CONST)
13584 return ix86_delegitimize_tls_address (orig_x);
13585
13586 if (ix86_pic_register_p (XEXP (x, 0)))
13587 /* %ebx + GOT/GOTOFF */
13588 ;
13589 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13590 {
13591 /* %ebx + %reg * scale + GOT/GOTOFF */
13592 reg_addend = XEXP (x, 0);
13593 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13594 reg_addend = XEXP (reg_addend, 1);
13595 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13596 reg_addend = XEXP (reg_addend, 0);
13597 else
13598 {
13599 reg_addend = NULL_RTX;
13600 addend = XEXP (x, 0);
13601 }
13602 }
13603 else
13604 addend = XEXP (x, 0);
13605
13606 x = XEXP (XEXP (x, 1), 0);
13607 if (GET_CODE (x) == PLUS
13608 && CONST_INT_P (XEXP (x, 1)))
13609 {
13610 const_addend = XEXP (x, 1);
13611 x = XEXP (x, 0);
13612 }
13613
13614 if (GET_CODE (x) == UNSPEC
13615 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13616 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
13617 result = XVECEXP (x, 0, 0);
13618
13619 if (TARGET_MACHO && darwin_local_data_pic (x)
13620 && !MEM_P (orig_x))
13621 result = XVECEXP (x, 0, 0);
13622
13623 if (! result)
13624 return ix86_delegitimize_tls_address (orig_x);
13625
13626 if (const_addend)
13627 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13628 if (reg_addend)
13629 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13630 if (addend)
13631 {
13632 /* If the rest of original X doesn't involve the PIC register, add
13633 addend and subtract pic_offset_table_rtx. This can happen e.g.
13634 for code like:
13635 leal (%ebx, %ecx, 4), %ecx
13636 ...
13637 movl foo@GOTOFF(%ecx), %edx
13638 in which case we return (%ecx - %ebx) + foo. */
13639 if (pic_offset_table_rtx)
13640 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13641 pic_offset_table_rtx),
13642 result);
13643 else
13644 return orig_x;
13645 }
13646 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13647 {
13648 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13649 if (result == NULL_RTX)
13650 return orig_x;
13651 }
13652 return result;
13653 }
13654
13655 /* If X is a machine specific address (i.e. a symbol or label being
13656 referenced as a displacement from the GOT implemented using an
13657 UNSPEC), then return the base term. Otherwise return X. */
13658
13659 rtx
13660 ix86_find_base_term (rtx x)
13661 {
13662 rtx term;
13663
13664 if (TARGET_64BIT)
13665 {
13666 if (GET_CODE (x) != CONST)
13667 return x;
13668 term = XEXP (x, 0);
13669 if (GET_CODE (term) == PLUS
13670 && (CONST_INT_P (XEXP (term, 1))
13671 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13672 term = XEXP (term, 0);
13673 if (GET_CODE (term) != UNSPEC
13674 || (XINT (term, 1) != UNSPEC_GOTPCREL
13675 && XINT (term, 1) != UNSPEC_PCREL))
13676 return x;
13677
13678 return XVECEXP (term, 0, 0);
13679 }
13680
13681 return ix86_delegitimize_address (x);
13682 }
13683 \f
13684 static void
13685 put_condition_code (enum rtx_code code, enum machine_mode mode, bool reverse,
13686 bool fp, FILE *file)
13687 {
13688 const char *suffix;
13689
13690 if (mode == CCFPmode || mode == CCFPUmode)
13691 {
13692 code = ix86_fp_compare_code_to_integer (code);
13693 mode = CCmode;
13694 }
13695 if (reverse)
13696 code = reverse_condition (code);
13697
13698 switch (code)
13699 {
13700 case EQ:
13701 switch (mode)
13702 {
13703 case CCAmode:
13704 suffix = "a";
13705 break;
13706
13707 case CCCmode:
13708 suffix = "c";
13709 break;
13710
13711 case CCOmode:
13712 suffix = "o";
13713 break;
13714
13715 case CCSmode:
13716 suffix = "s";
13717 break;
13718
13719 default:
13720 suffix = "e";
13721 }
13722 break;
13723 case NE:
13724 switch (mode)
13725 {
13726 case CCAmode:
13727 suffix = "na";
13728 break;
13729
13730 case CCCmode:
13731 suffix = "nc";
13732 break;
13733
13734 case CCOmode:
13735 suffix = "no";
13736 break;
13737
13738 case CCSmode:
13739 suffix = "ns";
13740 break;
13741
13742 default:
13743 suffix = "ne";
13744 }
13745 break;
13746 case GT:
13747 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
13748 suffix = "g";
13749 break;
13750 case GTU:
13751 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
13752 Those same assemblers have the same but opposite lossage on cmov. */
13753 if (mode == CCmode)
13754 suffix = fp ? "nbe" : "a";
13755 else if (mode == CCCmode)
13756 suffix = "b";
13757 else
13758 gcc_unreachable ();
13759 break;
13760 case LT:
13761 switch (mode)
13762 {
13763 case CCNOmode:
13764 case CCGOCmode:
13765 suffix = "s";
13766 break;
13767
13768 case CCmode:
13769 case CCGCmode:
13770 suffix = "l";
13771 break;
13772
13773 default:
13774 gcc_unreachable ();
13775 }
13776 break;
13777 case LTU:
13778 gcc_assert (mode == CCmode || mode == CCCmode);
13779 suffix = "b";
13780 break;
13781 case GE:
13782 switch (mode)
13783 {
13784 case CCNOmode:
13785 case CCGOCmode:
13786 suffix = "ns";
13787 break;
13788
13789 case CCmode:
13790 case CCGCmode:
13791 suffix = "ge";
13792 break;
13793
13794 default:
13795 gcc_unreachable ();
13796 }
13797 break;
13798 case GEU:
13799 /* ??? As above. */
13800 gcc_assert (mode == CCmode || mode == CCCmode);
13801 suffix = fp ? "nb" : "ae";
13802 break;
13803 case LE:
13804 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
13805 suffix = "le";
13806 break;
13807 case LEU:
13808 /* ??? As above. */
13809 if (mode == CCmode)
13810 suffix = "be";
13811 else if (mode == CCCmode)
13812 suffix = fp ? "nb" : "ae";
13813 else
13814 gcc_unreachable ();
13815 break;
13816 case UNORDERED:
13817 suffix = fp ? "u" : "p";
13818 break;
13819 case ORDERED:
13820 suffix = fp ? "nu" : "np";
13821 break;
13822 default:
13823 gcc_unreachable ();
13824 }
13825 fputs (suffix, file);
13826 }
13827
13828 /* Print the name of register X to FILE based on its machine mode and number.
13829 If CODE is 'w', pretend the mode is HImode.
13830 If CODE is 'b', pretend the mode is QImode.
13831 If CODE is 'k', pretend the mode is SImode.
13832 If CODE is 'q', pretend the mode is DImode.
13833 If CODE is 'x', pretend the mode is V4SFmode.
13834 If CODE is 't', pretend the mode is V8SFmode.
13835 If CODE is 'h', pretend the reg is the 'high' byte register.
13836 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
13837 If CODE is 'd', duplicate the operand for AVX instruction.
13838 */
13839
13840 void
13841 print_reg (rtx x, int code, FILE *file)
13842 {
13843 const char *reg;
13844 unsigned int regno;
13845 bool duplicated = code == 'd' && TARGET_AVX;
13846
13847 if (ASSEMBLER_DIALECT == ASM_ATT)
13848 putc ('%', file);
13849
13850 if (x == pc_rtx)
13851 {
13852 gcc_assert (TARGET_64BIT);
13853 fputs ("rip", file);
13854 return;
13855 }
13856
13857 regno = true_regnum (x);
13858 gcc_assert (regno != ARG_POINTER_REGNUM
13859 && regno != FRAME_POINTER_REGNUM
13860 && regno != FLAGS_REG
13861 && regno != FPSR_REG
13862 && regno != FPCR_REG);
13863
13864 if (code == 'w' || MMX_REG_P (x))
13865 code = 2;
13866 else if (code == 'b')
13867 code = 1;
13868 else if (code == 'k')
13869 code = 4;
13870 else if (code == 'q')
13871 code = 8;
13872 else if (code == 'y')
13873 code = 3;
13874 else if (code == 'h')
13875 code = 0;
13876 else if (code == 'x')
13877 code = 16;
13878 else if (code == 't')
13879 code = 32;
13880 else
13881 code = GET_MODE_SIZE (GET_MODE (x));
13882
13883 /* Irritatingly, AMD extended registers use different naming convention
13884 from the normal registers: "r%d[bwd]" */
13885 if (REX_INT_REGNO_P (regno))
13886 {
13887 gcc_assert (TARGET_64BIT);
13888 putc ('r', file);
13889 fprint_ul (file, regno - FIRST_REX_INT_REG + 8);
13890 switch (code)
13891 {
13892 case 0:
13893 error ("extended registers have no high halves");
13894 break;
13895 case 1:
13896 putc ('b', file);
13897 break;
13898 case 2:
13899 putc ('w', file);
13900 break;
13901 case 4:
13902 putc ('d', file);
13903 break;
13904 case 8:
13905 /* no suffix */
13906 break;
13907 default:
13908 error ("unsupported operand size for extended register");
13909 break;
13910 }
13911 return;
13912 }
13913
13914 reg = NULL;
13915 switch (code)
13916 {
13917 case 3:
13918 if (STACK_TOP_P (x))
13919 {
13920 reg = "st(0)";
13921 break;
13922 }
13923 /* FALLTHRU */
13924 case 8:
13925 case 4:
13926 case 12:
13927 if (! ANY_FP_REG_P (x))
13928 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
13929 /* FALLTHRU */
13930 case 16:
13931 case 2:
13932 normal:
13933 reg = hi_reg_name[regno];
13934 break;
13935 case 1:
13936 if (regno >= ARRAY_SIZE (qi_reg_name))
13937 goto normal;
13938 reg = qi_reg_name[regno];
13939 break;
13940 case 0:
13941 if (regno >= ARRAY_SIZE (qi_high_reg_name))
13942 goto normal;
13943 reg = qi_high_reg_name[regno];
13944 break;
13945 case 32:
13946 if (SSE_REG_P (x))
13947 {
13948 gcc_assert (!duplicated);
13949 putc ('y', file);
13950 fputs (hi_reg_name[regno] + 1, file);
13951 return;
13952 }
13953 break;
13954 default:
13955 gcc_unreachable ();
13956 }
13957
13958 fputs (reg, file);
13959 if (duplicated)
13960 {
13961 if (ASSEMBLER_DIALECT == ASM_ATT)
13962 fprintf (file, ", %%%s", reg);
13963 else
13964 fprintf (file, ", %s", reg);
13965 }
13966 }
13967
13968 /* Locate some local-dynamic symbol still in use by this function
13969 so that we can print its name in some tls_local_dynamic_base
13970 pattern. */
13971
13972 static int
13973 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
13974 {
13975 rtx x = *px;
13976
13977 if (GET_CODE (x) == SYMBOL_REF
13978 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
13979 {
13980 cfun->machine->some_ld_name = XSTR (x, 0);
13981 return 1;
13982 }
13983
13984 return 0;
13985 }
13986
13987 static const char *
13988 get_some_local_dynamic_name (void)
13989 {
13990 rtx insn;
13991
13992 if (cfun->machine->some_ld_name)
13993 return cfun->machine->some_ld_name;
13994
13995 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
13996 if (NONDEBUG_INSN_P (insn)
13997 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
13998 return cfun->machine->some_ld_name;
13999
14000 return NULL;
14001 }
14002
14003 /* Meaning of CODE:
14004 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
14005 C -- print opcode suffix for set/cmov insn.
14006 c -- like C, but print reversed condition
14007 F,f -- likewise, but for floating-point.
14008 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
14009 otherwise nothing
14010 R -- print the prefix for register names.
14011 z -- print the opcode suffix for the size of the current operand.
14012 Z -- likewise, with special suffixes for x87 instructions.
14013 * -- print a star (in certain assembler syntax)
14014 A -- print an absolute memory reference.
14015 E -- print address with DImode register names if TARGET_64BIT.
14016 w -- print the operand as if it's a "word" (HImode) even if it isn't.
14017 s -- print a shift double count, followed by the assemblers argument
14018 delimiter.
14019 b -- print the QImode name of the register for the indicated operand.
14020 %b0 would print %al if operands[0] is reg 0.
14021 w -- likewise, print the HImode name of the register.
14022 k -- likewise, print the SImode name of the register.
14023 q -- likewise, print the DImode name of the register.
14024 x -- likewise, print the V4SFmode name of the register.
14025 t -- likewise, print the V8SFmode name of the register.
14026 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
14027 y -- print "st(0)" instead of "st" as a register.
14028 d -- print duplicated register operand for AVX instruction.
14029 D -- print condition for SSE cmp instruction.
14030 P -- if PIC, print an @PLT suffix.
14031 p -- print raw symbol name.
14032 X -- don't print any sort of PIC '@' suffix for a symbol.
14033 & -- print some in-use local-dynamic symbol name.
14034 H -- print a memory address offset by 8; used for sse high-parts
14035 Y -- print condition for XOP pcom* instruction.
14036 + -- print a branch hint as 'cs' or 'ds' prefix
14037 ; -- print a semicolon (after prefixes due to bug in older gas).
14038 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
14039 @ -- print a segment register of thread base pointer load
14040 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
14041 */
14042
14043 void
14044 ix86_print_operand (FILE *file, rtx x, int code)
14045 {
14046 if (code)
14047 {
14048 switch (code)
14049 {
14050 case 'A':
14051 switch (ASSEMBLER_DIALECT)
14052 {
14053 case ASM_ATT:
14054 putc ('*', file);
14055 break;
14056
14057 case ASM_INTEL:
14058 /* Intel syntax. For absolute addresses, registers should not
14059 be surrounded by braces. */
14060 if (!REG_P (x))
14061 {
14062 putc ('[', file);
14063 ix86_print_operand (file, x, 0);
14064 putc (']', file);
14065 return;
14066 }
14067 break;
14068
14069 default:
14070 gcc_unreachable ();
14071 }
14072
14073 ix86_print_operand (file, x, 0);
14074 return;
14075
14076 case 'E':
14077 /* Wrap address in an UNSPEC to declare special handling. */
14078 if (TARGET_64BIT)
14079 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
14080
14081 output_address (x);
14082 return;
14083
14084 case 'L':
14085 if (ASSEMBLER_DIALECT == ASM_ATT)
14086 putc ('l', file);
14087 return;
14088
14089 case 'W':
14090 if (ASSEMBLER_DIALECT == ASM_ATT)
14091 putc ('w', file);
14092 return;
14093
14094 case 'B':
14095 if (ASSEMBLER_DIALECT == ASM_ATT)
14096 putc ('b', file);
14097 return;
14098
14099 case 'Q':
14100 if (ASSEMBLER_DIALECT == ASM_ATT)
14101 putc ('l', file);
14102 return;
14103
14104 case 'S':
14105 if (ASSEMBLER_DIALECT == ASM_ATT)
14106 putc ('s', file);
14107 return;
14108
14109 case 'T':
14110 if (ASSEMBLER_DIALECT == ASM_ATT)
14111 putc ('t', file);
14112 return;
14113
14114 case 'O':
14115 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14116 if (ASSEMBLER_DIALECT != ASM_ATT)
14117 return;
14118
14119 switch (GET_MODE_SIZE (GET_MODE (x)))
14120 {
14121 case 2:
14122 putc ('w', file);
14123 break;
14124
14125 case 4:
14126 putc ('l', file);
14127 break;
14128
14129 case 8:
14130 putc ('q', file);
14131 break;
14132
14133 default:
14134 output_operand_lossage
14135 ("invalid operand size for operand code 'O'");
14136 return;
14137 }
14138
14139 putc ('.', file);
14140 #endif
14141 return;
14142
14143 case 'z':
14144 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14145 {
14146 /* Opcodes don't get size suffixes if using Intel opcodes. */
14147 if (ASSEMBLER_DIALECT == ASM_INTEL)
14148 return;
14149
14150 switch (GET_MODE_SIZE (GET_MODE (x)))
14151 {
14152 case 1:
14153 putc ('b', file);
14154 return;
14155
14156 case 2:
14157 putc ('w', file);
14158 return;
14159
14160 case 4:
14161 putc ('l', file);
14162 return;
14163
14164 case 8:
14165 putc ('q', file);
14166 return;
14167
14168 default:
14169 output_operand_lossage
14170 ("invalid operand size for operand code 'z'");
14171 return;
14172 }
14173 }
14174
14175 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14176 warning
14177 (0, "non-integer operand used with operand code 'z'");
14178 /* FALLTHRU */
14179
14180 case 'Z':
14181 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
14182 if (ASSEMBLER_DIALECT == ASM_INTEL)
14183 return;
14184
14185 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14186 {
14187 switch (GET_MODE_SIZE (GET_MODE (x)))
14188 {
14189 case 2:
14190 #ifdef HAVE_AS_IX86_FILDS
14191 putc ('s', file);
14192 #endif
14193 return;
14194
14195 case 4:
14196 putc ('l', file);
14197 return;
14198
14199 case 8:
14200 #ifdef HAVE_AS_IX86_FILDQ
14201 putc ('q', file);
14202 #else
14203 fputs ("ll", file);
14204 #endif
14205 return;
14206
14207 default:
14208 break;
14209 }
14210 }
14211 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14212 {
14213 /* 387 opcodes don't get size suffixes
14214 if the operands are registers. */
14215 if (STACK_REG_P (x))
14216 return;
14217
14218 switch (GET_MODE_SIZE (GET_MODE (x)))
14219 {
14220 case 4:
14221 putc ('s', file);
14222 return;
14223
14224 case 8:
14225 putc ('l', file);
14226 return;
14227
14228 case 12:
14229 case 16:
14230 putc ('t', file);
14231 return;
14232
14233 default:
14234 break;
14235 }
14236 }
14237 else
14238 {
14239 output_operand_lossage
14240 ("invalid operand type used with operand code 'Z'");
14241 return;
14242 }
14243
14244 output_operand_lossage
14245 ("invalid operand size for operand code 'Z'");
14246 return;
14247
14248 case 'd':
14249 case 'b':
14250 case 'w':
14251 case 'k':
14252 case 'q':
14253 case 'h':
14254 case 't':
14255 case 'y':
14256 case 'x':
14257 case 'X':
14258 case 'P':
14259 case 'p':
14260 break;
14261
14262 case 's':
14263 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14264 {
14265 ix86_print_operand (file, x, 0);
14266 fputs (", ", file);
14267 }
14268 return;
14269
14270 case 'Y':
14271 switch (GET_CODE (x))
14272 {
14273 case NE:
14274 fputs ("neq", file);
14275 break;
14276 case EQ:
14277 fputs ("eq", file);
14278 break;
14279 case GE:
14280 case GEU:
14281 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14282 break;
14283 case GT:
14284 case GTU:
14285 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14286 break;
14287 case LE:
14288 case LEU:
14289 fputs ("le", file);
14290 break;
14291 case LT:
14292 case LTU:
14293 fputs ("lt", file);
14294 break;
14295 case UNORDERED:
14296 fputs ("unord", file);
14297 break;
14298 case ORDERED:
14299 fputs ("ord", file);
14300 break;
14301 case UNEQ:
14302 fputs ("ueq", file);
14303 break;
14304 case UNGE:
14305 fputs ("nlt", file);
14306 break;
14307 case UNGT:
14308 fputs ("nle", file);
14309 break;
14310 case UNLE:
14311 fputs ("ule", file);
14312 break;
14313 case UNLT:
14314 fputs ("ult", file);
14315 break;
14316 case LTGT:
14317 fputs ("une", file);
14318 break;
14319 default:
14320 output_operand_lossage ("operand is not a condition code, "
14321 "invalid operand code 'Y'");
14322 return;
14323 }
14324 return;
14325
14326 case 'D':
14327 /* Little bit of braindamage here. The SSE compare instructions
14328 does use completely different names for the comparisons that the
14329 fp conditional moves. */
14330 switch (GET_CODE (x))
14331 {
14332 case UNEQ:
14333 if (TARGET_AVX)
14334 {
14335 fputs ("eq_us", file);
14336 break;
14337 }
14338 case EQ:
14339 fputs ("eq", file);
14340 break;
14341 case UNLT:
14342 if (TARGET_AVX)
14343 {
14344 fputs ("nge", file);
14345 break;
14346 }
14347 case LT:
14348 fputs ("lt", file);
14349 break;
14350 case UNLE:
14351 if (TARGET_AVX)
14352 {
14353 fputs ("ngt", file);
14354 break;
14355 }
14356 case LE:
14357 fputs ("le", file);
14358 break;
14359 case UNORDERED:
14360 fputs ("unord", file);
14361 break;
14362 case LTGT:
14363 if (TARGET_AVX)
14364 {
14365 fputs ("neq_oq", file);
14366 break;
14367 }
14368 case NE:
14369 fputs ("neq", file);
14370 break;
14371 case GE:
14372 if (TARGET_AVX)
14373 {
14374 fputs ("ge", file);
14375 break;
14376 }
14377 case UNGE:
14378 fputs ("nlt", file);
14379 break;
14380 case GT:
14381 if (TARGET_AVX)
14382 {
14383 fputs ("gt", file);
14384 break;
14385 }
14386 case UNGT:
14387 fputs ("nle", file);
14388 break;
14389 case ORDERED:
14390 fputs ("ord", file);
14391 break;
14392 default:
14393 output_operand_lossage ("operand is not a condition code, "
14394 "invalid operand code 'D'");
14395 return;
14396 }
14397 return;
14398
14399 case 'F':
14400 case 'f':
14401 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14402 if (ASSEMBLER_DIALECT == ASM_ATT)
14403 putc ('.', file);
14404 #endif
14405
14406 case 'C':
14407 case 'c':
14408 if (!COMPARISON_P (x))
14409 {
14410 output_operand_lossage ("operand is not a condition code, "
14411 "invalid operand code '%c'", code);
14412 return;
14413 }
14414 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
14415 code == 'c' || code == 'f',
14416 code == 'F' || code == 'f',
14417 file);
14418 return;
14419
14420 case 'H':
14421 if (!offsettable_memref_p (x))
14422 {
14423 output_operand_lossage ("operand is not an offsettable memory "
14424 "reference, invalid operand code 'H'");
14425 return;
14426 }
14427 /* It doesn't actually matter what mode we use here, as we're
14428 only going to use this for printing. */
14429 x = adjust_address_nv (x, DImode, 8);
14430 break;
14431
14432 case 'K':
14433 gcc_assert (CONST_INT_P (x));
14434
14435 if (INTVAL (x) & IX86_HLE_ACQUIRE)
14436 #ifdef HAVE_AS_IX86_HLE
14437 fputs ("xacquire ", file);
14438 #else
14439 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
14440 #endif
14441 else if (INTVAL (x) & IX86_HLE_RELEASE)
14442 #ifdef HAVE_AS_IX86_HLE
14443 fputs ("xrelease ", file);
14444 #else
14445 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
14446 #endif
14447 /* We do not want to print value of the operand. */
14448 return;
14449
14450 case '*':
14451 if (ASSEMBLER_DIALECT == ASM_ATT)
14452 putc ('*', file);
14453 return;
14454
14455 case '&':
14456 {
14457 const char *name = get_some_local_dynamic_name ();
14458 if (name == NULL)
14459 output_operand_lossage ("'%%&' used without any "
14460 "local dynamic TLS references");
14461 else
14462 assemble_name (file, name);
14463 return;
14464 }
14465
14466 case '+':
14467 {
14468 rtx x;
14469
14470 if (!optimize
14471 || optimize_function_for_size_p (cfun)
14472 || !TARGET_BRANCH_PREDICTION_HINTS)
14473 return;
14474
14475 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14476 if (x)
14477 {
14478 int pred_val = INTVAL (XEXP (x, 0));
14479
14480 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14481 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14482 {
14483 bool taken = pred_val > REG_BR_PROB_BASE / 2;
14484 bool cputaken
14485 = final_forward_branch_p (current_output_insn) == 0;
14486
14487 /* Emit hints only in the case default branch prediction
14488 heuristics would fail. */
14489 if (taken != cputaken)
14490 {
14491 /* We use 3e (DS) prefix for taken branches and
14492 2e (CS) prefix for not taken branches. */
14493 if (taken)
14494 fputs ("ds ; ", file);
14495 else
14496 fputs ("cs ; ", file);
14497 }
14498 }
14499 }
14500 return;
14501 }
14502
14503 case ';':
14504 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14505 putc (';', file);
14506 #endif
14507 return;
14508
14509 case '@':
14510 if (ASSEMBLER_DIALECT == ASM_ATT)
14511 putc ('%', file);
14512
14513 /* The kernel uses a different segment register for performance
14514 reasons; a system call would not have to trash the userspace
14515 segment register, which would be expensive. */
14516 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14517 fputs ("fs", file);
14518 else
14519 fputs ("gs", file);
14520 return;
14521
14522 case '~':
14523 putc (TARGET_AVX2 ? 'i' : 'f', file);
14524 return;
14525
14526 case '^':
14527 if (TARGET_64BIT && Pmode != word_mode)
14528 fputs ("addr32 ", file);
14529 return;
14530
14531 default:
14532 output_operand_lossage ("invalid operand code '%c'", code);
14533 }
14534 }
14535
14536 if (REG_P (x))
14537 print_reg (x, code, file);
14538
14539 else if (MEM_P (x))
14540 {
14541 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14542 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14543 && GET_MODE (x) != BLKmode)
14544 {
14545 const char * size;
14546 switch (GET_MODE_SIZE (GET_MODE (x)))
14547 {
14548 case 1: size = "BYTE"; break;
14549 case 2: size = "WORD"; break;
14550 case 4: size = "DWORD"; break;
14551 case 8: size = "QWORD"; break;
14552 case 12: size = "TBYTE"; break;
14553 case 16:
14554 if (GET_MODE (x) == XFmode)
14555 size = "TBYTE";
14556 else
14557 size = "XMMWORD";
14558 break;
14559 case 32: size = "YMMWORD"; break;
14560 default:
14561 gcc_unreachable ();
14562 }
14563
14564 /* Check for explicit size override (codes 'b', 'w', 'k',
14565 'q' and 'x') */
14566 if (code == 'b')
14567 size = "BYTE";
14568 else if (code == 'w')
14569 size = "WORD";
14570 else if (code == 'k')
14571 size = "DWORD";
14572 else if (code == 'q')
14573 size = "QWORD";
14574 else if (code == 'x')
14575 size = "XMMWORD";
14576
14577 fputs (size, file);
14578 fputs (" PTR ", file);
14579 }
14580
14581 x = XEXP (x, 0);
14582 /* Avoid (%rip) for call operands. */
14583 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14584 && !CONST_INT_P (x))
14585 output_addr_const (file, x);
14586 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14587 output_operand_lossage ("invalid constraints for operand");
14588 else
14589 output_address (x);
14590 }
14591
14592 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14593 {
14594 REAL_VALUE_TYPE r;
14595 long l;
14596
14597 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14598 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14599
14600 if (ASSEMBLER_DIALECT == ASM_ATT)
14601 putc ('$', file);
14602 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14603 if (code == 'q')
14604 fprintf (file, "0x%08llx", (unsigned long long) (int) l);
14605 else
14606 fprintf (file, "0x%08x", (unsigned int) l);
14607 }
14608
14609 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14610 {
14611 REAL_VALUE_TYPE r;
14612 long l[2];
14613
14614 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14615 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14616
14617 if (ASSEMBLER_DIALECT == ASM_ATT)
14618 putc ('$', file);
14619 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14620 }
14621
14622 /* These float cases don't actually occur as immediate operands. */
14623 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14624 {
14625 char dstr[30];
14626
14627 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14628 fputs (dstr, file);
14629 }
14630
14631 else
14632 {
14633 /* We have patterns that allow zero sets of memory, for instance.
14634 In 64-bit mode, we should probably support all 8-byte vectors,
14635 since we can in fact encode that into an immediate. */
14636 if (GET_CODE (x) == CONST_VECTOR)
14637 {
14638 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14639 x = const0_rtx;
14640 }
14641
14642 if (code != 'P' && code != 'p')
14643 {
14644 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14645 {
14646 if (ASSEMBLER_DIALECT == ASM_ATT)
14647 putc ('$', file);
14648 }
14649 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14650 || GET_CODE (x) == LABEL_REF)
14651 {
14652 if (ASSEMBLER_DIALECT == ASM_ATT)
14653 putc ('$', file);
14654 else
14655 fputs ("OFFSET FLAT:", file);
14656 }
14657 }
14658 if (CONST_INT_P (x))
14659 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14660 else if (flag_pic || MACHOPIC_INDIRECT)
14661 output_pic_addr_const (file, x, code);
14662 else
14663 output_addr_const (file, x);
14664 }
14665 }
14666
14667 static bool
14668 ix86_print_operand_punct_valid_p (unsigned char code)
14669 {
14670 return (code == '@' || code == '*' || code == '+' || code == '&'
14671 || code == ';' || code == '~' || code == '^');
14672 }
14673 \f
14674 /* Print a memory operand whose address is ADDR. */
14675
14676 static void
14677 ix86_print_operand_address (FILE *file, rtx addr)
14678 {
14679 struct ix86_address parts;
14680 rtx base, index, disp;
14681 int scale;
14682 int ok;
14683 bool vsib = false;
14684 int code = 0;
14685
14686 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
14687 {
14688 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14689 gcc_assert (parts.index == NULL_RTX);
14690 parts.index = XVECEXP (addr, 0, 1);
14691 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
14692 addr = XVECEXP (addr, 0, 0);
14693 vsib = true;
14694 }
14695 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
14696 {
14697 gcc_assert (TARGET_64BIT);
14698 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14699 code = 'q';
14700 }
14701 else
14702 ok = ix86_decompose_address (addr, &parts);
14703
14704 gcc_assert (ok);
14705
14706 base = parts.base;
14707 index = parts.index;
14708 disp = parts.disp;
14709 scale = parts.scale;
14710
14711 switch (parts.seg)
14712 {
14713 case SEG_DEFAULT:
14714 break;
14715 case SEG_FS:
14716 case SEG_GS:
14717 if (ASSEMBLER_DIALECT == ASM_ATT)
14718 putc ('%', file);
14719 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
14720 break;
14721 default:
14722 gcc_unreachable ();
14723 }
14724
14725 /* Use one byte shorter RIP relative addressing for 64bit mode. */
14726 if (TARGET_64BIT && !base && !index)
14727 {
14728 rtx symbol = disp;
14729
14730 if (GET_CODE (disp) == CONST
14731 && GET_CODE (XEXP (disp, 0)) == PLUS
14732 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14733 symbol = XEXP (XEXP (disp, 0), 0);
14734
14735 if (GET_CODE (symbol) == LABEL_REF
14736 || (GET_CODE (symbol) == SYMBOL_REF
14737 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
14738 base = pc_rtx;
14739 }
14740 if (!base && !index)
14741 {
14742 /* Displacement only requires special attention. */
14743
14744 if (CONST_INT_P (disp))
14745 {
14746 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
14747 fputs ("ds:", file);
14748 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
14749 }
14750 else if (flag_pic)
14751 output_pic_addr_const (file, disp, 0);
14752 else
14753 output_addr_const (file, disp);
14754 }
14755 else
14756 {
14757 /* Print SImode register names to force addr32 prefix. */
14758 if (SImode_address_operand (addr, VOIDmode))
14759 {
14760 #ifdef ENABLE_CHECKING
14761 gcc_assert (TARGET_64BIT);
14762 switch (GET_CODE (addr))
14763 {
14764 case SUBREG:
14765 gcc_assert (GET_MODE (addr) == SImode);
14766 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
14767 break;
14768 case ZERO_EXTEND:
14769 case AND:
14770 gcc_assert (GET_MODE (addr) == DImode);
14771 break;
14772 default:
14773 gcc_unreachable ();
14774 }
14775 #endif
14776 gcc_assert (!code);
14777 code = 'k';
14778 }
14779 else if (code == 0
14780 && TARGET_X32
14781 && disp
14782 && CONST_INT_P (disp)
14783 && INTVAL (disp) < -16*1024*1024)
14784 {
14785 /* X32 runs in 64-bit mode, where displacement, DISP, in
14786 address DISP(%r64), is encoded as 32-bit immediate sign-
14787 extended from 32-bit to 64-bit. For -0x40000300(%r64),
14788 address is %r64 + 0xffffffffbffffd00. When %r64 <
14789 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
14790 which is invalid for x32. The correct address is %r64
14791 - 0x40000300 == 0xf7ffdd64. To properly encode
14792 -0x40000300(%r64) for x32, we zero-extend negative
14793 displacement by forcing addr32 prefix which truncates
14794 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
14795 zero-extend all negative displacements, including -1(%rsp).
14796 However, for small negative displacements, sign-extension
14797 won't cause overflow. We only zero-extend negative
14798 displacements if they < -16*1024*1024, which is also used
14799 to check legitimate address displacements for PIC. */
14800 code = 'k';
14801 }
14802
14803 if (ASSEMBLER_DIALECT == ASM_ATT)
14804 {
14805 if (disp)
14806 {
14807 if (flag_pic)
14808 output_pic_addr_const (file, disp, 0);
14809 else if (GET_CODE (disp) == LABEL_REF)
14810 output_asm_label (disp);
14811 else
14812 output_addr_const (file, disp);
14813 }
14814
14815 putc ('(', file);
14816 if (base)
14817 print_reg (base, code, file);
14818 if (index)
14819 {
14820 putc (',', file);
14821 print_reg (index, vsib ? 0 : code, file);
14822 if (scale != 1 || vsib)
14823 fprintf (file, ",%d", scale);
14824 }
14825 putc (')', file);
14826 }
14827 else
14828 {
14829 rtx offset = NULL_RTX;
14830
14831 if (disp)
14832 {
14833 /* Pull out the offset of a symbol; print any symbol itself. */
14834 if (GET_CODE (disp) == CONST
14835 && GET_CODE (XEXP (disp, 0)) == PLUS
14836 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14837 {
14838 offset = XEXP (XEXP (disp, 0), 1);
14839 disp = gen_rtx_CONST (VOIDmode,
14840 XEXP (XEXP (disp, 0), 0));
14841 }
14842
14843 if (flag_pic)
14844 output_pic_addr_const (file, disp, 0);
14845 else if (GET_CODE (disp) == LABEL_REF)
14846 output_asm_label (disp);
14847 else if (CONST_INT_P (disp))
14848 offset = disp;
14849 else
14850 output_addr_const (file, disp);
14851 }
14852
14853 putc ('[', file);
14854 if (base)
14855 {
14856 print_reg (base, code, file);
14857 if (offset)
14858 {
14859 if (INTVAL (offset) >= 0)
14860 putc ('+', file);
14861 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14862 }
14863 }
14864 else if (offset)
14865 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14866 else
14867 putc ('0', file);
14868
14869 if (index)
14870 {
14871 putc ('+', file);
14872 print_reg (index, vsib ? 0 : code, file);
14873 if (scale != 1 || vsib)
14874 fprintf (file, "*%d", scale);
14875 }
14876 putc (']', file);
14877 }
14878 }
14879 }
14880
14881 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
14882
14883 static bool
14884 i386_asm_output_addr_const_extra (FILE *file, rtx x)
14885 {
14886 rtx op;
14887
14888 if (GET_CODE (x) != UNSPEC)
14889 return false;
14890
14891 op = XVECEXP (x, 0, 0);
14892 switch (XINT (x, 1))
14893 {
14894 case UNSPEC_GOTTPOFF:
14895 output_addr_const (file, op);
14896 /* FIXME: This might be @TPOFF in Sun ld. */
14897 fputs ("@gottpoff", file);
14898 break;
14899 case UNSPEC_TPOFF:
14900 output_addr_const (file, op);
14901 fputs ("@tpoff", file);
14902 break;
14903 case UNSPEC_NTPOFF:
14904 output_addr_const (file, op);
14905 if (TARGET_64BIT)
14906 fputs ("@tpoff", file);
14907 else
14908 fputs ("@ntpoff", file);
14909 break;
14910 case UNSPEC_DTPOFF:
14911 output_addr_const (file, op);
14912 fputs ("@dtpoff", file);
14913 break;
14914 case UNSPEC_GOTNTPOFF:
14915 output_addr_const (file, op);
14916 if (TARGET_64BIT)
14917 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
14918 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
14919 else
14920 fputs ("@gotntpoff", file);
14921 break;
14922 case UNSPEC_INDNTPOFF:
14923 output_addr_const (file, op);
14924 fputs ("@indntpoff", file);
14925 break;
14926 #if TARGET_MACHO
14927 case UNSPEC_MACHOPIC_OFFSET:
14928 output_addr_const (file, op);
14929 putc ('-', file);
14930 machopic_output_function_base_name (file);
14931 break;
14932 #endif
14933
14934 case UNSPEC_STACK_CHECK:
14935 {
14936 int offset;
14937
14938 gcc_assert (flag_split_stack);
14939
14940 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14941 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14942 #else
14943 gcc_unreachable ();
14944 #endif
14945
14946 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
14947 }
14948 break;
14949
14950 default:
14951 return false;
14952 }
14953
14954 return true;
14955 }
14956 \f
14957 /* Split one or more double-mode RTL references into pairs of half-mode
14958 references. The RTL can be REG, offsettable MEM, integer constant, or
14959 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
14960 split and "num" is its length. lo_half and hi_half are output arrays
14961 that parallel "operands". */
14962
14963 void
14964 split_double_mode (enum machine_mode mode, rtx operands[],
14965 int num, rtx lo_half[], rtx hi_half[])
14966 {
14967 enum machine_mode half_mode;
14968 unsigned int byte;
14969
14970 switch (mode)
14971 {
14972 case TImode:
14973 half_mode = DImode;
14974 break;
14975 case DImode:
14976 half_mode = SImode;
14977 break;
14978 default:
14979 gcc_unreachable ();
14980 }
14981
14982 byte = GET_MODE_SIZE (half_mode);
14983
14984 while (num--)
14985 {
14986 rtx op = operands[num];
14987
14988 /* simplify_subreg refuse to split volatile memory addresses,
14989 but we still have to handle it. */
14990 if (MEM_P (op))
14991 {
14992 lo_half[num] = adjust_address (op, half_mode, 0);
14993 hi_half[num] = adjust_address (op, half_mode, byte);
14994 }
14995 else
14996 {
14997 lo_half[num] = simplify_gen_subreg (half_mode, op,
14998 GET_MODE (op) == VOIDmode
14999 ? mode : GET_MODE (op), 0);
15000 hi_half[num] = simplify_gen_subreg (half_mode, op,
15001 GET_MODE (op) == VOIDmode
15002 ? mode : GET_MODE (op), byte);
15003 }
15004 }
15005 }
15006 \f
15007 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
15008 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
15009 is the expression of the binary operation. The output may either be
15010 emitted here, or returned to the caller, like all output_* functions.
15011
15012 There is no guarantee that the operands are the same mode, as they
15013 might be within FLOAT or FLOAT_EXTEND expressions. */
15014
15015 #ifndef SYSV386_COMPAT
15016 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
15017 wants to fix the assemblers because that causes incompatibility
15018 with gcc. No-one wants to fix gcc because that causes
15019 incompatibility with assemblers... You can use the option of
15020 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
15021 #define SYSV386_COMPAT 1
15022 #endif
15023
15024 const char *
15025 output_387_binary_op (rtx insn, rtx *operands)
15026 {
15027 static char buf[40];
15028 const char *p;
15029 const char *ssep;
15030 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
15031
15032 #ifdef ENABLE_CHECKING
15033 /* Even if we do not want to check the inputs, this documents input
15034 constraints. Which helps in understanding the following code. */
15035 if (STACK_REG_P (operands[0])
15036 && ((REG_P (operands[1])
15037 && REGNO (operands[0]) == REGNO (operands[1])
15038 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
15039 || (REG_P (operands[2])
15040 && REGNO (operands[0]) == REGNO (operands[2])
15041 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
15042 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
15043 ; /* ok */
15044 else
15045 gcc_assert (is_sse);
15046 #endif
15047
15048 switch (GET_CODE (operands[3]))
15049 {
15050 case PLUS:
15051 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15052 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15053 p = "fiadd";
15054 else
15055 p = "fadd";
15056 ssep = "vadd";
15057 break;
15058
15059 case MINUS:
15060 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15061 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15062 p = "fisub";
15063 else
15064 p = "fsub";
15065 ssep = "vsub";
15066 break;
15067
15068 case MULT:
15069 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15070 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15071 p = "fimul";
15072 else
15073 p = "fmul";
15074 ssep = "vmul";
15075 break;
15076
15077 case DIV:
15078 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15079 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15080 p = "fidiv";
15081 else
15082 p = "fdiv";
15083 ssep = "vdiv";
15084 break;
15085
15086 default:
15087 gcc_unreachable ();
15088 }
15089
15090 if (is_sse)
15091 {
15092 if (TARGET_AVX)
15093 {
15094 strcpy (buf, ssep);
15095 if (GET_MODE (operands[0]) == SFmode)
15096 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
15097 else
15098 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
15099 }
15100 else
15101 {
15102 strcpy (buf, ssep + 1);
15103 if (GET_MODE (operands[0]) == SFmode)
15104 strcat (buf, "ss\t{%2, %0|%0, %2}");
15105 else
15106 strcat (buf, "sd\t{%2, %0|%0, %2}");
15107 }
15108 return buf;
15109 }
15110 strcpy (buf, p);
15111
15112 switch (GET_CODE (operands[3]))
15113 {
15114 case MULT:
15115 case PLUS:
15116 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
15117 {
15118 rtx temp = operands[2];
15119 operands[2] = operands[1];
15120 operands[1] = temp;
15121 }
15122
15123 /* know operands[0] == operands[1]. */
15124
15125 if (MEM_P (operands[2]))
15126 {
15127 p = "%Z2\t%2";
15128 break;
15129 }
15130
15131 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15132 {
15133 if (STACK_TOP_P (operands[0]))
15134 /* How is it that we are storing to a dead operand[2]?
15135 Well, presumably operands[1] is dead too. We can't
15136 store the result to st(0) as st(0) gets popped on this
15137 instruction. Instead store to operands[2] (which I
15138 think has to be st(1)). st(1) will be popped later.
15139 gcc <= 2.8.1 didn't have this check and generated
15140 assembly code that the Unixware assembler rejected. */
15141 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15142 else
15143 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15144 break;
15145 }
15146
15147 if (STACK_TOP_P (operands[0]))
15148 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15149 else
15150 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15151 break;
15152
15153 case MINUS:
15154 case DIV:
15155 if (MEM_P (operands[1]))
15156 {
15157 p = "r%Z1\t%1";
15158 break;
15159 }
15160
15161 if (MEM_P (operands[2]))
15162 {
15163 p = "%Z2\t%2";
15164 break;
15165 }
15166
15167 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15168 {
15169 #if SYSV386_COMPAT
15170 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
15171 derived assemblers, confusingly reverse the direction of
15172 the operation for fsub{r} and fdiv{r} when the
15173 destination register is not st(0). The Intel assembler
15174 doesn't have this brain damage. Read !SYSV386_COMPAT to
15175 figure out what the hardware really does. */
15176 if (STACK_TOP_P (operands[0]))
15177 p = "{p\t%0, %2|rp\t%2, %0}";
15178 else
15179 p = "{rp\t%2, %0|p\t%0, %2}";
15180 #else
15181 if (STACK_TOP_P (operands[0]))
15182 /* As above for fmul/fadd, we can't store to st(0). */
15183 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15184 else
15185 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15186 #endif
15187 break;
15188 }
15189
15190 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
15191 {
15192 #if SYSV386_COMPAT
15193 if (STACK_TOP_P (operands[0]))
15194 p = "{rp\t%0, %1|p\t%1, %0}";
15195 else
15196 p = "{p\t%1, %0|rp\t%0, %1}";
15197 #else
15198 if (STACK_TOP_P (operands[0]))
15199 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
15200 else
15201 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
15202 #endif
15203 break;
15204 }
15205
15206 if (STACK_TOP_P (operands[0]))
15207 {
15208 if (STACK_TOP_P (operands[1]))
15209 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15210 else
15211 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
15212 break;
15213 }
15214 else if (STACK_TOP_P (operands[1]))
15215 {
15216 #if SYSV386_COMPAT
15217 p = "{\t%1, %0|r\t%0, %1}";
15218 #else
15219 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
15220 #endif
15221 }
15222 else
15223 {
15224 #if SYSV386_COMPAT
15225 p = "{r\t%2, %0|\t%0, %2}";
15226 #else
15227 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15228 #endif
15229 }
15230 break;
15231
15232 default:
15233 gcc_unreachable ();
15234 }
15235
15236 strcat (buf, p);
15237 return buf;
15238 }
15239
15240 /* Check if a 256bit AVX register is referenced inside of EXP. */
15241
15242 static int
15243 ix86_check_avx256_register (rtx *pexp, void *data ATTRIBUTE_UNUSED)
15244 {
15245 rtx exp = *pexp;
15246
15247 if (GET_CODE (exp) == SUBREG)
15248 exp = SUBREG_REG (exp);
15249
15250 if (REG_P (exp)
15251 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)))
15252 return 1;
15253
15254 return 0;
15255 }
15256
15257 /* Return needed mode for entity in optimize_mode_switching pass. */
15258
15259 static int
15260 ix86_avx_u128_mode_needed (rtx insn)
15261 {
15262 if (CALL_P (insn))
15263 {
15264 rtx link;
15265
15266 /* Needed mode is set to AVX_U128_CLEAN if there are
15267 no 256bit modes used in function arguments. */
15268 for (link = CALL_INSN_FUNCTION_USAGE (insn);
15269 link;
15270 link = XEXP (link, 1))
15271 {
15272 if (GET_CODE (XEXP (link, 0)) == USE)
15273 {
15274 rtx arg = XEXP (XEXP (link, 0), 0);
15275
15276 if (ix86_check_avx256_register (&arg, NULL))
15277 return AVX_U128_ANY;
15278 }
15279 }
15280
15281 return AVX_U128_CLEAN;
15282 }
15283
15284 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
15285 changes state only when a 256bit register is written to, but we need
15286 to prevent the compiler from moving optimal insertion point above
15287 eventual read from 256bit register. */
15288 if (for_each_rtx (&PATTERN (insn), ix86_check_avx256_register, NULL))
15289 return AVX_U128_DIRTY;
15290
15291 return AVX_U128_ANY;
15292 }
15293
15294 /* Return mode that i387 must be switched into
15295 prior to the execution of insn. */
15296
15297 static int
15298 ix86_i387_mode_needed (int entity, rtx insn)
15299 {
15300 enum attr_i387_cw mode;
15301
15302 /* The mode UNINITIALIZED is used to store control word after a
15303 function call or ASM pattern. The mode ANY specify that function
15304 has no requirements on the control word and make no changes in the
15305 bits we are interested in. */
15306
15307 if (CALL_P (insn)
15308 || (NONJUMP_INSN_P (insn)
15309 && (asm_noperands (PATTERN (insn)) >= 0
15310 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15311 return I387_CW_UNINITIALIZED;
15312
15313 if (recog_memoized (insn) < 0)
15314 return I387_CW_ANY;
15315
15316 mode = get_attr_i387_cw (insn);
15317
15318 switch (entity)
15319 {
15320 case I387_TRUNC:
15321 if (mode == I387_CW_TRUNC)
15322 return mode;
15323 break;
15324
15325 case I387_FLOOR:
15326 if (mode == I387_CW_FLOOR)
15327 return mode;
15328 break;
15329
15330 case I387_CEIL:
15331 if (mode == I387_CW_CEIL)
15332 return mode;
15333 break;
15334
15335 case I387_MASK_PM:
15336 if (mode == I387_CW_MASK_PM)
15337 return mode;
15338 break;
15339
15340 default:
15341 gcc_unreachable ();
15342 }
15343
15344 return I387_CW_ANY;
15345 }
15346
15347 /* Return mode that entity must be switched into
15348 prior to the execution of insn. */
15349
15350 int
15351 ix86_mode_needed (int entity, rtx insn)
15352 {
15353 switch (entity)
15354 {
15355 case AVX_U128:
15356 return ix86_avx_u128_mode_needed (insn);
15357 case I387_TRUNC:
15358 case I387_FLOOR:
15359 case I387_CEIL:
15360 case I387_MASK_PM:
15361 return ix86_i387_mode_needed (entity, insn);
15362 default:
15363 gcc_unreachable ();
15364 }
15365 return 0;
15366 }
15367
15368 /* Check if a 256bit AVX register is referenced in stores. */
15369
15370 static void
15371 ix86_check_avx256_stores (rtx dest, const_rtx set ATTRIBUTE_UNUSED, void *data)
15372 {
15373 if (ix86_check_avx256_register (&dest, NULL))
15374 {
15375 bool *used = (bool *) data;
15376 *used = true;
15377 }
15378 }
15379
15380 /* Calculate mode of upper 128bit AVX registers after the insn. */
15381
15382 static int
15383 ix86_avx_u128_mode_after (int mode, rtx insn)
15384 {
15385 rtx pat = PATTERN (insn);
15386
15387 if (vzeroupper_operation (pat, VOIDmode)
15388 || vzeroall_operation (pat, VOIDmode))
15389 return AVX_U128_CLEAN;
15390
15391 /* We know that state is clean after CALL insn if there are no
15392 256bit registers used in the function return register. */
15393 if (CALL_P (insn))
15394 {
15395 bool avx_reg256_found = false;
15396 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
15397 if (!avx_reg256_found)
15398 return AVX_U128_CLEAN;
15399 }
15400
15401 /* Otherwise, return current mode. Remember that if insn
15402 references AVX 256bit registers, the mode was already changed
15403 to DIRTY from MODE_NEEDED. */
15404 return mode;
15405 }
15406
15407 /* Return the mode that an insn results in. */
15408
15409 int
15410 ix86_mode_after (int entity, int mode, rtx insn)
15411 {
15412 switch (entity)
15413 {
15414 case AVX_U128:
15415 return ix86_avx_u128_mode_after (mode, insn);
15416 case I387_TRUNC:
15417 case I387_FLOOR:
15418 case I387_CEIL:
15419 case I387_MASK_PM:
15420 return mode;
15421 default:
15422 gcc_unreachable ();
15423 }
15424 }
15425
15426 static int
15427 ix86_avx_u128_mode_entry (void)
15428 {
15429 tree arg;
15430
15431 /* Entry mode is set to AVX_U128_DIRTY if there are
15432 256bit modes used in function arguments. */
15433 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
15434 arg = TREE_CHAIN (arg))
15435 {
15436 rtx incoming = DECL_INCOMING_RTL (arg);
15437
15438 if (incoming && ix86_check_avx256_register (&incoming, NULL))
15439 return AVX_U128_DIRTY;
15440 }
15441
15442 return AVX_U128_CLEAN;
15443 }
15444
15445 /* Return a mode that ENTITY is assumed to be
15446 switched to at function entry. */
15447
15448 int
15449 ix86_mode_entry (int entity)
15450 {
15451 switch (entity)
15452 {
15453 case AVX_U128:
15454 return ix86_avx_u128_mode_entry ();
15455 case I387_TRUNC:
15456 case I387_FLOOR:
15457 case I387_CEIL:
15458 case I387_MASK_PM:
15459 return I387_CW_ANY;
15460 default:
15461 gcc_unreachable ();
15462 }
15463 }
15464
15465 static int
15466 ix86_avx_u128_mode_exit (void)
15467 {
15468 rtx reg = crtl->return_rtx;
15469
15470 /* Exit mode is set to AVX_U128_DIRTY if there are
15471 256bit modes used in the function return register. */
15472 if (reg && ix86_check_avx256_register (&reg, NULL))
15473 return AVX_U128_DIRTY;
15474
15475 return AVX_U128_CLEAN;
15476 }
15477
15478 /* Return a mode that ENTITY is assumed to be
15479 switched to at function exit. */
15480
15481 int
15482 ix86_mode_exit (int entity)
15483 {
15484 switch (entity)
15485 {
15486 case AVX_U128:
15487 return ix86_avx_u128_mode_exit ();
15488 case I387_TRUNC:
15489 case I387_FLOOR:
15490 case I387_CEIL:
15491 case I387_MASK_PM:
15492 return I387_CW_ANY;
15493 default:
15494 gcc_unreachable ();
15495 }
15496 }
15497
15498 /* Output code to initialize control word copies used by trunc?f?i and
15499 rounding patterns. CURRENT_MODE is set to current control word,
15500 while NEW_MODE is set to new control word. */
15501
15502 static void
15503 emit_i387_cw_initialization (int mode)
15504 {
15505 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
15506 rtx new_mode;
15507
15508 enum ix86_stack_slot slot;
15509
15510 rtx reg = gen_reg_rtx (HImode);
15511
15512 emit_insn (gen_x86_fnstcw_1 (stored_mode));
15513 emit_move_insn (reg, copy_rtx (stored_mode));
15514
15515 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
15516 || optimize_function_for_size_p (cfun))
15517 {
15518 switch (mode)
15519 {
15520 case I387_CW_TRUNC:
15521 /* round toward zero (truncate) */
15522 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
15523 slot = SLOT_CW_TRUNC;
15524 break;
15525
15526 case I387_CW_FLOOR:
15527 /* round down toward -oo */
15528 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15529 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
15530 slot = SLOT_CW_FLOOR;
15531 break;
15532
15533 case I387_CW_CEIL:
15534 /* round up toward +oo */
15535 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15536 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
15537 slot = SLOT_CW_CEIL;
15538 break;
15539
15540 case I387_CW_MASK_PM:
15541 /* mask precision exception for nearbyint() */
15542 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15543 slot = SLOT_CW_MASK_PM;
15544 break;
15545
15546 default:
15547 gcc_unreachable ();
15548 }
15549 }
15550 else
15551 {
15552 switch (mode)
15553 {
15554 case I387_CW_TRUNC:
15555 /* round toward zero (truncate) */
15556 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
15557 slot = SLOT_CW_TRUNC;
15558 break;
15559
15560 case I387_CW_FLOOR:
15561 /* round down toward -oo */
15562 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
15563 slot = SLOT_CW_FLOOR;
15564 break;
15565
15566 case I387_CW_CEIL:
15567 /* round up toward +oo */
15568 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
15569 slot = SLOT_CW_CEIL;
15570 break;
15571
15572 case I387_CW_MASK_PM:
15573 /* mask precision exception for nearbyint() */
15574 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15575 slot = SLOT_CW_MASK_PM;
15576 break;
15577
15578 default:
15579 gcc_unreachable ();
15580 }
15581 }
15582
15583 gcc_assert (slot < MAX_386_STACK_LOCALS);
15584
15585 new_mode = assign_386_stack_local (HImode, slot);
15586 emit_move_insn (new_mode, reg);
15587 }
15588
15589 /* Emit vzeroupper. */
15590
15591 void
15592 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
15593 {
15594 int i;
15595
15596 /* Cancel automatic vzeroupper insertion if there are
15597 live call-saved SSE registers at the insertion point. */
15598
15599 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
15600 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
15601 return;
15602
15603 if (TARGET_64BIT)
15604 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
15605 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
15606 return;
15607
15608 emit_insn (gen_avx_vzeroupper ());
15609 }
15610
15611 /* Generate one or more insns to set ENTITY to MODE. */
15612
15613 void
15614 ix86_emit_mode_set (int entity, int mode, HARD_REG_SET regs_live)
15615 {
15616 switch (entity)
15617 {
15618 case AVX_U128:
15619 if (mode == AVX_U128_CLEAN)
15620 ix86_avx_emit_vzeroupper (regs_live);
15621 break;
15622 case I387_TRUNC:
15623 case I387_FLOOR:
15624 case I387_CEIL:
15625 case I387_MASK_PM:
15626 if (mode != I387_CW_ANY
15627 && mode != I387_CW_UNINITIALIZED)
15628 emit_i387_cw_initialization (mode);
15629 break;
15630 default:
15631 gcc_unreachable ();
15632 }
15633 }
15634
15635 /* Output code for INSN to convert a float to a signed int. OPERANDS
15636 are the insn operands. The output may be [HSD]Imode and the input
15637 operand may be [SDX]Fmode. */
15638
15639 const char *
15640 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
15641 {
15642 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15643 int dimode_p = GET_MODE (operands[0]) == DImode;
15644 int round_mode = get_attr_i387_cw (insn);
15645
15646 /* Jump through a hoop or two for DImode, since the hardware has no
15647 non-popping instruction. We used to do this a different way, but
15648 that was somewhat fragile and broke with post-reload splitters. */
15649 if ((dimode_p || fisttp) && !stack_top_dies)
15650 output_asm_insn ("fld\t%y1", operands);
15651
15652 gcc_assert (STACK_TOP_P (operands[1]));
15653 gcc_assert (MEM_P (operands[0]));
15654 gcc_assert (GET_MODE (operands[1]) != TFmode);
15655
15656 if (fisttp)
15657 output_asm_insn ("fisttp%Z0\t%0", operands);
15658 else
15659 {
15660 if (round_mode != I387_CW_ANY)
15661 output_asm_insn ("fldcw\t%3", operands);
15662 if (stack_top_dies || dimode_p)
15663 output_asm_insn ("fistp%Z0\t%0", operands);
15664 else
15665 output_asm_insn ("fist%Z0\t%0", operands);
15666 if (round_mode != I387_CW_ANY)
15667 output_asm_insn ("fldcw\t%2", operands);
15668 }
15669
15670 return "";
15671 }
15672
15673 /* Output code for x87 ffreep insn. The OPNO argument, which may only
15674 have the values zero or one, indicates the ffreep insn's operand
15675 from the OPERANDS array. */
15676
15677 static const char *
15678 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
15679 {
15680 if (TARGET_USE_FFREEP)
15681 #ifdef HAVE_AS_IX86_FFREEP
15682 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
15683 #else
15684 {
15685 static char retval[32];
15686 int regno = REGNO (operands[opno]);
15687
15688 gcc_assert (STACK_REGNO_P (regno));
15689
15690 regno -= FIRST_STACK_REG;
15691
15692 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
15693 return retval;
15694 }
15695 #endif
15696
15697 return opno ? "fstp\t%y1" : "fstp\t%y0";
15698 }
15699
15700
15701 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
15702 should be used. UNORDERED_P is true when fucom should be used. */
15703
15704 const char *
15705 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
15706 {
15707 int stack_top_dies;
15708 rtx cmp_op0, cmp_op1;
15709 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
15710
15711 if (eflags_p)
15712 {
15713 cmp_op0 = operands[0];
15714 cmp_op1 = operands[1];
15715 }
15716 else
15717 {
15718 cmp_op0 = operands[1];
15719 cmp_op1 = operands[2];
15720 }
15721
15722 if (is_sse)
15723 {
15724 if (GET_MODE (operands[0]) == SFmode)
15725 if (unordered_p)
15726 return "%vucomiss\t{%1, %0|%0, %1}";
15727 else
15728 return "%vcomiss\t{%1, %0|%0, %1}";
15729 else
15730 if (unordered_p)
15731 return "%vucomisd\t{%1, %0|%0, %1}";
15732 else
15733 return "%vcomisd\t{%1, %0|%0, %1}";
15734 }
15735
15736 gcc_assert (STACK_TOP_P (cmp_op0));
15737
15738 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15739
15740 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
15741 {
15742 if (stack_top_dies)
15743 {
15744 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
15745 return output_387_ffreep (operands, 1);
15746 }
15747 else
15748 return "ftst\n\tfnstsw\t%0";
15749 }
15750
15751 if (STACK_REG_P (cmp_op1)
15752 && stack_top_dies
15753 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
15754 && REGNO (cmp_op1) != FIRST_STACK_REG)
15755 {
15756 /* If both the top of the 387 stack dies, and the other operand
15757 is also a stack register that dies, then this must be a
15758 `fcompp' float compare */
15759
15760 if (eflags_p)
15761 {
15762 /* There is no double popping fcomi variant. Fortunately,
15763 eflags is immune from the fstp's cc clobbering. */
15764 if (unordered_p)
15765 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
15766 else
15767 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
15768 return output_387_ffreep (operands, 0);
15769 }
15770 else
15771 {
15772 if (unordered_p)
15773 return "fucompp\n\tfnstsw\t%0";
15774 else
15775 return "fcompp\n\tfnstsw\t%0";
15776 }
15777 }
15778 else
15779 {
15780 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
15781
15782 static const char * const alt[16] =
15783 {
15784 "fcom%Z2\t%y2\n\tfnstsw\t%0",
15785 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
15786 "fucom%Z2\t%y2\n\tfnstsw\t%0",
15787 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
15788
15789 "ficom%Z2\t%y2\n\tfnstsw\t%0",
15790 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
15791 NULL,
15792 NULL,
15793
15794 "fcomi\t{%y1, %0|%0, %y1}",
15795 "fcomip\t{%y1, %0|%0, %y1}",
15796 "fucomi\t{%y1, %0|%0, %y1}",
15797 "fucomip\t{%y1, %0|%0, %y1}",
15798
15799 NULL,
15800 NULL,
15801 NULL,
15802 NULL
15803 };
15804
15805 int mask;
15806 const char *ret;
15807
15808 mask = eflags_p << 3;
15809 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
15810 mask |= unordered_p << 1;
15811 mask |= stack_top_dies;
15812
15813 gcc_assert (mask < 16);
15814 ret = alt[mask];
15815 gcc_assert (ret);
15816
15817 return ret;
15818 }
15819 }
15820
15821 void
15822 ix86_output_addr_vec_elt (FILE *file, int value)
15823 {
15824 const char *directive = ASM_LONG;
15825
15826 #ifdef ASM_QUAD
15827 if (TARGET_LP64)
15828 directive = ASM_QUAD;
15829 #else
15830 gcc_assert (!TARGET_64BIT);
15831 #endif
15832
15833 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
15834 }
15835
15836 void
15837 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
15838 {
15839 const char *directive = ASM_LONG;
15840
15841 #ifdef ASM_QUAD
15842 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
15843 directive = ASM_QUAD;
15844 #else
15845 gcc_assert (!TARGET_64BIT);
15846 #endif
15847 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
15848 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
15849 fprintf (file, "%s%s%d-%s%d\n",
15850 directive, LPREFIX, value, LPREFIX, rel);
15851 else if (HAVE_AS_GOTOFF_IN_DATA)
15852 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
15853 #if TARGET_MACHO
15854 else if (TARGET_MACHO)
15855 {
15856 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
15857 machopic_output_function_base_name (file);
15858 putc ('\n', file);
15859 }
15860 #endif
15861 else
15862 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
15863 GOT_SYMBOL_NAME, LPREFIX, value);
15864 }
15865 \f
15866 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
15867 for the target. */
15868
15869 void
15870 ix86_expand_clear (rtx dest)
15871 {
15872 rtx tmp;
15873
15874 /* We play register width games, which are only valid after reload. */
15875 gcc_assert (reload_completed);
15876
15877 /* Avoid HImode and its attendant prefix byte. */
15878 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
15879 dest = gen_rtx_REG (SImode, REGNO (dest));
15880 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
15881
15882 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
15883 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
15884 {
15885 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15886 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
15887 }
15888
15889 emit_insn (tmp);
15890 }
15891
15892 /* X is an unchanging MEM. If it is a constant pool reference, return
15893 the constant pool rtx, else NULL. */
15894
15895 rtx
15896 maybe_get_pool_constant (rtx x)
15897 {
15898 x = ix86_delegitimize_address (XEXP (x, 0));
15899
15900 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
15901 return get_pool_constant (x);
15902
15903 return NULL_RTX;
15904 }
15905
15906 void
15907 ix86_expand_move (enum machine_mode mode, rtx operands[])
15908 {
15909 rtx op0, op1;
15910 enum tls_model model;
15911
15912 op0 = operands[0];
15913 op1 = operands[1];
15914
15915 if (GET_CODE (op1) == SYMBOL_REF)
15916 {
15917 model = SYMBOL_REF_TLS_MODEL (op1);
15918 if (model)
15919 {
15920 op1 = legitimize_tls_address (op1, model, true);
15921 op1 = force_operand (op1, op0);
15922 if (op1 == op0)
15923 return;
15924 op1 = convert_to_mode (mode, op1, 1);
15925 }
15926 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15927 && SYMBOL_REF_DLLIMPORT_P (op1))
15928 op1 = legitimize_dllimport_symbol (op1, false);
15929 }
15930 else if (GET_CODE (op1) == CONST
15931 && GET_CODE (XEXP (op1, 0)) == PLUS
15932 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
15933 {
15934 rtx addend = XEXP (XEXP (op1, 0), 1);
15935 rtx symbol = XEXP (XEXP (op1, 0), 0);
15936 rtx tmp = NULL;
15937
15938 model = SYMBOL_REF_TLS_MODEL (symbol);
15939 if (model)
15940 tmp = legitimize_tls_address (symbol, model, true);
15941 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15942 && SYMBOL_REF_DLLIMPORT_P (symbol))
15943 tmp = legitimize_dllimport_symbol (symbol, true);
15944
15945 if (tmp)
15946 {
15947 tmp = force_operand (tmp, NULL);
15948 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
15949 op0, 1, OPTAB_DIRECT);
15950 if (tmp == op0)
15951 return;
15952 op1 = convert_to_mode (mode, tmp, 1);
15953 }
15954 }
15955
15956 if ((flag_pic || MACHOPIC_INDIRECT)
15957 && symbolic_operand (op1, mode))
15958 {
15959 if (TARGET_MACHO && !TARGET_64BIT)
15960 {
15961 #if TARGET_MACHO
15962 /* dynamic-no-pic */
15963 if (MACHOPIC_INDIRECT)
15964 {
15965 rtx temp = ((reload_in_progress
15966 || ((op0 && REG_P (op0))
15967 && mode == Pmode))
15968 ? op0 : gen_reg_rtx (Pmode));
15969 op1 = machopic_indirect_data_reference (op1, temp);
15970 if (MACHOPIC_PURE)
15971 op1 = machopic_legitimize_pic_address (op1, mode,
15972 temp == op1 ? 0 : temp);
15973 }
15974 if (op0 != op1 && GET_CODE (op0) != MEM)
15975 {
15976 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
15977 emit_insn (insn);
15978 return;
15979 }
15980 if (GET_CODE (op0) == MEM)
15981 op1 = force_reg (Pmode, op1);
15982 else
15983 {
15984 rtx temp = op0;
15985 if (GET_CODE (temp) != REG)
15986 temp = gen_reg_rtx (Pmode);
15987 temp = legitimize_pic_address (op1, temp);
15988 if (temp == op0)
15989 return;
15990 op1 = temp;
15991 }
15992 /* dynamic-no-pic */
15993 #endif
15994 }
15995 else
15996 {
15997 if (MEM_P (op0))
15998 op1 = force_reg (mode, op1);
15999 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
16000 {
16001 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
16002 op1 = legitimize_pic_address (op1, reg);
16003 if (op0 == op1)
16004 return;
16005 op1 = convert_to_mode (mode, op1, 1);
16006 }
16007 }
16008 }
16009 else
16010 {
16011 if (MEM_P (op0)
16012 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
16013 || !push_operand (op0, mode))
16014 && MEM_P (op1))
16015 op1 = force_reg (mode, op1);
16016
16017 if (push_operand (op0, mode)
16018 && ! general_no_elim_operand (op1, mode))
16019 op1 = copy_to_mode_reg (mode, op1);
16020
16021 /* Force large constants in 64bit compilation into register
16022 to get them CSEed. */
16023 if (can_create_pseudo_p ()
16024 && (mode == DImode) && TARGET_64BIT
16025 && immediate_operand (op1, mode)
16026 && !x86_64_zext_immediate_operand (op1, VOIDmode)
16027 && !register_operand (op0, mode)
16028 && optimize)
16029 op1 = copy_to_mode_reg (mode, op1);
16030
16031 if (can_create_pseudo_p ()
16032 && FLOAT_MODE_P (mode)
16033 && GET_CODE (op1) == CONST_DOUBLE)
16034 {
16035 /* If we are loading a floating point constant to a register,
16036 force the value to memory now, since we'll get better code
16037 out the back end. */
16038
16039 op1 = validize_mem (force_const_mem (mode, op1));
16040 if (!register_operand (op0, mode))
16041 {
16042 rtx temp = gen_reg_rtx (mode);
16043 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
16044 emit_move_insn (op0, temp);
16045 return;
16046 }
16047 }
16048 }
16049
16050 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16051 }
16052
16053 void
16054 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
16055 {
16056 rtx op0 = operands[0], op1 = operands[1];
16057 unsigned int align = GET_MODE_ALIGNMENT (mode);
16058
16059 /* Force constants other than zero into memory. We do not know how
16060 the instructions used to build constants modify the upper 64 bits
16061 of the register, once we have that information we may be able
16062 to handle some of them more efficiently. */
16063 if (can_create_pseudo_p ()
16064 && register_operand (op0, mode)
16065 && (CONSTANT_P (op1)
16066 || (GET_CODE (op1) == SUBREG
16067 && CONSTANT_P (SUBREG_REG (op1))))
16068 && !standard_sse_constant_p (op1))
16069 op1 = validize_mem (force_const_mem (mode, op1));
16070
16071 /* We need to check memory alignment for SSE mode since attribute
16072 can make operands unaligned. */
16073 if (can_create_pseudo_p ()
16074 && SSE_REG_MODE_P (mode)
16075 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
16076 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
16077 {
16078 rtx tmp[2];
16079
16080 /* ix86_expand_vector_move_misalign() does not like constants ... */
16081 if (CONSTANT_P (op1)
16082 || (GET_CODE (op1) == SUBREG
16083 && CONSTANT_P (SUBREG_REG (op1))))
16084 op1 = validize_mem (force_const_mem (mode, op1));
16085
16086 /* ... nor both arguments in memory. */
16087 if (!register_operand (op0, mode)
16088 && !register_operand (op1, mode))
16089 op1 = force_reg (mode, op1);
16090
16091 tmp[0] = op0; tmp[1] = op1;
16092 ix86_expand_vector_move_misalign (mode, tmp);
16093 return;
16094 }
16095
16096 /* Make operand1 a register if it isn't already. */
16097 if (can_create_pseudo_p ()
16098 && !register_operand (op0, mode)
16099 && !register_operand (op1, mode))
16100 {
16101 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
16102 return;
16103 }
16104
16105 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16106 }
16107
16108 /* Split 32-byte AVX unaligned load and store if needed. */
16109
16110 static void
16111 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
16112 {
16113 rtx m;
16114 rtx (*extract) (rtx, rtx, rtx);
16115 rtx (*load_unaligned) (rtx, rtx);
16116 rtx (*store_unaligned) (rtx, rtx);
16117 enum machine_mode mode;
16118
16119 switch (GET_MODE (op0))
16120 {
16121 default:
16122 gcc_unreachable ();
16123 case V32QImode:
16124 extract = gen_avx_vextractf128v32qi;
16125 load_unaligned = gen_avx_loaddqu256;
16126 store_unaligned = gen_avx_storedqu256;
16127 mode = V16QImode;
16128 break;
16129 case V8SFmode:
16130 extract = gen_avx_vextractf128v8sf;
16131 load_unaligned = gen_avx_loadups256;
16132 store_unaligned = gen_avx_storeups256;
16133 mode = V4SFmode;
16134 break;
16135 case V4DFmode:
16136 extract = gen_avx_vextractf128v4df;
16137 load_unaligned = gen_avx_loadupd256;
16138 store_unaligned = gen_avx_storeupd256;
16139 mode = V2DFmode;
16140 break;
16141 }
16142
16143 if (MEM_P (op1))
16144 {
16145 if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
16146 {
16147 rtx r = gen_reg_rtx (mode);
16148 m = adjust_address (op1, mode, 0);
16149 emit_move_insn (r, m);
16150 m = adjust_address (op1, mode, 16);
16151 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
16152 emit_move_insn (op0, r);
16153 }
16154 else
16155 emit_insn (load_unaligned (op0, op1));
16156 }
16157 else if (MEM_P (op0))
16158 {
16159 if (TARGET_AVX256_SPLIT_UNALIGNED_STORE)
16160 {
16161 m = adjust_address (op0, mode, 0);
16162 emit_insn (extract (m, op1, const0_rtx));
16163 m = adjust_address (op0, mode, 16);
16164 emit_insn (extract (m, op1, const1_rtx));
16165 }
16166 else
16167 emit_insn (store_unaligned (op0, op1));
16168 }
16169 else
16170 gcc_unreachable ();
16171 }
16172
16173 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
16174 straight to ix86_expand_vector_move. */
16175 /* Code generation for scalar reg-reg moves of single and double precision data:
16176 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
16177 movaps reg, reg
16178 else
16179 movss reg, reg
16180 if (x86_sse_partial_reg_dependency == true)
16181 movapd reg, reg
16182 else
16183 movsd reg, reg
16184
16185 Code generation for scalar loads of double precision data:
16186 if (x86_sse_split_regs == true)
16187 movlpd mem, reg (gas syntax)
16188 else
16189 movsd mem, reg
16190
16191 Code generation for unaligned packed loads of single precision data
16192 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
16193 if (x86_sse_unaligned_move_optimal)
16194 movups mem, reg
16195
16196 if (x86_sse_partial_reg_dependency == true)
16197 {
16198 xorps reg, reg
16199 movlps mem, reg
16200 movhps mem+8, reg
16201 }
16202 else
16203 {
16204 movlps mem, reg
16205 movhps mem+8, reg
16206 }
16207
16208 Code generation for unaligned packed loads of double precision data
16209 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
16210 if (x86_sse_unaligned_move_optimal)
16211 movupd mem, reg
16212
16213 if (x86_sse_split_regs == true)
16214 {
16215 movlpd mem, reg
16216 movhpd mem+8, reg
16217 }
16218 else
16219 {
16220 movsd mem, reg
16221 movhpd mem+8, reg
16222 }
16223 */
16224
16225 void
16226 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
16227 {
16228 rtx op0, op1, m;
16229
16230 op0 = operands[0];
16231 op1 = operands[1];
16232
16233 if (TARGET_AVX
16234 && GET_MODE_SIZE (mode) == 32)
16235 {
16236 switch (GET_MODE_CLASS (mode))
16237 {
16238 case MODE_VECTOR_INT:
16239 case MODE_INT:
16240 op0 = gen_lowpart (V32QImode, op0);
16241 op1 = gen_lowpart (V32QImode, op1);
16242 /* FALLTHRU */
16243
16244 case MODE_VECTOR_FLOAT:
16245 ix86_avx256_split_vector_move_misalign (op0, op1);
16246 break;
16247
16248 default:
16249 gcc_unreachable ();
16250 }
16251
16252 return;
16253 }
16254
16255 if (MEM_P (op1))
16256 {
16257 /* ??? If we have typed data, then it would appear that using
16258 movdqu is the only way to get unaligned data loaded with
16259 integer type. */
16260 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16261 {
16262 op0 = gen_lowpart (V16QImode, op0);
16263 op1 = gen_lowpart (V16QImode, op1);
16264 /* We will eventually emit movups based on insn attributes. */
16265 emit_insn (gen_sse2_loaddqu (op0, op1));
16266 }
16267 else if (TARGET_SSE2 && mode == V2DFmode)
16268 {
16269 rtx zero;
16270
16271 if (TARGET_AVX
16272 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16273 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16274 || optimize_function_for_size_p (cfun))
16275 {
16276 /* We will eventually emit movups based on insn attributes. */
16277 emit_insn (gen_sse2_loadupd (op0, op1));
16278 return;
16279 }
16280
16281 /* When SSE registers are split into halves, we can avoid
16282 writing to the top half twice. */
16283 if (TARGET_SSE_SPLIT_REGS)
16284 {
16285 emit_clobber (op0);
16286 zero = op0;
16287 }
16288 else
16289 {
16290 /* ??? Not sure about the best option for the Intel chips.
16291 The following would seem to satisfy; the register is
16292 entirely cleared, breaking the dependency chain. We
16293 then store to the upper half, with a dependency depth
16294 of one. A rumor has it that Intel recommends two movsd
16295 followed by an unpacklpd, but this is unconfirmed. And
16296 given that the dependency depth of the unpacklpd would
16297 still be one, I'm not sure why this would be better. */
16298 zero = CONST0_RTX (V2DFmode);
16299 }
16300
16301 m = adjust_address (op1, DFmode, 0);
16302 emit_insn (gen_sse2_loadlpd (op0, zero, m));
16303 m = adjust_address (op1, DFmode, 8);
16304 emit_insn (gen_sse2_loadhpd (op0, op0, m));
16305 }
16306 else
16307 {
16308 if (TARGET_AVX
16309 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16310 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16311 || optimize_function_for_size_p (cfun))
16312 {
16313 op0 = gen_lowpart (V4SFmode, op0);
16314 op1 = gen_lowpart (V4SFmode, op1);
16315 emit_insn (gen_sse_loadups (op0, op1));
16316 return;
16317 }
16318
16319 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
16320 emit_move_insn (op0, CONST0_RTX (mode));
16321 else
16322 emit_clobber (op0);
16323
16324 if (mode != V4SFmode)
16325 op0 = gen_lowpart (V4SFmode, op0);
16326
16327 m = adjust_address (op1, V2SFmode, 0);
16328 emit_insn (gen_sse_loadlps (op0, op0, m));
16329 m = adjust_address (op1, V2SFmode, 8);
16330 emit_insn (gen_sse_loadhps (op0, op0, m));
16331 }
16332 }
16333 else if (MEM_P (op0))
16334 {
16335 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16336 {
16337 op0 = gen_lowpart (V16QImode, op0);
16338 op1 = gen_lowpart (V16QImode, op1);
16339 /* We will eventually emit movups based on insn attributes. */
16340 emit_insn (gen_sse2_storedqu (op0, op1));
16341 }
16342 else if (TARGET_SSE2 && mode == V2DFmode)
16343 {
16344 if (TARGET_AVX
16345 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16346 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16347 || optimize_function_for_size_p (cfun))
16348 /* We will eventually emit movups based on insn attributes. */
16349 emit_insn (gen_sse2_storeupd (op0, op1));
16350 else
16351 {
16352 m = adjust_address (op0, DFmode, 0);
16353 emit_insn (gen_sse2_storelpd (m, op1));
16354 m = adjust_address (op0, DFmode, 8);
16355 emit_insn (gen_sse2_storehpd (m, op1));
16356 }
16357 }
16358 else
16359 {
16360 if (mode != V4SFmode)
16361 op1 = gen_lowpart (V4SFmode, op1);
16362
16363 if (TARGET_AVX
16364 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16365 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16366 || optimize_function_for_size_p (cfun))
16367 {
16368 op0 = gen_lowpart (V4SFmode, op0);
16369 emit_insn (gen_sse_storeups (op0, op1));
16370 }
16371 else
16372 {
16373 m = adjust_address (op0, V2SFmode, 0);
16374 emit_insn (gen_sse_storelps (m, op1));
16375 m = adjust_address (op0, V2SFmode, 8);
16376 emit_insn (gen_sse_storehps (m, op1));
16377 }
16378 }
16379 }
16380 else
16381 gcc_unreachable ();
16382 }
16383
16384 /* Expand a push in MODE. This is some mode for which we do not support
16385 proper push instructions, at least from the registers that we expect
16386 the value to live in. */
16387
16388 void
16389 ix86_expand_push (enum machine_mode mode, rtx x)
16390 {
16391 rtx tmp;
16392
16393 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
16394 GEN_INT (-GET_MODE_SIZE (mode)),
16395 stack_pointer_rtx, 1, OPTAB_DIRECT);
16396 if (tmp != stack_pointer_rtx)
16397 emit_move_insn (stack_pointer_rtx, tmp);
16398
16399 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
16400
16401 /* When we push an operand onto stack, it has to be aligned at least
16402 at the function argument boundary. However since we don't have
16403 the argument type, we can't determine the actual argument
16404 boundary. */
16405 emit_move_insn (tmp, x);
16406 }
16407
16408 /* Helper function of ix86_fixup_binary_operands to canonicalize
16409 operand order. Returns true if the operands should be swapped. */
16410
16411 static bool
16412 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
16413 rtx operands[])
16414 {
16415 rtx dst = operands[0];
16416 rtx src1 = operands[1];
16417 rtx src2 = operands[2];
16418
16419 /* If the operation is not commutative, we can't do anything. */
16420 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
16421 return false;
16422
16423 /* Highest priority is that src1 should match dst. */
16424 if (rtx_equal_p (dst, src1))
16425 return false;
16426 if (rtx_equal_p (dst, src2))
16427 return true;
16428
16429 /* Next highest priority is that immediate constants come second. */
16430 if (immediate_operand (src2, mode))
16431 return false;
16432 if (immediate_operand (src1, mode))
16433 return true;
16434
16435 /* Lowest priority is that memory references should come second. */
16436 if (MEM_P (src2))
16437 return false;
16438 if (MEM_P (src1))
16439 return true;
16440
16441 return false;
16442 }
16443
16444
16445 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
16446 destination to use for the operation. If different from the true
16447 destination in operands[0], a copy operation will be required. */
16448
16449 rtx
16450 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
16451 rtx operands[])
16452 {
16453 rtx dst = operands[0];
16454 rtx src1 = operands[1];
16455 rtx src2 = operands[2];
16456
16457 /* Canonicalize operand order. */
16458 if (ix86_swap_binary_operands_p (code, mode, operands))
16459 {
16460 rtx temp;
16461
16462 /* It is invalid to swap operands of different modes. */
16463 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
16464
16465 temp = src1;
16466 src1 = src2;
16467 src2 = temp;
16468 }
16469
16470 /* Both source operands cannot be in memory. */
16471 if (MEM_P (src1) && MEM_P (src2))
16472 {
16473 /* Optimization: Only read from memory once. */
16474 if (rtx_equal_p (src1, src2))
16475 {
16476 src2 = force_reg (mode, src2);
16477 src1 = src2;
16478 }
16479 else
16480 src2 = force_reg (mode, src2);
16481 }
16482
16483 /* If the destination is memory, and we do not have matching source
16484 operands, do things in registers. */
16485 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16486 dst = gen_reg_rtx (mode);
16487
16488 /* Source 1 cannot be a constant. */
16489 if (CONSTANT_P (src1))
16490 src1 = force_reg (mode, src1);
16491
16492 /* Source 1 cannot be a non-matching memory. */
16493 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16494 src1 = force_reg (mode, src1);
16495
16496 /* Improve address combine. */
16497 if (code == PLUS
16498 && GET_MODE_CLASS (mode) == MODE_INT
16499 && MEM_P (src2))
16500 src2 = force_reg (mode, src2);
16501
16502 operands[1] = src1;
16503 operands[2] = src2;
16504 return dst;
16505 }
16506
16507 /* Similarly, but assume that the destination has already been
16508 set up properly. */
16509
16510 void
16511 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
16512 enum machine_mode mode, rtx operands[])
16513 {
16514 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
16515 gcc_assert (dst == operands[0]);
16516 }
16517
16518 /* Attempt to expand a binary operator. Make the expansion closer to the
16519 actual machine, then just general_operand, which will allow 3 separate
16520 memory references (one output, two input) in a single insn. */
16521
16522 void
16523 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
16524 rtx operands[])
16525 {
16526 rtx src1, src2, dst, op, clob;
16527
16528 dst = ix86_fixup_binary_operands (code, mode, operands);
16529 src1 = operands[1];
16530 src2 = operands[2];
16531
16532 /* Emit the instruction. */
16533
16534 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
16535 if (reload_in_progress)
16536 {
16537 /* Reload doesn't know about the flags register, and doesn't know that
16538 it doesn't want to clobber it. We can only do this with PLUS. */
16539 gcc_assert (code == PLUS);
16540 emit_insn (op);
16541 }
16542 else if (reload_completed
16543 && code == PLUS
16544 && !rtx_equal_p (dst, src1))
16545 {
16546 /* This is going to be an LEA; avoid splitting it later. */
16547 emit_insn (op);
16548 }
16549 else
16550 {
16551 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16552 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16553 }
16554
16555 /* Fix up the destination if needed. */
16556 if (dst != operands[0])
16557 emit_move_insn (operands[0], dst);
16558 }
16559
16560 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
16561 the given OPERANDS. */
16562
16563 void
16564 ix86_expand_vector_logical_operator (enum rtx_code code, enum machine_mode mode,
16565 rtx operands[])
16566 {
16567 rtx op1 = NULL_RTX, op2 = NULL_RTX;
16568 if (GET_CODE (operands[1]) == SUBREG)
16569 {
16570 op1 = operands[1];
16571 op2 = operands[2];
16572 }
16573 else if (GET_CODE (operands[2]) == SUBREG)
16574 {
16575 op1 = operands[2];
16576 op2 = operands[1];
16577 }
16578 /* Optimize (__m128i) d | (__m128i) e and similar code
16579 when d and e are float vectors into float vector logical
16580 insn. In C/C++ without using intrinsics there is no other way
16581 to express vector logical operation on float vectors than
16582 to cast them temporarily to integer vectors. */
16583 if (op1
16584 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16585 && ((GET_CODE (op2) == SUBREG || GET_CODE (op2) == CONST_VECTOR))
16586 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
16587 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
16588 && SUBREG_BYTE (op1) == 0
16589 && (GET_CODE (op2) == CONST_VECTOR
16590 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
16591 && SUBREG_BYTE (op2) == 0))
16592 && can_create_pseudo_p ())
16593 {
16594 rtx dst;
16595 switch (GET_MODE (SUBREG_REG (op1)))
16596 {
16597 case V4SFmode:
16598 case V8SFmode:
16599 case V2DFmode:
16600 case V4DFmode:
16601 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
16602 if (GET_CODE (op2) == CONST_VECTOR)
16603 {
16604 op2 = gen_lowpart (GET_MODE (dst), op2);
16605 op2 = force_reg (GET_MODE (dst), op2);
16606 }
16607 else
16608 {
16609 op1 = operands[1];
16610 op2 = SUBREG_REG (operands[2]);
16611 if (!nonimmediate_operand (op2, GET_MODE (dst)))
16612 op2 = force_reg (GET_MODE (dst), op2);
16613 }
16614 op1 = SUBREG_REG (op1);
16615 if (!nonimmediate_operand (op1, GET_MODE (dst)))
16616 op1 = force_reg (GET_MODE (dst), op1);
16617 emit_insn (gen_rtx_SET (VOIDmode, dst,
16618 gen_rtx_fmt_ee (code, GET_MODE (dst),
16619 op1, op2)));
16620 emit_move_insn (operands[0], gen_lowpart (mode, dst));
16621 return;
16622 default:
16623 break;
16624 }
16625 }
16626 if (!nonimmediate_operand (operands[1], mode))
16627 operands[1] = force_reg (mode, operands[1]);
16628 if (!nonimmediate_operand (operands[2], mode))
16629 operands[2] = force_reg (mode, operands[2]);
16630 ix86_fixup_binary_operands_no_copy (code, mode, operands);
16631 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
16632 gen_rtx_fmt_ee (code, mode, operands[1],
16633 operands[2])));
16634 }
16635
16636 /* Return TRUE or FALSE depending on whether the binary operator meets the
16637 appropriate constraints. */
16638
16639 bool
16640 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
16641 rtx operands[3])
16642 {
16643 rtx dst = operands[0];
16644 rtx src1 = operands[1];
16645 rtx src2 = operands[2];
16646
16647 /* Both source operands cannot be in memory. */
16648 if (MEM_P (src1) && MEM_P (src2))
16649 return false;
16650
16651 /* Canonicalize operand order for commutative operators. */
16652 if (ix86_swap_binary_operands_p (code, mode, operands))
16653 {
16654 rtx temp = src1;
16655 src1 = src2;
16656 src2 = temp;
16657 }
16658
16659 /* If the destination is memory, we must have a matching source operand. */
16660 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16661 return false;
16662
16663 /* Source 1 cannot be a constant. */
16664 if (CONSTANT_P (src1))
16665 return false;
16666
16667 /* Source 1 cannot be a non-matching memory. */
16668 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16669 /* Support "andhi/andsi/anddi" as a zero-extending move. */
16670 return (code == AND
16671 && (mode == HImode
16672 || mode == SImode
16673 || (TARGET_64BIT && mode == DImode))
16674 && satisfies_constraint_L (src2));
16675
16676 return true;
16677 }
16678
16679 /* Attempt to expand a unary operator. Make the expansion closer to the
16680 actual machine, then just general_operand, which will allow 2 separate
16681 memory references (one output, one input) in a single insn. */
16682
16683 void
16684 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
16685 rtx operands[])
16686 {
16687 int matching_memory;
16688 rtx src, dst, op, clob;
16689
16690 dst = operands[0];
16691 src = operands[1];
16692
16693 /* If the destination is memory, and we do not have matching source
16694 operands, do things in registers. */
16695 matching_memory = 0;
16696 if (MEM_P (dst))
16697 {
16698 if (rtx_equal_p (dst, src))
16699 matching_memory = 1;
16700 else
16701 dst = gen_reg_rtx (mode);
16702 }
16703
16704 /* When source operand is memory, destination must match. */
16705 if (MEM_P (src) && !matching_memory)
16706 src = force_reg (mode, src);
16707
16708 /* Emit the instruction. */
16709
16710 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
16711 if (reload_in_progress || code == NOT)
16712 {
16713 /* Reload doesn't know about the flags register, and doesn't know that
16714 it doesn't want to clobber it. */
16715 gcc_assert (code == NOT);
16716 emit_insn (op);
16717 }
16718 else
16719 {
16720 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16721 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16722 }
16723
16724 /* Fix up the destination if needed. */
16725 if (dst != operands[0])
16726 emit_move_insn (operands[0], dst);
16727 }
16728
16729 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
16730 divisor are within the range [0-255]. */
16731
16732 void
16733 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
16734 bool signed_p)
16735 {
16736 rtx end_label, qimode_label;
16737 rtx insn, div, mod;
16738 rtx scratch, tmp0, tmp1, tmp2;
16739 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
16740 rtx (*gen_zero_extend) (rtx, rtx);
16741 rtx (*gen_test_ccno_1) (rtx, rtx);
16742
16743 switch (mode)
16744 {
16745 case SImode:
16746 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
16747 gen_test_ccno_1 = gen_testsi_ccno_1;
16748 gen_zero_extend = gen_zero_extendqisi2;
16749 break;
16750 case DImode:
16751 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
16752 gen_test_ccno_1 = gen_testdi_ccno_1;
16753 gen_zero_extend = gen_zero_extendqidi2;
16754 break;
16755 default:
16756 gcc_unreachable ();
16757 }
16758
16759 end_label = gen_label_rtx ();
16760 qimode_label = gen_label_rtx ();
16761
16762 scratch = gen_reg_rtx (mode);
16763
16764 /* Use 8bit unsigned divimod if dividend and divisor are within
16765 the range [0-255]. */
16766 emit_move_insn (scratch, operands[2]);
16767 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
16768 scratch, 1, OPTAB_DIRECT);
16769 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
16770 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
16771 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
16772 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
16773 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
16774 pc_rtx);
16775 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
16776 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16777 JUMP_LABEL (insn) = qimode_label;
16778
16779 /* Generate original signed/unsigned divimod. */
16780 div = gen_divmod4_1 (operands[0], operands[1],
16781 operands[2], operands[3]);
16782 emit_insn (div);
16783
16784 /* Branch to the end. */
16785 emit_jump_insn (gen_jump (end_label));
16786 emit_barrier ();
16787
16788 /* Generate 8bit unsigned divide. */
16789 emit_label (qimode_label);
16790 /* Don't use operands[0] for result of 8bit divide since not all
16791 registers support QImode ZERO_EXTRACT. */
16792 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
16793 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
16794 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
16795 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
16796
16797 if (signed_p)
16798 {
16799 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
16800 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
16801 }
16802 else
16803 {
16804 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
16805 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
16806 }
16807
16808 /* Extract remainder from AH. */
16809 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
16810 if (REG_P (operands[1]))
16811 insn = emit_move_insn (operands[1], tmp1);
16812 else
16813 {
16814 /* Need a new scratch register since the old one has result
16815 of 8bit divide. */
16816 scratch = gen_reg_rtx (mode);
16817 emit_move_insn (scratch, tmp1);
16818 insn = emit_move_insn (operands[1], scratch);
16819 }
16820 set_unique_reg_note (insn, REG_EQUAL, mod);
16821
16822 /* Zero extend quotient from AL. */
16823 tmp1 = gen_lowpart (QImode, tmp0);
16824 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
16825 set_unique_reg_note (insn, REG_EQUAL, div);
16826
16827 emit_label (end_label);
16828 }
16829
16830 #define LEA_MAX_STALL (3)
16831 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
16832
16833 /* Increase given DISTANCE in half-cycles according to
16834 dependencies between PREV and NEXT instructions.
16835 Add 1 half-cycle if there is no dependency and
16836 go to next cycle if there is some dependecy. */
16837
16838 static unsigned int
16839 increase_distance (rtx prev, rtx next, unsigned int distance)
16840 {
16841 df_ref *use_rec;
16842 df_ref *def_rec;
16843
16844 if (!prev || !next)
16845 return distance + (distance & 1) + 2;
16846
16847 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
16848 return distance + 1;
16849
16850 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
16851 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
16852 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
16853 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
16854 return distance + (distance & 1) + 2;
16855
16856 return distance + 1;
16857 }
16858
16859 /* Function checks if instruction INSN defines register number
16860 REGNO1 or REGNO2. */
16861
16862 static bool
16863 insn_defines_reg (unsigned int regno1, unsigned int regno2,
16864 rtx insn)
16865 {
16866 df_ref *def_rec;
16867
16868 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
16869 if (DF_REF_REG_DEF_P (*def_rec)
16870 && !DF_REF_IS_ARTIFICIAL (*def_rec)
16871 && (regno1 == DF_REF_REGNO (*def_rec)
16872 || regno2 == DF_REF_REGNO (*def_rec)))
16873 {
16874 return true;
16875 }
16876
16877 return false;
16878 }
16879
16880 /* Function checks if instruction INSN uses register number
16881 REGNO as a part of address expression. */
16882
16883 static bool
16884 insn_uses_reg_mem (unsigned int regno, rtx insn)
16885 {
16886 df_ref *use_rec;
16887
16888 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
16889 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
16890 return true;
16891
16892 return false;
16893 }
16894
16895 /* Search backward for non-agu definition of register number REGNO1
16896 or register number REGNO2 in basic block starting from instruction
16897 START up to head of basic block or instruction INSN.
16898
16899 Function puts true value into *FOUND var if definition was found
16900 and false otherwise.
16901
16902 Distance in half-cycles between START and found instruction or head
16903 of BB is added to DISTANCE and returned. */
16904
16905 static int
16906 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
16907 rtx insn, int distance,
16908 rtx start, bool *found)
16909 {
16910 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16911 rtx prev = start;
16912 rtx next = NULL;
16913
16914 *found = false;
16915
16916 while (prev
16917 && prev != insn
16918 && distance < LEA_SEARCH_THRESHOLD)
16919 {
16920 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
16921 {
16922 distance = increase_distance (prev, next, distance);
16923 if (insn_defines_reg (regno1, regno2, prev))
16924 {
16925 if (recog_memoized (prev) < 0
16926 || get_attr_type (prev) != TYPE_LEA)
16927 {
16928 *found = true;
16929 return distance;
16930 }
16931 }
16932
16933 next = prev;
16934 }
16935 if (prev == BB_HEAD (bb))
16936 break;
16937
16938 prev = PREV_INSN (prev);
16939 }
16940
16941 return distance;
16942 }
16943
16944 /* Search backward for non-agu definition of register number REGNO1
16945 or register number REGNO2 in INSN's basic block until
16946 1. Pass LEA_SEARCH_THRESHOLD instructions, or
16947 2. Reach neighbour BBs boundary, or
16948 3. Reach agu definition.
16949 Returns the distance between the non-agu definition point and INSN.
16950 If no definition point, returns -1. */
16951
16952 static int
16953 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
16954 rtx insn)
16955 {
16956 basic_block bb = BLOCK_FOR_INSN (insn);
16957 int distance = 0;
16958 bool found = false;
16959
16960 if (insn != BB_HEAD (bb))
16961 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
16962 distance, PREV_INSN (insn),
16963 &found);
16964
16965 if (!found && distance < LEA_SEARCH_THRESHOLD)
16966 {
16967 edge e;
16968 edge_iterator ei;
16969 bool simple_loop = false;
16970
16971 FOR_EACH_EDGE (e, ei, bb->preds)
16972 if (e->src == bb)
16973 {
16974 simple_loop = true;
16975 break;
16976 }
16977
16978 if (simple_loop)
16979 distance = distance_non_agu_define_in_bb (regno1, regno2,
16980 insn, distance,
16981 BB_END (bb), &found);
16982 else
16983 {
16984 int shortest_dist = -1;
16985 bool found_in_bb = false;
16986
16987 FOR_EACH_EDGE (e, ei, bb->preds)
16988 {
16989 int bb_dist
16990 = distance_non_agu_define_in_bb (regno1, regno2,
16991 insn, distance,
16992 BB_END (e->src),
16993 &found_in_bb);
16994 if (found_in_bb)
16995 {
16996 if (shortest_dist < 0)
16997 shortest_dist = bb_dist;
16998 else if (bb_dist > 0)
16999 shortest_dist = MIN (bb_dist, shortest_dist);
17000
17001 found = true;
17002 }
17003 }
17004
17005 distance = shortest_dist;
17006 }
17007 }
17008
17009 /* get_attr_type may modify recog data. We want to make sure
17010 that recog data is valid for instruction INSN, on which
17011 distance_non_agu_define is called. INSN is unchanged here. */
17012 extract_insn_cached (insn);
17013
17014 if (!found)
17015 return -1;
17016
17017 return distance >> 1;
17018 }
17019
17020 /* Return the distance in half-cycles between INSN and the next
17021 insn that uses register number REGNO in memory address added
17022 to DISTANCE. Return -1 if REGNO0 is set.
17023
17024 Put true value into *FOUND if register usage was found and
17025 false otherwise.
17026 Put true value into *REDEFINED if register redefinition was
17027 found and false otherwise. */
17028
17029 static int
17030 distance_agu_use_in_bb (unsigned int regno,
17031 rtx insn, int distance, rtx start,
17032 bool *found, bool *redefined)
17033 {
17034 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
17035 rtx next = start;
17036 rtx prev = NULL;
17037
17038 *found = false;
17039 *redefined = false;
17040
17041 while (next
17042 && next != insn
17043 && distance < LEA_SEARCH_THRESHOLD)
17044 {
17045 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
17046 {
17047 distance = increase_distance(prev, next, distance);
17048 if (insn_uses_reg_mem (regno, next))
17049 {
17050 /* Return DISTANCE if OP0 is used in memory
17051 address in NEXT. */
17052 *found = true;
17053 return distance;
17054 }
17055
17056 if (insn_defines_reg (regno, INVALID_REGNUM, next))
17057 {
17058 /* Return -1 if OP0 is set in NEXT. */
17059 *redefined = true;
17060 return -1;
17061 }
17062
17063 prev = next;
17064 }
17065
17066 if (next == BB_END (bb))
17067 break;
17068
17069 next = NEXT_INSN (next);
17070 }
17071
17072 return distance;
17073 }
17074
17075 /* Return the distance between INSN and the next insn that uses
17076 register number REGNO0 in memory address. Return -1 if no such
17077 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
17078
17079 static int
17080 distance_agu_use (unsigned int regno0, rtx insn)
17081 {
17082 basic_block bb = BLOCK_FOR_INSN (insn);
17083 int distance = 0;
17084 bool found = false;
17085 bool redefined = false;
17086
17087 if (insn != BB_END (bb))
17088 distance = distance_agu_use_in_bb (regno0, insn, distance,
17089 NEXT_INSN (insn),
17090 &found, &redefined);
17091
17092 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
17093 {
17094 edge e;
17095 edge_iterator ei;
17096 bool simple_loop = false;
17097
17098 FOR_EACH_EDGE (e, ei, bb->succs)
17099 if (e->dest == bb)
17100 {
17101 simple_loop = true;
17102 break;
17103 }
17104
17105 if (simple_loop)
17106 distance = distance_agu_use_in_bb (regno0, insn,
17107 distance, BB_HEAD (bb),
17108 &found, &redefined);
17109 else
17110 {
17111 int shortest_dist = -1;
17112 bool found_in_bb = false;
17113 bool redefined_in_bb = false;
17114
17115 FOR_EACH_EDGE (e, ei, bb->succs)
17116 {
17117 int bb_dist
17118 = distance_agu_use_in_bb (regno0, insn,
17119 distance, BB_HEAD (e->dest),
17120 &found_in_bb, &redefined_in_bb);
17121 if (found_in_bb)
17122 {
17123 if (shortest_dist < 0)
17124 shortest_dist = bb_dist;
17125 else if (bb_dist > 0)
17126 shortest_dist = MIN (bb_dist, shortest_dist);
17127
17128 found = true;
17129 }
17130 }
17131
17132 distance = shortest_dist;
17133 }
17134 }
17135
17136 if (!found || redefined)
17137 return -1;
17138
17139 return distance >> 1;
17140 }
17141
17142 /* Define this macro to tune LEA priority vs ADD, it take effect when
17143 there is a dilemma of choicing LEA or ADD
17144 Negative value: ADD is more preferred than LEA
17145 Zero: Netrual
17146 Positive value: LEA is more preferred than ADD*/
17147 #define IX86_LEA_PRIORITY 0
17148
17149 /* Return true if usage of lea INSN has performance advantage
17150 over a sequence of instructions. Instructions sequence has
17151 SPLIT_COST cycles higher latency than lea latency. */
17152
17153 static bool
17154 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
17155 unsigned int regno2, int split_cost)
17156 {
17157 int dist_define, dist_use;
17158
17159 dist_define = distance_non_agu_define (regno1, regno2, insn);
17160 dist_use = distance_agu_use (regno0, insn);
17161
17162 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
17163 {
17164 /* If there is no non AGU operand definition, no AGU
17165 operand usage and split cost is 0 then both lea
17166 and non lea variants have same priority. Currently
17167 we prefer lea for 64 bit code and non lea on 32 bit
17168 code. */
17169 if (dist_use < 0 && split_cost == 0)
17170 return TARGET_64BIT || IX86_LEA_PRIORITY;
17171 else
17172 return true;
17173 }
17174
17175 /* With longer definitions distance lea is more preferable.
17176 Here we change it to take into account splitting cost and
17177 lea priority. */
17178 dist_define += split_cost + IX86_LEA_PRIORITY;
17179
17180 /* If there is no use in memory addess then we just check
17181 that split cost exceeds AGU stall. */
17182 if (dist_use < 0)
17183 return dist_define > LEA_MAX_STALL;
17184
17185 /* If this insn has both backward non-agu dependence and forward
17186 agu dependence, the one with short distance takes effect. */
17187 return dist_define >= dist_use;
17188 }
17189
17190 /* Return true if it is legal to clobber flags by INSN and
17191 false otherwise. */
17192
17193 static bool
17194 ix86_ok_to_clobber_flags (rtx insn)
17195 {
17196 basic_block bb = BLOCK_FOR_INSN (insn);
17197 df_ref *use;
17198 bitmap live;
17199
17200 while (insn)
17201 {
17202 if (NONDEBUG_INSN_P (insn))
17203 {
17204 for (use = DF_INSN_USES (insn); *use; use++)
17205 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
17206 return false;
17207
17208 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
17209 return true;
17210 }
17211
17212 if (insn == BB_END (bb))
17213 break;
17214
17215 insn = NEXT_INSN (insn);
17216 }
17217
17218 live = df_get_live_out(bb);
17219 return !REGNO_REG_SET_P (live, FLAGS_REG);
17220 }
17221
17222 /* Return true if we need to split op0 = op1 + op2 into a sequence of
17223 move and add to avoid AGU stalls. */
17224
17225 bool
17226 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
17227 {
17228 unsigned int regno0, regno1, regno2;
17229
17230 /* Check if we need to optimize. */
17231 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17232 return false;
17233
17234 /* Check it is correct to split here. */
17235 if (!ix86_ok_to_clobber_flags(insn))
17236 return false;
17237
17238 regno0 = true_regnum (operands[0]);
17239 regno1 = true_regnum (operands[1]);
17240 regno2 = true_regnum (operands[2]);
17241
17242 /* We need to split only adds with non destructive
17243 destination operand. */
17244 if (regno0 == regno1 || regno0 == regno2)
17245 return false;
17246 else
17247 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
17248 }
17249
17250 /* Return true if we should emit lea instruction instead of mov
17251 instruction. */
17252
17253 bool
17254 ix86_use_lea_for_mov (rtx insn, rtx operands[])
17255 {
17256 unsigned int regno0, regno1;
17257
17258 /* Check if we need to optimize. */
17259 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17260 return false;
17261
17262 /* Use lea for reg to reg moves only. */
17263 if (!REG_P (operands[0]) || !REG_P (operands[1]))
17264 return false;
17265
17266 regno0 = true_regnum (operands[0]);
17267 regno1 = true_regnum (operands[1]);
17268
17269 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0);
17270 }
17271
17272 /* Return true if we need to split lea into a sequence of
17273 instructions to avoid AGU stalls. */
17274
17275 bool
17276 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
17277 {
17278 unsigned int regno0, regno1, regno2;
17279 int split_cost;
17280 struct ix86_address parts;
17281 int ok;
17282
17283 /* Check we need to optimize. */
17284 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17285 return false;
17286
17287 /* Check it is correct to split here. */
17288 if (!ix86_ok_to_clobber_flags(insn))
17289 return false;
17290
17291 ok = ix86_decompose_address (operands[1], &parts);
17292 gcc_assert (ok);
17293
17294 /* There should be at least two components in the address. */
17295 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
17296 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
17297 return false;
17298
17299 /* We should not split into add if non legitimate pic
17300 operand is used as displacement. */
17301 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
17302 return false;
17303
17304 regno0 = true_regnum (operands[0]) ;
17305 regno1 = INVALID_REGNUM;
17306 regno2 = INVALID_REGNUM;
17307
17308 if (parts.base)
17309 regno1 = true_regnum (parts.base);
17310 if (parts.index)
17311 regno2 = true_regnum (parts.index);
17312
17313 split_cost = 0;
17314
17315 /* Compute how many cycles we will add to execution time
17316 if split lea into a sequence of instructions. */
17317 if (parts.base || parts.index)
17318 {
17319 /* Have to use mov instruction if non desctructive
17320 destination form is used. */
17321 if (regno1 != regno0 && regno2 != regno0)
17322 split_cost += 1;
17323
17324 /* Have to add index to base if both exist. */
17325 if (parts.base && parts.index)
17326 split_cost += 1;
17327
17328 /* Have to use shift and adds if scale is 2 or greater. */
17329 if (parts.scale > 1)
17330 {
17331 if (regno0 != regno1)
17332 split_cost += 1;
17333 else if (regno2 == regno0)
17334 split_cost += 4;
17335 else
17336 split_cost += parts.scale;
17337 }
17338
17339 /* Have to use add instruction with immediate if
17340 disp is non zero. */
17341 if (parts.disp && parts.disp != const0_rtx)
17342 split_cost += 1;
17343
17344 /* Subtract the price of lea. */
17345 split_cost -= 1;
17346 }
17347
17348 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
17349 }
17350
17351 /* Emit x86 binary operand CODE in mode MODE, where the first operand
17352 matches destination. RTX includes clobber of FLAGS_REG. */
17353
17354 static void
17355 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
17356 rtx dst, rtx src)
17357 {
17358 rtx op, clob;
17359
17360 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
17361 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17362
17363 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17364 }
17365
17366 /* Return true if regno1 def is nearest to the insn. */
17367
17368 static bool
17369 find_nearest_reg_def (rtx insn, int regno1, int regno2)
17370 {
17371 rtx prev = insn;
17372 rtx start = BB_HEAD (BLOCK_FOR_INSN (insn));
17373
17374 if (insn == start)
17375 return false;
17376 while (prev && prev != start)
17377 {
17378 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
17379 {
17380 prev = PREV_INSN (prev);
17381 continue;
17382 }
17383 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
17384 return true;
17385 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
17386 return false;
17387 prev = PREV_INSN (prev);
17388 }
17389
17390 /* None of the regs is defined in the bb. */
17391 return false;
17392 }
17393
17394 /* Split lea instructions into a sequence of instructions
17395 which are executed on ALU to avoid AGU stalls.
17396 It is assumed that it is allowed to clobber flags register
17397 at lea position. */
17398
17399 void
17400 ix86_split_lea_for_addr (rtx insn, rtx operands[], enum machine_mode mode)
17401 {
17402 unsigned int regno0, regno1, regno2;
17403 struct ix86_address parts;
17404 rtx target, tmp;
17405 int ok, adds;
17406
17407 ok = ix86_decompose_address (operands[1], &parts);
17408 gcc_assert (ok);
17409
17410 target = gen_lowpart (mode, operands[0]);
17411
17412 regno0 = true_regnum (target);
17413 regno1 = INVALID_REGNUM;
17414 regno2 = INVALID_REGNUM;
17415
17416 if (parts.base)
17417 {
17418 parts.base = gen_lowpart (mode, parts.base);
17419 regno1 = true_regnum (parts.base);
17420 }
17421
17422 if (parts.index)
17423 {
17424 parts.index = gen_lowpart (mode, parts.index);
17425 regno2 = true_regnum (parts.index);
17426 }
17427
17428 if (parts.disp)
17429 parts.disp = gen_lowpart (mode, parts.disp);
17430
17431 if (parts.scale > 1)
17432 {
17433 /* Case r1 = r1 + ... */
17434 if (regno1 == regno0)
17435 {
17436 /* If we have a case r1 = r1 + C * r1 then we
17437 should use multiplication which is very
17438 expensive. Assume cost model is wrong if we
17439 have such case here. */
17440 gcc_assert (regno2 != regno0);
17441
17442 for (adds = parts.scale; adds > 0; adds--)
17443 ix86_emit_binop (PLUS, mode, target, parts.index);
17444 }
17445 else
17446 {
17447 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
17448 if (regno0 != regno2)
17449 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17450
17451 /* Use shift for scaling. */
17452 ix86_emit_binop (ASHIFT, mode, target,
17453 GEN_INT (exact_log2 (parts.scale)));
17454
17455 if (parts.base)
17456 ix86_emit_binop (PLUS, mode, target, parts.base);
17457
17458 if (parts.disp && parts.disp != const0_rtx)
17459 ix86_emit_binop (PLUS, mode, target, parts.disp);
17460 }
17461 }
17462 else if (!parts.base && !parts.index)
17463 {
17464 gcc_assert(parts.disp);
17465 emit_insn (gen_rtx_SET (VOIDmode, target, parts.disp));
17466 }
17467 else
17468 {
17469 if (!parts.base)
17470 {
17471 if (regno0 != regno2)
17472 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17473 }
17474 else if (!parts.index)
17475 {
17476 if (regno0 != regno1)
17477 emit_insn (gen_rtx_SET (VOIDmode, target, parts.base));
17478 }
17479 else
17480 {
17481 if (regno0 == regno1)
17482 tmp = parts.index;
17483 else if (regno0 == regno2)
17484 tmp = parts.base;
17485 else
17486 {
17487 rtx tmp1;
17488
17489 /* Find better operand for SET instruction, depending
17490 on which definition is farther from the insn. */
17491 if (find_nearest_reg_def (insn, regno1, regno2))
17492 tmp = parts.index, tmp1 = parts.base;
17493 else
17494 tmp = parts.base, tmp1 = parts.index;
17495
17496 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
17497
17498 if (parts.disp && parts.disp != const0_rtx)
17499 ix86_emit_binop (PLUS, mode, target, parts.disp);
17500
17501 ix86_emit_binop (PLUS, mode, target, tmp1);
17502 return;
17503 }
17504
17505 ix86_emit_binop (PLUS, mode, target, tmp);
17506 }
17507
17508 if (parts.disp && parts.disp != const0_rtx)
17509 ix86_emit_binop (PLUS, mode, target, parts.disp);
17510 }
17511 }
17512
17513 /* Return true if it is ok to optimize an ADD operation to LEA
17514 operation to avoid flag register consumation. For most processors,
17515 ADD is faster than LEA. For the processors like ATOM, if the
17516 destination register of LEA holds an actual address which will be
17517 used soon, LEA is better and otherwise ADD is better. */
17518
17519 bool
17520 ix86_lea_for_add_ok (rtx insn, rtx operands[])
17521 {
17522 unsigned int regno0 = true_regnum (operands[0]);
17523 unsigned int regno1 = true_regnum (operands[1]);
17524 unsigned int regno2 = true_regnum (operands[2]);
17525
17526 /* If a = b + c, (a!=b && a!=c), must use lea form. */
17527 if (regno0 != regno1 && regno0 != regno2)
17528 return true;
17529
17530 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17531 return false;
17532
17533 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
17534 }
17535
17536 /* Return true if destination reg of SET_BODY is shift count of
17537 USE_BODY. */
17538
17539 static bool
17540 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
17541 {
17542 rtx set_dest;
17543 rtx shift_rtx;
17544 int i;
17545
17546 /* Retrieve destination of SET_BODY. */
17547 switch (GET_CODE (set_body))
17548 {
17549 case SET:
17550 set_dest = SET_DEST (set_body);
17551 if (!set_dest || !REG_P (set_dest))
17552 return false;
17553 break;
17554 case PARALLEL:
17555 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
17556 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
17557 use_body))
17558 return true;
17559 default:
17560 return false;
17561 break;
17562 }
17563
17564 /* Retrieve shift count of USE_BODY. */
17565 switch (GET_CODE (use_body))
17566 {
17567 case SET:
17568 shift_rtx = XEXP (use_body, 1);
17569 break;
17570 case PARALLEL:
17571 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
17572 if (ix86_dep_by_shift_count_body (set_body,
17573 XVECEXP (use_body, 0, i)))
17574 return true;
17575 default:
17576 return false;
17577 break;
17578 }
17579
17580 if (shift_rtx
17581 && (GET_CODE (shift_rtx) == ASHIFT
17582 || GET_CODE (shift_rtx) == LSHIFTRT
17583 || GET_CODE (shift_rtx) == ASHIFTRT
17584 || GET_CODE (shift_rtx) == ROTATE
17585 || GET_CODE (shift_rtx) == ROTATERT))
17586 {
17587 rtx shift_count = XEXP (shift_rtx, 1);
17588
17589 /* Return true if shift count is dest of SET_BODY. */
17590 if (REG_P (shift_count))
17591 {
17592 /* Add check since it can be invoked before register
17593 allocation in pre-reload schedule. */
17594 if (reload_completed
17595 && true_regnum (set_dest) == true_regnum (shift_count))
17596 return true;
17597 else if (REGNO(set_dest) == REGNO(shift_count))
17598 return true;
17599 }
17600 }
17601
17602 return false;
17603 }
17604
17605 /* Return true if destination reg of SET_INSN is shift count of
17606 USE_INSN. */
17607
17608 bool
17609 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
17610 {
17611 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
17612 PATTERN (use_insn));
17613 }
17614
17615 /* Return TRUE or FALSE depending on whether the unary operator meets the
17616 appropriate constraints. */
17617
17618 bool
17619 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
17620 enum machine_mode mode ATTRIBUTE_UNUSED,
17621 rtx operands[2] ATTRIBUTE_UNUSED)
17622 {
17623 /* If one of operands is memory, source and destination must match. */
17624 if ((MEM_P (operands[0])
17625 || MEM_P (operands[1]))
17626 && ! rtx_equal_p (operands[0], operands[1]))
17627 return false;
17628 return true;
17629 }
17630
17631 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
17632 are ok, keeping in mind the possible movddup alternative. */
17633
17634 bool
17635 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
17636 {
17637 if (MEM_P (operands[0]))
17638 return rtx_equal_p (operands[0], operands[1 + high]);
17639 if (MEM_P (operands[1]) && MEM_P (operands[2]))
17640 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
17641 return true;
17642 }
17643
17644 /* Post-reload splitter for converting an SF or DFmode value in an
17645 SSE register into an unsigned SImode. */
17646
17647 void
17648 ix86_split_convert_uns_si_sse (rtx operands[])
17649 {
17650 enum machine_mode vecmode;
17651 rtx value, large, zero_or_two31, input, two31, x;
17652
17653 large = operands[1];
17654 zero_or_two31 = operands[2];
17655 input = operands[3];
17656 two31 = operands[4];
17657 vecmode = GET_MODE (large);
17658 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
17659
17660 /* Load up the value into the low element. We must ensure that the other
17661 elements are valid floats -- zero is the easiest such value. */
17662 if (MEM_P (input))
17663 {
17664 if (vecmode == V4SFmode)
17665 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
17666 else
17667 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
17668 }
17669 else
17670 {
17671 input = gen_rtx_REG (vecmode, REGNO (input));
17672 emit_move_insn (value, CONST0_RTX (vecmode));
17673 if (vecmode == V4SFmode)
17674 emit_insn (gen_sse_movss (value, value, input));
17675 else
17676 emit_insn (gen_sse2_movsd (value, value, input));
17677 }
17678
17679 emit_move_insn (large, two31);
17680 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
17681
17682 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
17683 emit_insn (gen_rtx_SET (VOIDmode, large, x));
17684
17685 x = gen_rtx_AND (vecmode, zero_or_two31, large);
17686 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
17687
17688 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
17689 emit_insn (gen_rtx_SET (VOIDmode, value, x));
17690
17691 large = gen_rtx_REG (V4SImode, REGNO (large));
17692 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
17693
17694 x = gen_rtx_REG (V4SImode, REGNO (value));
17695 if (vecmode == V4SFmode)
17696 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
17697 else
17698 emit_insn (gen_sse2_cvttpd2dq (x, value));
17699 value = x;
17700
17701 emit_insn (gen_xorv4si3 (value, value, large));
17702 }
17703
17704 /* Convert an unsigned DImode value into a DFmode, using only SSE.
17705 Expects the 64-bit DImode to be supplied in a pair of integral
17706 registers. Requires SSE2; will use SSE3 if available. For x86_32,
17707 -mfpmath=sse, !optimize_size only. */
17708
17709 void
17710 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
17711 {
17712 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
17713 rtx int_xmm, fp_xmm;
17714 rtx biases, exponents;
17715 rtx x;
17716
17717 int_xmm = gen_reg_rtx (V4SImode);
17718 if (TARGET_INTER_UNIT_MOVES)
17719 emit_insn (gen_movdi_to_sse (int_xmm, input));
17720 else if (TARGET_SSE_SPLIT_REGS)
17721 {
17722 emit_clobber (int_xmm);
17723 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
17724 }
17725 else
17726 {
17727 x = gen_reg_rtx (V2DImode);
17728 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
17729 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
17730 }
17731
17732 x = gen_rtx_CONST_VECTOR (V4SImode,
17733 gen_rtvec (4, GEN_INT (0x43300000UL),
17734 GEN_INT (0x45300000UL),
17735 const0_rtx, const0_rtx));
17736 exponents = validize_mem (force_const_mem (V4SImode, x));
17737
17738 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
17739 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
17740
17741 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
17742 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
17743 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
17744 (0x1.0p84 + double(fp_value_hi_xmm)).
17745 Note these exponents differ by 32. */
17746
17747 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
17748
17749 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
17750 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
17751 real_ldexp (&bias_lo_rvt, &dconst1, 52);
17752 real_ldexp (&bias_hi_rvt, &dconst1, 84);
17753 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
17754 x = const_double_from_real_value (bias_hi_rvt, DFmode);
17755 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
17756 biases = validize_mem (force_const_mem (V2DFmode, biases));
17757 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
17758
17759 /* Add the upper and lower DFmode values together. */
17760 if (TARGET_SSE3)
17761 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
17762 else
17763 {
17764 x = copy_to_mode_reg (V2DFmode, fp_xmm);
17765 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
17766 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
17767 }
17768
17769 ix86_expand_vector_extract (false, target, fp_xmm, 0);
17770 }
17771
17772 /* Not used, but eases macroization of patterns. */
17773 void
17774 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
17775 rtx input ATTRIBUTE_UNUSED)
17776 {
17777 gcc_unreachable ();
17778 }
17779
17780 /* Convert an unsigned SImode value into a DFmode. Only currently used
17781 for SSE, but applicable anywhere. */
17782
17783 void
17784 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
17785 {
17786 REAL_VALUE_TYPE TWO31r;
17787 rtx x, fp;
17788
17789 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
17790 NULL, 1, OPTAB_DIRECT);
17791
17792 fp = gen_reg_rtx (DFmode);
17793 emit_insn (gen_floatsidf2 (fp, x));
17794
17795 real_ldexp (&TWO31r, &dconst1, 31);
17796 x = const_double_from_real_value (TWO31r, DFmode);
17797
17798 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
17799 if (x != target)
17800 emit_move_insn (target, x);
17801 }
17802
17803 /* Convert a signed DImode value into a DFmode. Only used for SSE in
17804 32-bit mode; otherwise we have a direct convert instruction. */
17805
17806 void
17807 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
17808 {
17809 REAL_VALUE_TYPE TWO32r;
17810 rtx fp_lo, fp_hi, x;
17811
17812 fp_lo = gen_reg_rtx (DFmode);
17813 fp_hi = gen_reg_rtx (DFmode);
17814
17815 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
17816
17817 real_ldexp (&TWO32r, &dconst1, 32);
17818 x = const_double_from_real_value (TWO32r, DFmode);
17819 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
17820
17821 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
17822
17823 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
17824 0, OPTAB_DIRECT);
17825 if (x != target)
17826 emit_move_insn (target, x);
17827 }
17828
17829 /* Convert an unsigned SImode value into a SFmode, using only SSE.
17830 For x86_32, -mfpmath=sse, !optimize_size only. */
17831 void
17832 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
17833 {
17834 REAL_VALUE_TYPE ONE16r;
17835 rtx fp_hi, fp_lo, int_hi, int_lo, x;
17836
17837 real_ldexp (&ONE16r, &dconst1, 16);
17838 x = const_double_from_real_value (ONE16r, SFmode);
17839 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
17840 NULL, 0, OPTAB_DIRECT);
17841 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
17842 NULL, 0, OPTAB_DIRECT);
17843 fp_hi = gen_reg_rtx (SFmode);
17844 fp_lo = gen_reg_rtx (SFmode);
17845 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
17846 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
17847 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
17848 0, OPTAB_DIRECT);
17849 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
17850 0, OPTAB_DIRECT);
17851 if (!rtx_equal_p (target, fp_hi))
17852 emit_move_insn (target, fp_hi);
17853 }
17854
17855 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
17856 a vector of unsigned ints VAL to vector of floats TARGET. */
17857
17858 void
17859 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
17860 {
17861 rtx tmp[8];
17862 REAL_VALUE_TYPE TWO16r;
17863 enum machine_mode intmode = GET_MODE (val);
17864 enum machine_mode fltmode = GET_MODE (target);
17865 rtx (*cvt) (rtx, rtx);
17866
17867 if (intmode == V4SImode)
17868 cvt = gen_floatv4siv4sf2;
17869 else
17870 cvt = gen_floatv8siv8sf2;
17871 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
17872 tmp[0] = force_reg (intmode, tmp[0]);
17873 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
17874 OPTAB_DIRECT);
17875 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
17876 NULL_RTX, 1, OPTAB_DIRECT);
17877 tmp[3] = gen_reg_rtx (fltmode);
17878 emit_insn (cvt (tmp[3], tmp[1]));
17879 tmp[4] = gen_reg_rtx (fltmode);
17880 emit_insn (cvt (tmp[4], tmp[2]));
17881 real_ldexp (&TWO16r, &dconst1, 16);
17882 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
17883 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
17884 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
17885 OPTAB_DIRECT);
17886 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
17887 OPTAB_DIRECT);
17888 if (tmp[7] != target)
17889 emit_move_insn (target, tmp[7]);
17890 }
17891
17892 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
17893 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
17894 This is done by doing just signed conversion if < 0x1p31, and otherwise by
17895 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
17896
17897 rtx
17898 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
17899 {
17900 REAL_VALUE_TYPE TWO31r;
17901 rtx two31r, tmp[4];
17902 enum machine_mode mode = GET_MODE (val);
17903 enum machine_mode scalarmode = GET_MODE_INNER (mode);
17904 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
17905 rtx (*cmp) (rtx, rtx, rtx, rtx);
17906 int i;
17907
17908 for (i = 0; i < 3; i++)
17909 tmp[i] = gen_reg_rtx (mode);
17910 real_ldexp (&TWO31r, &dconst1, 31);
17911 two31r = const_double_from_real_value (TWO31r, scalarmode);
17912 two31r = ix86_build_const_vector (mode, 1, two31r);
17913 two31r = force_reg (mode, two31r);
17914 switch (mode)
17915 {
17916 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
17917 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
17918 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
17919 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
17920 default: gcc_unreachable ();
17921 }
17922 tmp[3] = gen_rtx_LE (mode, two31r, val);
17923 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
17924 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
17925 0, OPTAB_DIRECT);
17926 if (intmode == V4SImode || TARGET_AVX2)
17927 *xorp = expand_simple_binop (intmode, ASHIFT,
17928 gen_lowpart (intmode, tmp[0]),
17929 GEN_INT (31), NULL_RTX, 0,
17930 OPTAB_DIRECT);
17931 else
17932 {
17933 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
17934 two31 = ix86_build_const_vector (intmode, 1, two31);
17935 *xorp = expand_simple_binop (intmode, AND,
17936 gen_lowpart (intmode, tmp[0]),
17937 two31, NULL_RTX, 0,
17938 OPTAB_DIRECT);
17939 }
17940 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
17941 0, OPTAB_DIRECT);
17942 }
17943
17944 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
17945 then replicate the value for all elements of the vector
17946 register. */
17947
17948 rtx
17949 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
17950 {
17951 int i, n_elt;
17952 rtvec v;
17953 enum machine_mode scalar_mode;
17954
17955 switch (mode)
17956 {
17957 case V32QImode:
17958 case V16QImode:
17959 case V16HImode:
17960 case V8HImode:
17961 case V8SImode:
17962 case V4SImode:
17963 case V4DImode:
17964 case V2DImode:
17965 gcc_assert (vect);
17966 case V8SFmode:
17967 case V4SFmode:
17968 case V4DFmode:
17969 case V2DFmode:
17970 n_elt = GET_MODE_NUNITS (mode);
17971 v = rtvec_alloc (n_elt);
17972 scalar_mode = GET_MODE_INNER (mode);
17973
17974 RTVEC_ELT (v, 0) = value;
17975
17976 for (i = 1; i < n_elt; ++i)
17977 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
17978
17979 return gen_rtx_CONST_VECTOR (mode, v);
17980
17981 default:
17982 gcc_unreachable ();
17983 }
17984 }
17985
17986 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
17987 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
17988 for an SSE register. If VECT is true, then replicate the mask for
17989 all elements of the vector register. If INVERT is true, then create
17990 a mask excluding the sign bit. */
17991
17992 rtx
17993 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
17994 {
17995 enum machine_mode vec_mode, imode;
17996 HOST_WIDE_INT hi, lo;
17997 int shift = 63;
17998 rtx v;
17999 rtx mask;
18000
18001 /* Find the sign bit, sign extended to 2*HWI. */
18002 switch (mode)
18003 {
18004 case V8SImode:
18005 case V4SImode:
18006 case V8SFmode:
18007 case V4SFmode:
18008 vec_mode = mode;
18009 mode = GET_MODE_INNER (mode);
18010 imode = SImode;
18011 lo = 0x80000000, hi = lo < 0;
18012 break;
18013
18014 case V4DImode:
18015 case V2DImode:
18016 case V4DFmode:
18017 case V2DFmode:
18018 vec_mode = mode;
18019 mode = GET_MODE_INNER (mode);
18020 imode = DImode;
18021 if (HOST_BITS_PER_WIDE_INT >= 64)
18022 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
18023 else
18024 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18025 break;
18026
18027 case TImode:
18028 case TFmode:
18029 vec_mode = VOIDmode;
18030 if (HOST_BITS_PER_WIDE_INT >= 64)
18031 {
18032 imode = TImode;
18033 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
18034 }
18035 else
18036 {
18037 rtvec vec;
18038
18039 imode = DImode;
18040 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18041
18042 if (invert)
18043 {
18044 lo = ~lo, hi = ~hi;
18045 v = constm1_rtx;
18046 }
18047 else
18048 v = const0_rtx;
18049
18050 mask = immed_double_const (lo, hi, imode);
18051
18052 vec = gen_rtvec (2, v, mask);
18053 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
18054 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
18055
18056 return v;
18057 }
18058 break;
18059
18060 default:
18061 gcc_unreachable ();
18062 }
18063
18064 if (invert)
18065 lo = ~lo, hi = ~hi;
18066
18067 /* Force this value into the low part of a fp vector constant. */
18068 mask = immed_double_const (lo, hi, imode);
18069 mask = gen_lowpart (mode, mask);
18070
18071 if (vec_mode == VOIDmode)
18072 return force_reg (mode, mask);
18073
18074 v = ix86_build_const_vector (vec_mode, vect, mask);
18075 return force_reg (vec_mode, v);
18076 }
18077
18078 /* Generate code for floating point ABS or NEG. */
18079
18080 void
18081 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
18082 rtx operands[])
18083 {
18084 rtx mask, set, dst, src;
18085 bool use_sse = false;
18086 bool vector_mode = VECTOR_MODE_P (mode);
18087 enum machine_mode vmode = mode;
18088
18089 if (vector_mode)
18090 use_sse = true;
18091 else if (mode == TFmode)
18092 use_sse = true;
18093 else if (TARGET_SSE_MATH)
18094 {
18095 use_sse = SSE_FLOAT_MODE_P (mode);
18096 if (mode == SFmode)
18097 vmode = V4SFmode;
18098 else if (mode == DFmode)
18099 vmode = V2DFmode;
18100 }
18101
18102 /* NEG and ABS performed with SSE use bitwise mask operations.
18103 Create the appropriate mask now. */
18104 if (use_sse)
18105 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
18106 else
18107 mask = NULL_RTX;
18108
18109 dst = operands[0];
18110 src = operands[1];
18111
18112 set = gen_rtx_fmt_e (code, mode, src);
18113 set = gen_rtx_SET (VOIDmode, dst, set);
18114
18115 if (mask)
18116 {
18117 rtx use, clob;
18118 rtvec par;
18119
18120 use = gen_rtx_USE (VOIDmode, mask);
18121 if (vector_mode)
18122 par = gen_rtvec (2, set, use);
18123 else
18124 {
18125 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
18126 par = gen_rtvec (3, set, use, clob);
18127 }
18128 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
18129 }
18130 else
18131 emit_insn (set);
18132 }
18133
18134 /* Expand a copysign operation. Special case operand 0 being a constant. */
18135
18136 void
18137 ix86_expand_copysign (rtx operands[])
18138 {
18139 enum machine_mode mode, vmode;
18140 rtx dest, op0, op1, mask, nmask;
18141
18142 dest = operands[0];
18143 op0 = operands[1];
18144 op1 = operands[2];
18145
18146 mode = GET_MODE (dest);
18147
18148 if (mode == SFmode)
18149 vmode = V4SFmode;
18150 else if (mode == DFmode)
18151 vmode = V2DFmode;
18152 else
18153 vmode = mode;
18154
18155 if (GET_CODE (op0) == CONST_DOUBLE)
18156 {
18157 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
18158
18159 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
18160 op0 = simplify_unary_operation (ABS, mode, op0, mode);
18161
18162 if (mode == SFmode || mode == DFmode)
18163 {
18164 if (op0 == CONST0_RTX (mode))
18165 op0 = CONST0_RTX (vmode);
18166 else
18167 {
18168 rtx v = ix86_build_const_vector (vmode, false, op0);
18169
18170 op0 = force_reg (vmode, v);
18171 }
18172 }
18173 else if (op0 != CONST0_RTX (mode))
18174 op0 = force_reg (mode, op0);
18175
18176 mask = ix86_build_signbit_mask (vmode, 0, 0);
18177
18178 if (mode == SFmode)
18179 copysign_insn = gen_copysignsf3_const;
18180 else if (mode == DFmode)
18181 copysign_insn = gen_copysigndf3_const;
18182 else
18183 copysign_insn = gen_copysigntf3_const;
18184
18185 emit_insn (copysign_insn (dest, op0, op1, mask));
18186 }
18187 else
18188 {
18189 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
18190
18191 nmask = ix86_build_signbit_mask (vmode, 0, 1);
18192 mask = ix86_build_signbit_mask (vmode, 0, 0);
18193
18194 if (mode == SFmode)
18195 copysign_insn = gen_copysignsf3_var;
18196 else if (mode == DFmode)
18197 copysign_insn = gen_copysigndf3_var;
18198 else
18199 copysign_insn = gen_copysigntf3_var;
18200
18201 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
18202 }
18203 }
18204
18205 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
18206 be a constant, and so has already been expanded into a vector constant. */
18207
18208 void
18209 ix86_split_copysign_const (rtx operands[])
18210 {
18211 enum machine_mode mode, vmode;
18212 rtx dest, op0, mask, x;
18213
18214 dest = operands[0];
18215 op0 = operands[1];
18216 mask = operands[3];
18217
18218 mode = GET_MODE (dest);
18219 vmode = GET_MODE (mask);
18220
18221 dest = simplify_gen_subreg (vmode, dest, mode, 0);
18222 x = gen_rtx_AND (vmode, dest, mask);
18223 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18224
18225 if (op0 != CONST0_RTX (vmode))
18226 {
18227 x = gen_rtx_IOR (vmode, dest, op0);
18228 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18229 }
18230 }
18231
18232 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
18233 so we have to do two masks. */
18234
18235 void
18236 ix86_split_copysign_var (rtx operands[])
18237 {
18238 enum machine_mode mode, vmode;
18239 rtx dest, scratch, op0, op1, mask, nmask, x;
18240
18241 dest = operands[0];
18242 scratch = operands[1];
18243 op0 = operands[2];
18244 op1 = operands[3];
18245 nmask = operands[4];
18246 mask = operands[5];
18247
18248 mode = GET_MODE (dest);
18249 vmode = GET_MODE (mask);
18250
18251 if (rtx_equal_p (op0, op1))
18252 {
18253 /* Shouldn't happen often (it's useless, obviously), but when it does
18254 we'd generate incorrect code if we continue below. */
18255 emit_move_insn (dest, op0);
18256 return;
18257 }
18258
18259 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
18260 {
18261 gcc_assert (REGNO (op1) == REGNO (scratch));
18262
18263 x = gen_rtx_AND (vmode, scratch, mask);
18264 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18265
18266 dest = mask;
18267 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18268 x = gen_rtx_NOT (vmode, dest);
18269 x = gen_rtx_AND (vmode, x, op0);
18270 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18271 }
18272 else
18273 {
18274 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
18275 {
18276 x = gen_rtx_AND (vmode, scratch, mask);
18277 }
18278 else /* alternative 2,4 */
18279 {
18280 gcc_assert (REGNO (mask) == REGNO (scratch));
18281 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
18282 x = gen_rtx_AND (vmode, scratch, op1);
18283 }
18284 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18285
18286 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
18287 {
18288 dest = simplify_gen_subreg (vmode, op0, mode, 0);
18289 x = gen_rtx_AND (vmode, dest, nmask);
18290 }
18291 else /* alternative 3,4 */
18292 {
18293 gcc_assert (REGNO (nmask) == REGNO (dest));
18294 dest = nmask;
18295 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18296 x = gen_rtx_AND (vmode, dest, op0);
18297 }
18298 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18299 }
18300
18301 x = gen_rtx_IOR (vmode, dest, scratch);
18302 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18303 }
18304
18305 /* Return TRUE or FALSE depending on whether the first SET in INSN
18306 has source and destination with matching CC modes, and that the
18307 CC mode is at least as constrained as REQ_MODE. */
18308
18309 bool
18310 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
18311 {
18312 rtx set;
18313 enum machine_mode set_mode;
18314
18315 set = PATTERN (insn);
18316 if (GET_CODE (set) == PARALLEL)
18317 set = XVECEXP (set, 0, 0);
18318 gcc_assert (GET_CODE (set) == SET);
18319 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
18320
18321 set_mode = GET_MODE (SET_DEST (set));
18322 switch (set_mode)
18323 {
18324 case CCNOmode:
18325 if (req_mode != CCNOmode
18326 && (req_mode != CCmode
18327 || XEXP (SET_SRC (set), 1) != const0_rtx))
18328 return false;
18329 break;
18330 case CCmode:
18331 if (req_mode == CCGCmode)
18332 return false;
18333 /* FALLTHRU */
18334 case CCGCmode:
18335 if (req_mode == CCGOCmode || req_mode == CCNOmode)
18336 return false;
18337 /* FALLTHRU */
18338 case CCGOCmode:
18339 if (req_mode == CCZmode)
18340 return false;
18341 /* FALLTHRU */
18342 case CCZmode:
18343 break;
18344
18345 case CCAmode:
18346 case CCCmode:
18347 case CCOmode:
18348 case CCSmode:
18349 if (set_mode != req_mode)
18350 return false;
18351 break;
18352
18353 default:
18354 gcc_unreachable ();
18355 }
18356
18357 return GET_MODE (SET_SRC (set)) == set_mode;
18358 }
18359
18360 /* Generate insn patterns to do an integer compare of OPERANDS. */
18361
18362 static rtx
18363 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
18364 {
18365 enum machine_mode cmpmode;
18366 rtx tmp, flags;
18367
18368 cmpmode = SELECT_CC_MODE (code, op0, op1);
18369 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
18370
18371 /* This is very simple, but making the interface the same as in the
18372 FP case makes the rest of the code easier. */
18373 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
18374 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
18375
18376 /* Return the test that should be put into the flags user, i.e.
18377 the bcc, scc, or cmov instruction. */
18378 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
18379 }
18380
18381 /* Figure out whether to use ordered or unordered fp comparisons.
18382 Return the appropriate mode to use. */
18383
18384 enum machine_mode
18385 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
18386 {
18387 /* ??? In order to make all comparisons reversible, we do all comparisons
18388 non-trapping when compiling for IEEE. Once gcc is able to distinguish
18389 all forms trapping and nontrapping comparisons, we can make inequality
18390 comparisons trapping again, since it results in better code when using
18391 FCOM based compares. */
18392 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
18393 }
18394
18395 enum machine_mode
18396 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
18397 {
18398 enum machine_mode mode = GET_MODE (op0);
18399
18400 if (SCALAR_FLOAT_MODE_P (mode))
18401 {
18402 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18403 return ix86_fp_compare_mode (code);
18404 }
18405
18406 switch (code)
18407 {
18408 /* Only zero flag is needed. */
18409 case EQ: /* ZF=0 */
18410 case NE: /* ZF!=0 */
18411 return CCZmode;
18412 /* Codes needing carry flag. */
18413 case GEU: /* CF=0 */
18414 case LTU: /* CF=1 */
18415 /* Detect overflow checks. They need just the carry flag. */
18416 if (GET_CODE (op0) == PLUS
18417 && rtx_equal_p (op1, XEXP (op0, 0)))
18418 return CCCmode;
18419 else
18420 return CCmode;
18421 case GTU: /* CF=0 & ZF=0 */
18422 case LEU: /* CF=1 | ZF=1 */
18423 /* Detect overflow checks. They need just the carry flag. */
18424 if (GET_CODE (op0) == MINUS
18425 && rtx_equal_p (op1, XEXP (op0, 0)))
18426 return CCCmode;
18427 else
18428 return CCmode;
18429 /* Codes possibly doable only with sign flag when
18430 comparing against zero. */
18431 case GE: /* SF=OF or SF=0 */
18432 case LT: /* SF<>OF or SF=1 */
18433 if (op1 == const0_rtx)
18434 return CCGOCmode;
18435 else
18436 /* For other cases Carry flag is not required. */
18437 return CCGCmode;
18438 /* Codes doable only with sign flag when comparing
18439 against zero, but we miss jump instruction for it
18440 so we need to use relational tests against overflow
18441 that thus needs to be zero. */
18442 case GT: /* ZF=0 & SF=OF */
18443 case LE: /* ZF=1 | SF<>OF */
18444 if (op1 == const0_rtx)
18445 return CCNOmode;
18446 else
18447 return CCGCmode;
18448 /* strcmp pattern do (use flags) and combine may ask us for proper
18449 mode. */
18450 case USE:
18451 return CCmode;
18452 default:
18453 gcc_unreachable ();
18454 }
18455 }
18456
18457 /* Return the fixed registers used for condition codes. */
18458
18459 static bool
18460 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
18461 {
18462 *p1 = FLAGS_REG;
18463 *p2 = FPSR_REG;
18464 return true;
18465 }
18466
18467 /* If two condition code modes are compatible, return a condition code
18468 mode which is compatible with both. Otherwise, return
18469 VOIDmode. */
18470
18471 static enum machine_mode
18472 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
18473 {
18474 if (m1 == m2)
18475 return m1;
18476
18477 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
18478 return VOIDmode;
18479
18480 if ((m1 == CCGCmode && m2 == CCGOCmode)
18481 || (m1 == CCGOCmode && m2 == CCGCmode))
18482 return CCGCmode;
18483
18484 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
18485 return m2;
18486 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
18487 return m1;
18488
18489 switch (m1)
18490 {
18491 default:
18492 gcc_unreachable ();
18493
18494 case CCmode:
18495 case CCGCmode:
18496 case CCGOCmode:
18497 case CCNOmode:
18498 case CCAmode:
18499 case CCCmode:
18500 case CCOmode:
18501 case CCSmode:
18502 case CCZmode:
18503 switch (m2)
18504 {
18505 default:
18506 return VOIDmode;
18507
18508 case CCmode:
18509 case CCGCmode:
18510 case CCGOCmode:
18511 case CCNOmode:
18512 case CCAmode:
18513 case CCCmode:
18514 case CCOmode:
18515 case CCSmode:
18516 case CCZmode:
18517 return CCmode;
18518 }
18519
18520 case CCFPmode:
18521 case CCFPUmode:
18522 /* These are only compatible with themselves, which we already
18523 checked above. */
18524 return VOIDmode;
18525 }
18526 }
18527
18528
18529 /* Return a comparison we can do and that it is equivalent to
18530 swap_condition (code) apart possibly from orderedness.
18531 But, never change orderedness if TARGET_IEEE_FP, returning
18532 UNKNOWN in that case if necessary. */
18533
18534 static enum rtx_code
18535 ix86_fp_swap_condition (enum rtx_code code)
18536 {
18537 switch (code)
18538 {
18539 case GT: /* GTU - CF=0 & ZF=0 */
18540 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
18541 case GE: /* GEU - CF=0 */
18542 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
18543 case UNLT: /* LTU - CF=1 */
18544 return TARGET_IEEE_FP ? UNKNOWN : GT;
18545 case UNLE: /* LEU - CF=1 | ZF=1 */
18546 return TARGET_IEEE_FP ? UNKNOWN : GE;
18547 default:
18548 return swap_condition (code);
18549 }
18550 }
18551
18552 /* Return cost of comparison CODE using the best strategy for performance.
18553 All following functions do use number of instructions as a cost metrics.
18554 In future this should be tweaked to compute bytes for optimize_size and
18555 take into account performance of various instructions on various CPUs. */
18556
18557 static int
18558 ix86_fp_comparison_cost (enum rtx_code code)
18559 {
18560 int arith_cost;
18561
18562 /* The cost of code using bit-twiddling on %ah. */
18563 switch (code)
18564 {
18565 case UNLE:
18566 case UNLT:
18567 case LTGT:
18568 case GT:
18569 case GE:
18570 case UNORDERED:
18571 case ORDERED:
18572 case UNEQ:
18573 arith_cost = 4;
18574 break;
18575 case LT:
18576 case NE:
18577 case EQ:
18578 case UNGE:
18579 arith_cost = TARGET_IEEE_FP ? 5 : 4;
18580 break;
18581 case LE:
18582 case UNGT:
18583 arith_cost = TARGET_IEEE_FP ? 6 : 4;
18584 break;
18585 default:
18586 gcc_unreachable ();
18587 }
18588
18589 switch (ix86_fp_comparison_strategy (code))
18590 {
18591 case IX86_FPCMP_COMI:
18592 return arith_cost > 4 ? 3 : 2;
18593 case IX86_FPCMP_SAHF:
18594 return arith_cost > 4 ? 4 : 3;
18595 default:
18596 return arith_cost;
18597 }
18598 }
18599
18600 /* Return strategy to use for floating-point. We assume that fcomi is always
18601 preferrable where available, since that is also true when looking at size
18602 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
18603
18604 enum ix86_fpcmp_strategy
18605 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
18606 {
18607 /* Do fcomi/sahf based test when profitable. */
18608
18609 if (TARGET_CMOVE)
18610 return IX86_FPCMP_COMI;
18611
18612 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun)))
18613 return IX86_FPCMP_SAHF;
18614
18615 return IX86_FPCMP_ARITH;
18616 }
18617
18618 /* Swap, force into registers, or otherwise massage the two operands
18619 to a fp comparison. The operands are updated in place; the new
18620 comparison code is returned. */
18621
18622 static enum rtx_code
18623 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
18624 {
18625 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
18626 rtx op0 = *pop0, op1 = *pop1;
18627 enum machine_mode op_mode = GET_MODE (op0);
18628 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
18629
18630 /* All of the unordered compare instructions only work on registers.
18631 The same is true of the fcomi compare instructions. The XFmode
18632 compare instructions require registers except when comparing
18633 against zero or when converting operand 1 from fixed point to
18634 floating point. */
18635
18636 if (!is_sse
18637 && (fpcmp_mode == CCFPUmode
18638 || (op_mode == XFmode
18639 && ! (standard_80387_constant_p (op0) == 1
18640 || standard_80387_constant_p (op1) == 1)
18641 && GET_CODE (op1) != FLOAT)
18642 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
18643 {
18644 op0 = force_reg (op_mode, op0);
18645 op1 = force_reg (op_mode, op1);
18646 }
18647 else
18648 {
18649 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
18650 things around if they appear profitable, otherwise force op0
18651 into a register. */
18652
18653 if (standard_80387_constant_p (op0) == 0
18654 || (MEM_P (op0)
18655 && ! (standard_80387_constant_p (op1) == 0
18656 || MEM_P (op1))))
18657 {
18658 enum rtx_code new_code = ix86_fp_swap_condition (code);
18659 if (new_code != UNKNOWN)
18660 {
18661 rtx tmp;
18662 tmp = op0, op0 = op1, op1 = tmp;
18663 code = new_code;
18664 }
18665 }
18666
18667 if (!REG_P (op0))
18668 op0 = force_reg (op_mode, op0);
18669
18670 if (CONSTANT_P (op1))
18671 {
18672 int tmp = standard_80387_constant_p (op1);
18673 if (tmp == 0)
18674 op1 = validize_mem (force_const_mem (op_mode, op1));
18675 else if (tmp == 1)
18676 {
18677 if (TARGET_CMOVE)
18678 op1 = force_reg (op_mode, op1);
18679 }
18680 else
18681 op1 = force_reg (op_mode, op1);
18682 }
18683 }
18684
18685 /* Try to rearrange the comparison to make it cheaper. */
18686 if (ix86_fp_comparison_cost (code)
18687 > ix86_fp_comparison_cost (swap_condition (code))
18688 && (REG_P (op1) || can_create_pseudo_p ()))
18689 {
18690 rtx tmp;
18691 tmp = op0, op0 = op1, op1 = tmp;
18692 code = swap_condition (code);
18693 if (!REG_P (op0))
18694 op0 = force_reg (op_mode, op0);
18695 }
18696
18697 *pop0 = op0;
18698 *pop1 = op1;
18699 return code;
18700 }
18701
18702 /* Convert comparison codes we use to represent FP comparison to integer
18703 code that will result in proper branch. Return UNKNOWN if no such code
18704 is available. */
18705
18706 enum rtx_code
18707 ix86_fp_compare_code_to_integer (enum rtx_code code)
18708 {
18709 switch (code)
18710 {
18711 case GT:
18712 return GTU;
18713 case GE:
18714 return GEU;
18715 case ORDERED:
18716 case UNORDERED:
18717 return code;
18718 break;
18719 case UNEQ:
18720 return EQ;
18721 break;
18722 case UNLT:
18723 return LTU;
18724 break;
18725 case UNLE:
18726 return LEU;
18727 break;
18728 case LTGT:
18729 return NE;
18730 break;
18731 default:
18732 return UNKNOWN;
18733 }
18734 }
18735
18736 /* Generate insn patterns to do a floating point compare of OPERANDS. */
18737
18738 static rtx
18739 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
18740 {
18741 enum machine_mode fpcmp_mode, intcmp_mode;
18742 rtx tmp, tmp2;
18743
18744 fpcmp_mode = ix86_fp_compare_mode (code);
18745 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
18746
18747 /* Do fcomi/sahf based test when profitable. */
18748 switch (ix86_fp_comparison_strategy (code))
18749 {
18750 case IX86_FPCMP_COMI:
18751 intcmp_mode = fpcmp_mode;
18752 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18753 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18754 tmp);
18755 emit_insn (tmp);
18756 break;
18757
18758 case IX86_FPCMP_SAHF:
18759 intcmp_mode = fpcmp_mode;
18760 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18761 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18762 tmp);
18763
18764 if (!scratch)
18765 scratch = gen_reg_rtx (HImode);
18766 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
18767 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
18768 break;
18769
18770 case IX86_FPCMP_ARITH:
18771 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
18772 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18773 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
18774 if (!scratch)
18775 scratch = gen_reg_rtx (HImode);
18776 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
18777
18778 /* In the unordered case, we have to check C2 for NaN's, which
18779 doesn't happen to work out to anything nice combination-wise.
18780 So do some bit twiddling on the value we've got in AH to come
18781 up with an appropriate set of condition codes. */
18782
18783 intcmp_mode = CCNOmode;
18784 switch (code)
18785 {
18786 case GT:
18787 case UNGT:
18788 if (code == GT || !TARGET_IEEE_FP)
18789 {
18790 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18791 code = EQ;
18792 }
18793 else
18794 {
18795 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18796 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18797 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
18798 intcmp_mode = CCmode;
18799 code = GEU;
18800 }
18801 break;
18802 case LT:
18803 case UNLT:
18804 if (code == LT && TARGET_IEEE_FP)
18805 {
18806 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18807 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
18808 intcmp_mode = CCmode;
18809 code = EQ;
18810 }
18811 else
18812 {
18813 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
18814 code = NE;
18815 }
18816 break;
18817 case GE:
18818 case UNGE:
18819 if (code == GE || !TARGET_IEEE_FP)
18820 {
18821 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
18822 code = EQ;
18823 }
18824 else
18825 {
18826 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18827 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
18828 code = NE;
18829 }
18830 break;
18831 case LE:
18832 case UNLE:
18833 if (code == LE && TARGET_IEEE_FP)
18834 {
18835 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18836 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18837 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18838 intcmp_mode = CCmode;
18839 code = LTU;
18840 }
18841 else
18842 {
18843 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18844 code = NE;
18845 }
18846 break;
18847 case EQ:
18848 case UNEQ:
18849 if (code == EQ && TARGET_IEEE_FP)
18850 {
18851 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18852 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18853 intcmp_mode = CCmode;
18854 code = EQ;
18855 }
18856 else
18857 {
18858 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18859 code = NE;
18860 }
18861 break;
18862 case NE:
18863 case LTGT:
18864 if (code == NE && TARGET_IEEE_FP)
18865 {
18866 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18867 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
18868 GEN_INT (0x40)));
18869 code = NE;
18870 }
18871 else
18872 {
18873 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18874 code = EQ;
18875 }
18876 break;
18877
18878 case UNORDERED:
18879 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18880 code = NE;
18881 break;
18882 case ORDERED:
18883 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18884 code = EQ;
18885 break;
18886
18887 default:
18888 gcc_unreachable ();
18889 }
18890 break;
18891
18892 default:
18893 gcc_unreachable();
18894 }
18895
18896 /* Return the test that should be put into the flags user, i.e.
18897 the bcc, scc, or cmov instruction. */
18898 return gen_rtx_fmt_ee (code, VOIDmode,
18899 gen_rtx_REG (intcmp_mode, FLAGS_REG),
18900 const0_rtx);
18901 }
18902
18903 static rtx
18904 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
18905 {
18906 rtx ret;
18907
18908 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
18909 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
18910
18911 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
18912 {
18913 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
18914 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18915 }
18916 else
18917 ret = ix86_expand_int_compare (code, op0, op1);
18918
18919 return ret;
18920 }
18921
18922 void
18923 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
18924 {
18925 enum machine_mode mode = GET_MODE (op0);
18926 rtx tmp;
18927
18928 switch (mode)
18929 {
18930 case SFmode:
18931 case DFmode:
18932 case XFmode:
18933 case QImode:
18934 case HImode:
18935 case SImode:
18936 simple:
18937 tmp = ix86_expand_compare (code, op0, op1);
18938 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18939 gen_rtx_LABEL_REF (VOIDmode, label),
18940 pc_rtx);
18941 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
18942 return;
18943
18944 case DImode:
18945 if (TARGET_64BIT)
18946 goto simple;
18947 case TImode:
18948 /* Expand DImode branch into multiple compare+branch. */
18949 {
18950 rtx lo[2], hi[2], label2;
18951 enum rtx_code code1, code2, code3;
18952 enum machine_mode submode;
18953
18954 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
18955 {
18956 tmp = op0, op0 = op1, op1 = tmp;
18957 code = swap_condition (code);
18958 }
18959
18960 split_double_mode (mode, &op0, 1, lo+0, hi+0);
18961 split_double_mode (mode, &op1, 1, lo+1, hi+1);
18962
18963 submode = mode == DImode ? SImode : DImode;
18964
18965 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
18966 avoid two branches. This costs one extra insn, so disable when
18967 optimizing for size. */
18968
18969 if ((code == EQ || code == NE)
18970 && (!optimize_insn_for_size_p ()
18971 || hi[1] == const0_rtx || lo[1] == const0_rtx))
18972 {
18973 rtx xor0, xor1;
18974
18975 xor1 = hi[0];
18976 if (hi[1] != const0_rtx)
18977 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
18978 NULL_RTX, 0, OPTAB_WIDEN);
18979
18980 xor0 = lo[0];
18981 if (lo[1] != const0_rtx)
18982 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
18983 NULL_RTX, 0, OPTAB_WIDEN);
18984
18985 tmp = expand_binop (submode, ior_optab, xor1, xor0,
18986 NULL_RTX, 0, OPTAB_WIDEN);
18987
18988 ix86_expand_branch (code, tmp, const0_rtx, label);
18989 return;
18990 }
18991
18992 /* Otherwise, if we are doing less-than or greater-or-equal-than,
18993 op1 is a constant and the low word is zero, then we can just
18994 examine the high word. Similarly for low word -1 and
18995 less-or-equal-than or greater-than. */
18996
18997 if (CONST_INT_P (hi[1]))
18998 switch (code)
18999 {
19000 case LT: case LTU: case GE: case GEU:
19001 if (lo[1] == const0_rtx)
19002 {
19003 ix86_expand_branch (code, hi[0], hi[1], label);
19004 return;
19005 }
19006 break;
19007 case LE: case LEU: case GT: case GTU:
19008 if (lo[1] == constm1_rtx)
19009 {
19010 ix86_expand_branch (code, hi[0], hi[1], label);
19011 return;
19012 }
19013 break;
19014 default:
19015 break;
19016 }
19017
19018 /* Otherwise, we need two or three jumps. */
19019
19020 label2 = gen_label_rtx ();
19021
19022 code1 = code;
19023 code2 = swap_condition (code);
19024 code3 = unsigned_condition (code);
19025
19026 switch (code)
19027 {
19028 case LT: case GT: case LTU: case GTU:
19029 break;
19030
19031 case LE: code1 = LT; code2 = GT; break;
19032 case GE: code1 = GT; code2 = LT; break;
19033 case LEU: code1 = LTU; code2 = GTU; break;
19034 case GEU: code1 = GTU; code2 = LTU; break;
19035
19036 case EQ: code1 = UNKNOWN; code2 = NE; break;
19037 case NE: code2 = UNKNOWN; break;
19038
19039 default:
19040 gcc_unreachable ();
19041 }
19042
19043 /*
19044 * a < b =>
19045 * if (hi(a) < hi(b)) goto true;
19046 * if (hi(a) > hi(b)) goto false;
19047 * if (lo(a) < lo(b)) goto true;
19048 * false:
19049 */
19050
19051 if (code1 != UNKNOWN)
19052 ix86_expand_branch (code1, hi[0], hi[1], label);
19053 if (code2 != UNKNOWN)
19054 ix86_expand_branch (code2, hi[0], hi[1], label2);
19055
19056 ix86_expand_branch (code3, lo[0], lo[1], label);
19057
19058 if (code2 != UNKNOWN)
19059 emit_label (label2);
19060 return;
19061 }
19062
19063 default:
19064 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
19065 goto simple;
19066 }
19067 }
19068
19069 /* Split branch based on floating point condition. */
19070 void
19071 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
19072 rtx target1, rtx target2, rtx tmp, rtx pushed)
19073 {
19074 rtx condition;
19075 rtx i;
19076
19077 if (target2 != pc_rtx)
19078 {
19079 rtx tmp = target2;
19080 code = reverse_condition_maybe_unordered (code);
19081 target2 = target1;
19082 target1 = tmp;
19083 }
19084
19085 condition = ix86_expand_fp_compare (code, op1, op2,
19086 tmp);
19087
19088 /* Remove pushed operand from stack. */
19089 if (pushed)
19090 ix86_free_from_memory (GET_MODE (pushed));
19091
19092 i = emit_jump_insn (gen_rtx_SET
19093 (VOIDmode, pc_rtx,
19094 gen_rtx_IF_THEN_ELSE (VOIDmode,
19095 condition, target1, target2)));
19096 if (split_branch_probability >= 0)
19097 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
19098 }
19099
19100 void
19101 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
19102 {
19103 rtx ret;
19104
19105 gcc_assert (GET_MODE (dest) == QImode);
19106
19107 ret = ix86_expand_compare (code, op0, op1);
19108 PUT_MODE (ret, QImode);
19109 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
19110 }
19111
19112 /* Expand comparison setting or clearing carry flag. Return true when
19113 successful and set pop for the operation. */
19114 static bool
19115 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
19116 {
19117 enum machine_mode mode =
19118 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
19119
19120 /* Do not handle double-mode compares that go through special path. */
19121 if (mode == (TARGET_64BIT ? TImode : DImode))
19122 return false;
19123
19124 if (SCALAR_FLOAT_MODE_P (mode))
19125 {
19126 rtx compare_op, compare_seq;
19127
19128 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
19129
19130 /* Shortcut: following common codes never translate
19131 into carry flag compares. */
19132 if (code == EQ || code == NE || code == UNEQ || code == LTGT
19133 || code == ORDERED || code == UNORDERED)
19134 return false;
19135
19136 /* These comparisons require zero flag; swap operands so they won't. */
19137 if ((code == GT || code == UNLE || code == LE || code == UNGT)
19138 && !TARGET_IEEE_FP)
19139 {
19140 rtx tmp = op0;
19141 op0 = op1;
19142 op1 = tmp;
19143 code = swap_condition (code);
19144 }
19145
19146 /* Try to expand the comparison and verify that we end up with
19147 carry flag based comparison. This fails to be true only when
19148 we decide to expand comparison using arithmetic that is not
19149 too common scenario. */
19150 start_sequence ();
19151 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19152 compare_seq = get_insns ();
19153 end_sequence ();
19154
19155 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
19156 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
19157 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
19158 else
19159 code = GET_CODE (compare_op);
19160
19161 if (code != LTU && code != GEU)
19162 return false;
19163
19164 emit_insn (compare_seq);
19165 *pop = compare_op;
19166 return true;
19167 }
19168
19169 if (!INTEGRAL_MODE_P (mode))
19170 return false;
19171
19172 switch (code)
19173 {
19174 case LTU:
19175 case GEU:
19176 break;
19177
19178 /* Convert a==0 into (unsigned)a<1. */
19179 case EQ:
19180 case NE:
19181 if (op1 != const0_rtx)
19182 return false;
19183 op1 = const1_rtx;
19184 code = (code == EQ ? LTU : GEU);
19185 break;
19186
19187 /* Convert a>b into b<a or a>=b-1. */
19188 case GTU:
19189 case LEU:
19190 if (CONST_INT_P (op1))
19191 {
19192 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
19193 /* Bail out on overflow. We still can swap operands but that
19194 would force loading of the constant into register. */
19195 if (op1 == const0_rtx
19196 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
19197 return false;
19198 code = (code == GTU ? GEU : LTU);
19199 }
19200 else
19201 {
19202 rtx tmp = op1;
19203 op1 = op0;
19204 op0 = tmp;
19205 code = (code == GTU ? LTU : GEU);
19206 }
19207 break;
19208
19209 /* Convert a>=0 into (unsigned)a<0x80000000. */
19210 case LT:
19211 case GE:
19212 if (mode == DImode || op1 != const0_rtx)
19213 return false;
19214 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19215 code = (code == LT ? GEU : LTU);
19216 break;
19217 case LE:
19218 case GT:
19219 if (mode == DImode || op1 != constm1_rtx)
19220 return false;
19221 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19222 code = (code == LE ? GEU : LTU);
19223 break;
19224
19225 default:
19226 return false;
19227 }
19228 /* Swapping operands may cause constant to appear as first operand. */
19229 if (!nonimmediate_operand (op0, VOIDmode))
19230 {
19231 if (!can_create_pseudo_p ())
19232 return false;
19233 op0 = force_reg (mode, op0);
19234 }
19235 *pop = ix86_expand_compare (code, op0, op1);
19236 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
19237 return true;
19238 }
19239
19240 bool
19241 ix86_expand_int_movcc (rtx operands[])
19242 {
19243 enum rtx_code code = GET_CODE (operands[1]), compare_code;
19244 rtx compare_seq, compare_op;
19245 enum machine_mode mode = GET_MODE (operands[0]);
19246 bool sign_bit_compare_p = false;
19247 rtx op0 = XEXP (operands[1], 0);
19248 rtx op1 = XEXP (operands[1], 1);
19249
19250 if (GET_MODE (op0) == TImode
19251 || (GET_MODE (op0) == DImode
19252 && !TARGET_64BIT))
19253 return false;
19254
19255 start_sequence ();
19256 compare_op = ix86_expand_compare (code, op0, op1);
19257 compare_seq = get_insns ();
19258 end_sequence ();
19259
19260 compare_code = GET_CODE (compare_op);
19261
19262 if ((op1 == const0_rtx && (code == GE || code == LT))
19263 || (op1 == constm1_rtx && (code == GT || code == LE)))
19264 sign_bit_compare_p = true;
19265
19266 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
19267 HImode insns, we'd be swallowed in word prefix ops. */
19268
19269 if ((mode != HImode || TARGET_FAST_PREFIX)
19270 && (mode != (TARGET_64BIT ? TImode : DImode))
19271 && CONST_INT_P (operands[2])
19272 && CONST_INT_P (operands[3]))
19273 {
19274 rtx out = operands[0];
19275 HOST_WIDE_INT ct = INTVAL (operands[2]);
19276 HOST_WIDE_INT cf = INTVAL (operands[3]);
19277 HOST_WIDE_INT diff;
19278
19279 diff = ct - cf;
19280 /* Sign bit compares are better done using shifts than we do by using
19281 sbb. */
19282 if (sign_bit_compare_p
19283 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
19284 {
19285 /* Detect overlap between destination and compare sources. */
19286 rtx tmp = out;
19287
19288 if (!sign_bit_compare_p)
19289 {
19290 rtx flags;
19291 bool fpcmp = false;
19292
19293 compare_code = GET_CODE (compare_op);
19294
19295 flags = XEXP (compare_op, 0);
19296
19297 if (GET_MODE (flags) == CCFPmode
19298 || GET_MODE (flags) == CCFPUmode)
19299 {
19300 fpcmp = true;
19301 compare_code
19302 = ix86_fp_compare_code_to_integer (compare_code);
19303 }
19304
19305 /* To simplify rest of code, restrict to the GEU case. */
19306 if (compare_code == LTU)
19307 {
19308 HOST_WIDE_INT tmp = ct;
19309 ct = cf;
19310 cf = tmp;
19311 compare_code = reverse_condition (compare_code);
19312 code = reverse_condition (code);
19313 }
19314 else
19315 {
19316 if (fpcmp)
19317 PUT_CODE (compare_op,
19318 reverse_condition_maybe_unordered
19319 (GET_CODE (compare_op)));
19320 else
19321 PUT_CODE (compare_op,
19322 reverse_condition (GET_CODE (compare_op)));
19323 }
19324 diff = ct - cf;
19325
19326 if (reg_overlap_mentioned_p (out, op0)
19327 || reg_overlap_mentioned_p (out, op1))
19328 tmp = gen_reg_rtx (mode);
19329
19330 if (mode == DImode)
19331 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
19332 else
19333 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
19334 flags, compare_op));
19335 }
19336 else
19337 {
19338 if (code == GT || code == GE)
19339 code = reverse_condition (code);
19340 else
19341 {
19342 HOST_WIDE_INT tmp = ct;
19343 ct = cf;
19344 cf = tmp;
19345 diff = ct - cf;
19346 }
19347 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
19348 }
19349
19350 if (diff == 1)
19351 {
19352 /*
19353 * cmpl op0,op1
19354 * sbbl dest,dest
19355 * [addl dest, ct]
19356 *
19357 * Size 5 - 8.
19358 */
19359 if (ct)
19360 tmp = expand_simple_binop (mode, PLUS,
19361 tmp, GEN_INT (ct),
19362 copy_rtx (tmp), 1, OPTAB_DIRECT);
19363 }
19364 else if (cf == -1)
19365 {
19366 /*
19367 * cmpl op0,op1
19368 * sbbl dest,dest
19369 * orl $ct, dest
19370 *
19371 * Size 8.
19372 */
19373 tmp = expand_simple_binop (mode, IOR,
19374 tmp, GEN_INT (ct),
19375 copy_rtx (tmp), 1, OPTAB_DIRECT);
19376 }
19377 else if (diff == -1 && ct)
19378 {
19379 /*
19380 * cmpl op0,op1
19381 * sbbl dest,dest
19382 * notl dest
19383 * [addl dest, cf]
19384 *
19385 * Size 8 - 11.
19386 */
19387 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19388 if (cf)
19389 tmp = expand_simple_binop (mode, PLUS,
19390 copy_rtx (tmp), GEN_INT (cf),
19391 copy_rtx (tmp), 1, OPTAB_DIRECT);
19392 }
19393 else
19394 {
19395 /*
19396 * cmpl op0,op1
19397 * sbbl dest,dest
19398 * [notl dest]
19399 * andl cf - ct, dest
19400 * [addl dest, ct]
19401 *
19402 * Size 8 - 11.
19403 */
19404
19405 if (cf == 0)
19406 {
19407 cf = ct;
19408 ct = 0;
19409 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19410 }
19411
19412 tmp = expand_simple_binop (mode, AND,
19413 copy_rtx (tmp),
19414 gen_int_mode (cf - ct, mode),
19415 copy_rtx (tmp), 1, OPTAB_DIRECT);
19416 if (ct)
19417 tmp = expand_simple_binop (mode, PLUS,
19418 copy_rtx (tmp), GEN_INT (ct),
19419 copy_rtx (tmp), 1, OPTAB_DIRECT);
19420 }
19421
19422 if (!rtx_equal_p (tmp, out))
19423 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
19424
19425 return true;
19426 }
19427
19428 if (diff < 0)
19429 {
19430 enum machine_mode cmp_mode = GET_MODE (op0);
19431
19432 HOST_WIDE_INT tmp;
19433 tmp = ct, ct = cf, cf = tmp;
19434 diff = -diff;
19435
19436 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19437 {
19438 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19439
19440 /* We may be reversing unordered compare to normal compare, that
19441 is not valid in general (we may convert non-trapping condition
19442 to trapping one), however on i386 we currently emit all
19443 comparisons unordered. */
19444 compare_code = reverse_condition_maybe_unordered (compare_code);
19445 code = reverse_condition_maybe_unordered (code);
19446 }
19447 else
19448 {
19449 compare_code = reverse_condition (compare_code);
19450 code = reverse_condition (code);
19451 }
19452 }
19453
19454 compare_code = UNKNOWN;
19455 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
19456 && CONST_INT_P (op1))
19457 {
19458 if (op1 == const0_rtx
19459 && (code == LT || code == GE))
19460 compare_code = code;
19461 else if (op1 == constm1_rtx)
19462 {
19463 if (code == LE)
19464 compare_code = LT;
19465 else if (code == GT)
19466 compare_code = GE;
19467 }
19468 }
19469
19470 /* Optimize dest = (op0 < 0) ? -1 : cf. */
19471 if (compare_code != UNKNOWN
19472 && GET_MODE (op0) == GET_MODE (out)
19473 && (cf == -1 || ct == -1))
19474 {
19475 /* If lea code below could be used, only optimize
19476 if it results in a 2 insn sequence. */
19477
19478 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
19479 || diff == 3 || diff == 5 || diff == 9)
19480 || (compare_code == LT && ct == -1)
19481 || (compare_code == GE && cf == -1))
19482 {
19483 /*
19484 * notl op1 (if necessary)
19485 * sarl $31, op1
19486 * orl cf, op1
19487 */
19488 if (ct != -1)
19489 {
19490 cf = ct;
19491 ct = -1;
19492 code = reverse_condition (code);
19493 }
19494
19495 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19496
19497 out = expand_simple_binop (mode, IOR,
19498 out, GEN_INT (cf),
19499 out, 1, OPTAB_DIRECT);
19500 if (out != operands[0])
19501 emit_move_insn (operands[0], out);
19502
19503 return true;
19504 }
19505 }
19506
19507
19508 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
19509 || diff == 3 || diff == 5 || diff == 9)
19510 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
19511 && (mode != DImode
19512 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
19513 {
19514 /*
19515 * xorl dest,dest
19516 * cmpl op1,op2
19517 * setcc dest
19518 * lea cf(dest*(ct-cf)),dest
19519 *
19520 * Size 14.
19521 *
19522 * This also catches the degenerate setcc-only case.
19523 */
19524
19525 rtx tmp;
19526 int nops;
19527
19528 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19529
19530 nops = 0;
19531 /* On x86_64 the lea instruction operates on Pmode, so we need
19532 to get arithmetics done in proper mode to match. */
19533 if (diff == 1)
19534 tmp = copy_rtx (out);
19535 else
19536 {
19537 rtx out1;
19538 out1 = copy_rtx (out);
19539 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
19540 nops++;
19541 if (diff & 1)
19542 {
19543 tmp = gen_rtx_PLUS (mode, tmp, out1);
19544 nops++;
19545 }
19546 }
19547 if (cf != 0)
19548 {
19549 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
19550 nops++;
19551 }
19552 if (!rtx_equal_p (tmp, out))
19553 {
19554 if (nops == 1)
19555 out = force_operand (tmp, copy_rtx (out));
19556 else
19557 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
19558 }
19559 if (!rtx_equal_p (out, operands[0]))
19560 emit_move_insn (operands[0], copy_rtx (out));
19561
19562 return true;
19563 }
19564
19565 /*
19566 * General case: Jumpful:
19567 * xorl dest,dest cmpl op1, op2
19568 * cmpl op1, op2 movl ct, dest
19569 * setcc dest jcc 1f
19570 * decl dest movl cf, dest
19571 * andl (cf-ct),dest 1:
19572 * addl ct,dest
19573 *
19574 * Size 20. Size 14.
19575 *
19576 * This is reasonably steep, but branch mispredict costs are
19577 * high on modern cpus, so consider failing only if optimizing
19578 * for space.
19579 */
19580
19581 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19582 && BRANCH_COST (optimize_insn_for_speed_p (),
19583 false) >= 2)
19584 {
19585 if (cf == 0)
19586 {
19587 enum machine_mode cmp_mode = GET_MODE (op0);
19588
19589 cf = ct;
19590 ct = 0;
19591
19592 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19593 {
19594 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19595
19596 /* We may be reversing unordered compare to normal compare,
19597 that is not valid in general (we may convert non-trapping
19598 condition to trapping one), however on i386 we currently
19599 emit all comparisons unordered. */
19600 code = reverse_condition_maybe_unordered (code);
19601 }
19602 else
19603 {
19604 code = reverse_condition (code);
19605 if (compare_code != UNKNOWN)
19606 compare_code = reverse_condition (compare_code);
19607 }
19608 }
19609
19610 if (compare_code != UNKNOWN)
19611 {
19612 /* notl op1 (if needed)
19613 sarl $31, op1
19614 andl (cf-ct), op1
19615 addl ct, op1
19616
19617 For x < 0 (resp. x <= -1) there will be no notl,
19618 so if possible swap the constants to get rid of the
19619 complement.
19620 True/false will be -1/0 while code below (store flag
19621 followed by decrement) is 0/-1, so the constants need
19622 to be exchanged once more. */
19623
19624 if (compare_code == GE || !cf)
19625 {
19626 code = reverse_condition (code);
19627 compare_code = LT;
19628 }
19629 else
19630 {
19631 HOST_WIDE_INT tmp = cf;
19632 cf = ct;
19633 ct = tmp;
19634 }
19635
19636 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19637 }
19638 else
19639 {
19640 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19641
19642 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
19643 constm1_rtx,
19644 copy_rtx (out), 1, OPTAB_DIRECT);
19645 }
19646
19647 out = expand_simple_binop (mode, AND, copy_rtx (out),
19648 gen_int_mode (cf - ct, mode),
19649 copy_rtx (out), 1, OPTAB_DIRECT);
19650 if (ct)
19651 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
19652 copy_rtx (out), 1, OPTAB_DIRECT);
19653 if (!rtx_equal_p (out, operands[0]))
19654 emit_move_insn (operands[0], copy_rtx (out));
19655
19656 return true;
19657 }
19658 }
19659
19660 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19661 {
19662 /* Try a few things more with specific constants and a variable. */
19663
19664 optab op;
19665 rtx var, orig_out, out, tmp;
19666
19667 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
19668 return false;
19669
19670 /* If one of the two operands is an interesting constant, load a
19671 constant with the above and mask it in with a logical operation. */
19672
19673 if (CONST_INT_P (operands[2]))
19674 {
19675 var = operands[3];
19676 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
19677 operands[3] = constm1_rtx, op = and_optab;
19678 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
19679 operands[3] = const0_rtx, op = ior_optab;
19680 else
19681 return false;
19682 }
19683 else if (CONST_INT_P (operands[3]))
19684 {
19685 var = operands[2];
19686 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
19687 operands[2] = constm1_rtx, op = and_optab;
19688 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
19689 operands[2] = const0_rtx, op = ior_optab;
19690 else
19691 return false;
19692 }
19693 else
19694 return false;
19695
19696 orig_out = operands[0];
19697 tmp = gen_reg_rtx (mode);
19698 operands[0] = tmp;
19699
19700 /* Recurse to get the constant loaded. */
19701 if (ix86_expand_int_movcc (operands) == 0)
19702 return false;
19703
19704 /* Mask in the interesting variable. */
19705 out = expand_binop (mode, op, var, tmp, orig_out, 0,
19706 OPTAB_WIDEN);
19707 if (!rtx_equal_p (out, orig_out))
19708 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
19709
19710 return true;
19711 }
19712
19713 /*
19714 * For comparison with above,
19715 *
19716 * movl cf,dest
19717 * movl ct,tmp
19718 * cmpl op1,op2
19719 * cmovcc tmp,dest
19720 *
19721 * Size 15.
19722 */
19723
19724 if (! nonimmediate_operand (operands[2], mode))
19725 operands[2] = force_reg (mode, operands[2]);
19726 if (! nonimmediate_operand (operands[3], mode))
19727 operands[3] = force_reg (mode, operands[3]);
19728
19729 if (! register_operand (operands[2], VOIDmode)
19730 && (mode == QImode
19731 || ! register_operand (operands[3], VOIDmode)))
19732 operands[2] = force_reg (mode, operands[2]);
19733
19734 if (mode == QImode
19735 && ! register_operand (operands[3], VOIDmode))
19736 operands[3] = force_reg (mode, operands[3]);
19737
19738 emit_insn (compare_seq);
19739 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19740 gen_rtx_IF_THEN_ELSE (mode,
19741 compare_op, operands[2],
19742 operands[3])));
19743 return true;
19744 }
19745
19746 /* Swap, force into registers, or otherwise massage the two operands
19747 to an sse comparison with a mask result. Thus we differ a bit from
19748 ix86_prepare_fp_compare_args which expects to produce a flags result.
19749
19750 The DEST operand exists to help determine whether to commute commutative
19751 operators. The POP0/POP1 operands are updated in place. The new
19752 comparison code is returned, or UNKNOWN if not implementable. */
19753
19754 static enum rtx_code
19755 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
19756 rtx *pop0, rtx *pop1)
19757 {
19758 rtx tmp;
19759
19760 switch (code)
19761 {
19762 case LTGT:
19763 case UNEQ:
19764 /* AVX supports all the needed comparisons. */
19765 if (TARGET_AVX)
19766 break;
19767 /* We have no LTGT as an operator. We could implement it with
19768 NE & ORDERED, but this requires an extra temporary. It's
19769 not clear that it's worth it. */
19770 return UNKNOWN;
19771
19772 case LT:
19773 case LE:
19774 case UNGT:
19775 case UNGE:
19776 /* These are supported directly. */
19777 break;
19778
19779 case EQ:
19780 case NE:
19781 case UNORDERED:
19782 case ORDERED:
19783 /* AVX has 3 operand comparisons, no need to swap anything. */
19784 if (TARGET_AVX)
19785 break;
19786 /* For commutative operators, try to canonicalize the destination
19787 operand to be first in the comparison - this helps reload to
19788 avoid extra moves. */
19789 if (!dest || !rtx_equal_p (dest, *pop1))
19790 break;
19791 /* FALLTHRU */
19792
19793 case GE:
19794 case GT:
19795 case UNLE:
19796 case UNLT:
19797 /* These are not supported directly before AVX, and furthermore
19798 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
19799 comparison operands to transform into something that is
19800 supported. */
19801 tmp = *pop0;
19802 *pop0 = *pop1;
19803 *pop1 = tmp;
19804 code = swap_condition (code);
19805 break;
19806
19807 default:
19808 gcc_unreachable ();
19809 }
19810
19811 return code;
19812 }
19813
19814 /* Detect conditional moves that exactly match min/max operational
19815 semantics. Note that this is IEEE safe, as long as we don't
19816 interchange the operands.
19817
19818 Returns FALSE if this conditional move doesn't match a MIN/MAX,
19819 and TRUE if the operation is successful and instructions are emitted. */
19820
19821 static bool
19822 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
19823 rtx cmp_op1, rtx if_true, rtx if_false)
19824 {
19825 enum machine_mode mode;
19826 bool is_min;
19827 rtx tmp;
19828
19829 if (code == LT)
19830 ;
19831 else if (code == UNGE)
19832 {
19833 tmp = if_true;
19834 if_true = if_false;
19835 if_false = tmp;
19836 }
19837 else
19838 return false;
19839
19840 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
19841 is_min = true;
19842 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
19843 is_min = false;
19844 else
19845 return false;
19846
19847 mode = GET_MODE (dest);
19848
19849 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
19850 but MODE may be a vector mode and thus not appropriate. */
19851 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
19852 {
19853 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
19854 rtvec v;
19855
19856 if_true = force_reg (mode, if_true);
19857 v = gen_rtvec (2, if_true, if_false);
19858 tmp = gen_rtx_UNSPEC (mode, v, u);
19859 }
19860 else
19861 {
19862 code = is_min ? SMIN : SMAX;
19863 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
19864 }
19865
19866 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
19867 return true;
19868 }
19869
19870 /* Expand an sse vector comparison. Return the register with the result. */
19871
19872 static rtx
19873 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
19874 rtx op_true, rtx op_false)
19875 {
19876 enum machine_mode mode = GET_MODE (dest);
19877 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
19878 rtx x;
19879
19880 cmp_op0 = force_reg (cmp_mode, cmp_op0);
19881 if (!nonimmediate_operand (cmp_op1, cmp_mode))
19882 cmp_op1 = force_reg (cmp_mode, cmp_op1);
19883
19884 if (optimize
19885 || reg_overlap_mentioned_p (dest, op_true)
19886 || reg_overlap_mentioned_p (dest, op_false))
19887 dest = gen_reg_rtx (mode);
19888
19889 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
19890 if (cmp_mode != mode)
19891 {
19892 x = force_reg (cmp_mode, x);
19893 convert_move (dest, x, false);
19894 }
19895 else
19896 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19897
19898 return dest;
19899 }
19900
19901 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
19902 operations. This is used for both scalar and vector conditional moves. */
19903
19904 static void
19905 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
19906 {
19907 enum machine_mode mode = GET_MODE (dest);
19908 rtx t2, t3, x;
19909
19910 if (vector_all_ones_operand (op_true, mode)
19911 && rtx_equal_p (op_false, CONST0_RTX (mode)))
19912 {
19913 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
19914 }
19915 else if (op_false == CONST0_RTX (mode))
19916 {
19917 op_true = force_reg (mode, op_true);
19918 x = gen_rtx_AND (mode, cmp, op_true);
19919 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19920 }
19921 else if (op_true == CONST0_RTX (mode))
19922 {
19923 op_false = force_reg (mode, op_false);
19924 x = gen_rtx_NOT (mode, cmp);
19925 x = gen_rtx_AND (mode, x, op_false);
19926 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19927 }
19928 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
19929 {
19930 op_false = force_reg (mode, op_false);
19931 x = gen_rtx_IOR (mode, cmp, op_false);
19932 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19933 }
19934 else if (TARGET_XOP)
19935 {
19936 op_true = force_reg (mode, op_true);
19937
19938 if (!nonimmediate_operand (op_false, mode))
19939 op_false = force_reg (mode, op_false);
19940
19941 emit_insn (gen_rtx_SET (mode, dest,
19942 gen_rtx_IF_THEN_ELSE (mode, cmp,
19943 op_true,
19944 op_false)));
19945 }
19946 else
19947 {
19948 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
19949
19950 if (!nonimmediate_operand (op_true, mode))
19951 op_true = force_reg (mode, op_true);
19952
19953 op_false = force_reg (mode, op_false);
19954
19955 switch (mode)
19956 {
19957 case V4SFmode:
19958 if (TARGET_SSE4_1)
19959 gen = gen_sse4_1_blendvps;
19960 break;
19961 case V2DFmode:
19962 if (TARGET_SSE4_1)
19963 gen = gen_sse4_1_blendvpd;
19964 break;
19965 case V16QImode:
19966 case V8HImode:
19967 case V4SImode:
19968 case V2DImode:
19969 if (TARGET_SSE4_1)
19970 {
19971 gen = gen_sse4_1_pblendvb;
19972 dest = gen_lowpart (V16QImode, dest);
19973 op_false = gen_lowpart (V16QImode, op_false);
19974 op_true = gen_lowpart (V16QImode, op_true);
19975 cmp = gen_lowpart (V16QImode, cmp);
19976 }
19977 break;
19978 case V8SFmode:
19979 if (TARGET_AVX)
19980 gen = gen_avx_blendvps256;
19981 break;
19982 case V4DFmode:
19983 if (TARGET_AVX)
19984 gen = gen_avx_blendvpd256;
19985 break;
19986 case V32QImode:
19987 case V16HImode:
19988 case V8SImode:
19989 case V4DImode:
19990 if (TARGET_AVX2)
19991 {
19992 gen = gen_avx2_pblendvb;
19993 dest = gen_lowpart (V32QImode, dest);
19994 op_false = gen_lowpart (V32QImode, op_false);
19995 op_true = gen_lowpart (V32QImode, op_true);
19996 cmp = gen_lowpart (V32QImode, cmp);
19997 }
19998 break;
19999 default:
20000 break;
20001 }
20002
20003 if (gen != NULL)
20004 emit_insn (gen (dest, op_false, op_true, cmp));
20005 else
20006 {
20007 op_true = force_reg (mode, op_true);
20008
20009 t2 = gen_reg_rtx (mode);
20010 if (optimize)
20011 t3 = gen_reg_rtx (mode);
20012 else
20013 t3 = dest;
20014
20015 x = gen_rtx_AND (mode, op_true, cmp);
20016 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
20017
20018 x = gen_rtx_NOT (mode, cmp);
20019 x = gen_rtx_AND (mode, x, op_false);
20020 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
20021
20022 x = gen_rtx_IOR (mode, t3, t2);
20023 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20024 }
20025 }
20026 }
20027
20028 /* Expand a floating-point conditional move. Return true if successful. */
20029
20030 bool
20031 ix86_expand_fp_movcc (rtx operands[])
20032 {
20033 enum machine_mode mode = GET_MODE (operands[0]);
20034 enum rtx_code code = GET_CODE (operands[1]);
20035 rtx tmp, compare_op;
20036 rtx op0 = XEXP (operands[1], 0);
20037 rtx op1 = XEXP (operands[1], 1);
20038
20039 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
20040 {
20041 enum machine_mode cmode;
20042
20043 /* Since we've no cmove for sse registers, don't force bad register
20044 allocation just to gain access to it. Deny movcc when the
20045 comparison mode doesn't match the move mode. */
20046 cmode = GET_MODE (op0);
20047 if (cmode == VOIDmode)
20048 cmode = GET_MODE (op1);
20049 if (cmode != mode)
20050 return false;
20051
20052 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
20053 if (code == UNKNOWN)
20054 return false;
20055
20056 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
20057 operands[2], operands[3]))
20058 return true;
20059
20060 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
20061 operands[2], operands[3]);
20062 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
20063 return true;
20064 }
20065
20066 if (GET_MODE (op0) == TImode
20067 || (GET_MODE (op0) == DImode
20068 && !TARGET_64BIT))
20069 return false;
20070
20071 /* The floating point conditional move instructions don't directly
20072 support conditions resulting from a signed integer comparison. */
20073
20074 compare_op = ix86_expand_compare (code, op0, op1);
20075 if (!fcmov_comparison_operator (compare_op, VOIDmode))
20076 {
20077 tmp = gen_reg_rtx (QImode);
20078 ix86_expand_setcc (tmp, code, op0, op1);
20079
20080 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
20081 }
20082
20083 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
20084 gen_rtx_IF_THEN_ELSE (mode, compare_op,
20085 operands[2], operands[3])));
20086
20087 return true;
20088 }
20089
20090 /* Expand a floating-point vector conditional move; a vcond operation
20091 rather than a movcc operation. */
20092
20093 bool
20094 ix86_expand_fp_vcond (rtx operands[])
20095 {
20096 enum rtx_code code = GET_CODE (operands[3]);
20097 rtx cmp;
20098
20099 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
20100 &operands[4], &operands[5]);
20101 if (code == UNKNOWN)
20102 {
20103 rtx temp;
20104 switch (GET_CODE (operands[3]))
20105 {
20106 case LTGT:
20107 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
20108 operands[5], operands[0], operands[0]);
20109 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
20110 operands[5], operands[1], operands[2]);
20111 code = AND;
20112 break;
20113 case UNEQ:
20114 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
20115 operands[5], operands[0], operands[0]);
20116 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
20117 operands[5], operands[1], operands[2]);
20118 code = IOR;
20119 break;
20120 default:
20121 gcc_unreachable ();
20122 }
20123 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
20124 OPTAB_DIRECT);
20125 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20126 return true;
20127 }
20128
20129 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
20130 operands[5], operands[1], operands[2]))
20131 return true;
20132
20133 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
20134 operands[1], operands[2]);
20135 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20136 return true;
20137 }
20138
20139 /* Expand a signed/unsigned integral vector conditional move. */
20140
20141 bool
20142 ix86_expand_int_vcond (rtx operands[])
20143 {
20144 enum machine_mode data_mode = GET_MODE (operands[0]);
20145 enum machine_mode mode = GET_MODE (operands[4]);
20146 enum rtx_code code = GET_CODE (operands[3]);
20147 bool negate = false;
20148 rtx x, cop0, cop1;
20149
20150 cop0 = operands[4];
20151 cop1 = operands[5];
20152
20153 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
20154 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
20155 if ((code == LT || code == GE)
20156 && data_mode == mode
20157 && cop1 == CONST0_RTX (mode)
20158 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
20159 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
20160 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
20161 && (GET_MODE_SIZE (data_mode) == 16
20162 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
20163 {
20164 rtx negop = operands[2 - (code == LT)];
20165 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
20166 if (negop == CONST1_RTX (data_mode))
20167 {
20168 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
20169 operands[0], 1, OPTAB_DIRECT);
20170 if (res != operands[0])
20171 emit_move_insn (operands[0], res);
20172 return true;
20173 }
20174 else if (GET_MODE_INNER (data_mode) != DImode
20175 && vector_all_ones_operand (negop, data_mode))
20176 {
20177 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
20178 operands[0], 0, OPTAB_DIRECT);
20179 if (res != operands[0])
20180 emit_move_insn (operands[0], res);
20181 return true;
20182 }
20183 }
20184
20185 if (!nonimmediate_operand (cop1, mode))
20186 cop1 = force_reg (mode, cop1);
20187 if (!general_operand (operands[1], data_mode))
20188 operands[1] = force_reg (data_mode, operands[1]);
20189 if (!general_operand (operands[2], data_mode))
20190 operands[2] = force_reg (data_mode, operands[2]);
20191
20192 /* XOP supports all of the comparisons on all 128-bit vector int types. */
20193 if (TARGET_XOP
20194 && (mode == V16QImode || mode == V8HImode
20195 || mode == V4SImode || mode == V2DImode))
20196 ;
20197 else
20198 {
20199 /* Canonicalize the comparison to EQ, GT, GTU. */
20200 switch (code)
20201 {
20202 case EQ:
20203 case GT:
20204 case GTU:
20205 break;
20206
20207 case NE:
20208 case LE:
20209 case LEU:
20210 code = reverse_condition (code);
20211 negate = true;
20212 break;
20213
20214 case GE:
20215 case GEU:
20216 code = reverse_condition (code);
20217 negate = true;
20218 /* FALLTHRU */
20219
20220 case LT:
20221 case LTU:
20222 code = swap_condition (code);
20223 x = cop0, cop0 = cop1, cop1 = x;
20224 break;
20225
20226 default:
20227 gcc_unreachable ();
20228 }
20229
20230 /* Only SSE4.1/SSE4.2 supports V2DImode. */
20231 if (mode == V2DImode)
20232 {
20233 switch (code)
20234 {
20235 case EQ:
20236 /* SSE4.1 supports EQ. */
20237 if (!TARGET_SSE4_1)
20238 return false;
20239 break;
20240
20241 case GT:
20242 case GTU:
20243 /* SSE4.2 supports GT/GTU. */
20244 if (!TARGET_SSE4_2)
20245 return false;
20246 break;
20247
20248 default:
20249 gcc_unreachable ();
20250 }
20251 }
20252
20253 /* Unsigned parallel compare is not supported by the hardware.
20254 Play some tricks to turn this into a signed comparison
20255 against 0. */
20256 if (code == GTU)
20257 {
20258 cop0 = force_reg (mode, cop0);
20259
20260 switch (mode)
20261 {
20262 case V8SImode:
20263 case V4DImode:
20264 case V4SImode:
20265 case V2DImode:
20266 {
20267 rtx t1, t2, mask;
20268 rtx (*gen_sub3) (rtx, rtx, rtx);
20269
20270 switch (mode)
20271 {
20272 case V8SImode: gen_sub3 = gen_subv8si3; break;
20273 case V4DImode: gen_sub3 = gen_subv4di3; break;
20274 case V4SImode: gen_sub3 = gen_subv4si3; break;
20275 case V2DImode: gen_sub3 = gen_subv2di3; break;
20276 default:
20277 gcc_unreachable ();
20278 }
20279 /* Subtract (-(INT MAX) - 1) from both operands to make
20280 them signed. */
20281 mask = ix86_build_signbit_mask (mode, true, false);
20282 t1 = gen_reg_rtx (mode);
20283 emit_insn (gen_sub3 (t1, cop0, mask));
20284
20285 t2 = gen_reg_rtx (mode);
20286 emit_insn (gen_sub3 (t2, cop1, mask));
20287
20288 cop0 = t1;
20289 cop1 = t2;
20290 code = GT;
20291 }
20292 break;
20293
20294 case V32QImode:
20295 case V16HImode:
20296 case V16QImode:
20297 case V8HImode:
20298 /* Perform a parallel unsigned saturating subtraction. */
20299 x = gen_reg_rtx (mode);
20300 emit_insn (gen_rtx_SET (VOIDmode, x,
20301 gen_rtx_US_MINUS (mode, cop0, cop1)));
20302
20303 cop0 = x;
20304 cop1 = CONST0_RTX (mode);
20305 code = EQ;
20306 negate = !negate;
20307 break;
20308
20309 default:
20310 gcc_unreachable ();
20311 }
20312 }
20313 }
20314
20315 /* Allow the comparison to be done in one mode, but the movcc to
20316 happen in another mode. */
20317 if (data_mode == mode)
20318 {
20319 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
20320 operands[1+negate], operands[2-negate]);
20321 }
20322 else
20323 {
20324 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
20325 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
20326 code, cop0, cop1,
20327 operands[1+negate], operands[2-negate]);
20328 x = gen_lowpart (data_mode, x);
20329 }
20330
20331 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
20332 operands[2-negate]);
20333 return true;
20334 }
20335
20336 /* Expand a variable vector permutation. */
20337
20338 void
20339 ix86_expand_vec_perm (rtx operands[])
20340 {
20341 rtx target = operands[0];
20342 rtx op0 = operands[1];
20343 rtx op1 = operands[2];
20344 rtx mask = operands[3];
20345 rtx t1, t2, t3, t4, vt, vt2, vec[32];
20346 enum machine_mode mode = GET_MODE (op0);
20347 enum machine_mode maskmode = GET_MODE (mask);
20348 int w, e, i;
20349 bool one_operand_shuffle = rtx_equal_p (op0, op1);
20350
20351 /* Number of elements in the vector. */
20352 w = GET_MODE_NUNITS (mode);
20353 e = GET_MODE_UNIT_SIZE (mode);
20354 gcc_assert (w <= 32);
20355
20356 if (TARGET_AVX2)
20357 {
20358 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
20359 {
20360 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
20361 an constant shuffle operand. With a tiny bit of effort we can
20362 use VPERMD instead. A re-interpretation stall for V4DFmode is
20363 unfortunate but there's no avoiding it.
20364 Similarly for V16HImode we don't have instructions for variable
20365 shuffling, while for V32QImode we can use after preparing suitable
20366 masks vpshufb; vpshufb; vpermq; vpor. */
20367
20368 if (mode == V16HImode)
20369 {
20370 maskmode = mode = V32QImode;
20371 w = 32;
20372 e = 1;
20373 }
20374 else
20375 {
20376 maskmode = mode = V8SImode;
20377 w = 8;
20378 e = 4;
20379 }
20380 t1 = gen_reg_rtx (maskmode);
20381
20382 /* Replicate the low bits of the V4DImode mask into V8SImode:
20383 mask = { A B C D }
20384 t1 = { A A B B C C D D }. */
20385 for (i = 0; i < w / 2; ++i)
20386 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
20387 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20388 vt = force_reg (maskmode, vt);
20389 mask = gen_lowpart (maskmode, mask);
20390 if (maskmode == V8SImode)
20391 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
20392 else
20393 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
20394
20395 /* Multiply the shuffle indicies by two. */
20396 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
20397 OPTAB_DIRECT);
20398
20399 /* Add one to the odd shuffle indicies:
20400 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
20401 for (i = 0; i < w / 2; ++i)
20402 {
20403 vec[i * 2] = const0_rtx;
20404 vec[i * 2 + 1] = const1_rtx;
20405 }
20406 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20407 vt = force_const_mem (maskmode, vt);
20408 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
20409 OPTAB_DIRECT);
20410
20411 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
20412 operands[3] = mask = t1;
20413 target = gen_lowpart (mode, target);
20414 op0 = gen_lowpart (mode, op0);
20415 op1 = gen_lowpart (mode, op1);
20416 }
20417
20418 switch (mode)
20419 {
20420 case V8SImode:
20421 /* The VPERMD and VPERMPS instructions already properly ignore
20422 the high bits of the shuffle elements. No need for us to
20423 perform an AND ourselves. */
20424 if (one_operand_shuffle)
20425 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
20426 else
20427 {
20428 t1 = gen_reg_rtx (V8SImode);
20429 t2 = gen_reg_rtx (V8SImode);
20430 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
20431 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
20432 goto merge_two;
20433 }
20434 return;
20435
20436 case V8SFmode:
20437 mask = gen_lowpart (V8SFmode, mask);
20438 if (one_operand_shuffle)
20439 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
20440 else
20441 {
20442 t1 = gen_reg_rtx (V8SFmode);
20443 t2 = gen_reg_rtx (V8SFmode);
20444 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
20445 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
20446 goto merge_two;
20447 }
20448 return;
20449
20450 case V4SImode:
20451 /* By combining the two 128-bit input vectors into one 256-bit
20452 input vector, we can use VPERMD and VPERMPS for the full
20453 two-operand shuffle. */
20454 t1 = gen_reg_rtx (V8SImode);
20455 t2 = gen_reg_rtx (V8SImode);
20456 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
20457 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20458 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
20459 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
20460 return;
20461
20462 case V4SFmode:
20463 t1 = gen_reg_rtx (V8SFmode);
20464 t2 = gen_reg_rtx (V8SImode);
20465 mask = gen_lowpart (V4SImode, mask);
20466 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
20467 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20468 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
20469 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
20470 return;
20471
20472 case V32QImode:
20473 t1 = gen_reg_rtx (V32QImode);
20474 t2 = gen_reg_rtx (V32QImode);
20475 t3 = gen_reg_rtx (V32QImode);
20476 vt2 = GEN_INT (128);
20477 for (i = 0; i < 32; i++)
20478 vec[i] = vt2;
20479 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20480 vt = force_reg (V32QImode, vt);
20481 for (i = 0; i < 32; i++)
20482 vec[i] = i < 16 ? vt2 : const0_rtx;
20483 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20484 vt2 = force_reg (V32QImode, vt2);
20485 /* From mask create two adjusted masks, which contain the same
20486 bits as mask in the low 7 bits of each vector element.
20487 The first mask will have the most significant bit clear
20488 if it requests element from the same 128-bit lane
20489 and MSB set if it requests element from the other 128-bit lane.
20490 The second mask will have the opposite values of the MSB,
20491 and additionally will have its 128-bit lanes swapped.
20492 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
20493 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
20494 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
20495 stands for other 12 bytes. */
20496 /* The bit whether element is from the same lane or the other
20497 lane is bit 4, so shift it up by 3 to the MSB position. */
20498 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
20499 gen_lowpart (V4DImode, mask),
20500 GEN_INT (3)));
20501 /* Clear MSB bits from the mask just in case it had them set. */
20502 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
20503 /* After this t1 will have MSB set for elements from other lane. */
20504 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
20505 /* Clear bits other than MSB. */
20506 emit_insn (gen_andv32qi3 (t1, t1, vt));
20507 /* Or in the lower bits from mask into t3. */
20508 emit_insn (gen_iorv32qi3 (t3, t1, t2));
20509 /* And invert MSB bits in t1, so MSB is set for elements from the same
20510 lane. */
20511 emit_insn (gen_xorv32qi3 (t1, t1, vt));
20512 /* Swap 128-bit lanes in t3. */
20513 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20514 gen_lowpart (V4DImode, t3),
20515 const2_rtx, GEN_INT (3),
20516 const0_rtx, const1_rtx));
20517 /* And or in the lower bits from mask into t1. */
20518 emit_insn (gen_iorv32qi3 (t1, t1, t2));
20519 if (one_operand_shuffle)
20520 {
20521 /* Each of these shuffles will put 0s in places where
20522 element from the other 128-bit lane is needed, otherwise
20523 will shuffle in the requested value. */
20524 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
20525 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
20526 /* For t3 the 128-bit lanes are swapped again. */
20527 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20528 gen_lowpart (V4DImode, t3),
20529 const2_rtx, GEN_INT (3),
20530 const0_rtx, const1_rtx));
20531 /* And oring both together leads to the result. */
20532 emit_insn (gen_iorv32qi3 (target, t1, t3));
20533 return;
20534 }
20535
20536 t4 = gen_reg_rtx (V32QImode);
20537 /* Similarly to the above one_operand_shuffle code,
20538 just for repeated twice for each operand. merge_two:
20539 code will merge the two results together. */
20540 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
20541 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
20542 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
20543 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
20544 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
20545 gen_lowpart (V4DImode, t4),
20546 const2_rtx, GEN_INT (3),
20547 const0_rtx, const1_rtx));
20548 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20549 gen_lowpart (V4DImode, t3),
20550 const2_rtx, GEN_INT (3),
20551 const0_rtx, const1_rtx));
20552 emit_insn (gen_iorv32qi3 (t4, t2, t4));
20553 emit_insn (gen_iorv32qi3 (t3, t1, t3));
20554 t1 = t4;
20555 t2 = t3;
20556 goto merge_two;
20557
20558 default:
20559 gcc_assert (GET_MODE_SIZE (mode) <= 16);
20560 break;
20561 }
20562 }
20563
20564 if (TARGET_XOP)
20565 {
20566 /* The XOP VPPERM insn supports three inputs. By ignoring the
20567 one_operand_shuffle special case, we avoid creating another
20568 set of constant vectors in memory. */
20569 one_operand_shuffle = false;
20570
20571 /* mask = mask & {2*w-1, ...} */
20572 vt = GEN_INT (2*w - 1);
20573 }
20574 else
20575 {
20576 /* mask = mask & {w-1, ...} */
20577 vt = GEN_INT (w - 1);
20578 }
20579
20580 for (i = 0; i < w; i++)
20581 vec[i] = vt;
20582 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20583 mask = expand_simple_binop (maskmode, AND, mask, vt,
20584 NULL_RTX, 0, OPTAB_DIRECT);
20585
20586 /* For non-QImode operations, convert the word permutation control
20587 into a byte permutation control. */
20588 if (mode != V16QImode)
20589 {
20590 mask = expand_simple_binop (maskmode, ASHIFT, mask,
20591 GEN_INT (exact_log2 (e)),
20592 NULL_RTX, 0, OPTAB_DIRECT);
20593
20594 /* Convert mask to vector of chars. */
20595 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
20596
20597 /* Replicate each of the input bytes into byte positions:
20598 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
20599 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
20600 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
20601 for (i = 0; i < 16; ++i)
20602 vec[i] = GEN_INT (i/e * e);
20603 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20604 vt = force_const_mem (V16QImode, vt);
20605 if (TARGET_XOP)
20606 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
20607 else
20608 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
20609
20610 /* Convert it into the byte positions by doing
20611 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
20612 for (i = 0; i < 16; ++i)
20613 vec[i] = GEN_INT (i % e);
20614 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20615 vt = force_const_mem (V16QImode, vt);
20616 emit_insn (gen_addv16qi3 (mask, mask, vt));
20617 }
20618
20619 /* The actual shuffle operations all operate on V16QImode. */
20620 op0 = gen_lowpart (V16QImode, op0);
20621 op1 = gen_lowpart (V16QImode, op1);
20622 target = gen_lowpart (V16QImode, target);
20623
20624 if (TARGET_XOP)
20625 {
20626 emit_insn (gen_xop_pperm (target, op0, op1, mask));
20627 }
20628 else if (one_operand_shuffle)
20629 {
20630 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
20631 }
20632 else
20633 {
20634 rtx xops[6];
20635 bool ok;
20636
20637 /* Shuffle the two input vectors independently. */
20638 t1 = gen_reg_rtx (V16QImode);
20639 t2 = gen_reg_rtx (V16QImode);
20640 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
20641 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
20642
20643 merge_two:
20644 /* Then merge them together. The key is whether any given control
20645 element contained a bit set that indicates the second word. */
20646 mask = operands[3];
20647 vt = GEN_INT (w);
20648 if (maskmode == V2DImode && !TARGET_SSE4_1)
20649 {
20650 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
20651 more shuffle to convert the V2DI input mask into a V4SI
20652 input mask. At which point the masking that expand_int_vcond
20653 will work as desired. */
20654 rtx t3 = gen_reg_rtx (V4SImode);
20655 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
20656 const0_rtx, const0_rtx,
20657 const2_rtx, const2_rtx));
20658 mask = t3;
20659 maskmode = V4SImode;
20660 e = w = 4;
20661 }
20662
20663 for (i = 0; i < w; i++)
20664 vec[i] = vt;
20665 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20666 vt = force_reg (maskmode, vt);
20667 mask = expand_simple_binop (maskmode, AND, mask, vt,
20668 NULL_RTX, 0, OPTAB_DIRECT);
20669
20670 xops[0] = gen_lowpart (mode, operands[0]);
20671 xops[1] = gen_lowpart (mode, t2);
20672 xops[2] = gen_lowpart (mode, t1);
20673 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
20674 xops[4] = mask;
20675 xops[5] = vt;
20676 ok = ix86_expand_int_vcond (xops);
20677 gcc_assert (ok);
20678 }
20679 }
20680
20681 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
20682 true if we should do zero extension, else sign extension. HIGH_P is
20683 true if we want the N/2 high elements, else the low elements. */
20684
20685 void
20686 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
20687 {
20688 enum machine_mode imode = GET_MODE (src);
20689 rtx tmp;
20690
20691 if (TARGET_SSE4_1)
20692 {
20693 rtx (*unpack)(rtx, rtx);
20694 rtx (*extract)(rtx, rtx) = NULL;
20695 enum machine_mode halfmode = BLKmode;
20696
20697 switch (imode)
20698 {
20699 case V32QImode:
20700 if (unsigned_p)
20701 unpack = gen_avx2_zero_extendv16qiv16hi2;
20702 else
20703 unpack = gen_avx2_sign_extendv16qiv16hi2;
20704 halfmode = V16QImode;
20705 extract
20706 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
20707 break;
20708 case V16HImode:
20709 if (unsigned_p)
20710 unpack = gen_avx2_zero_extendv8hiv8si2;
20711 else
20712 unpack = gen_avx2_sign_extendv8hiv8si2;
20713 halfmode = V8HImode;
20714 extract
20715 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
20716 break;
20717 case V8SImode:
20718 if (unsigned_p)
20719 unpack = gen_avx2_zero_extendv4siv4di2;
20720 else
20721 unpack = gen_avx2_sign_extendv4siv4di2;
20722 halfmode = V4SImode;
20723 extract
20724 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
20725 break;
20726 case V16QImode:
20727 if (unsigned_p)
20728 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
20729 else
20730 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
20731 break;
20732 case V8HImode:
20733 if (unsigned_p)
20734 unpack = gen_sse4_1_zero_extendv4hiv4si2;
20735 else
20736 unpack = gen_sse4_1_sign_extendv4hiv4si2;
20737 break;
20738 case V4SImode:
20739 if (unsigned_p)
20740 unpack = gen_sse4_1_zero_extendv2siv2di2;
20741 else
20742 unpack = gen_sse4_1_sign_extendv2siv2di2;
20743 break;
20744 default:
20745 gcc_unreachable ();
20746 }
20747
20748 if (GET_MODE_SIZE (imode) == 32)
20749 {
20750 tmp = gen_reg_rtx (halfmode);
20751 emit_insn (extract (tmp, src));
20752 }
20753 else if (high_p)
20754 {
20755 /* Shift higher 8 bytes to lower 8 bytes. */
20756 tmp = gen_reg_rtx (imode);
20757 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
20758 gen_lowpart (V1TImode, src),
20759 GEN_INT (64)));
20760 }
20761 else
20762 tmp = src;
20763
20764 emit_insn (unpack (dest, tmp));
20765 }
20766 else
20767 {
20768 rtx (*unpack)(rtx, rtx, rtx);
20769
20770 switch (imode)
20771 {
20772 case V16QImode:
20773 if (high_p)
20774 unpack = gen_vec_interleave_highv16qi;
20775 else
20776 unpack = gen_vec_interleave_lowv16qi;
20777 break;
20778 case V8HImode:
20779 if (high_p)
20780 unpack = gen_vec_interleave_highv8hi;
20781 else
20782 unpack = gen_vec_interleave_lowv8hi;
20783 break;
20784 case V4SImode:
20785 if (high_p)
20786 unpack = gen_vec_interleave_highv4si;
20787 else
20788 unpack = gen_vec_interleave_lowv4si;
20789 break;
20790 default:
20791 gcc_unreachable ();
20792 }
20793
20794 if (unsigned_p)
20795 tmp = force_reg (imode, CONST0_RTX (imode));
20796 else
20797 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
20798 src, pc_rtx, pc_rtx);
20799
20800 emit_insn (unpack (gen_lowpart (imode, dest), src, tmp));
20801 }
20802 }
20803
20804 /* Expand conditional increment or decrement using adb/sbb instructions.
20805 The default case using setcc followed by the conditional move can be
20806 done by generic code. */
20807 bool
20808 ix86_expand_int_addcc (rtx operands[])
20809 {
20810 enum rtx_code code = GET_CODE (operands[1]);
20811 rtx flags;
20812 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
20813 rtx compare_op;
20814 rtx val = const0_rtx;
20815 bool fpcmp = false;
20816 enum machine_mode mode;
20817 rtx op0 = XEXP (operands[1], 0);
20818 rtx op1 = XEXP (operands[1], 1);
20819
20820 if (operands[3] != const1_rtx
20821 && operands[3] != constm1_rtx)
20822 return false;
20823 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
20824 return false;
20825 code = GET_CODE (compare_op);
20826
20827 flags = XEXP (compare_op, 0);
20828
20829 if (GET_MODE (flags) == CCFPmode
20830 || GET_MODE (flags) == CCFPUmode)
20831 {
20832 fpcmp = true;
20833 code = ix86_fp_compare_code_to_integer (code);
20834 }
20835
20836 if (code != LTU)
20837 {
20838 val = constm1_rtx;
20839 if (fpcmp)
20840 PUT_CODE (compare_op,
20841 reverse_condition_maybe_unordered
20842 (GET_CODE (compare_op)));
20843 else
20844 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
20845 }
20846
20847 mode = GET_MODE (operands[0]);
20848
20849 /* Construct either adc or sbb insn. */
20850 if ((code == LTU) == (operands[3] == constm1_rtx))
20851 {
20852 switch (mode)
20853 {
20854 case QImode:
20855 insn = gen_subqi3_carry;
20856 break;
20857 case HImode:
20858 insn = gen_subhi3_carry;
20859 break;
20860 case SImode:
20861 insn = gen_subsi3_carry;
20862 break;
20863 case DImode:
20864 insn = gen_subdi3_carry;
20865 break;
20866 default:
20867 gcc_unreachable ();
20868 }
20869 }
20870 else
20871 {
20872 switch (mode)
20873 {
20874 case QImode:
20875 insn = gen_addqi3_carry;
20876 break;
20877 case HImode:
20878 insn = gen_addhi3_carry;
20879 break;
20880 case SImode:
20881 insn = gen_addsi3_carry;
20882 break;
20883 case DImode:
20884 insn = gen_adddi3_carry;
20885 break;
20886 default:
20887 gcc_unreachable ();
20888 }
20889 }
20890 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
20891
20892 return true;
20893 }
20894
20895
20896 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
20897 but works for floating pointer parameters and nonoffsetable memories.
20898 For pushes, it returns just stack offsets; the values will be saved
20899 in the right order. Maximally three parts are generated. */
20900
20901 static int
20902 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
20903 {
20904 int size;
20905
20906 if (!TARGET_64BIT)
20907 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
20908 else
20909 size = (GET_MODE_SIZE (mode) + 4) / 8;
20910
20911 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
20912 gcc_assert (size >= 2 && size <= 4);
20913
20914 /* Optimize constant pool reference to immediates. This is used by fp
20915 moves, that force all constants to memory to allow combining. */
20916 if (MEM_P (operand) && MEM_READONLY_P (operand))
20917 {
20918 rtx tmp = maybe_get_pool_constant (operand);
20919 if (tmp)
20920 operand = tmp;
20921 }
20922
20923 if (MEM_P (operand) && !offsettable_memref_p (operand))
20924 {
20925 /* The only non-offsetable memories we handle are pushes. */
20926 int ok = push_operand (operand, VOIDmode);
20927
20928 gcc_assert (ok);
20929
20930 operand = copy_rtx (operand);
20931 PUT_MODE (operand, word_mode);
20932 parts[0] = parts[1] = parts[2] = parts[3] = operand;
20933 return size;
20934 }
20935
20936 if (GET_CODE (operand) == CONST_VECTOR)
20937 {
20938 enum machine_mode imode = int_mode_for_mode (mode);
20939 /* Caution: if we looked through a constant pool memory above,
20940 the operand may actually have a different mode now. That's
20941 ok, since we want to pun this all the way back to an integer. */
20942 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
20943 gcc_assert (operand != NULL);
20944 mode = imode;
20945 }
20946
20947 if (!TARGET_64BIT)
20948 {
20949 if (mode == DImode)
20950 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20951 else
20952 {
20953 int i;
20954
20955 if (REG_P (operand))
20956 {
20957 gcc_assert (reload_completed);
20958 for (i = 0; i < size; i++)
20959 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
20960 }
20961 else if (offsettable_memref_p (operand))
20962 {
20963 operand = adjust_address (operand, SImode, 0);
20964 parts[0] = operand;
20965 for (i = 1; i < size; i++)
20966 parts[i] = adjust_address (operand, SImode, 4 * i);
20967 }
20968 else if (GET_CODE (operand) == CONST_DOUBLE)
20969 {
20970 REAL_VALUE_TYPE r;
20971 long l[4];
20972
20973 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20974 switch (mode)
20975 {
20976 case TFmode:
20977 real_to_target (l, &r, mode);
20978 parts[3] = gen_int_mode (l[3], SImode);
20979 parts[2] = gen_int_mode (l[2], SImode);
20980 break;
20981 case XFmode:
20982 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
20983 long double may not be 80-bit. */
20984 real_to_target (l, &r, mode);
20985 parts[2] = gen_int_mode (l[2], SImode);
20986 break;
20987 case DFmode:
20988 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
20989 break;
20990 default:
20991 gcc_unreachable ();
20992 }
20993 parts[1] = gen_int_mode (l[1], SImode);
20994 parts[0] = gen_int_mode (l[0], SImode);
20995 }
20996 else
20997 gcc_unreachable ();
20998 }
20999 }
21000 else
21001 {
21002 if (mode == TImode)
21003 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
21004 if (mode == XFmode || mode == TFmode)
21005 {
21006 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
21007 if (REG_P (operand))
21008 {
21009 gcc_assert (reload_completed);
21010 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
21011 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
21012 }
21013 else if (offsettable_memref_p (operand))
21014 {
21015 operand = adjust_address (operand, DImode, 0);
21016 parts[0] = operand;
21017 parts[1] = adjust_address (operand, upper_mode, 8);
21018 }
21019 else if (GET_CODE (operand) == CONST_DOUBLE)
21020 {
21021 REAL_VALUE_TYPE r;
21022 long l[4];
21023
21024 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21025 real_to_target (l, &r, mode);
21026
21027 /* Do not use shift by 32 to avoid warning on 32bit systems. */
21028 if (HOST_BITS_PER_WIDE_INT >= 64)
21029 parts[0]
21030 = gen_int_mode
21031 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
21032 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
21033 DImode);
21034 else
21035 parts[0] = immed_double_const (l[0], l[1], DImode);
21036
21037 if (upper_mode == SImode)
21038 parts[1] = gen_int_mode (l[2], SImode);
21039 else if (HOST_BITS_PER_WIDE_INT >= 64)
21040 parts[1]
21041 = gen_int_mode
21042 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
21043 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
21044 DImode);
21045 else
21046 parts[1] = immed_double_const (l[2], l[3], DImode);
21047 }
21048 else
21049 gcc_unreachable ();
21050 }
21051 }
21052
21053 return size;
21054 }
21055
21056 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
21057 Return false when normal moves are needed; true when all required
21058 insns have been emitted. Operands 2-4 contain the input values
21059 int the correct order; operands 5-7 contain the output values. */
21060
21061 void
21062 ix86_split_long_move (rtx operands[])
21063 {
21064 rtx part[2][4];
21065 int nparts, i, j;
21066 int push = 0;
21067 int collisions = 0;
21068 enum machine_mode mode = GET_MODE (operands[0]);
21069 bool collisionparts[4];
21070
21071 /* The DFmode expanders may ask us to move double.
21072 For 64bit target this is single move. By hiding the fact
21073 here we simplify i386.md splitters. */
21074 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
21075 {
21076 /* Optimize constant pool reference to immediates. This is used by
21077 fp moves, that force all constants to memory to allow combining. */
21078
21079 if (MEM_P (operands[1])
21080 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
21081 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
21082 operands[1] = get_pool_constant (XEXP (operands[1], 0));
21083 if (push_operand (operands[0], VOIDmode))
21084 {
21085 operands[0] = copy_rtx (operands[0]);
21086 PUT_MODE (operands[0], word_mode);
21087 }
21088 else
21089 operands[0] = gen_lowpart (DImode, operands[0]);
21090 operands[1] = gen_lowpart (DImode, operands[1]);
21091 emit_move_insn (operands[0], operands[1]);
21092 return;
21093 }
21094
21095 /* The only non-offsettable memory we handle is push. */
21096 if (push_operand (operands[0], VOIDmode))
21097 push = 1;
21098 else
21099 gcc_assert (!MEM_P (operands[0])
21100 || offsettable_memref_p (operands[0]));
21101
21102 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
21103 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
21104
21105 /* When emitting push, take care for source operands on the stack. */
21106 if (push && MEM_P (operands[1])
21107 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
21108 {
21109 rtx src_base = XEXP (part[1][nparts - 1], 0);
21110
21111 /* Compensate for the stack decrement by 4. */
21112 if (!TARGET_64BIT && nparts == 3
21113 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
21114 src_base = plus_constant (Pmode, src_base, 4);
21115
21116 /* src_base refers to the stack pointer and is
21117 automatically decreased by emitted push. */
21118 for (i = 0; i < nparts; i++)
21119 part[1][i] = change_address (part[1][i],
21120 GET_MODE (part[1][i]), src_base);
21121 }
21122
21123 /* We need to do copy in the right order in case an address register
21124 of the source overlaps the destination. */
21125 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
21126 {
21127 rtx tmp;
21128
21129 for (i = 0; i < nparts; i++)
21130 {
21131 collisionparts[i]
21132 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
21133 if (collisionparts[i])
21134 collisions++;
21135 }
21136
21137 /* Collision in the middle part can be handled by reordering. */
21138 if (collisions == 1 && nparts == 3 && collisionparts [1])
21139 {
21140 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21141 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21142 }
21143 else if (collisions == 1
21144 && nparts == 4
21145 && (collisionparts [1] || collisionparts [2]))
21146 {
21147 if (collisionparts [1])
21148 {
21149 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21150 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21151 }
21152 else
21153 {
21154 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
21155 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
21156 }
21157 }
21158
21159 /* If there are more collisions, we can't handle it by reordering.
21160 Do an lea to the last part and use only one colliding move. */
21161 else if (collisions > 1)
21162 {
21163 rtx base;
21164
21165 collisions = 1;
21166
21167 base = part[0][nparts - 1];
21168
21169 /* Handle the case when the last part isn't valid for lea.
21170 Happens in 64-bit mode storing the 12-byte XFmode. */
21171 if (GET_MODE (base) != Pmode)
21172 base = gen_rtx_REG (Pmode, REGNO (base));
21173
21174 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
21175 part[1][0] = replace_equiv_address (part[1][0], base);
21176 for (i = 1; i < nparts; i++)
21177 {
21178 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
21179 part[1][i] = replace_equiv_address (part[1][i], tmp);
21180 }
21181 }
21182 }
21183
21184 if (push)
21185 {
21186 if (!TARGET_64BIT)
21187 {
21188 if (nparts == 3)
21189 {
21190 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
21191 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
21192 stack_pointer_rtx, GEN_INT (-4)));
21193 emit_move_insn (part[0][2], part[1][2]);
21194 }
21195 else if (nparts == 4)
21196 {
21197 emit_move_insn (part[0][3], part[1][3]);
21198 emit_move_insn (part[0][2], part[1][2]);
21199 }
21200 }
21201 else
21202 {
21203 /* In 64bit mode we don't have 32bit push available. In case this is
21204 register, it is OK - we will just use larger counterpart. We also
21205 retype memory - these comes from attempt to avoid REX prefix on
21206 moving of second half of TFmode value. */
21207 if (GET_MODE (part[1][1]) == SImode)
21208 {
21209 switch (GET_CODE (part[1][1]))
21210 {
21211 case MEM:
21212 part[1][1] = adjust_address (part[1][1], DImode, 0);
21213 break;
21214
21215 case REG:
21216 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
21217 break;
21218
21219 default:
21220 gcc_unreachable ();
21221 }
21222
21223 if (GET_MODE (part[1][0]) == SImode)
21224 part[1][0] = part[1][1];
21225 }
21226 }
21227 emit_move_insn (part[0][1], part[1][1]);
21228 emit_move_insn (part[0][0], part[1][0]);
21229 return;
21230 }
21231
21232 /* Choose correct order to not overwrite the source before it is copied. */
21233 if ((REG_P (part[0][0])
21234 && REG_P (part[1][1])
21235 && (REGNO (part[0][0]) == REGNO (part[1][1])
21236 || (nparts == 3
21237 && REGNO (part[0][0]) == REGNO (part[1][2]))
21238 || (nparts == 4
21239 && REGNO (part[0][0]) == REGNO (part[1][3]))))
21240 || (collisions > 0
21241 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
21242 {
21243 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
21244 {
21245 operands[2 + i] = part[0][j];
21246 operands[6 + i] = part[1][j];
21247 }
21248 }
21249 else
21250 {
21251 for (i = 0; i < nparts; i++)
21252 {
21253 operands[2 + i] = part[0][i];
21254 operands[6 + i] = part[1][i];
21255 }
21256 }
21257
21258 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
21259 if (optimize_insn_for_size_p ())
21260 {
21261 for (j = 0; j < nparts - 1; j++)
21262 if (CONST_INT_P (operands[6 + j])
21263 && operands[6 + j] != const0_rtx
21264 && REG_P (operands[2 + j]))
21265 for (i = j; i < nparts - 1; i++)
21266 if (CONST_INT_P (operands[7 + i])
21267 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
21268 operands[7 + i] = operands[2 + j];
21269 }
21270
21271 for (i = 0; i < nparts; i++)
21272 emit_move_insn (operands[2 + i], operands[6 + i]);
21273
21274 return;
21275 }
21276
21277 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
21278 left shift by a constant, either using a single shift or
21279 a sequence of add instructions. */
21280
21281 static void
21282 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
21283 {
21284 rtx (*insn)(rtx, rtx, rtx);
21285
21286 if (count == 1
21287 || (count * ix86_cost->add <= ix86_cost->shift_const
21288 && !optimize_insn_for_size_p ()))
21289 {
21290 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
21291 while (count-- > 0)
21292 emit_insn (insn (operand, operand, operand));
21293 }
21294 else
21295 {
21296 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21297 emit_insn (insn (operand, operand, GEN_INT (count)));
21298 }
21299 }
21300
21301 void
21302 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
21303 {
21304 rtx (*gen_ashl3)(rtx, rtx, rtx);
21305 rtx (*gen_shld)(rtx, rtx, rtx);
21306 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21307
21308 rtx low[2], high[2];
21309 int count;
21310
21311 if (CONST_INT_P (operands[2]))
21312 {
21313 split_double_mode (mode, operands, 2, low, high);
21314 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21315
21316 if (count >= half_width)
21317 {
21318 emit_move_insn (high[0], low[1]);
21319 emit_move_insn (low[0], const0_rtx);
21320
21321 if (count > half_width)
21322 ix86_expand_ashl_const (high[0], count - half_width, mode);
21323 }
21324 else
21325 {
21326 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21327
21328 if (!rtx_equal_p (operands[0], operands[1]))
21329 emit_move_insn (operands[0], operands[1]);
21330
21331 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
21332 ix86_expand_ashl_const (low[0], count, mode);
21333 }
21334 return;
21335 }
21336
21337 split_double_mode (mode, operands, 1, low, high);
21338
21339 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21340
21341 if (operands[1] == const1_rtx)
21342 {
21343 /* Assuming we've chosen a QImode capable registers, then 1 << N
21344 can be done with two 32/64-bit shifts, no branches, no cmoves. */
21345 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
21346 {
21347 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
21348
21349 ix86_expand_clear (low[0]);
21350 ix86_expand_clear (high[0]);
21351 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
21352
21353 d = gen_lowpart (QImode, low[0]);
21354 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21355 s = gen_rtx_EQ (QImode, flags, const0_rtx);
21356 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21357
21358 d = gen_lowpart (QImode, high[0]);
21359 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21360 s = gen_rtx_NE (QImode, flags, const0_rtx);
21361 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21362 }
21363
21364 /* Otherwise, we can get the same results by manually performing
21365 a bit extract operation on bit 5/6, and then performing the two
21366 shifts. The two methods of getting 0/1 into low/high are exactly
21367 the same size. Avoiding the shift in the bit extract case helps
21368 pentium4 a bit; no one else seems to care much either way. */
21369 else
21370 {
21371 enum machine_mode half_mode;
21372 rtx (*gen_lshr3)(rtx, rtx, rtx);
21373 rtx (*gen_and3)(rtx, rtx, rtx);
21374 rtx (*gen_xor3)(rtx, rtx, rtx);
21375 HOST_WIDE_INT bits;
21376 rtx x;
21377
21378 if (mode == DImode)
21379 {
21380 half_mode = SImode;
21381 gen_lshr3 = gen_lshrsi3;
21382 gen_and3 = gen_andsi3;
21383 gen_xor3 = gen_xorsi3;
21384 bits = 5;
21385 }
21386 else
21387 {
21388 half_mode = DImode;
21389 gen_lshr3 = gen_lshrdi3;
21390 gen_and3 = gen_anddi3;
21391 gen_xor3 = gen_xordi3;
21392 bits = 6;
21393 }
21394
21395 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
21396 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
21397 else
21398 x = gen_lowpart (half_mode, operands[2]);
21399 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
21400
21401 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
21402 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
21403 emit_move_insn (low[0], high[0]);
21404 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
21405 }
21406
21407 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21408 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
21409 return;
21410 }
21411
21412 if (operands[1] == constm1_rtx)
21413 {
21414 /* For -1 << N, we can avoid the shld instruction, because we
21415 know that we're shifting 0...31/63 ones into a -1. */
21416 emit_move_insn (low[0], constm1_rtx);
21417 if (optimize_insn_for_size_p ())
21418 emit_move_insn (high[0], low[0]);
21419 else
21420 emit_move_insn (high[0], constm1_rtx);
21421 }
21422 else
21423 {
21424 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21425
21426 if (!rtx_equal_p (operands[0], operands[1]))
21427 emit_move_insn (operands[0], operands[1]);
21428
21429 split_double_mode (mode, operands, 1, low, high);
21430 emit_insn (gen_shld (high[0], low[0], operands[2]));
21431 }
21432
21433 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21434
21435 if (TARGET_CMOVE && scratch)
21436 {
21437 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21438 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21439
21440 ix86_expand_clear (scratch);
21441 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
21442 }
21443 else
21444 {
21445 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21446 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21447
21448 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
21449 }
21450 }
21451
21452 void
21453 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
21454 {
21455 rtx (*gen_ashr3)(rtx, rtx, rtx)
21456 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
21457 rtx (*gen_shrd)(rtx, rtx, rtx);
21458 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21459
21460 rtx low[2], high[2];
21461 int count;
21462
21463 if (CONST_INT_P (operands[2]))
21464 {
21465 split_double_mode (mode, operands, 2, low, high);
21466 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21467
21468 if (count == GET_MODE_BITSIZE (mode) - 1)
21469 {
21470 emit_move_insn (high[0], high[1]);
21471 emit_insn (gen_ashr3 (high[0], high[0],
21472 GEN_INT (half_width - 1)));
21473 emit_move_insn (low[0], high[0]);
21474
21475 }
21476 else if (count >= half_width)
21477 {
21478 emit_move_insn (low[0], high[1]);
21479 emit_move_insn (high[0], low[0]);
21480 emit_insn (gen_ashr3 (high[0], high[0],
21481 GEN_INT (half_width - 1)));
21482
21483 if (count > half_width)
21484 emit_insn (gen_ashr3 (low[0], low[0],
21485 GEN_INT (count - half_width)));
21486 }
21487 else
21488 {
21489 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21490
21491 if (!rtx_equal_p (operands[0], operands[1]))
21492 emit_move_insn (operands[0], operands[1]);
21493
21494 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21495 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
21496 }
21497 }
21498 else
21499 {
21500 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21501
21502 if (!rtx_equal_p (operands[0], operands[1]))
21503 emit_move_insn (operands[0], operands[1]);
21504
21505 split_double_mode (mode, operands, 1, low, high);
21506
21507 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21508 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
21509
21510 if (TARGET_CMOVE && scratch)
21511 {
21512 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21513 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21514
21515 emit_move_insn (scratch, high[0]);
21516 emit_insn (gen_ashr3 (scratch, scratch,
21517 GEN_INT (half_width - 1)));
21518 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21519 scratch));
21520 }
21521 else
21522 {
21523 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
21524 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
21525
21526 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
21527 }
21528 }
21529 }
21530
21531 void
21532 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
21533 {
21534 rtx (*gen_lshr3)(rtx, rtx, rtx)
21535 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
21536 rtx (*gen_shrd)(rtx, rtx, rtx);
21537 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21538
21539 rtx low[2], high[2];
21540 int count;
21541
21542 if (CONST_INT_P (operands[2]))
21543 {
21544 split_double_mode (mode, operands, 2, low, high);
21545 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21546
21547 if (count >= half_width)
21548 {
21549 emit_move_insn (low[0], high[1]);
21550 ix86_expand_clear (high[0]);
21551
21552 if (count > half_width)
21553 emit_insn (gen_lshr3 (low[0], low[0],
21554 GEN_INT (count - half_width)));
21555 }
21556 else
21557 {
21558 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21559
21560 if (!rtx_equal_p (operands[0], operands[1]))
21561 emit_move_insn (operands[0], operands[1]);
21562
21563 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21564 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
21565 }
21566 }
21567 else
21568 {
21569 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21570
21571 if (!rtx_equal_p (operands[0], operands[1]))
21572 emit_move_insn (operands[0], operands[1]);
21573
21574 split_double_mode (mode, operands, 1, low, high);
21575
21576 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21577 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
21578
21579 if (TARGET_CMOVE && scratch)
21580 {
21581 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21582 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21583
21584 ix86_expand_clear (scratch);
21585 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21586 scratch));
21587 }
21588 else
21589 {
21590 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21591 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21592
21593 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
21594 }
21595 }
21596 }
21597
21598 /* Predict just emitted jump instruction to be taken with probability PROB. */
21599 static void
21600 predict_jump (int prob)
21601 {
21602 rtx insn = get_last_insn ();
21603 gcc_assert (JUMP_P (insn));
21604 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
21605 }
21606
21607 /* Helper function for the string operations below. Dest VARIABLE whether
21608 it is aligned to VALUE bytes. If true, jump to the label. */
21609 static rtx
21610 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
21611 {
21612 rtx label = gen_label_rtx ();
21613 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
21614 if (GET_MODE (variable) == DImode)
21615 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
21616 else
21617 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
21618 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
21619 1, label);
21620 if (epilogue)
21621 predict_jump (REG_BR_PROB_BASE * 50 / 100);
21622 else
21623 predict_jump (REG_BR_PROB_BASE * 90 / 100);
21624 return label;
21625 }
21626
21627 /* Adjust COUNTER by the VALUE. */
21628 static void
21629 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
21630 {
21631 rtx (*gen_add)(rtx, rtx, rtx)
21632 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
21633
21634 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
21635 }
21636
21637 /* Zero extend possibly SImode EXP to Pmode register. */
21638 rtx
21639 ix86_zero_extend_to_Pmode (rtx exp)
21640 {
21641 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
21642 }
21643
21644 /* Divide COUNTREG by SCALE. */
21645 static rtx
21646 scale_counter (rtx countreg, int scale)
21647 {
21648 rtx sc;
21649
21650 if (scale == 1)
21651 return countreg;
21652 if (CONST_INT_P (countreg))
21653 return GEN_INT (INTVAL (countreg) / scale);
21654 gcc_assert (REG_P (countreg));
21655
21656 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
21657 GEN_INT (exact_log2 (scale)),
21658 NULL, 1, OPTAB_DIRECT);
21659 return sc;
21660 }
21661
21662 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
21663 DImode for constant loop counts. */
21664
21665 static enum machine_mode
21666 counter_mode (rtx count_exp)
21667 {
21668 if (GET_MODE (count_exp) != VOIDmode)
21669 return GET_MODE (count_exp);
21670 if (!CONST_INT_P (count_exp))
21671 return Pmode;
21672 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
21673 return DImode;
21674 return SImode;
21675 }
21676
21677 /* When SRCPTR is non-NULL, output simple loop to move memory
21678 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
21679 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
21680 equivalent loop to set memory by VALUE (supposed to be in MODE).
21681
21682 The size is rounded down to whole number of chunk size moved at once.
21683 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
21684
21685
21686 static void
21687 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
21688 rtx destptr, rtx srcptr, rtx value,
21689 rtx count, enum machine_mode mode, int unroll,
21690 int expected_size)
21691 {
21692 rtx out_label, top_label, iter, tmp;
21693 enum machine_mode iter_mode = counter_mode (count);
21694 rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
21695 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
21696 rtx size;
21697 rtx x_addr;
21698 rtx y_addr;
21699 int i;
21700
21701 top_label = gen_label_rtx ();
21702 out_label = gen_label_rtx ();
21703 iter = gen_reg_rtx (iter_mode);
21704
21705 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
21706 NULL, 1, OPTAB_DIRECT);
21707 /* Those two should combine. */
21708 if (piece_size == const1_rtx)
21709 {
21710 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
21711 true, out_label);
21712 predict_jump (REG_BR_PROB_BASE * 10 / 100);
21713 }
21714 emit_move_insn (iter, const0_rtx);
21715
21716 emit_label (top_label);
21717
21718 tmp = convert_modes (Pmode, iter_mode, iter, true);
21719 x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
21720 destmem = change_address (destmem, mode, x_addr);
21721
21722 if (srcmem)
21723 {
21724 y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
21725 srcmem = change_address (srcmem, mode, y_addr);
21726
21727 /* When unrolling for chips that reorder memory reads and writes,
21728 we can save registers by using single temporary.
21729 Also using 4 temporaries is overkill in 32bit mode. */
21730 if (!TARGET_64BIT && 0)
21731 {
21732 for (i = 0; i < unroll; i++)
21733 {
21734 if (i)
21735 {
21736 destmem =
21737 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21738 srcmem =
21739 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21740 }
21741 emit_move_insn (destmem, srcmem);
21742 }
21743 }
21744 else
21745 {
21746 rtx tmpreg[4];
21747 gcc_assert (unroll <= 4);
21748 for (i = 0; i < unroll; i++)
21749 {
21750 tmpreg[i] = gen_reg_rtx (mode);
21751 if (i)
21752 {
21753 srcmem =
21754 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21755 }
21756 emit_move_insn (tmpreg[i], srcmem);
21757 }
21758 for (i = 0; i < unroll; i++)
21759 {
21760 if (i)
21761 {
21762 destmem =
21763 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21764 }
21765 emit_move_insn (destmem, tmpreg[i]);
21766 }
21767 }
21768 }
21769 else
21770 for (i = 0; i < unroll; i++)
21771 {
21772 if (i)
21773 destmem =
21774 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21775 emit_move_insn (destmem, value);
21776 }
21777
21778 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
21779 true, OPTAB_LIB_WIDEN);
21780 if (tmp != iter)
21781 emit_move_insn (iter, tmp);
21782
21783 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
21784 true, top_label);
21785 if (expected_size != -1)
21786 {
21787 expected_size /= GET_MODE_SIZE (mode) * unroll;
21788 if (expected_size == 0)
21789 predict_jump (0);
21790 else if (expected_size > REG_BR_PROB_BASE)
21791 predict_jump (REG_BR_PROB_BASE - 1);
21792 else
21793 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
21794 }
21795 else
21796 predict_jump (REG_BR_PROB_BASE * 80 / 100);
21797 iter = ix86_zero_extend_to_Pmode (iter);
21798 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
21799 true, OPTAB_LIB_WIDEN);
21800 if (tmp != destptr)
21801 emit_move_insn (destptr, tmp);
21802 if (srcptr)
21803 {
21804 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
21805 true, OPTAB_LIB_WIDEN);
21806 if (tmp != srcptr)
21807 emit_move_insn (srcptr, tmp);
21808 }
21809 emit_label (out_label);
21810 }
21811
21812 /* Output "rep; mov" instruction.
21813 Arguments have same meaning as for previous function */
21814 static void
21815 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
21816 rtx destptr, rtx srcptr,
21817 rtx count,
21818 enum machine_mode mode)
21819 {
21820 rtx destexp;
21821 rtx srcexp;
21822 rtx countreg;
21823 HOST_WIDE_INT rounded_count;
21824
21825 /* If the size is known, it is shorter to use rep movs. */
21826 if (mode == QImode && CONST_INT_P (count)
21827 && !(INTVAL (count) & 3))
21828 mode = SImode;
21829
21830 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21831 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21832 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
21833 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
21834 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21835 if (mode != QImode)
21836 {
21837 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21838 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21839 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21840 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
21841 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21842 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
21843 }
21844 else
21845 {
21846 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21847 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
21848 }
21849 if (CONST_INT_P (count))
21850 {
21851 rounded_count = (INTVAL (count)
21852 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21853 destmem = shallow_copy_rtx (destmem);
21854 srcmem = shallow_copy_rtx (srcmem);
21855 set_mem_size (destmem, rounded_count);
21856 set_mem_size (srcmem, rounded_count);
21857 }
21858 else
21859 {
21860 if (MEM_SIZE_KNOWN_P (destmem))
21861 clear_mem_size (destmem);
21862 if (MEM_SIZE_KNOWN_P (srcmem))
21863 clear_mem_size (srcmem);
21864 }
21865 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
21866 destexp, srcexp));
21867 }
21868
21869 /* Output "rep; stos" instruction.
21870 Arguments have same meaning as for previous function */
21871 static void
21872 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
21873 rtx count, enum machine_mode mode,
21874 rtx orig_value)
21875 {
21876 rtx destexp;
21877 rtx countreg;
21878 HOST_WIDE_INT rounded_count;
21879
21880 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21881 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21882 value = force_reg (mode, gen_lowpart (mode, value));
21883 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21884 if (mode != QImode)
21885 {
21886 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21887 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21888 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21889 }
21890 else
21891 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21892 if (orig_value == const0_rtx && CONST_INT_P (count))
21893 {
21894 rounded_count = (INTVAL (count)
21895 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21896 destmem = shallow_copy_rtx (destmem);
21897 set_mem_size (destmem, rounded_count);
21898 }
21899 else if (MEM_SIZE_KNOWN_P (destmem))
21900 clear_mem_size (destmem);
21901 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
21902 }
21903
21904 static void
21905 emit_strmov (rtx destmem, rtx srcmem,
21906 rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
21907 {
21908 rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
21909 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
21910 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21911 }
21912
21913 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
21914 static void
21915 expand_movmem_epilogue (rtx destmem, rtx srcmem,
21916 rtx destptr, rtx srcptr, rtx count, int max_size)
21917 {
21918 rtx src, dest;
21919 if (CONST_INT_P (count))
21920 {
21921 HOST_WIDE_INT countval = INTVAL (count);
21922 int offset = 0;
21923
21924 if ((countval & 0x10) && max_size > 16)
21925 {
21926 if (TARGET_64BIT)
21927 {
21928 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21929 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
21930 }
21931 else
21932 gcc_unreachable ();
21933 offset += 16;
21934 }
21935 if ((countval & 0x08) && max_size > 8)
21936 {
21937 if (TARGET_64BIT)
21938 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21939 else
21940 {
21941 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21942 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
21943 }
21944 offset += 8;
21945 }
21946 if ((countval & 0x04) && max_size > 4)
21947 {
21948 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21949 offset += 4;
21950 }
21951 if ((countval & 0x02) && max_size > 2)
21952 {
21953 emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
21954 offset += 2;
21955 }
21956 if ((countval & 0x01) && max_size > 1)
21957 {
21958 emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
21959 offset += 1;
21960 }
21961 return;
21962 }
21963 if (max_size > 8)
21964 {
21965 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
21966 count, 1, OPTAB_DIRECT);
21967 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
21968 count, QImode, 1, 4);
21969 return;
21970 }
21971
21972 /* When there are stringops, we can cheaply increase dest and src pointers.
21973 Otherwise we save code size by maintaining offset (zero is readily
21974 available from preceding rep operation) and using x86 addressing modes.
21975 */
21976 if (TARGET_SINGLE_STRINGOP)
21977 {
21978 if (max_size > 4)
21979 {
21980 rtx label = ix86_expand_aligntest (count, 4, true);
21981 src = change_address (srcmem, SImode, srcptr);
21982 dest = change_address (destmem, SImode, destptr);
21983 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21984 emit_label (label);
21985 LABEL_NUSES (label) = 1;
21986 }
21987 if (max_size > 2)
21988 {
21989 rtx label = ix86_expand_aligntest (count, 2, true);
21990 src = change_address (srcmem, HImode, srcptr);
21991 dest = change_address (destmem, HImode, destptr);
21992 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21993 emit_label (label);
21994 LABEL_NUSES (label) = 1;
21995 }
21996 if (max_size > 1)
21997 {
21998 rtx label = ix86_expand_aligntest (count, 1, true);
21999 src = change_address (srcmem, QImode, srcptr);
22000 dest = change_address (destmem, QImode, destptr);
22001 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22002 emit_label (label);
22003 LABEL_NUSES (label) = 1;
22004 }
22005 }
22006 else
22007 {
22008 rtx offset = force_reg (Pmode, const0_rtx);
22009 rtx tmp;
22010
22011 if (max_size > 4)
22012 {
22013 rtx label = ix86_expand_aligntest (count, 4, true);
22014 src = change_address (srcmem, SImode, srcptr);
22015 dest = change_address (destmem, SImode, destptr);
22016 emit_move_insn (dest, src);
22017 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
22018 true, OPTAB_LIB_WIDEN);
22019 if (tmp != offset)
22020 emit_move_insn (offset, tmp);
22021 emit_label (label);
22022 LABEL_NUSES (label) = 1;
22023 }
22024 if (max_size > 2)
22025 {
22026 rtx label = ix86_expand_aligntest (count, 2, true);
22027 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22028 src = change_address (srcmem, HImode, tmp);
22029 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22030 dest = change_address (destmem, HImode, tmp);
22031 emit_move_insn (dest, src);
22032 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
22033 true, OPTAB_LIB_WIDEN);
22034 if (tmp != offset)
22035 emit_move_insn (offset, tmp);
22036 emit_label (label);
22037 LABEL_NUSES (label) = 1;
22038 }
22039 if (max_size > 1)
22040 {
22041 rtx label = ix86_expand_aligntest (count, 1, true);
22042 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22043 src = change_address (srcmem, QImode, tmp);
22044 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22045 dest = change_address (destmem, QImode, tmp);
22046 emit_move_insn (dest, src);
22047 emit_label (label);
22048 LABEL_NUSES (label) = 1;
22049 }
22050 }
22051 }
22052
22053 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22054 static void
22055 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
22056 rtx count, int max_size)
22057 {
22058 count =
22059 expand_simple_binop (counter_mode (count), AND, count,
22060 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
22061 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
22062 gen_lowpart (QImode, value), count, QImode,
22063 1, max_size / 2);
22064 }
22065
22066 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22067 static void
22068 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
22069 {
22070 rtx dest;
22071
22072 if (CONST_INT_P (count))
22073 {
22074 HOST_WIDE_INT countval = INTVAL (count);
22075 int offset = 0;
22076
22077 if ((countval & 0x10) && max_size > 16)
22078 {
22079 if (TARGET_64BIT)
22080 {
22081 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
22082 emit_insn (gen_strset (destptr, dest, value));
22083 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
22084 emit_insn (gen_strset (destptr, dest, value));
22085 }
22086 else
22087 gcc_unreachable ();
22088 offset += 16;
22089 }
22090 if ((countval & 0x08) && max_size > 8)
22091 {
22092 if (TARGET_64BIT)
22093 {
22094 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
22095 emit_insn (gen_strset (destptr, dest, value));
22096 }
22097 else
22098 {
22099 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
22100 emit_insn (gen_strset (destptr, dest, value));
22101 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
22102 emit_insn (gen_strset (destptr, dest, value));
22103 }
22104 offset += 8;
22105 }
22106 if ((countval & 0x04) && max_size > 4)
22107 {
22108 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
22109 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22110 offset += 4;
22111 }
22112 if ((countval & 0x02) && max_size > 2)
22113 {
22114 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
22115 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22116 offset += 2;
22117 }
22118 if ((countval & 0x01) && max_size > 1)
22119 {
22120 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
22121 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22122 offset += 1;
22123 }
22124 return;
22125 }
22126 if (max_size > 32)
22127 {
22128 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
22129 return;
22130 }
22131 if (max_size > 16)
22132 {
22133 rtx label = ix86_expand_aligntest (count, 16, true);
22134 if (TARGET_64BIT)
22135 {
22136 dest = change_address (destmem, DImode, destptr);
22137 emit_insn (gen_strset (destptr, dest, value));
22138 emit_insn (gen_strset (destptr, dest, value));
22139 }
22140 else
22141 {
22142 dest = change_address (destmem, SImode, destptr);
22143 emit_insn (gen_strset (destptr, dest, value));
22144 emit_insn (gen_strset (destptr, dest, value));
22145 emit_insn (gen_strset (destptr, dest, value));
22146 emit_insn (gen_strset (destptr, dest, value));
22147 }
22148 emit_label (label);
22149 LABEL_NUSES (label) = 1;
22150 }
22151 if (max_size > 8)
22152 {
22153 rtx label = ix86_expand_aligntest (count, 8, true);
22154 if (TARGET_64BIT)
22155 {
22156 dest = change_address (destmem, DImode, destptr);
22157 emit_insn (gen_strset (destptr, dest, value));
22158 }
22159 else
22160 {
22161 dest = change_address (destmem, SImode, destptr);
22162 emit_insn (gen_strset (destptr, dest, value));
22163 emit_insn (gen_strset (destptr, dest, value));
22164 }
22165 emit_label (label);
22166 LABEL_NUSES (label) = 1;
22167 }
22168 if (max_size > 4)
22169 {
22170 rtx label = ix86_expand_aligntest (count, 4, true);
22171 dest = change_address (destmem, SImode, destptr);
22172 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22173 emit_label (label);
22174 LABEL_NUSES (label) = 1;
22175 }
22176 if (max_size > 2)
22177 {
22178 rtx label = ix86_expand_aligntest (count, 2, true);
22179 dest = change_address (destmem, HImode, destptr);
22180 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22181 emit_label (label);
22182 LABEL_NUSES (label) = 1;
22183 }
22184 if (max_size > 1)
22185 {
22186 rtx label = ix86_expand_aligntest (count, 1, true);
22187 dest = change_address (destmem, QImode, destptr);
22188 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22189 emit_label (label);
22190 LABEL_NUSES (label) = 1;
22191 }
22192 }
22193
22194 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
22195 DESIRED_ALIGNMENT. */
22196 static void
22197 expand_movmem_prologue (rtx destmem, rtx srcmem,
22198 rtx destptr, rtx srcptr, rtx count,
22199 int align, int desired_alignment)
22200 {
22201 if (align <= 1 && desired_alignment > 1)
22202 {
22203 rtx label = ix86_expand_aligntest (destptr, 1, false);
22204 srcmem = change_address (srcmem, QImode, srcptr);
22205 destmem = change_address (destmem, QImode, destptr);
22206 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22207 ix86_adjust_counter (count, 1);
22208 emit_label (label);
22209 LABEL_NUSES (label) = 1;
22210 }
22211 if (align <= 2 && desired_alignment > 2)
22212 {
22213 rtx label = ix86_expand_aligntest (destptr, 2, false);
22214 srcmem = change_address (srcmem, HImode, srcptr);
22215 destmem = change_address (destmem, HImode, destptr);
22216 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22217 ix86_adjust_counter (count, 2);
22218 emit_label (label);
22219 LABEL_NUSES (label) = 1;
22220 }
22221 if (align <= 4 && desired_alignment > 4)
22222 {
22223 rtx label = ix86_expand_aligntest (destptr, 4, false);
22224 srcmem = change_address (srcmem, SImode, srcptr);
22225 destmem = change_address (destmem, SImode, destptr);
22226 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22227 ix86_adjust_counter (count, 4);
22228 emit_label (label);
22229 LABEL_NUSES (label) = 1;
22230 }
22231 gcc_assert (desired_alignment <= 8);
22232 }
22233
22234 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
22235 ALIGN_BYTES is how many bytes need to be copied. */
22236 static rtx
22237 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
22238 int desired_align, int align_bytes)
22239 {
22240 rtx src = *srcp;
22241 rtx orig_dst = dst;
22242 rtx orig_src = src;
22243 int off = 0;
22244 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
22245 if (src_align_bytes >= 0)
22246 src_align_bytes = desired_align - src_align_bytes;
22247 if (align_bytes & 1)
22248 {
22249 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22250 src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
22251 off = 1;
22252 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22253 }
22254 if (align_bytes & 2)
22255 {
22256 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22257 src = adjust_automodify_address_nv (src, HImode, srcreg, off);
22258 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22259 set_mem_align (dst, 2 * BITS_PER_UNIT);
22260 if (src_align_bytes >= 0
22261 && (src_align_bytes & 1) == (align_bytes & 1)
22262 && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
22263 set_mem_align (src, 2 * BITS_PER_UNIT);
22264 off = 2;
22265 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22266 }
22267 if (align_bytes & 4)
22268 {
22269 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22270 src = adjust_automodify_address_nv (src, SImode, srcreg, off);
22271 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22272 set_mem_align (dst, 4 * BITS_PER_UNIT);
22273 if (src_align_bytes >= 0)
22274 {
22275 unsigned int src_align = 0;
22276 if ((src_align_bytes & 3) == (align_bytes & 3))
22277 src_align = 4;
22278 else if ((src_align_bytes & 1) == (align_bytes & 1))
22279 src_align = 2;
22280 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
22281 set_mem_align (src, src_align * BITS_PER_UNIT);
22282 }
22283 off = 4;
22284 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22285 }
22286 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22287 src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
22288 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22289 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22290 if (src_align_bytes >= 0)
22291 {
22292 unsigned int src_align = 0;
22293 if ((src_align_bytes & 7) == (align_bytes & 7))
22294 src_align = 8;
22295 else if ((src_align_bytes & 3) == (align_bytes & 3))
22296 src_align = 4;
22297 else if ((src_align_bytes & 1) == (align_bytes & 1))
22298 src_align = 2;
22299 if (src_align > (unsigned int) desired_align)
22300 src_align = desired_align;
22301 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
22302 set_mem_align (src, src_align * BITS_PER_UNIT);
22303 }
22304 if (MEM_SIZE_KNOWN_P (orig_dst))
22305 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22306 if (MEM_SIZE_KNOWN_P (orig_src))
22307 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
22308 *srcp = src;
22309 return dst;
22310 }
22311
22312 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
22313 DESIRED_ALIGNMENT. */
22314 static void
22315 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
22316 int align, int desired_alignment)
22317 {
22318 if (align <= 1 && desired_alignment > 1)
22319 {
22320 rtx label = ix86_expand_aligntest (destptr, 1, false);
22321 destmem = change_address (destmem, QImode, destptr);
22322 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
22323 ix86_adjust_counter (count, 1);
22324 emit_label (label);
22325 LABEL_NUSES (label) = 1;
22326 }
22327 if (align <= 2 && desired_alignment > 2)
22328 {
22329 rtx label = ix86_expand_aligntest (destptr, 2, false);
22330 destmem = change_address (destmem, HImode, destptr);
22331 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
22332 ix86_adjust_counter (count, 2);
22333 emit_label (label);
22334 LABEL_NUSES (label) = 1;
22335 }
22336 if (align <= 4 && desired_alignment > 4)
22337 {
22338 rtx label = ix86_expand_aligntest (destptr, 4, false);
22339 destmem = change_address (destmem, SImode, destptr);
22340 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
22341 ix86_adjust_counter (count, 4);
22342 emit_label (label);
22343 LABEL_NUSES (label) = 1;
22344 }
22345 gcc_assert (desired_alignment <= 8);
22346 }
22347
22348 /* Set enough from DST to align DST known to by aligned by ALIGN to
22349 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
22350 static rtx
22351 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
22352 int desired_align, int align_bytes)
22353 {
22354 int off = 0;
22355 rtx orig_dst = dst;
22356 if (align_bytes & 1)
22357 {
22358 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22359 off = 1;
22360 emit_insn (gen_strset (destreg, dst,
22361 gen_lowpart (QImode, value)));
22362 }
22363 if (align_bytes & 2)
22364 {
22365 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22366 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22367 set_mem_align (dst, 2 * BITS_PER_UNIT);
22368 off = 2;
22369 emit_insn (gen_strset (destreg, dst,
22370 gen_lowpart (HImode, value)));
22371 }
22372 if (align_bytes & 4)
22373 {
22374 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22375 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22376 set_mem_align (dst, 4 * BITS_PER_UNIT);
22377 off = 4;
22378 emit_insn (gen_strset (destreg, dst,
22379 gen_lowpart (SImode, value)));
22380 }
22381 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22382 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22383 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22384 if (MEM_SIZE_KNOWN_P (orig_dst))
22385 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22386 return dst;
22387 }
22388
22389 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
22390 static enum stringop_alg
22391 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
22392 int *dynamic_check, bool *noalign)
22393 {
22394 const struct stringop_algs * algs;
22395 bool optimize_for_speed;
22396 /* Algorithms using the rep prefix want at least edi and ecx;
22397 additionally, memset wants eax and memcpy wants esi. Don't
22398 consider such algorithms if the user has appropriated those
22399 registers for their own purposes. */
22400 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
22401 || (memset
22402 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
22403 *noalign = false;
22404
22405 #define ALG_USABLE_P(alg) (rep_prefix_usable \
22406 || (alg != rep_prefix_1_byte \
22407 && alg != rep_prefix_4_byte \
22408 && alg != rep_prefix_8_byte))
22409 const struct processor_costs *cost;
22410
22411 /* Even if the string operation call is cold, we still might spend a lot
22412 of time processing large blocks. */
22413 if (optimize_function_for_size_p (cfun)
22414 || (optimize_insn_for_size_p ()
22415 && expected_size != -1 && expected_size < 256))
22416 optimize_for_speed = false;
22417 else
22418 optimize_for_speed = true;
22419
22420 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
22421
22422 *dynamic_check = -1;
22423 if (memset)
22424 algs = &cost->memset[TARGET_64BIT != 0];
22425 else
22426 algs = &cost->memcpy[TARGET_64BIT != 0];
22427 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
22428 return ix86_stringop_alg;
22429 /* rep; movq or rep; movl is the smallest variant. */
22430 else if (!optimize_for_speed)
22431 {
22432 if (!count || (count & 3))
22433 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
22434 else
22435 return rep_prefix_usable ? rep_prefix_4_byte : loop;
22436 }
22437 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
22438 */
22439 else if (expected_size != -1 && expected_size < 4)
22440 return loop_1_byte;
22441 else if (expected_size != -1)
22442 {
22443 unsigned int i;
22444 enum stringop_alg alg = libcall;
22445 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22446 {
22447 /* We get here if the algorithms that were not libcall-based
22448 were rep-prefix based and we are unable to use rep prefixes
22449 based on global register usage. Break out of the loop and
22450 use the heuristic below. */
22451 if (algs->size[i].max == 0)
22452 break;
22453 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
22454 {
22455 enum stringop_alg candidate = algs->size[i].alg;
22456
22457 if (candidate != libcall && ALG_USABLE_P (candidate))
22458 alg = candidate;
22459 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
22460 last non-libcall inline algorithm. */
22461 if (TARGET_INLINE_ALL_STRINGOPS)
22462 {
22463 /* When the current size is best to be copied by a libcall,
22464 but we are still forced to inline, run the heuristic below
22465 that will pick code for medium sized blocks. */
22466 if (alg != libcall)
22467 return alg;
22468 break;
22469 }
22470 else if (ALG_USABLE_P (candidate))
22471 {
22472 *noalign = algs->size[i].noalign;
22473 return candidate;
22474 }
22475 }
22476 }
22477 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
22478 }
22479 /* When asked to inline the call anyway, try to pick meaningful choice.
22480 We look for maximal size of block that is faster to copy by hand and
22481 take blocks of at most of that size guessing that average size will
22482 be roughly half of the block.
22483
22484 If this turns out to be bad, we might simply specify the preferred
22485 choice in ix86_costs. */
22486 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22487 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
22488 {
22489 int max = -1;
22490 enum stringop_alg alg;
22491 int i;
22492 bool any_alg_usable_p = true;
22493
22494 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22495 {
22496 enum stringop_alg candidate = algs->size[i].alg;
22497 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
22498
22499 if (candidate != libcall && candidate
22500 && ALG_USABLE_P (candidate))
22501 max = algs->size[i].max;
22502 }
22503 /* If there aren't any usable algorithms, then recursing on
22504 smaller sizes isn't going to find anything. Just return the
22505 simple byte-at-a-time copy loop. */
22506 if (!any_alg_usable_p)
22507 {
22508 /* Pick something reasonable. */
22509 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22510 *dynamic_check = 128;
22511 return loop_1_byte;
22512 }
22513 if (max == -1)
22514 max = 4096;
22515 alg = decide_alg (count, max / 2, memset, dynamic_check, noalign);
22516 gcc_assert (*dynamic_check == -1);
22517 gcc_assert (alg != libcall);
22518 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22519 *dynamic_check = max;
22520 return alg;
22521 }
22522 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
22523 #undef ALG_USABLE_P
22524 }
22525
22526 /* Decide on alignment. We know that the operand is already aligned to ALIGN
22527 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
22528 static int
22529 decide_alignment (int align,
22530 enum stringop_alg alg,
22531 int expected_size)
22532 {
22533 int desired_align = 0;
22534 switch (alg)
22535 {
22536 case no_stringop:
22537 gcc_unreachable ();
22538 case loop:
22539 case unrolled_loop:
22540 desired_align = GET_MODE_SIZE (Pmode);
22541 break;
22542 case rep_prefix_8_byte:
22543 desired_align = 8;
22544 break;
22545 case rep_prefix_4_byte:
22546 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22547 copying whole cacheline at once. */
22548 if (TARGET_PENTIUMPRO)
22549 desired_align = 8;
22550 else
22551 desired_align = 4;
22552 break;
22553 case rep_prefix_1_byte:
22554 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22555 copying whole cacheline at once. */
22556 if (TARGET_PENTIUMPRO)
22557 desired_align = 8;
22558 else
22559 desired_align = 1;
22560 break;
22561 case loop_1_byte:
22562 desired_align = 1;
22563 break;
22564 case libcall:
22565 return 0;
22566 }
22567
22568 if (optimize_size)
22569 desired_align = 1;
22570 if (desired_align < align)
22571 desired_align = align;
22572 if (expected_size != -1 && expected_size < 4)
22573 desired_align = align;
22574 return desired_align;
22575 }
22576
22577 /* Return the smallest power of 2 greater than VAL. */
22578 static int
22579 smallest_pow2_greater_than (int val)
22580 {
22581 int ret = 1;
22582 while (ret <= val)
22583 ret <<= 1;
22584 return ret;
22585 }
22586
22587 /* Expand string move (memcpy) operation. Use i386 string operations
22588 when profitable. expand_setmem contains similar code. The code
22589 depends upon architecture, block size and alignment, but always has
22590 the same overall structure:
22591
22592 1) Prologue guard: Conditional that jumps up to epilogues for small
22593 blocks that can be handled by epilogue alone. This is faster
22594 but also needed for correctness, since prologue assume the block
22595 is larger than the desired alignment.
22596
22597 Optional dynamic check for size and libcall for large
22598 blocks is emitted here too, with -minline-stringops-dynamically.
22599
22600 2) Prologue: copy first few bytes in order to get destination
22601 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
22602 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
22603 copied. We emit either a jump tree on power of two sized
22604 blocks, or a byte loop.
22605
22606 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
22607 with specified algorithm.
22608
22609 4) Epilogue: code copying tail of the block that is too small to be
22610 handled by main body (or up to size guarded by prologue guard). */
22611
22612 bool
22613 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
22614 rtx expected_align_exp, rtx expected_size_exp)
22615 {
22616 rtx destreg;
22617 rtx srcreg;
22618 rtx label = NULL;
22619 rtx tmp;
22620 rtx jump_around_label = NULL;
22621 HOST_WIDE_INT align = 1;
22622 unsigned HOST_WIDE_INT count = 0;
22623 HOST_WIDE_INT expected_size = -1;
22624 int size_needed = 0, epilogue_size_needed;
22625 int desired_align = 0, align_bytes = 0;
22626 enum stringop_alg alg;
22627 int dynamic_check;
22628 bool need_zero_guard = false;
22629 bool noalign;
22630
22631 if (CONST_INT_P (align_exp))
22632 align = INTVAL (align_exp);
22633 /* i386 can do misaligned access on reasonably increased cost. */
22634 if (CONST_INT_P (expected_align_exp)
22635 && INTVAL (expected_align_exp) > align)
22636 align = INTVAL (expected_align_exp);
22637 /* ALIGN is the minimum of destination and source alignment, but we care here
22638 just about destination alignment. */
22639 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
22640 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
22641
22642 if (CONST_INT_P (count_exp))
22643 count = expected_size = INTVAL (count_exp);
22644 if (CONST_INT_P (expected_size_exp) && count == 0)
22645 expected_size = INTVAL (expected_size_exp);
22646
22647 /* Make sure we don't need to care about overflow later on. */
22648 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22649 return false;
22650
22651 /* Step 0: Decide on preferred algorithm, desired alignment and
22652 size of chunks to be copied by main loop. */
22653
22654 alg = decide_alg (count, expected_size, false, &dynamic_check, &noalign);
22655 desired_align = decide_alignment (align, alg, expected_size);
22656
22657 if (!TARGET_ALIGN_STRINGOPS || noalign)
22658 align = desired_align;
22659
22660 if (alg == libcall)
22661 return false;
22662 gcc_assert (alg != no_stringop);
22663 if (!count)
22664 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
22665 destreg = copy_addr_to_reg (XEXP (dst, 0));
22666 srcreg = copy_addr_to_reg (XEXP (src, 0));
22667 switch (alg)
22668 {
22669 case libcall:
22670 case no_stringop:
22671 gcc_unreachable ();
22672 case loop:
22673 need_zero_guard = true;
22674 size_needed = GET_MODE_SIZE (word_mode);
22675 break;
22676 case unrolled_loop:
22677 need_zero_guard = true;
22678 size_needed = GET_MODE_SIZE (word_mode) * (TARGET_64BIT ? 4 : 2);
22679 break;
22680 case rep_prefix_8_byte:
22681 size_needed = 8;
22682 break;
22683 case rep_prefix_4_byte:
22684 size_needed = 4;
22685 break;
22686 case rep_prefix_1_byte:
22687 size_needed = 1;
22688 break;
22689 case loop_1_byte:
22690 need_zero_guard = true;
22691 size_needed = 1;
22692 break;
22693 }
22694
22695 epilogue_size_needed = size_needed;
22696
22697 /* Step 1: Prologue guard. */
22698
22699 /* Alignment code needs count to be in register. */
22700 if (CONST_INT_P (count_exp) && desired_align > align)
22701 {
22702 if (INTVAL (count_exp) > desired_align
22703 && INTVAL (count_exp) > size_needed)
22704 {
22705 align_bytes
22706 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22707 if (align_bytes <= 0)
22708 align_bytes = 0;
22709 else
22710 align_bytes = desired_align - align_bytes;
22711 }
22712 if (align_bytes == 0)
22713 count_exp = force_reg (counter_mode (count_exp), count_exp);
22714 }
22715 gcc_assert (desired_align >= 1 && align >= 1);
22716
22717 /* Ensure that alignment prologue won't copy past end of block. */
22718 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22719 {
22720 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22721 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
22722 Make sure it is power of 2. */
22723 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22724
22725 if (count)
22726 {
22727 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22728 {
22729 /* If main algorithm works on QImode, no epilogue is needed.
22730 For small sizes just don't align anything. */
22731 if (size_needed == 1)
22732 desired_align = align;
22733 else
22734 goto epilogue;
22735 }
22736 }
22737 else
22738 {
22739 label = gen_label_rtx ();
22740 emit_cmp_and_jump_insns (count_exp,
22741 GEN_INT (epilogue_size_needed),
22742 LTU, 0, counter_mode (count_exp), 1, label);
22743 if (expected_size == -1 || expected_size < epilogue_size_needed)
22744 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22745 else
22746 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22747 }
22748 }
22749
22750 /* Emit code to decide on runtime whether library call or inline should be
22751 used. */
22752 if (dynamic_check != -1)
22753 {
22754 if (CONST_INT_P (count_exp))
22755 {
22756 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
22757 {
22758 emit_block_move_via_libcall (dst, src, count_exp, false);
22759 count_exp = const0_rtx;
22760 goto epilogue;
22761 }
22762 }
22763 else
22764 {
22765 rtx hot_label = gen_label_rtx ();
22766 jump_around_label = gen_label_rtx ();
22767 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22768 LEU, 0, GET_MODE (count_exp), 1, hot_label);
22769 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22770 emit_block_move_via_libcall (dst, src, count_exp, false);
22771 emit_jump (jump_around_label);
22772 emit_label (hot_label);
22773 }
22774 }
22775
22776 /* Step 2: Alignment prologue. */
22777
22778 if (desired_align > align)
22779 {
22780 if (align_bytes == 0)
22781 {
22782 /* Except for the first move in epilogue, we no longer know
22783 constant offset in aliasing info. It don't seems to worth
22784 the pain to maintain it for the first move, so throw away
22785 the info early. */
22786 src = change_address (src, BLKmode, srcreg);
22787 dst = change_address (dst, BLKmode, destreg);
22788 expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
22789 desired_align);
22790 }
22791 else
22792 {
22793 /* If we know how many bytes need to be stored before dst is
22794 sufficiently aligned, maintain aliasing info accurately. */
22795 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
22796 desired_align, align_bytes);
22797 count_exp = plus_constant (counter_mode (count_exp),
22798 count_exp, -align_bytes);
22799 count -= align_bytes;
22800 }
22801 if (need_zero_guard
22802 && (count < (unsigned HOST_WIDE_INT) size_needed
22803 || (align_bytes == 0
22804 && count < ((unsigned HOST_WIDE_INT) size_needed
22805 + desired_align - align))))
22806 {
22807 /* It is possible that we copied enough so the main loop will not
22808 execute. */
22809 gcc_assert (size_needed > 1);
22810 if (label == NULL_RTX)
22811 label = gen_label_rtx ();
22812 emit_cmp_and_jump_insns (count_exp,
22813 GEN_INT (size_needed),
22814 LTU, 0, counter_mode (count_exp), 1, label);
22815 if (expected_size == -1
22816 || expected_size < (desired_align - align) / 2 + size_needed)
22817 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22818 else
22819 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22820 }
22821 }
22822 if (label && size_needed == 1)
22823 {
22824 emit_label (label);
22825 LABEL_NUSES (label) = 1;
22826 label = NULL;
22827 epilogue_size_needed = 1;
22828 }
22829 else if (label == NULL_RTX)
22830 epilogue_size_needed = size_needed;
22831
22832 /* Step 3: Main loop. */
22833
22834 switch (alg)
22835 {
22836 case libcall:
22837 case no_stringop:
22838 gcc_unreachable ();
22839 case loop_1_byte:
22840 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22841 count_exp, QImode, 1, expected_size);
22842 break;
22843 case loop:
22844 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22845 count_exp, word_mode, 1, expected_size);
22846 break;
22847 case unrolled_loop:
22848 /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
22849 registers for 4 temporaries anyway. */
22850 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22851 count_exp, word_mode, TARGET_64BIT ? 4 : 2,
22852 expected_size);
22853 break;
22854 case rep_prefix_8_byte:
22855 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22856 DImode);
22857 break;
22858 case rep_prefix_4_byte:
22859 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22860 SImode);
22861 break;
22862 case rep_prefix_1_byte:
22863 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22864 QImode);
22865 break;
22866 }
22867 /* Adjust properly the offset of src and dest memory for aliasing. */
22868 if (CONST_INT_P (count_exp))
22869 {
22870 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
22871 (count / size_needed) * size_needed);
22872 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22873 (count / size_needed) * size_needed);
22874 }
22875 else
22876 {
22877 src = change_address (src, BLKmode, srcreg);
22878 dst = change_address (dst, BLKmode, destreg);
22879 }
22880
22881 /* Step 4: Epilogue to copy the remaining bytes. */
22882 epilogue:
22883 if (label)
22884 {
22885 /* When the main loop is done, COUNT_EXP might hold original count,
22886 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22887 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22888 bytes. Compensate if needed. */
22889
22890 if (size_needed < epilogue_size_needed)
22891 {
22892 tmp =
22893 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22894 GEN_INT (size_needed - 1), count_exp, 1,
22895 OPTAB_DIRECT);
22896 if (tmp != count_exp)
22897 emit_move_insn (count_exp, tmp);
22898 }
22899 emit_label (label);
22900 LABEL_NUSES (label) = 1;
22901 }
22902
22903 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22904 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
22905 epilogue_size_needed);
22906 if (jump_around_label)
22907 emit_label (jump_around_label);
22908 return true;
22909 }
22910
22911 /* Helper function for memcpy. For QImode value 0xXY produce
22912 0xXYXYXYXY of wide specified by MODE. This is essentially
22913 a * 0x10101010, but we can do slightly better than
22914 synth_mult by unwinding the sequence by hand on CPUs with
22915 slow multiply. */
22916 static rtx
22917 promote_duplicated_reg (enum machine_mode mode, rtx val)
22918 {
22919 enum machine_mode valmode = GET_MODE (val);
22920 rtx tmp;
22921 int nops = mode == DImode ? 3 : 2;
22922
22923 gcc_assert (mode == SImode || mode == DImode);
22924 if (val == const0_rtx)
22925 return copy_to_mode_reg (mode, const0_rtx);
22926 if (CONST_INT_P (val))
22927 {
22928 HOST_WIDE_INT v = INTVAL (val) & 255;
22929
22930 v |= v << 8;
22931 v |= v << 16;
22932 if (mode == DImode)
22933 v |= (v << 16) << 16;
22934 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
22935 }
22936
22937 if (valmode == VOIDmode)
22938 valmode = QImode;
22939 if (valmode != QImode)
22940 val = gen_lowpart (QImode, val);
22941 if (mode == QImode)
22942 return val;
22943 if (!TARGET_PARTIAL_REG_STALL)
22944 nops--;
22945 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
22946 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
22947 <= (ix86_cost->shift_const + ix86_cost->add) * nops
22948 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
22949 {
22950 rtx reg = convert_modes (mode, QImode, val, true);
22951 tmp = promote_duplicated_reg (mode, const1_rtx);
22952 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
22953 OPTAB_DIRECT);
22954 }
22955 else
22956 {
22957 rtx reg = convert_modes (mode, QImode, val, true);
22958
22959 if (!TARGET_PARTIAL_REG_STALL)
22960 if (mode == SImode)
22961 emit_insn (gen_movsi_insv_1 (reg, reg));
22962 else
22963 emit_insn (gen_movdi_insv_1 (reg, reg));
22964 else
22965 {
22966 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
22967 NULL, 1, OPTAB_DIRECT);
22968 reg =
22969 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22970 }
22971 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
22972 NULL, 1, OPTAB_DIRECT);
22973 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22974 if (mode == SImode)
22975 return reg;
22976 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
22977 NULL, 1, OPTAB_DIRECT);
22978 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22979 return reg;
22980 }
22981 }
22982
22983 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
22984 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
22985 alignment from ALIGN to DESIRED_ALIGN. */
22986 static rtx
22987 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
22988 {
22989 rtx promoted_val;
22990
22991 if (TARGET_64BIT
22992 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
22993 promoted_val = promote_duplicated_reg (DImode, val);
22994 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
22995 promoted_val = promote_duplicated_reg (SImode, val);
22996 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
22997 promoted_val = promote_duplicated_reg (HImode, val);
22998 else
22999 promoted_val = val;
23000
23001 return promoted_val;
23002 }
23003
23004 /* Expand string clear operation (bzero). Use i386 string operations when
23005 profitable. See expand_movmem comment for explanation of individual
23006 steps performed. */
23007 bool
23008 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
23009 rtx expected_align_exp, rtx expected_size_exp)
23010 {
23011 rtx destreg;
23012 rtx label = NULL;
23013 rtx tmp;
23014 rtx jump_around_label = NULL;
23015 HOST_WIDE_INT align = 1;
23016 unsigned HOST_WIDE_INT count = 0;
23017 HOST_WIDE_INT expected_size = -1;
23018 int size_needed = 0, epilogue_size_needed;
23019 int desired_align = 0, align_bytes = 0;
23020 enum stringop_alg alg;
23021 rtx promoted_val = NULL;
23022 bool force_loopy_epilogue = false;
23023 int dynamic_check;
23024 bool need_zero_guard = false;
23025 bool noalign;
23026
23027 if (CONST_INT_P (align_exp))
23028 align = INTVAL (align_exp);
23029 /* i386 can do misaligned access on reasonably increased cost. */
23030 if (CONST_INT_P (expected_align_exp)
23031 && INTVAL (expected_align_exp) > align)
23032 align = INTVAL (expected_align_exp);
23033 if (CONST_INT_P (count_exp))
23034 count = expected_size = INTVAL (count_exp);
23035 if (CONST_INT_P (expected_size_exp) && count == 0)
23036 expected_size = INTVAL (expected_size_exp);
23037
23038 /* Make sure we don't need to care about overflow later on. */
23039 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
23040 return false;
23041
23042 /* Step 0: Decide on preferred algorithm, desired alignment and
23043 size of chunks to be copied by main loop. */
23044
23045 alg = decide_alg (count, expected_size, true, &dynamic_check, &noalign);
23046 desired_align = decide_alignment (align, alg, expected_size);
23047
23048 if (!TARGET_ALIGN_STRINGOPS || noalign)
23049 align = desired_align;
23050
23051 if (alg == libcall)
23052 return false;
23053 gcc_assert (alg != no_stringop);
23054 if (!count)
23055 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
23056 destreg = copy_addr_to_reg (XEXP (dst, 0));
23057 switch (alg)
23058 {
23059 case libcall:
23060 case no_stringop:
23061 gcc_unreachable ();
23062 case loop:
23063 need_zero_guard = true;
23064 size_needed = GET_MODE_SIZE (word_mode);
23065 break;
23066 case unrolled_loop:
23067 need_zero_guard = true;
23068 size_needed = GET_MODE_SIZE (word_mode) * 4;
23069 break;
23070 case rep_prefix_8_byte:
23071 size_needed = 8;
23072 break;
23073 case rep_prefix_4_byte:
23074 size_needed = 4;
23075 break;
23076 case rep_prefix_1_byte:
23077 size_needed = 1;
23078 break;
23079 case loop_1_byte:
23080 need_zero_guard = true;
23081 size_needed = 1;
23082 break;
23083 }
23084 epilogue_size_needed = size_needed;
23085
23086 /* Step 1: Prologue guard. */
23087
23088 /* Alignment code needs count to be in register. */
23089 if (CONST_INT_P (count_exp) && desired_align > align)
23090 {
23091 if (INTVAL (count_exp) > desired_align
23092 && INTVAL (count_exp) > size_needed)
23093 {
23094 align_bytes
23095 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
23096 if (align_bytes <= 0)
23097 align_bytes = 0;
23098 else
23099 align_bytes = desired_align - align_bytes;
23100 }
23101 if (align_bytes == 0)
23102 {
23103 enum machine_mode mode = SImode;
23104 if (TARGET_64BIT && (count & ~0xffffffff))
23105 mode = DImode;
23106 count_exp = force_reg (mode, count_exp);
23107 }
23108 }
23109 /* Do the cheap promotion to allow better CSE across the
23110 main loop and epilogue (ie one load of the big constant in the
23111 front of all code. */
23112 if (CONST_INT_P (val_exp))
23113 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23114 desired_align, align);
23115 /* Ensure that alignment prologue won't copy past end of block. */
23116 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
23117 {
23118 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
23119 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
23120 Make sure it is power of 2. */
23121 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
23122
23123 /* To improve performance of small blocks, we jump around the VAL
23124 promoting mode. This mean that if the promoted VAL is not constant,
23125 we might not use it in the epilogue and have to use byte
23126 loop variant. */
23127 if (epilogue_size_needed > 2 && !promoted_val)
23128 force_loopy_epilogue = true;
23129 if (count)
23130 {
23131 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
23132 {
23133 /* If main algorithm works on QImode, no epilogue is needed.
23134 For small sizes just don't align anything. */
23135 if (size_needed == 1)
23136 desired_align = align;
23137 else
23138 goto epilogue;
23139 }
23140 }
23141 else
23142 {
23143 label = gen_label_rtx ();
23144 emit_cmp_and_jump_insns (count_exp,
23145 GEN_INT (epilogue_size_needed),
23146 LTU, 0, counter_mode (count_exp), 1, label);
23147 if (expected_size == -1 || expected_size <= epilogue_size_needed)
23148 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23149 else
23150 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23151 }
23152 }
23153 if (dynamic_check != -1)
23154 {
23155 rtx hot_label = gen_label_rtx ();
23156 jump_around_label = gen_label_rtx ();
23157 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
23158 LEU, 0, counter_mode (count_exp), 1, hot_label);
23159 predict_jump (REG_BR_PROB_BASE * 90 / 100);
23160 set_storage_via_libcall (dst, count_exp, val_exp, false);
23161 emit_jump (jump_around_label);
23162 emit_label (hot_label);
23163 }
23164
23165 /* Step 2: Alignment prologue. */
23166
23167 /* Do the expensive promotion once we branched off the small blocks. */
23168 if (!promoted_val)
23169 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23170 desired_align, align);
23171 gcc_assert (desired_align >= 1 && align >= 1);
23172
23173 if (desired_align > align)
23174 {
23175 if (align_bytes == 0)
23176 {
23177 /* Except for the first move in epilogue, we no longer know
23178 constant offset in aliasing info. It don't seems to worth
23179 the pain to maintain it for the first move, so throw away
23180 the info early. */
23181 dst = change_address (dst, BLKmode, destreg);
23182 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
23183 desired_align);
23184 }
23185 else
23186 {
23187 /* If we know how many bytes need to be stored before dst is
23188 sufficiently aligned, maintain aliasing info accurately. */
23189 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
23190 desired_align, align_bytes);
23191 count_exp = plus_constant (counter_mode (count_exp),
23192 count_exp, -align_bytes);
23193 count -= align_bytes;
23194 }
23195 if (need_zero_guard
23196 && (count < (unsigned HOST_WIDE_INT) size_needed
23197 || (align_bytes == 0
23198 && count < ((unsigned HOST_WIDE_INT) size_needed
23199 + desired_align - align))))
23200 {
23201 /* It is possible that we copied enough so the main loop will not
23202 execute. */
23203 gcc_assert (size_needed > 1);
23204 if (label == NULL_RTX)
23205 label = gen_label_rtx ();
23206 emit_cmp_and_jump_insns (count_exp,
23207 GEN_INT (size_needed),
23208 LTU, 0, counter_mode (count_exp), 1, label);
23209 if (expected_size == -1
23210 || expected_size < (desired_align - align) / 2 + size_needed)
23211 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23212 else
23213 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23214 }
23215 }
23216 if (label && size_needed == 1)
23217 {
23218 emit_label (label);
23219 LABEL_NUSES (label) = 1;
23220 label = NULL;
23221 promoted_val = val_exp;
23222 epilogue_size_needed = 1;
23223 }
23224 else if (label == NULL_RTX)
23225 epilogue_size_needed = size_needed;
23226
23227 /* Step 3: Main loop. */
23228
23229 switch (alg)
23230 {
23231 case libcall:
23232 case no_stringop:
23233 gcc_unreachable ();
23234 case loop_1_byte:
23235 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23236 count_exp, QImode, 1, expected_size);
23237 break;
23238 case loop:
23239 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23240 count_exp, word_mode, 1, expected_size);
23241 break;
23242 case unrolled_loop:
23243 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23244 count_exp, word_mode, 4, expected_size);
23245 break;
23246 case rep_prefix_8_byte:
23247 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23248 DImode, val_exp);
23249 break;
23250 case rep_prefix_4_byte:
23251 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23252 SImode, val_exp);
23253 break;
23254 case rep_prefix_1_byte:
23255 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23256 QImode, val_exp);
23257 break;
23258 }
23259 /* Adjust properly the offset of src and dest memory for aliasing. */
23260 if (CONST_INT_P (count_exp))
23261 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
23262 (count / size_needed) * size_needed);
23263 else
23264 dst = change_address (dst, BLKmode, destreg);
23265
23266 /* Step 4: Epilogue to copy the remaining bytes. */
23267
23268 if (label)
23269 {
23270 /* When the main loop is done, COUNT_EXP might hold original count,
23271 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
23272 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
23273 bytes. Compensate if needed. */
23274
23275 if (size_needed < epilogue_size_needed)
23276 {
23277 tmp =
23278 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
23279 GEN_INT (size_needed - 1), count_exp, 1,
23280 OPTAB_DIRECT);
23281 if (tmp != count_exp)
23282 emit_move_insn (count_exp, tmp);
23283 }
23284 emit_label (label);
23285 LABEL_NUSES (label) = 1;
23286 }
23287 epilogue:
23288 if (count_exp != const0_rtx && epilogue_size_needed > 1)
23289 {
23290 if (force_loopy_epilogue)
23291 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
23292 epilogue_size_needed);
23293 else
23294 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
23295 epilogue_size_needed);
23296 }
23297 if (jump_around_label)
23298 emit_label (jump_around_label);
23299 return true;
23300 }
23301
23302 /* Expand the appropriate insns for doing strlen if not just doing
23303 repnz; scasb
23304
23305 out = result, initialized with the start address
23306 align_rtx = alignment of the address.
23307 scratch = scratch register, initialized with the startaddress when
23308 not aligned, otherwise undefined
23309
23310 This is just the body. It needs the initializations mentioned above and
23311 some address computing at the end. These things are done in i386.md. */
23312
23313 static void
23314 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
23315 {
23316 int align;
23317 rtx tmp;
23318 rtx align_2_label = NULL_RTX;
23319 rtx align_3_label = NULL_RTX;
23320 rtx align_4_label = gen_label_rtx ();
23321 rtx end_0_label = gen_label_rtx ();
23322 rtx mem;
23323 rtx tmpreg = gen_reg_rtx (SImode);
23324 rtx scratch = gen_reg_rtx (SImode);
23325 rtx cmp;
23326
23327 align = 0;
23328 if (CONST_INT_P (align_rtx))
23329 align = INTVAL (align_rtx);
23330
23331 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
23332
23333 /* Is there a known alignment and is it less than 4? */
23334 if (align < 4)
23335 {
23336 rtx scratch1 = gen_reg_rtx (Pmode);
23337 emit_move_insn (scratch1, out);
23338 /* Is there a known alignment and is it not 2? */
23339 if (align != 2)
23340 {
23341 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
23342 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
23343
23344 /* Leave just the 3 lower bits. */
23345 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
23346 NULL_RTX, 0, OPTAB_WIDEN);
23347
23348 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23349 Pmode, 1, align_4_label);
23350 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
23351 Pmode, 1, align_2_label);
23352 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
23353 Pmode, 1, align_3_label);
23354 }
23355 else
23356 {
23357 /* Since the alignment is 2, we have to check 2 or 0 bytes;
23358 check if is aligned to 4 - byte. */
23359
23360 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
23361 NULL_RTX, 0, OPTAB_WIDEN);
23362
23363 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23364 Pmode, 1, align_4_label);
23365 }
23366
23367 mem = change_address (src, QImode, out);
23368
23369 /* Now compare the bytes. */
23370
23371 /* Compare the first n unaligned byte on a byte per byte basis. */
23372 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
23373 QImode, 1, end_0_label);
23374
23375 /* Increment the address. */
23376 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23377
23378 /* Not needed with an alignment of 2 */
23379 if (align != 2)
23380 {
23381 emit_label (align_2_label);
23382
23383 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23384 end_0_label);
23385
23386 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23387
23388 emit_label (align_3_label);
23389 }
23390
23391 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23392 end_0_label);
23393
23394 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23395 }
23396
23397 /* Generate loop to check 4 bytes at a time. It is not a good idea to
23398 align this loop. It gives only huge programs, but does not help to
23399 speed up. */
23400 emit_label (align_4_label);
23401
23402 mem = change_address (src, SImode, out);
23403 emit_move_insn (scratch, mem);
23404 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
23405
23406 /* This formula yields a nonzero result iff one of the bytes is zero.
23407 This saves three branches inside loop and many cycles. */
23408
23409 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
23410 emit_insn (gen_one_cmplsi2 (scratch, scratch));
23411 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
23412 emit_insn (gen_andsi3 (tmpreg, tmpreg,
23413 gen_int_mode (0x80808080, SImode)));
23414 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
23415 align_4_label);
23416
23417 if (TARGET_CMOVE)
23418 {
23419 rtx reg = gen_reg_rtx (SImode);
23420 rtx reg2 = gen_reg_rtx (Pmode);
23421 emit_move_insn (reg, tmpreg);
23422 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
23423
23424 /* If zero is not in the first two bytes, move two bytes forward. */
23425 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23426 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23427 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23428 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
23429 gen_rtx_IF_THEN_ELSE (SImode, tmp,
23430 reg,
23431 tmpreg)));
23432 /* Emit lea manually to avoid clobbering of flags. */
23433 emit_insn (gen_rtx_SET (SImode, reg2,
23434 gen_rtx_PLUS (Pmode, out, const2_rtx)));
23435
23436 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23437 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23438 emit_insn (gen_rtx_SET (VOIDmode, out,
23439 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
23440 reg2,
23441 out)));
23442 }
23443 else
23444 {
23445 rtx end_2_label = gen_label_rtx ();
23446 /* Is zero in the first two bytes? */
23447
23448 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23449 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23450 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
23451 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23452 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
23453 pc_rtx);
23454 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
23455 JUMP_LABEL (tmp) = end_2_label;
23456
23457 /* Not in the first two. Move two bytes forward. */
23458 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
23459 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
23460
23461 emit_label (end_2_label);
23462
23463 }
23464
23465 /* Avoid branch in fixing the byte. */
23466 tmpreg = gen_lowpart (QImode, tmpreg);
23467 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
23468 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
23469 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
23470 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
23471
23472 emit_label (end_0_label);
23473 }
23474
23475 /* Expand strlen. */
23476
23477 bool
23478 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
23479 {
23480 rtx addr, scratch1, scratch2, scratch3, scratch4;
23481
23482 /* The generic case of strlen expander is long. Avoid it's
23483 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
23484
23485 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23486 && !TARGET_INLINE_ALL_STRINGOPS
23487 && !optimize_insn_for_size_p ()
23488 && (!CONST_INT_P (align) || INTVAL (align) < 4))
23489 return false;
23490
23491 addr = force_reg (Pmode, XEXP (src, 0));
23492 scratch1 = gen_reg_rtx (Pmode);
23493
23494 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23495 && !optimize_insn_for_size_p ())
23496 {
23497 /* Well it seems that some optimizer does not combine a call like
23498 foo(strlen(bar), strlen(bar));
23499 when the move and the subtraction is done here. It does calculate
23500 the length just once when these instructions are done inside of
23501 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
23502 often used and I use one fewer register for the lifetime of
23503 output_strlen_unroll() this is better. */
23504
23505 emit_move_insn (out, addr);
23506
23507 ix86_expand_strlensi_unroll_1 (out, src, align);
23508
23509 /* strlensi_unroll_1 returns the address of the zero at the end of
23510 the string, like memchr(), so compute the length by subtracting
23511 the start address. */
23512 emit_insn (ix86_gen_sub3 (out, out, addr));
23513 }
23514 else
23515 {
23516 rtx unspec;
23517
23518 /* Can't use this if the user has appropriated eax, ecx, or edi. */
23519 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
23520 return false;
23521
23522 scratch2 = gen_reg_rtx (Pmode);
23523 scratch3 = gen_reg_rtx (Pmode);
23524 scratch4 = force_reg (Pmode, constm1_rtx);
23525
23526 emit_move_insn (scratch3, addr);
23527 eoschar = force_reg (QImode, eoschar);
23528
23529 src = replace_equiv_address_nv (src, scratch3);
23530
23531 /* If .md starts supporting :P, this can be done in .md. */
23532 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
23533 scratch4), UNSPEC_SCAS);
23534 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
23535 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
23536 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
23537 }
23538 return true;
23539 }
23540
23541 /* For given symbol (function) construct code to compute address of it's PLT
23542 entry in large x86-64 PIC model. */
23543 static rtx
23544 construct_plt_address (rtx symbol)
23545 {
23546 rtx tmp, unspec;
23547
23548 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
23549 gcc_assert (ix86_cmodel == CM_LARGE_PIC);
23550 gcc_assert (Pmode == DImode);
23551
23552 tmp = gen_reg_rtx (Pmode);
23553 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
23554
23555 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
23556 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
23557 return tmp;
23558 }
23559
23560 rtx
23561 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
23562 rtx callarg2,
23563 rtx pop, bool sibcall)
23564 {
23565 /* We need to represent that SI and DI registers are clobbered
23566 by SYSV calls. */
23567 static int clobbered_registers[] = {
23568 XMM6_REG, XMM7_REG, XMM8_REG,
23569 XMM9_REG, XMM10_REG, XMM11_REG,
23570 XMM12_REG, XMM13_REG, XMM14_REG,
23571 XMM15_REG, SI_REG, DI_REG
23572 };
23573 rtx vec[ARRAY_SIZE (clobbered_registers) + 3];
23574 rtx use = NULL, call;
23575 unsigned int vec_len;
23576
23577 if (pop == const0_rtx)
23578 pop = NULL;
23579 gcc_assert (!TARGET_64BIT || !pop);
23580
23581 if (TARGET_MACHO && !TARGET_64BIT)
23582 {
23583 #if TARGET_MACHO
23584 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
23585 fnaddr = machopic_indirect_call_target (fnaddr);
23586 #endif
23587 }
23588 else
23589 {
23590 /* Static functions and indirect calls don't need the pic register. */
23591 if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC)
23592 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23593 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
23594 use_reg (&use, pic_offset_table_rtx);
23595 }
23596
23597 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
23598 {
23599 rtx al = gen_rtx_REG (QImode, AX_REG);
23600 emit_move_insn (al, callarg2);
23601 use_reg (&use, al);
23602 }
23603
23604 if (ix86_cmodel == CM_LARGE_PIC
23605 && MEM_P (fnaddr)
23606 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23607 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
23608 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
23609 else if (sibcall
23610 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
23611 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
23612 {
23613 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
23614 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
23615 }
23616
23617 vec_len = 0;
23618 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
23619 if (retval)
23620 call = gen_rtx_SET (VOIDmode, retval, call);
23621 vec[vec_len++] = call;
23622
23623 if (pop)
23624 {
23625 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
23626 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
23627 vec[vec_len++] = pop;
23628 }
23629
23630 if (TARGET_64BIT_MS_ABI
23631 && (!callarg2 || INTVAL (callarg2) != -2))
23632 {
23633 unsigned i;
23634
23635 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
23636 UNSPEC_MS_TO_SYSV_CALL);
23637
23638 for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++)
23639 vec[vec_len++]
23640 = gen_rtx_CLOBBER (VOIDmode,
23641 gen_rtx_REG (SSE_REGNO_P (clobbered_registers[i])
23642 ? TImode : DImode,
23643 clobbered_registers[i]));
23644 }
23645
23646 if (vec_len > 1)
23647 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
23648 call = emit_call_insn (call);
23649 if (use)
23650 CALL_INSN_FUNCTION_USAGE (call) = use;
23651
23652 return call;
23653 }
23654
23655 /* Output the assembly for a call instruction. */
23656
23657 const char *
23658 ix86_output_call_insn (rtx insn, rtx call_op)
23659 {
23660 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
23661 bool seh_nop_p = false;
23662 const char *xasm;
23663
23664 if (SIBLING_CALL_P (insn))
23665 {
23666 if (direct_p)
23667 xasm = "jmp\t%P0";
23668 /* SEH epilogue detection requires the indirect branch case
23669 to include REX.W. */
23670 else if (TARGET_SEH)
23671 xasm = "rex.W jmp %A0";
23672 else
23673 xasm = "jmp\t%A0";
23674
23675 output_asm_insn (xasm, &call_op);
23676 return "";
23677 }
23678
23679 /* SEH unwinding can require an extra nop to be emitted in several
23680 circumstances. Determine if we have one of those. */
23681 if (TARGET_SEH)
23682 {
23683 rtx i;
23684
23685 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
23686 {
23687 /* If we get to another real insn, we don't need the nop. */
23688 if (INSN_P (i))
23689 break;
23690
23691 /* If we get to the epilogue note, prevent a catch region from
23692 being adjacent to the standard epilogue sequence. If non-
23693 call-exceptions, we'll have done this during epilogue emission. */
23694 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
23695 && !flag_non_call_exceptions
23696 && !can_throw_internal (insn))
23697 {
23698 seh_nop_p = true;
23699 break;
23700 }
23701 }
23702
23703 /* If we didn't find a real insn following the call, prevent the
23704 unwinder from looking into the next function. */
23705 if (i == NULL)
23706 seh_nop_p = true;
23707 }
23708
23709 if (direct_p)
23710 xasm = "call\t%P0";
23711 else
23712 xasm = "call\t%A0";
23713
23714 output_asm_insn (xasm, &call_op);
23715
23716 if (seh_nop_p)
23717 return "nop";
23718
23719 return "";
23720 }
23721 \f
23722 /* Clear stack slot assignments remembered from previous functions.
23723 This is called from INIT_EXPANDERS once before RTL is emitted for each
23724 function. */
23725
23726 static struct machine_function *
23727 ix86_init_machine_status (void)
23728 {
23729 struct machine_function *f;
23730
23731 f = ggc_alloc_cleared_machine_function ();
23732 f->use_fast_prologue_epilogue_nregs = -1;
23733 f->call_abi = ix86_abi;
23734
23735 return f;
23736 }
23737
23738 /* Return a MEM corresponding to a stack slot with mode MODE.
23739 Allocate a new slot if necessary.
23740
23741 The RTL for a function can have several slots available: N is
23742 which slot to use. */
23743
23744 rtx
23745 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
23746 {
23747 struct stack_local_entry *s;
23748
23749 gcc_assert (n < MAX_386_STACK_LOCALS);
23750
23751 for (s = ix86_stack_locals; s; s = s->next)
23752 if (s->mode == mode && s->n == n)
23753 return validize_mem (copy_rtx (s->rtl));
23754
23755 s = ggc_alloc_stack_local_entry ();
23756 s->n = n;
23757 s->mode = mode;
23758 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
23759
23760 s->next = ix86_stack_locals;
23761 ix86_stack_locals = s;
23762 return validize_mem (s->rtl);
23763 }
23764
23765 static void
23766 ix86_instantiate_decls (void)
23767 {
23768 struct stack_local_entry *s;
23769
23770 for (s = ix86_stack_locals; s; s = s->next)
23771 if (s->rtl != NULL_RTX)
23772 instantiate_decl_rtl (s->rtl);
23773 }
23774 \f
23775 /* Calculate the length of the memory address in the instruction encoding.
23776 Includes addr32 prefix, does not include the one-byte modrm, opcode,
23777 or other prefixes. We never generate addr32 prefix for LEA insn. */
23778
23779 int
23780 memory_address_length (rtx addr, bool lea)
23781 {
23782 struct ix86_address parts;
23783 rtx base, index, disp;
23784 int len;
23785 int ok;
23786
23787 if (GET_CODE (addr) == PRE_DEC
23788 || GET_CODE (addr) == POST_INC
23789 || GET_CODE (addr) == PRE_MODIFY
23790 || GET_CODE (addr) == POST_MODIFY)
23791 return 0;
23792
23793 ok = ix86_decompose_address (addr, &parts);
23794 gcc_assert (ok);
23795
23796 len = (parts.seg == SEG_DEFAULT) ? 0 : 1;
23797
23798 /* If this is not LEA instruction, add the length of addr32 prefix. */
23799 if (TARGET_64BIT && !lea
23800 && (SImode_address_operand (addr, VOIDmode)
23801 || (parts.base && GET_MODE (parts.base) == SImode)
23802 || (parts.index && GET_MODE (parts.index) == SImode)))
23803 len++;
23804
23805 base = parts.base;
23806 index = parts.index;
23807 disp = parts.disp;
23808
23809 if (base && GET_CODE (base) == SUBREG)
23810 base = SUBREG_REG (base);
23811 if (index && GET_CODE (index) == SUBREG)
23812 index = SUBREG_REG (index);
23813
23814 gcc_assert (base == NULL_RTX || REG_P (base));
23815 gcc_assert (index == NULL_RTX || REG_P (index));
23816
23817 /* Rule of thumb:
23818 - esp as the base always wants an index,
23819 - ebp as the base always wants a displacement,
23820 - r12 as the base always wants an index,
23821 - r13 as the base always wants a displacement. */
23822
23823 /* Register Indirect. */
23824 if (base && !index && !disp)
23825 {
23826 /* esp (for its index) and ebp (for its displacement) need
23827 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
23828 code. */
23829 if (base == arg_pointer_rtx
23830 || base == frame_pointer_rtx
23831 || REGNO (base) == SP_REG
23832 || REGNO (base) == BP_REG
23833 || REGNO (base) == R12_REG
23834 || REGNO (base) == R13_REG)
23835 len++;
23836 }
23837
23838 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
23839 is not disp32, but disp32(%rip), so for disp32
23840 SIB byte is needed, unless print_operand_address
23841 optimizes it into disp32(%rip) or (%rip) is implied
23842 by UNSPEC. */
23843 else if (disp && !base && !index)
23844 {
23845 len += 4;
23846 if (TARGET_64BIT)
23847 {
23848 rtx symbol = disp;
23849
23850 if (GET_CODE (disp) == CONST)
23851 symbol = XEXP (disp, 0);
23852 if (GET_CODE (symbol) == PLUS
23853 && CONST_INT_P (XEXP (symbol, 1)))
23854 symbol = XEXP (symbol, 0);
23855
23856 if (GET_CODE (symbol) != LABEL_REF
23857 && (GET_CODE (symbol) != SYMBOL_REF
23858 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
23859 && (GET_CODE (symbol) != UNSPEC
23860 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
23861 && XINT (symbol, 1) != UNSPEC_PCREL
23862 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
23863 len++;
23864 }
23865 }
23866 else
23867 {
23868 /* Find the length of the displacement constant. */
23869 if (disp)
23870 {
23871 if (base && satisfies_constraint_K (disp))
23872 len += 1;
23873 else
23874 len += 4;
23875 }
23876 /* ebp always wants a displacement. Similarly r13. */
23877 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
23878 len++;
23879
23880 /* An index requires the two-byte modrm form.... */
23881 if (index
23882 /* ...like esp (or r12), which always wants an index. */
23883 || base == arg_pointer_rtx
23884 || base == frame_pointer_rtx
23885 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
23886 len++;
23887 }
23888
23889 return len;
23890 }
23891
23892 /* Compute default value for "length_immediate" attribute. When SHORTFORM
23893 is set, expect that insn have 8bit immediate alternative. */
23894 int
23895 ix86_attr_length_immediate_default (rtx insn, bool shortform)
23896 {
23897 int len = 0;
23898 int i;
23899 extract_insn_cached (insn);
23900 for (i = recog_data.n_operands - 1; i >= 0; --i)
23901 if (CONSTANT_P (recog_data.operand[i]))
23902 {
23903 enum attr_mode mode = get_attr_mode (insn);
23904
23905 gcc_assert (!len);
23906 if (shortform && CONST_INT_P (recog_data.operand[i]))
23907 {
23908 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
23909 switch (mode)
23910 {
23911 case MODE_QI:
23912 len = 1;
23913 continue;
23914 case MODE_HI:
23915 ival = trunc_int_for_mode (ival, HImode);
23916 break;
23917 case MODE_SI:
23918 ival = trunc_int_for_mode (ival, SImode);
23919 break;
23920 default:
23921 break;
23922 }
23923 if (IN_RANGE (ival, -128, 127))
23924 {
23925 len = 1;
23926 continue;
23927 }
23928 }
23929 switch (mode)
23930 {
23931 case MODE_QI:
23932 len = 1;
23933 break;
23934 case MODE_HI:
23935 len = 2;
23936 break;
23937 case MODE_SI:
23938 len = 4;
23939 break;
23940 /* Immediates for DImode instructions are encoded
23941 as 32bit sign extended values. */
23942 case MODE_DI:
23943 len = 4;
23944 break;
23945 default:
23946 fatal_insn ("unknown insn mode", insn);
23947 }
23948 }
23949 return len;
23950 }
23951
23952 /* Compute default value for "length_address" attribute. */
23953 int
23954 ix86_attr_length_address_default (rtx insn)
23955 {
23956 int i;
23957
23958 if (get_attr_type (insn) == TYPE_LEA)
23959 {
23960 rtx set = PATTERN (insn), addr;
23961
23962 if (GET_CODE (set) == PARALLEL)
23963 set = XVECEXP (set, 0, 0);
23964
23965 gcc_assert (GET_CODE (set) == SET);
23966
23967 addr = SET_SRC (set);
23968
23969 return memory_address_length (addr, true);
23970 }
23971
23972 extract_insn_cached (insn);
23973 for (i = recog_data.n_operands - 1; i >= 0; --i)
23974 if (MEM_P (recog_data.operand[i]))
23975 {
23976 constrain_operands_cached (reload_completed);
23977 if (which_alternative != -1)
23978 {
23979 const char *constraints = recog_data.constraints[i];
23980 int alt = which_alternative;
23981
23982 while (*constraints == '=' || *constraints == '+')
23983 constraints++;
23984 while (alt-- > 0)
23985 while (*constraints++ != ',')
23986 ;
23987 /* Skip ignored operands. */
23988 if (*constraints == 'X')
23989 continue;
23990 }
23991 return memory_address_length (XEXP (recog_data.operand[i], 0), false);
23992 }
23993 return 0;
23994 }
23995
23996 /* Compute default value for "length_vex" attribute. It includes
23997 2 or 3 byte VEX prefix and 1 opcode byte. */
23998
23999 int
24000 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
24001 {
24002 int i;
24003
24004 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
24005 byte VEX prefix. */
24006 if (!has_0f_opcode || has_vex_w)
24007 return 3 + 1;
24008
24009 /* We can always use 2 byte VEX prefix in 32bit. */
24010 if (!TARGET_64BIT)
24011 return 2 + 1;
24012
24013 extract_insn_cached (insn);
24014
24015 for (i = recog_data.n_operands - 1; i >= 0; --i)
24016 if (REG_P (recog_data.operand[i]))
24017 {
24018 /* REX.W bit uses 3 byte VEX prefix. */
24019 if (GET_MODE (recog_data.operand[i]) == DImode
24020 && GENERAL_REG_P (recog_data.operand[i]))
24021 return 3 + 1;
24022 }
24023 else
24024 {
24025 /* REX.X or REX.B bits use 3 byte VEX prefix. */
24026 if (MEM_P (recog_data.operand[i])
24027 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
24028 return 3 + 1;
24029 }
24030
24031 return 2 + 1;
24032 }
24033 \f
24034 /* Return the maximum number of instructions a cpu can issue. */
24035
24036 static int
24037 ix86_issue_rate (void)
24038 {
24039 switch (ix86_tune)
24040 {
24041 case PROCESSOR_PENTIUM:
24042 case PROCESSOR_ATOM:
24043 case PROCESSOR_K6:
24044 case PROCESSOR_BTVER2:
24045 return 2;
24046
24047 case PROCESSOR_PENTIUMPRO:
24048 case PROCESSOR_PENTIUM4:
24049 case PROCESSOR_CORE2:
24050 case PROCESSOR_COREI7:
24051 case PROCESSOR_ATHLON:
24052 case PROCESSOR_K8:
24053 case PROCESSOR_AMDFAM10:
24054 case PROCESSOR_NOCONA:
24055 case PROCESSOR_GENERIC32:
24056 case PROCESSOR_GENERIC64:
24057 case PROCESSOR_BDVER1:
24058 case PROCESSOR_BDVER2:
24059 case PROCESSOR_BDVER3:
24060 case PROCESSOR_BTVER1:
24061 return 3;
24062
24063 default:
24064 return 1;
24065 }
24066 }
24067
24068 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
24069 by DEP_INSN and nothing set by DEP_INSN. */
24070
24071 static bool
24072 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
24073 {
24074 rtx set, set2;
24075
24076 /* Simplify the test for uninteresting insns. */
24077 if (insn_type != TYPE_SETCC
24078 && insn_type != TYPE_ICMOV
24079 && insn_type != TYPE_FCMOV
24080 && insn_type != TYPE_IBR)
24081 return false;
24082
24083 if ((set = single_set (dep_insn)) != 0)
24084 {
24085 set = SET_DEST (set);
24086 set2 = NULL_RTX;
24087 }
24088 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
24089 && XVECLEN (PATTERN (dep_insn), 0) == 2
24090 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
24091 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
24092 {
24093 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24094 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24095 }
24096 else
24097 return false;
24098
24099 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
24100 return false;
24101
24102 /* This test is true if the dependent insn reads the flags but
24103 not any other potentially set register. */
24104 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
24105 return false;
24106
24107 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
24108 return false;
24109
24110 return true;
24111 }
24112
24113 /* Return true iff USE_INSN has a memory address with operands set by
24114 SET_INSN. */
24115
24116 bool
24117 ix86_agi_dependent (rtx set_insn, rtx use_insn)
24118 {
24119 int i;
24120 extract_insn_cached (use_insn);
24121 for (i = recog_data.n_operands - 1; i >= 0; --i)
24122 if (MEM_P (recog_data.operand[i]))
24123 {
24124 rtx addr = XEXP (recog_data.operand[i], 0);
24125 return modified_in_p (addr, set_insn) != 0;
24126 }
24127 return false;
24128 }
24129
24130 static int
24131 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
24132 {
24133 enum attr_type insn_type, dep_insn_type;
24134 enum attr_memory memory;
24135 rtx set, set2;
24136 int dep_insn_code_number;
24137
24138 /* Anti and output dependencies have zero cost on all CPUs. */
24139 if (REG_NOTE_KIND (link) != 0)
24140 return 0;
24141
24142 dep_insn_code_number = recog_memoized (dep_insn);
24143
24144 /* If we can't recognize the insns, we can't really do anything. */
24145 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
24146 return cost;
24147
24148 insn_type = get_attr_type (insn);
24149 dep_insn_type = get_attr_type (dep_insn);
24150
24151 switch (ix86_tune)
24152 {
24153 case PROCESSOR_PENTIUM:
24154 /* Address Generation Interlock adds a cycle of latency. */
24155 if (insn_type == TYPE_LEA)
24156 {
24157 rtx addr = PATTERN (insn);
24158
24159 if (GET_CODE (addr) == PARALLEL)
24160 addr = XVECEXP (addr, 0, 0);
24161
24162 gcc_assert (GET_CODE (addr) == SET);
24163
24164 addr = SET_SRC (addr);
24165 if (modified_in_p (addr, dep_insn))
24166 cost += 1;
24167 }
24168 else if (ix86_agi_dependent (dep_insn, insn))
24169 cost += 1;
24170
24171 /* ??? Compares pair with jump/setcc. */
24172 if (ix86_flags_dependent (insn, dep_insn, insn_type))
24173 cost = 0;
24174
24175 /* Floating point stores require value to be ready one cycle earlier. */
24176 if (insn_type == TYPE_FMOV
24177 && get_attr_memory (insn) == MEMORY_STORE
24178 && !ix86_agi_dependent (dep_insn, insn))
24179 cost += 1;
24180 break;
24181
24182 case PROCESSOR_PENTIUMPRO:
24183 memory = get_attr_memory (insn);
24184
24185 /* INT->FP conversion is expensive. */
24186 if (get_attr_fp_int_src (dep_insn))
24187 cost += 5;
24188
24189 /* There is one cycle extra latency between an FP op and a store. */
24190 if (insn_type == TYPE_FMOV
24191 && (set = single_set (dep_insn)) != NULL_RTX
24192 && (set2 = single_set (insn)) != NULL_RTX
24193 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
24194 && MEM_P (SET_DEST (set2)))
24195 cost += 1;
24196
24197 /* Show ability of reorder buffer to hide latency of load by executing
24198 in parallel with previous instruction in case
24199 previous instruction is not needed to compute the address. */
24200 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24201 && !ix86_agi_dependent (dep_insn, insn))
24202 {
24203 /* Claim moves to take one cycle, as core can issue one load
24204 at time and the next load can start cycle later. */
24205 if (dep_insn_type == TYPE_IMOV
24206 || dep_insn_type == TYPE_FMOV)
24207 cost = 1;
24208 else if (cost > 1)
24209 cost--;
24210 }
24211 break;
24212
24213 case PROCESSOR_K6:
24214 memory = get_attr_memory (insn);
24215
24216 /* The esp dependency is resolved before the instruction is really
24217 finished. */
24218 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
24219 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
24220 return 1;
24221
24222 /* INT->FP conversion is expensive. */
24223 if (get_attr_fp_int_src (dep_insn))
24224 cost += 5;
24225
24226 /* Show ability of reorder buffer to hide latency of load by executing
24227 in parallel with previous instruction in case
24228 previous instruction is not needed to compute the address. */
24229 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24230 && !ix86_agi_dependent (dep_insn, insn))
24231 {
24232 /* Claim moves to take one cycle, as core can issue one load
24233 at time and the next load can start cycle later. */
24234 if (dep_insn_type == TYPE_IMOV
24235 || dep_insn_type == TYPE_FMOV)
24236 cost = 1;
24237 else if (cost > 2)
24238 cost -= 2;
24239 else
24240 cost = 1;
24241 }
24242 break;
24243
24244 case PROCESSOR_ATHLON:
24245 case PROCESSOR_K8:
24246 case PROCESSOR_AMDFAM10:
24247 case PROCESSOR_BDVER1:
24248 case PROCESSOR_BDVER2:
24249 case PROCESSOR_BDVER3:
24250 case PROCESSOR_BTVER1:
24251 case PROCESSOR_BTVER2:
24252 case PROCESSOR_ATOM:
24253 case PROCESSOR_GENERIC32:
24254 case PROCESSOR_GENERIC64:
24255 memory = get_attr_memory (insn);
24256
24257 /* Show ability of reorder buffer to hide latency of load by executing
24258 in parallel with previous instruction in case
24259 previous instruction is not needed to compute the address. */
24260 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24261 && !ix86_agi_dependent (dep_insn, insn))
24262 {
24263 enum attr_unit unit = get_attr_unit (insn);
24264 int loadcost = 3;
24265
24266 /* Because of the difference between the length of integer and
24267 floating unit pipeline preparation stages, the memory operands
24268 for floating point are cheaper.
24269
24270 ??? For Athlon it the difference is most probably 2. */
24271 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
24272 loadcost = 3;
24273 else
24274 loadcost = TARGET_ATHLON ? 2 : 0;
24275
24276 if (cost >= loadcost)
24277 cost -= loadcost;
24278 else
24279 cost = 0;
24280 }
24281
24282 default:
24283 break;
24284 }
24285
24286 return cost;
24287 }
24288
24289 /* How many alternative schedules to try. This should be as wide as the
24290 scheduling freedom in the DFA, but no wider. Making this value too
24291 large results extra work for the scheduler. */
24292
24293 static int
24294 ia32_multipass_dfa_lookahead (void)
24295 {
24296 switch (ix86_tune)
24297 {
24298 case PROCESSOR_PENTIUM:
24299 return 2;
24300
24301 case PROCESSOR_PENTIUMPRO:
24302 case PROCESSOR_K6:
24303 return 1;
24304
24305 case PROCESSOR_CORE2:
24306 case PROCESSOR_COREI7:
24307 case PROCESSOR_ATOM:
24308 /* Generally, we want haifa-sched:max_issue() to look ahead as far
24309 as many instructions can be executed on a cycle, i.e.,
24310 issue_rate. I wonder why tuning for many CPUs does not do this. */
24311 if (reload_completed)
24312 return ix86_issue_rate ();
24313 /* Don't use lookahead for pre-reload schedule to save compile time. */
24314 return 0;
24315
24316 default:
24317 return 0;
24318 }
24319 }
24320
24321 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
24322 execution. It is applied if
24323 (1) IMUL instruction is on the top of list;
24324 (2) There exists the only producer of independent IMUL instruction in
24325 ready list;
24326 (3) Put found producer on the top of ready list.
24327 Returns issue rate. */
24328
24329 static int
24330 ix86_sched_reorder(FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
24331 int clock_var ATTRIBUTE_UNUSED)
24332 {
24333 static int issue_rate = -1;
24334 int n_ready = *pn_ready;
24335 rtx insn, insn1, insn2;
24336 int i;
24337 sd_iterator_def sd_it;
24338 dep_t dep;
24339 int index = -1;
24340
24341 /* Set up issue rate. */
24342 issue_rate = ix86_issue_rate();
24343
24344 /* Do reodering for Atom only. */
24345 if (ix86_tune != PROCESSOR_ATOM)
24346 return issue_rate;
24347 /* Do not perform ready list reodering for pre-reload schedule pass. */
24348 if (!reload_completed)
24349 return issue_rate;
24350 /* Nothing to do if ready list contains only 1 instruction. */
24351 if (n_ready <= 1)
24352 return issue_rate;
24353
24354 /* Check that IMUL instruction is on the top of ready list. */
24355 insn = ready[n_ready - 1];
24356 if (!NONDEBUG_INSN_P (insn))
24357 return issue_rate;
24358 insn = PATTERN (insn);
24359 if (GET_CODE (insn) == PARALLEL)
24360 insn = XVECEXP (insn, 0, 0);
24361 if (GET_CODE (insn) != SET)
24362 return issue_rate;
24363 if (!(GET_CODE (SET_SRC (insn)) == MULT
24364 && GET_MODE (SET_SRC (insn)) == SImode))
24365 return issue_rate;
24366
24367 /* Search for producer of independent IMUL instruction. */
24368 for (i = n_ready - 2; i>= 0; i--)
24369 {
24370 insn = ready[i];
24371 if (!NONDEBUG_INSN_P (insn))
24372 continue;
24373 /* Skip IMUL instruction. */
24374 insn2 = PATTERN (insn);
24375 if (GET_CODE (insn2) == PARALLEL)
24376 insn2 = XVECEXP (insn2, 0, 0);
24377 if (GET_CODE (insn2) == SET
24378 && GET_CODE (SET_SRC (insn2)) == MULT
24379 && GET_MODE (SET_SRC (insn2)) == SImode)
24380 continue;
24381
24382 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
24383 {
24384 rtx con;
24385 con = DEP_CON (dep);
24386 if (!NONDEBUG_INSN_P (con))
24387 continue;
24388 insn1 = PATTERN (con);
24389 if (GET_CODE (insn1) == PARALLEL)
24390 insn1 = XVECEXP (insn1, 0, 0);
24391
24392 if (GET_CODE (insn1) == SET
24393 && GET_CODE (SET_SRC (insn1)) == MULT
24394 && GET_MODE (SET_SRC (insn1)) == SImode)
24395 {
24396 sd_iterator_def sd_it1;
24397 dep_t dep1;
24398 /* Check if there is no other dependee for IMUL. */
24399 index = i;
24400 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
24401 {
24402 rtx pro;
24403 pro = DEP_PRO (dep1);
24404 if (!NONDEBUG_INSN_P (pro))
24405 continue;
24406 if (pro != insn)
24407 index = -1;
24408 }
24409 if (index >= 0)
24410 break;
24411 }
24412 }
24413 if (index >= 0)
24414 break;
24415 }
24416 if (index < 0)
24417 return issue_rate; /* Didn't find IMUL producer. */
24418
24419 if (sched_verbose > 1)
24420 fprintf(dump, ";;\tatom sched_reorder: swap %d and %d insns\n",
24421 INSN_UID (ready[index]), INSN_UID (ready[n_ready - 1]));
24422
24423 /* Put IMUL producer (ready[index]) at the top of ready list. */
24424 insn1= ready[index];
24425 for (i = index; i < n_ready - 1; i++)
24426 ready[i] = ready[i + 1];
24427 ready[n_ready - 1] = insn1;
24428
24429 return issue_rate;
24430 }
24431
24432 static bool
24433 ix86_class_likely_spilled_p (reg_class_t);
24434
24435 /* Returns true if lhs of insn is HW function argument register and set up
24436 is_spilled to true if it is likely spilled HW register. */
24437 static bool
24438 insn_is_function_arg (rtx insn, bool* is_spilled)
24439 {
24440 rtx dst;
24441
24442 if (!NONDEBUG_INSN_P (insn))
24443 return false;
24444 /* Call instructions are not movable, ignore it. */
24445 if (CALL_P (insn))
24446 return false;
24447 insn = PATTERN (insn);
24448 if (GET_CODE (insn) == PARALLEL)
24449 insn = XVECEXP (insn, 0, 0);
24450 if (GET_CODE (insn) != SET)
24451 return false;
24452 dst = SET_DEST (insn);
24453 if (REG_P (dst) && HARD_REGISTER_P (dst)
24454 && ix86_function_arg_regno_p (REGNO (dst)))
24455 {
24456 /* Is it likely spilled HW register? */
24457 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
24458 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
24459 *is_spilled = true;
24460 return true;
24461 }
24462 return false;
24463 }
24464
24465 /* Add output dependencies for chain of function adjacent arguments if only
24466 there is a move to likely spilled HW register. Return first argument
24467 if at least one dependence was added or NULL otherwise. */
24468 static rtx
24469 add_parameter_dependencies (rtx call, rtx head)
24470 {
24471 rtx insn;
24472 rtx last = call;
24473 rtx first_arg = NULL;
24474 bool is_spilled = false;
24475
24476 head = PREV_INSN (head);
24477
24478 /* Find nearest to call argument passing instruction. */
24479 while (true)
24480 {
24481 last = PREV_INSN (last);
24482 if (last == head)
24483 return NULL;
24484 if (!NONDEBUG_INSN_P (last))
24485 continue;
24486 if (insn_is_function_arg (last, &is_spilled))
24487 break;
24488 return NULL;
24489 }
24490
24491 first_arg = last;
24492 while (true)
24493 {
24494 insn = PREV_INSN (last);
24495 if (!INSN_P (insn))
24496 break;
24497 if (insn == head)
24498 break;
24499 if (!NONDEBUG_INSN_P (insn))
24500 {
24501 last = insn;
24502 continue;
24503 }
24504 if (insn_is_function_arg (insn, &is_spilled))
24505 {
24506 /* Add output depdendence between two function arguments if chain
24507 of output arguments contains likely spilled HW registers. */
24508 if (is_spilled)
24509 add_dependence (last, insn, REG_DEP_OUTPUT);
24510 first_arg = last = insn;
24511 }
24512 else
24513 break;
24514 }
24515 if (!is_spilled)
24516 return NULL;
24517 return first_arg;
24518 }
24519
24520 /* Add output or anti dependency from insn to first_arg to restrict its code
24521 motion. */
24522 static void
24523 avoid_func_arg_motion (rtx first_arg, rtx insn)
24524 {
24525 rtx set;
24526 rtx tmp;
24527
24528 set = single_set (insn);
24529 if (!set)
24530 return;
24531 tmp = SET_DEST (set);
24532 if (REG_P (tmp))
24533 {
24534 /* Add output dependency to the first function argument. */
24535 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
24536 return;
24537 }
24538 /* Add anti dependency. */
24539 add_dependence (first_arg, insn, REG_DEP_ANTI);
24540 }
24541
24542 /* Avoid cross block motion of function argument through adding dependency
24543 from the first non-jump instruction in bb. */
24544 static void
24545 add_dependee_for_func_arg (rtx arg, basic_block bb)
24546 {
24547 rtx insn = BB_END (bb);
24548
24549 while (insn)
24550 {
24551 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
24552 {
24553 rtx set = single_set (insn);
24554 if (set)
24555 {
24556 avoid_func_arg_motion (arg, insn);
24557 return;
24558 }
24559 }
24560 if (insn == BB_HEAD (bb))
24561 return;
24562 insn = PREV_INSN (insn);
24563 }
24564 }
24565
24566 /* Hook for pre-reload schedule - avoid motion of function arguments
24567 passed in likely spilled HW registers. */
24568 static void
24569 ix86_dependencies_evaluation_hook (rtx head, rtx tail)
24570 {
24571 rtx insn;
24572 rtx first_arg = NULL;
24573 if (reload_completed)
24574 return;
24575 while (head != tail && DEBUG_INSN_P (head))
24576 head = NEXT_INSN (head);
24577 for (insn = tail; insn != head; insn = PREV_INSN (insn))
24578 if (INSN_P (insn) && CALL_P (insn))
24579 {
24580 first_arg = add_parameter_dependencies (insn, head);
24581 if (first_arg)
24582 {
24583 /* Add dependee for first argument to predecessors if only
24584 region contains more than one block. */
24585 basic_block bb = BLOCK_FOR_INSN (insn);
24586 int rgn = CONTAINING_RGN (bb->index);
24587 int nr_blks = RGN_NR_BLOCKS (rgn);
24588 /* Skip trivial regions and region head blocks that can have
24589 predecessors outside of region. */
24590 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
24591 {
24592 edge e;
24593 edge_iterator ei;
24594 /* Assume that region is SCC, i.e. all immediate predecessors
24595 of non-head block are in the same region. */
24596 FOR_EACH_EDGE (e, ei, bb->preds)
24597 {
24598 /* Avoid creating of loop-carried dependencies through
24599 using topological odering in region. */
24600 if (BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
24601 add_dependee_for_func_arg (first_arg, e->src);
24602 }
24603 }
24604 insn = first_arg;
24605 if (insn == head)
24606 break;
24607 }
24608 }
24609 else if (first_arg)
24610 avoid_func_arg_motion (first_arg, insn);
24611 }
24612
24613 /* Hook for pre-reload schedule - set priority of moves from likely spilled
24614 HW registers to maximum, to schedule them at soon as possible. These are
24615 moves from function argument registers at the top of the function entry
24616 and moves from function return value registers after call. */
24617 static int
24618 ix86_adjust_priority (rtx insn, int priority)
24619 {
24620 rtx set;
24621
24622 if (reload_completed)
24623 return priority;
24624
24625 if (!NONDEBUG_INSN_P (insn))
24626 return priority;
24627
24628 set = single_set (insn);
24629 if (set)
24630 {
24631 rtx tmp = SET_SRC (set);
24632 if (REG_P (tmp)
24633 && HARD_REGISTER_P (tmp)
24634 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
24635 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
24636 return current_sched_info->sched_max_insns_priority;
24637 }
24638
24639 return priority;
24640 }
24641
24642 /* Model decoder of Core 2/i7.
24643 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
24644 track the instruction fetch block boundaries and make sure that long
24645 (9+ bytes) instructions are assigned to D0. */
24646
24647 /* Maximum length of an insn that can be handled by
24648 a secondary decoder unit. '8' for Core 2/i7. */
24649 static int core2i7_secondary_decoder_max_insn_size;
24650
24651 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
24652 '16' for Core 2/i7. */
24653 static int core2i7_ifetch_block_size;
24654
24655 /* Maximum number of instructions decoder can handle per cycle.
24656 '6' for Core 2/i7. */
24657 static int core2i7_ifetch_block_max_insns;
24658
24659 typedef struct ix86_first_cycle_multipass_data_ *
24660 ix86_first_cycle_multipass_data_t;
24661 typedef const struct ix86_first_cycle_multipass_data_ *
24662 const_ix86_first_cycle_multipass_data_t;
24663
24664 /* A variable to store target state across calls to max_issue within
24665 one cycle. */
24666 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
24667 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
24668
24669 /* Initialize DATA. */
24670 static void
24671 core2i7_first_cycle_multipass_init (void *_data)
24672 {
24673 ix86_first_cycle_multipass_data_t data
24674 = (ix86_first_cycle_multipass_data_t) _data;
24675
24676 data->ifetch_block_len = 0;
24677 data->ifetch_block_n_insns = 0;
24678 data->ready_try_change = NULL;
24679 data->ready_try_change_size = 0;
24680 }
24681
24682 /* Advancing the cycle; reset ifetch block counts. */
24683 static void
24684 core2i7_dfa_post_advance_cycle (void)
24685 {
24686 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
24687
24688 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24689
24690 data->ifetch_block_len = 0;
24691 data->ifetch_block_n_insns = 0;
24692 }
24693
24694 static int min_insn_size (rtx);
24695
24696 /* Filter out insns from ready_try that the core will not be able to issue
24697 on current cycle due to decoder. */
24698 static void
24699 core2i7_first_cycle_multipass_filter_ready_try
24700 (const_ix86_first_cycle_multipass_data_t data,
24701 char *ready_try, int n_ready, bool first_cycle_insn_p)
24702 {
24703 while (n_ready--)
24704 {
24705 rtx insn;
24706 int insn_size;
24707
24708 if (ready_try[n_ready])
24709 continue;
24710
24711 insn = get_ready_element (n_ready);
24712 insn_size = min_insn_size (insn);
24713
24714 if (/* If this is a too long an insn for a secondary decoder ... */
24715 (!first_cycle_insn_p
24716 && insn_size > core2i7_secondary_decoder_max_insn_size)
24717 /* ... or it would not fit into the ifetch block ... */
24718 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
24719 /* ... or the decoder is full already ... */
24720 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
24721 /* ... mask the insn out. */
24722 {
24723 ready_try[n_ready] = 1;
24724
24725 if (data->ready_try_change)
24726 bitmap_set_bit (data->ready_try_change, n_ready);
24727 }
24728 }
24729 }
24730
24731 /* Prepare for a new round of multipass lookahead scheduling. */
24732 static void
24733 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
24734 bool first_cycle_insn_p)
24735 {
24736 ix86_first_cycle_multipass_data_t data
24737 = (ix86_first_cycle_multipass_data_t) _data;
24738 const_ix86_first_cycle_multipass_data_t prev_data
24739 = ix86_first_cycle_multipass_data;
24740
24741 /* Restore the state from the end of the previous round. */
24742 data->ifetch_block_len = prev_data->ifetch_block_len;
24743 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
24744
24745 /* Filter instructions that cannot be issued on current cycle due to
24746 decoder restrictions. */
24747 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24748 first_cycle_insn_p);
24749 }
24750
24751 /* INSN is being issued in current solution. Account for its impact on
24752 the decoder model. */
24753 static void
24754 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
24755 rtx insn, const void *_prev_data)
24756 {
24757 ix86_first_cycle_multipass_data_t data
24758 = (ix86_first_cycle_multipass_data_t) _data;
24759 const_ix86_first_cycle_multipass_data_t prev_data
24760 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
24761
24762 int insn_size = min_insn_size (insn);
24763
24764 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
24765 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
24766 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
24767 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24768
24769 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
24770 if (!data->ready_try_change)
24771 {
24772 data->ready_try_change = sbitmap_alloc (n_ready);
24773 data->ready_try_change_size = n_ready;
24774 }
24775 else if (data->ready_try_change_size < n_ready)
24776 {
24777 data->ready_try_change = sbitmap_resize (data->ready_try_change,
24778 n_ready, 0);
24779 data->ready_try_change_size = n_ready;
24780 }
24781 bitmap_clear (data->ready_try_change);
24782
24783 /* Filter out insns from ready_try that the core will not be able to issue
24784 on current cycle due to decoder. */
24785 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24786 false);
24787 }
24788
24789 /* Revert the effect on ready_try. */
24790 static void
24791 core2i7_first_cycle_multipass_backtrack (const void *_data,
24792 char *ready_try,
24793 int n_ready ATTRIBUTE_UNUSED)
24794 {
24795 const_ix86_first_cycle_multipass_data_t data
24796 = (const_ix86_first_cycle_multipass_data_t) _data;
24797 unsigned int i = 0;
24798 sbitmap_iterator sbi;
24799
24800 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
24801 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
24802 {
24803 ready_try[i] = 0;
24804 }
24805 }
24806
24807 /* Save the result of multipass lookahead scheduling for the next round. */
24808 static void
24809 core2i7_first_cycle_multipass_end (const void *_data)
24810 {
24811 const_ix86_first_cycle_multipass_data_t data
24812 = (const_ix86_first_cycle_multipass_data_t) _data;
24813 ix86_first_cycle_multipass_data_t next_data
24814 = ix86_first_cycle_multipass_data;
24815
24816 if (data != NULL)
24817 {
24818 next_data->ifetch_block_len = data->ifetch_block_len;
24819 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
24820 }
24821 }
24822
24823 /* Deallocate target data. */
24824 static void
24825 core2i7_first_cycle_multipass_fini (void *_data)
24826 {
24827 ix86_first_cycle_multipass_data_t data
24828 = (ix86_first_cycle_multipass_data_t) _data;
24829
24830 if (data->ready_try_change)
24831 {
24832 sbitmap_free (data->ready_try_change);
24833 data->ready_try_change = NULL;
24834 data->ready_try_change_size = 0;
24835 }
24836 }
24837
24838 /* Prepare for scheduling pass. */
24839 static void
24840 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
24841 int verbose ATTRIBUTE_UNUSED,
24842 int max_uid ATTRIBUTE_UNUSED)
24843 {
24844 /* Install scheduling hooks for current CPU. Some of these hooks are used
24845 in time-critical parts of the scheduler, so we only set them up when
24846 they are actually used. */
24847 switch (ix86_tune)
24848 {
24849 case PROCESSOR_CORE2:
24850 case PROCESSOR_COREI7:
24851 /* Do not perform multipass scheduling for pre-reload schedule
24852 to save compile time. */
24853 if (reload_completed)
24854 {
24855 targetm.sched.dfa_post_advance_cycle
24856 = core2i7_dfa_post_advance_cycle;
24857 targetm.sched.first_cycle_multipass_init
24858 = core2i7_first_cycle_multipass_init;
24859 targetm.sched.first_cycle_multipass_begin
24860 = core2i7_first_cycle_multipass_begin;
24861 targetm.sched.first_cycle_multipass_issue
24862 = core2i7_first_cycle_multipass_issue;
24863 targetm.sched.first_cycle_multipass_backtrack
24864 = core2i7_first_cycle_multipass_backtrack;
24865 targetm.sched.first_cycle_multipass_end
24866 = core2i7_first_cycle_multipass_end;
24867 targetm.sched.first_cycle_multipass_fini
24868 = core2i7_first_cycle_multipass_fini;
24869
24870 /* Set decoder parameters. */
24871 core2i7_secondary_decoder_max_insn_size = 8;
24872 core2i7_ifetch_block_size = 16;
24873 core2i7_ifetch_block_max_insns = 6;
24874 break;
24875 }
24876 /* ... Fall through ... */
24877 default:
24878 targetm.sched.dfa_post_advance_cycle = NULL;
24879 targetm.sched.first_cycle_multipass_init = NULL;
24880 targetm.sched.first_cycle_multipass_begin = NULL;
24881 targetm.sched.first_cycle_multipass_issue = NULL;
24882 targetm.sched.first_cycle_multipass_backtrack = NULL;
24883 targetm.sched.first_cycle_multipass_end = NULL;
24884 targetm.sched.first_cycle_multipass_fini = NULL;
24885 break;
24886 }
24887 }
24888
24889 \f
24890 /* Compute the alignment given to a constant that is being placed in memory.
24891 EXP is the constant and ALIGN is the alignment that the object would
24892 ordinarily have.
24893 The value of this function is used instead of that alignment to align
24894 the object. */
24895
24896 int
24897 ix86_constant_alignment (tree exp, int align)
24898 {
24899 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
24900 || TREE_CODE (exp) == INTEGER_CST)
24901 {
24902 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
24903 return 64;
24904 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
24905 return 128;
24906 }
24907 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
24908 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
24909 return BITS_PER_WORD;
24910
24911 return align;
24912 }
24913
24914 /* Compute the alignment for a static variable.
24915 TYPE is the data type, and ALIGN is the alignment that
24916 the object would ordinarily have. The value of this function is used
24917 instead of that alignment to align the object. */
24918
24919 int
24920 ix86_data_alignment (tree type, int align)
24921 {
24922 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
24923
24924 if (AGGREGATE_TYPE_P (type)
24925 && TYPE_SIZE (type)
24926 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24927 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
24928 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
24929 && align < max_align)
24930 align = max_align;
24931
24932 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24933 to 16byte boundary. */
24934 if (TARGET_64BIT)
24935 {
24936 if (AGGREGATE_TYPE_P (type)
24937 && TYPE_SIZE (type)
24938 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24939 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
24940 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24941 return 128;
24942 }
24943
24944 if (TREE_CODE (type) == ARRAY_TYPE)
24945 {
24946 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24947 return 64;
24948 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24949 return 128;
24950 }
24951 else if (TREE_CODE (type) == COMPLEX_TYPE)
24952 {
24953
24954 if (TYPE_MODE (type) == DCmode && align < 64)
24955 return 64;
24956 if ((TYPE_MODE (type) == XCmode
24957 || TYPE_MODE (type) == TCmode) && align < 128)
24958 return 128;
24959 }
24960 else if ((TREE_CODE (type) == RECORD_TYPE
24961 || TREE_CODE (type) == UNION_TYPE
24962 || TREE_CODE (type) == QUAL_UNION_TYPE)
24963 && TYPE_FIELDS (type))
24964 {
24965 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24966 return 64;
24967 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24968 return 128;
24969 }
24970 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24971 || TREE_CODE (type) == INTEGER_TYPE)
24972 {
24973 if (TYPE_MODE (type) == DFmode && align < 64)
24974 return 64;
24975 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24976 return 128;
24977 }
24978
24979 return align;
24980 }
24981
24982 /* Compute the alignment for a local variable or a stack slot. EXP is
24983 the data type or decl itself, MODE is the widest mode available and
24984 ALIGN is the alignment that the object would ordinarily have. The
24985 value of this macro is used instead of that alignment to align the
24986 object. */
24987
24988 unsigned int
24989 ix86_local_alignment (tree exp, enum machine_mode mode,
24990 unsigned int align)
24991 {
24992 tree type, decl;
24993
24994 if (exp && DECL_P (exp))
24995 {
24996 type = TREE_TYPE (exp);
24997 decl = exp;
24998 }
24999 else
25000 {
25001 type = exp;
25002 decl = NULL;
25003 }
25004
25005 /* Don't do dynamic stack realignment for long long objects with
25006 -mpreferred-stack-boundary=2. */
25007 if (!TARGET_64BIT
25008 && align == 64
25009 && ix86_preferred_stack_boundary < 64
25010 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
25011 && (!type || !TYPE_USER_ALIGN (type))
25012 && (!decl || !DECL_USER_ALIGN (decl)))
25013 align = 32;
25014
25015 /* If TYPE is NULL, we are allocating a stack slot for caller-save
25016 register in MODE. We will return the largest alignment of XF
25017 and DF. */
25018 if (!type)
25019 {
25020 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
25021 align = GET_MODE_ALIGNMENT (DFmode);
25022 return align;
25023 }
25024
25025 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
25026 to 16byte boundary. Exact wording is:
25027
25028 An array uses the same alignment as its elements, except that a local or
25029 global array variable of length at least 16 bytes or
25030 a C99 variable-length array variable always has alignment of at least 16 bytes.
25031
25032 This was added to allow use of aligned SSE instructions at arrays. This
25033 rule is meant for static storage (where compiler can not do the analysis
25034 by itself). We follow it for automatic variables only when convenient.
25035 We fully control everything in the function compiled and functions from
25036 other unit can not rely on the alignment.
25037
25038 Exclude va_list type. It is the common case of local array where
25039 we can not benefit from the alignment. */
25040 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
25041 && TARGET_SSE)
25042 {
25043 if (AGGREGATE_TYPE_P (type)
25044 && (va_list_type_node == NULL_TREE
25045 || (TYPE_MAIN_VARIANT (type)
25046 != TYPE_MAIN_VARIANT (va_list_type_node)))
25047 && TYPE_SIZE (type)
25048 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
25049 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
25050 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
25051 return 128;
25052 }
25053 if (TREE_CODE (type) == ARRAY_TYPE)
25054 {
25055 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
25056 return 64;
25057 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
25058 return 128;
25059 }
25060 else if (TREE_CODE (type) == COMPLEX_TYPE)
25061 {
25062 if (TYPE_MODE (type) == DCmode && align < 64)
25063 return 64;
25064 if ((TYPE_MODE (type) == XCmode
25065 || TYPE_MODE (type) == TCmode) && align < 128)
25066 return 128;
25067 }
25068 else if ((TREE_CODE (type) == RECORD_TYPE
25069 || TREE_CODE (type) == UNION_TYPE
25070 || TREE_CODE (type) == QUAL_UNION_TYPE)
25071 && TYPE_FIELDS (type))
25072 {
25073 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
25074 return 64;
25075 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
25076 return 128;
25077 }
25078 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
25079 || TREE_CODE (type) == INTEGER_TYPE)
25080 {
25081
25082 if (TYPE_MODE (type) == DFmode && align < 64)
25083 return 64;
25084 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
25085 return 128;
25086 }
25087 return align;
25088 }
25089
25090 /* Compute the minimum required alignment for dynamic stack realignment
25091 purposes for a local variable, parameter or a stack slot. EXP is
25092 the data type or decl itself, MODE is its mode and ALIGN is the
25093 alignment that the object would ordinarily have. */
25094
25095 unsigned int
25096 ix86_minimum_alignment (tree exp, enum machine_mode mode,
25097 unsigned int align)
25098 {
25099 tree type, decl;
25100
25101 if (exp && DECL_P (exp))
25102 {
25103 type = TREE_TYPE (exp);
25104 decl = exp;
25105 }
25106 else
25107 {
25108 type = exp;
25109 decl = NULL;
25110 }
25111
25112 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
25113 return align;
25114
25115 /* Don't do dynamic stack realignment for long long objects with
25116 -mpreferred-stack-boundary=2. */
25117 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
25118 && (!type || !TYPE_USER_ALIGN (type))
25119 && (!decl || !DECL_USER_ALIGN (decl)))
25120 return 32;
25121
25122 return align;
25123 }
25124 \f
25125 /* Find a location for the static chain incoming to a nested function.
25126 This is a register, unless all free registers are used by arguments. */
25127
25128 static rtx
25129 ix86_static_chain (const_tree fndecl, bool incoming_p)
25130 {
25131 unsigned regno;
25132
25133 if (!DECL_STATIC_CHAIN (fndecl))
25134 return NULL;
25135
25136 if (TARGET_64BIT)
25137 {
25138 /* We always use R10 in 64-bit mode. */
25139 regno = R10_REG;
25140 }
25141 else
25142 {
25143 tree fntype;
25144 unsigned int ccvt;
25145
25146 /* By default in 32-bit mode we use ECX to pass the static chain. */
25147 regno = CX_REG;
25148
25149 fntype = TREE_TYPE (fndecl);
25150 ccvt = ix86_get_callcvt (fntype);
25151 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
25152 {
25153 /* Fastcall functions use ecx/edx for arguments, which leaves
25154 us with EAX for the static chain.
25155 Thiscall functions use ecx for arguments, which also
25156 leaves us with EAX for the static chain. */
25157 regno = AX_REG;
25158 }
25159 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
25160 {
25161 /* Thiscall functions use ecx for arguments, which leaves
25162 us with EAX and EDX for the static chain.
25163 We are using for abi-compatibility EAX. */
25164 regno = AX_REG;
25165 }
25166 else if (ix86_function_regparm (fntype, fndecl) == 3)
25167 {
25168 /* For regparm 3, we have no free call-clobbered registers in
25169 which to store the static chain. In order to implement this,
25170 we have the trampoline push the static chain to the stack.
25171 However, we can't push a value below the return address when
25172 we call the nested function directly, so we have to use an
25173 alternate entry point. For this we use ESI, and have the
25174 alternate entry point push ESI, so that things appear the
25175 same once we're executing the nested function. */
25176 if (incoming_p)
25177 {
25178 if (fndecl == current_function_decl)
25179 ix86_static_chain_on_stack = true;
25180 return gen_frame_mem (SImode,
25181 plus_constant (Pmode,
25182 arg_pointer_rtx, -8));
25183 }
25184 regno = SI_REG;
25185 }
25186 }
25187
25188 return gen_rtx_REG (Pmode, regno);
25189 }
25190
25191 /* Emit RTL insns to initialize the variable parts of a trampoline.
25192 FNDECL is the decl of the target address; M_TRAMP is a MEM for
25193 the trampoline, and CHAIN_VALUE is an RTX for the static chain
25194 to be passed to the target function. */
25195
25196 static void
25197 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
25198 {
25199 rtx mem, fnaddr;
25200 int opcode;
25201 int offset = 0;
25202
25203 fnaddr = XEXP (DECL_RTL (fndecl), 0);
25204
25205 if (TARGET_64BIT)
25206 {
25207 int size;
25208
25209 /* Load the function address to r11. Try to load address using
25210 the shorter movl instead of movabs. We may want to support
25211 movq for kernel mode, but kernel does not use trampolines at
25212 the moment. FNADDR is a 32bit address and may not be in
25213 DImode when ptr_mode == SImode. Always use movl in this
25214 case. */
25215 if (ptr_mode == SImode
25216 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
25217 {
25218 fnaddr = copy_addr_to_reg (fnaddr);
25219
25220 mem = adjust_address (m_tramp, HImode, offset);
25221 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
25222
25223 mem = adjust_address (m_tramp, SImode, offset + 2);
25224 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
25225 offset += 6;
25226 }
25227 else
25228 {
25229 mem = adjust_address (m_tramp, HImode, offset);
25230 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
25231
25232 mem = adjust_address (m_tramp, DImode, offset + 2);
25233 emit_move_insn (mem, fnaddr);
25234 offset += 10;
25235 }
25236
25237 /* Load static chain using movabs to r10. Use the shorter movl
25238 instead of movabs when ptr_mode == SImode. */
25239 if (ptr_mode == SImode)
25240 {
25241 opcode = 0xba41;
25242 size = 6;
25243 }
25244 else
25245 {
25246 opcode = 0xba49;
25247 size = 10;
25248 }
25249
25250 mem = adjust_address (m_tramp, HImode, offset);
25251 emit_move_insn (mem, gen_int_mode (opcode, HImode));
25252
25253 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
25254 emit_move_insn (mem, chain_value);
25255 offset += size;
25256
25257 /* Jump to r11; the last (unused) byte is a nop, only there to
25258 pad the write out to a single 32-bit store. */
25259 mem = adjust_address (m_tramp, SImode, offset);
25260 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
25261 offset += 4;
25262 }
25263 else
25264 {
25265 rtx disp, chain;
25266
25267 /* Depending on the static chain location, either load a register
25268 with a constant, or push the constant to the stack. All of the
25269 instructions are the same size. */
25270 chain = ix86_static_chain (fndecl, true);
25271 if (REG_P (chain))
25272 {
25273 switch (REGNO (chain))
25274 {
25275 case AX_REG:
25276 opcode = 0xb8; break;
25277 case CX_REG:
25278 opcode = 0xb9; break;
25279 default:
25280 gcc_unreachable ();
25281 }
25282 }
25283 else
25284 opcode = 0x68;
25285
25286 mem = adjust_address (m_tramp, QImode, offset);
25287 emit_move_insn (mem, gen_int_mode (opcode, QImode));
25288
25289 mem = adjust_address (m_tramp, SImode, offset + 1);
25290 emit_move_insn (mem, chain_value);
25291 offset += 5;
25292
25293 mem = adjust_address (m_tramp, QImode, offset);
25294 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
25295
25296 mem = adjust_address (m_tramp, SImode, offset + 1);
25297
25298 /* Compute offset from the end of the jmp to the target function.
25299 In the case in which the trampoline stores the static chain on
25300 the stack, we need to skip the first insn which pushes the
25301 (call-saved) register static chain; this push is 1 byte. */
25302 offset += 5;
25303 disp = expand_binop (SImode, sub_optab, fnaddr,
25304 plus_constant (Pmode, XEXP (m_tramp, 0),
25305 offset - (MEM_P (chain) ? 1 : 0)),
25306 NULL_RTX, 1, OPTAB_DIRECT);
25307 emit_move_insn (mem, disp);
25308 }
25309
25310 gcc_assert (offset <= TRAMPOLINE_SIZE);
25311
25312 #ifdef HAVE_ENABLE_EXECUTE_STACK
25313 #ifdef CHECK_EXECUTE_STACK_ENABLED
25314 if (CHECK_EXECUTE_STACK_ENABLED)
25315 #endif
25316 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
25317 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
25318 #endif
25319 }
25320 \f
25321 /* The following file contains several enumerations and data structures
25322 built from the definitions in i386-builtin-types.def. */
25323
25324 #include "i386-builtin-types.inc"
25325
25326 /* Table for the ix86 builtin non-function types. */
25327 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
25328
25329 /* Retrieve an element from the above table, building some of
25330 the types lazily. */
25331
25332 static tree
25333 ix86_get_builtin_type (enum ix86_builtin_type tcode)
25334 {
25335 unsigned int index;
25336 tree type, itype;
25337
25338 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
25339
25340 type = ix86_builtin_type_tab[(int) tcode];
25341 if (type != NULL)
25342 return type;
25343
25344 gcc_assert (tcode > IX86_BT_LAST_PRIM);
25345 if (tcode <= IX86_BT_LAST_VECT)
25346 {
25347 enum machine_mode mode;
25348
25349 index = tcode - IX86_BT_LAST_PRIM - 1;
25350 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
25351 mode = ix86_builtin_type_vect_mode[index];
25352
25353 type = build_vector_type_for_mode (itype, mode);
25354 }
25355 else
25356 {
25357 int quals;
25358
25359 index = tcode - IX86_BT_LAST_VECT - 1;
25360 if (tcode <= IX86_BT_LAST_PTR)
25361 quals = TYPE_UNQUALIFIED;
25362 else
25363 quals = TYPE_QUAL_CONST;
25364
25365 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
25366 if (quals != TYPE_UNQUALIFIED)
25367 itype = build_qualified_type (itype, quals);
25368
25369 type = build_pointer_type (itype);
25370 }
25371
25372 ix86_builtin_type_tab[(int) tcode] = type;
25373 return type;
25374 }
25375
25376 /* Table for the ix86 builtin function types. */
25377 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
25378
25379 /* Retrieve an element from the above table, building some of
25380 the types lazily. */
25381
25382 static tree
25383 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
25384 {
25385 tree type;
25386
25387 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
25388
25389 type = ix86_builtin_func_type_tab[(int) tcode];
25390 if (type != NULL)
25391 return type;
25392
25393 if (tcode <= IX86_BT_LAST_FUNC)
25394 {
25395 unsigned start = ix86_builtin_func_start[(int) tcode];
25396 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
25397 tree rtype, atype, args = void_list_node;
25398 unsigned i;
25399
25400 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
25401 for (i = after - 1; i > start; --i)
25402 {
25403 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
25404 args = tree_cons (NULL, atype, args);
25405 }
25406
25407 type = build_function_type (rtype, args);
25408 }
25409 else
25410 {
25411 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
25412 enum ix86_builtin_func_type icode;
25413
25414 icode = ix86_builtin_func_alias_base[index];
25415 type = ix86_get_builtin_func_type (icode);
25416 }
25417
25418 ix86_builtin_func_type_tab[(int) tcode] = type;
25419 return type;
25420 }
25421
25422
25423 /* Codes for all the SSE/MMX builtins. */
25424 enum ix86_builtins
25425 {
25426 IX86_BUILTIN_ADDPS,
25427 IX86_BUILTIN_ADDSS,
25428 IX86_BUILTIN_DIVPS,
25429 IX86_BUILTIN_DIVSS,
25430 IX86_BUILTIN_MULPS,
25431 IX86_BUILTIN_MULSS,
25432 IX86_BUILTIN_SUBPS,
25433 IX86_BUILTIN_SUBSS,
25434
25435 IX86_BUILTIN_CMPEQPS,
25436 IX86_BUILTIN_CMPLTPS,
25437 IX86_BUILTIN_CMPLEPS,
25438 IX86_BUILTIN_CMPGTPS,
25439 IX86_BUILTIN_CMPGEPS,
25440 IX86_BUILTIN_CMPNEQPS,
25441 IX86_BUILTIN_CMPNLTPS,
25442 IX86_BUILTIN_CMPNLEPS,
25443 IX86_BUILTIN_CMPNGTPS,
25444 IX86_BUILTIN_CMPNGEPS,
25445 IX86_BUILTIN_CMPORDPS,
25446 IX86_BUILTIN_CMPUNORDPS,
25447 IX86_BUILTIN_CMPEQSS,
25448 IX86_BUILTIN_CMPLTSS,
25449 IX86_BUILTIN_CMPLESS,
25450 IX86_BUILTIN_CMPNEQSS,
25451 IX86_BUILTIN_CMPNLTSS,
25452 IX86_BUILTIN_CMPNLESS,
25453 IX86_BUILTIN_CMPNGTSS,
25454 IX86_BUILTIN_CMPNGESS,
25455 IX86_BUILTIN_CMPORDSS,
25456 IX86_BUILTIN_CMPUNORDSS,
25457
25458 IX86_BUILTIN_COMIEQSS,
25459 IX86_BUILTIN_COMILTSS,
25460 IX86_BUILTIN_COMILESS,
25461 IX86_BUILTIN_COMIGTSS,
25462 IX86_BUILTIN_COMIGESS,
25463 IX86_BUILTIN_COMINEQSS,
25464 IX86_BUILTIN_UCOMIEQSS,
25465 IX86_BUILTIN_UCOMILTSS,
25466 IX86_BUILTIN_UCOMILESS,
25467 IX86_BUILTIN_UCOMIGTSS,
25468 IX86_BUILTIN_UCOMIGESS,
25469 IX86_BUILTIN_UCOMINEQSS,
25470
25471 IX86_BUILTIN_CVTPI2PS,
25472 IX86_BUILTIN_CVTPS2PI,
25473 IX86_BUILTIN_CVTSI2SS,
25474 IX86_BUILTIN_CVTSI642SS,
25475 IX86_BUILTIN_CVTSS2SI,
25476 IX86_BUILTIN_CVTSS2SI64,
25477 IX86_BUILTIN_CVTTPS2PI,
25478 IX86_BUILTIN_CVTTSS2SI,
25479 IX86_BUILTIN_CVTTSS2SI64,
25480
25481 IX86_BUILTIN_MAXPS,
25482 IX86_BUILTIN_MAXSS,
25483 IX86_BUILTIN_MINPS,
25484 IX86_BUILTIN_MINSS,
25485
25486 IX86_BUILTIN_LOADUPS,
25487 IX86_BUILTIN_STOREUPS,
25488 IX86_BUILTIN_MOVSS,
25489
25490 IX86_BUILTIN_MOVHLPS,
25491 IX86_BUILTIN_MOVLHPS,
25492 IX86_BUILTIN_LOADHPS,
25493 IX86_BUILTIN_LOADLPS,
25494 IX86_BUILTIN_STOREHPS,
25495 IX86_BUILTIN_STORELPS,
25496
25497 IX86_BUILTIN_MASKMOVQ,
25498 IX86_BUILTIN_MOVMSKPS,
25499 IX86_BUILTIN_PMOVMSKB,
25500
25501 IX86_BUILTIN_MOVNTPS,
25502 IX86_BUILTIN_MOVNTQ,
25503
25504 IX86_BUILTIN_LOADDQU,
25505 IX86_BUILTIN_STOREDQU,
25506
25507 IX86_BUILTIN_PACKSSWB,
25508 IX86_BUILTIN_PACKSSDW,
25509 IX86_BUILTIN_PACKUSWB,
25510
25511 IX86_BUILTIN_PADDB,
25512 IX86_BUILTIN_PADDW,
25513 IX86_BUILTIN_PADDD,
25514 IX86_BUILTIN_PADDQ,
25515 IX86_BUILTIN_PADDSB,
25516 IX86_BUILTIN_PADDSW,
25517 IX86_BUILTIN_PADDUSB,
25518 IX86_BUILTIN_PADDUSW,
25519 IX86_BUILTIN_PSUBB,
25520 IX86_BUILTIN_PSUBW,
25521 IX86_BUILTIN_PSUBD,
25522 IX86_BUILTIN_PSUBQ,
25523 IX86_BUILTIN_PSUBSB,
25524 IX86_BUILTIN_PSUBSW,
25525 IX86_BUILTIN_PSUBUSB,
25526 IX86_BUILTIN_PSUBUSW,
25527
25528 IX86_BUILTIN_PAND,
25529 IX86_BUILTIN_PANDN,
25530 IX86_BUILTIN_POR,
25531 IX86_BUILTIN_PXOR,
25532
25533 IX86_BUILTIN_PAVGB,
25534 IX86_BUILTIN_PAVGW,
25535
25536 IX86_BUILTIN_PCMPEQB,
25537 IX86_BUILTIN_PCMPEQW,
25538 IX86_BUILTIN_PCMPEQD,
25539 IX86_BUILTIN_PCMPGTB,
25540 IX86_BUILTIN_PCMPGTW,
25541 IX86_BUILTIN_PCMPGTD,
25542
25543 IX86_BUILTIN_PMADDWD,
25544
25545 IX86_BUILTIN_PMAXSW,
25546 IX86_BUILTIN_PMAXUB,
25547 IX86_BUILTIN_PMINSW,
25548 IX86_BUILTIN_PMINUB,
25549
25550 IX86_BUILTIN_PMULHUW,
25551 IX86_BUILTIN_PMULHW,
25552 IX86_BUILTIN_PMULLW,
25553
25554 IX86_BUILTIN_PSADBW,
25555 IX86_BUILTIN_PSHUFW,
25556
25557 IX86_BUILTIN_PSLLW,
25558 IX86_BUILTIN_PSLLD,
25559 IX86_BUILTIN_PSLLQ,
25560 IX86_BUILTIN_PSRAW,
25561 IX86_BUILTIN_PSRAD,
25562 IX86_BUILTIN_PSRLW,
25563 IX86_BUILTIN_PSRLD,
25564 IX86_BUILTIN_PSRLQ,
25565 IX86_BUILTIN_PSLLWI,
25566 IX86_BUILTIN_PSLLDI,
25567 IX86_BUILTIN_PSLLQI,
25568 IX86_BUILTIN_PSRAWI,
25569 IX86_BUILTIN_PSRADI,
25570 IX86_BUILTIN_PSRLWI,
25571 IX86_BUILTIN_PSRLDI,
25572 IX86_BUILTIN_PSRLQI,
25573
25574 IX86_BUILTIN_PUNPCKHBW,
25575 IX86_BUILTIN_PUNPCKHWD,
25576 IX86_BUILTIN_PUNPCKHDQ,
25577 IX86_BUILTIN_PUNPCKLBW,
25578 IX86_BUILTIN_PUNPCKLWD,
25579 IX86_BUILTIN_PUNPCKLDQ,
25580
25581 IX86_BUILTIN_SHUFPS,
25582
25583 IX86_BUILTIN_RCPPS,
25584 IX86_BUILTIN_RCPSS,
25585 IX86_BUILTIN_RSQRTPS,
25586 IX86_BUILTIN_RSQRTPS_NR,
25587 IX86_BUILTIN_RSQRTSS,
25588 IX86_BUILTIN_RSQRTF,
25589 IX86_BUILTIN_SQRTPS,
25590 IX86_BUILTIN_SQRTPS_NR,
25591 IX86_BUILTIN_SQRTSS,
25592
25593 IX86_BUILTIN_UNPCKHPS,
25594 IX86_BUILTIN_UNPCKLPS,
25595
25596 IX86_BUILTIN_ANDPS,
25597 IX86_BUILTIN_ANDNPS,
25598 IX86_BUILTIN_ORPS,
25599 IX86_BUILTIN_XORPS,
25600
25601 IX86_BUILTIN_EMMS,
25602 IX86_BUILTIN_LDMXCSR,
25603 IX86_BUILTIN_STMXCSR,
25604 IX86_BUILTIN_SFENCE,
25605
25606 IX86_BUILTIN_FXSAVE,
25607 IX86_BUILTIN_FXRSTOR,
25608 IX86_BUILTIN_FXSAVE64,
25609 IX86_BUILTIN_FXRSTOR64,
25610
25611 IX86_BUILTIN_XSAVE,
25612 IX86_BUILTIN_XRSTOR,
25613 IX86_BUILTIN_XSAVE64,
25614 IX86_BUILTIN_XRSTOR64,
25615
25616 IX86_BUILTIN_XSAVEOPT,
25617 IX86_BUILTIN_XSAVEOPT64,
25618
25619 /* 3DNow! Original */
25620 IX86_BUILTIN_FEMMS,
25621 IX86_BUILTIN_PAVGUSB,
25622 IX86_BUILTIN_PF2ID,
25623 IX86_BUILTIN_PFACC,
25624 IX86_BUILTIN_PFADD,
25625 IX86_BUILTIN_PFCMPEQ,
25626 IX86_BUILTIN_PFCMPGE,
25627 IX86_BUILTIN_PFCMPGT,
25628 IX86_BUILTIN_PFMAX,
25629 IX86_BUILTIN_PFMIN,
25630 IX86_BUILTIN_PFMUL,
25631 IX86_BUILTIN_PFRCP,
25632 IX86_BUILTIN_PFRCPIT1,
25633 IX86_BUILTIN_PFRCPIT2,
25634 IX86_BUILTIN_PFRSQIT1,
25635 IX86_BUILTIN_PFRSQRT,
25636 IX86_BUILTIN_PFSUB,
25637 IX86_BUILTIN_PFSUBR,
25638 IX86_BUILTIN_PI2FD,
25639 IX86_BUILTIN_PMULHRW,
25640
25641 /* 3DNow! Athlon Extensions */
25642 IX86_BUILTIN_PF2IW,
25643 IX86_BUILTIN_PFNACC,
25644 IX86_BUILTIN_PFPNACC,
25645 IX86_BUILTIN_PI2FW,
25646 IX86_BUILTIN_PSWAPDSI,
25647 IX86_BUILTIN_PSWAPDSF,
25648
25649 /* SSE2 */
25650 IX86_BUILTIN_ADDPD,
25651 IX86_BUILTIN_ADDSD,
25652 IX86_BUILTIN_DIVPD,
25653 IX86_BUILTIN_DIVSD,
25654 IX86_BUILTIN_MULPD,
25655 IX86_BUILTIN_MULSD,
25656 IX86_BUILTIN_SUBPD,
25657 IX86_BUILTIN_SUBSD,
25658
25659 IX86_BUILTIN_CMPEQPD,
25660 IX86_BUILTIN_CMPLTPD,
25661 IX86_BUILTIN_CMPLEPD,
25662 IX86_BUILTIN_CMPGTPD,
25663 IX86_BUILTIN_CMPGEPD,
25664 IX86_BUILTIN_CMPNEQPD,
25665 IX86_BUILTIN_CMPNLTPD,
25666 IX86_BUILTIN_CMPNLEPD,
25667 IX86_BUILTIN_CMPNGTPD,
25668 IX86_BUILTIN_CMPNGEPD,
25669 IX86_BUILTIN_CMPORDPD,
25670 IX86_BUILTIN_CMPUNORDPD,
25671 IX86_BUILTIN_CMPEQSD,
25672 IX86_BUILTIN_CMPLTSD,
25673 IX86_BUILTIN_CMPLESD,
25674 IX86_BUILTIN_CMPNEQSD,
25675 IX86_BUILTIN_CMPNLTSD,
25676 IX86_BUILTIN_CMPNLESD,
25677 IX86_BUILTIN_CMPORDSD,
25678 IX86_BUILTIN_CMPUNORDSD,
25679
25680 IX86_BUILTIN_COMIEQSD,
25681 IX86_BUILTIN_COMILTSD,
25682 IX86_BUILTIN_COMILESD,
25683 IX86_BUILTIN_COMIGTSD,
25684 IX86_BUILTIN_COMIGESD,
25685 IX86_BUILTIN_COMINEQSD,
25686 IX86_BUILTIN_UCOMIEQSD,
25687 IX86_BUILTIN_UCOMILTSD,
25688 IX86_BUILTIN_UCOMILESD,
25689 IX86_BUILTIN_UCOMIGTSD,
25690 IX86_BUILTIN_UCOMIGESD,
25691 IX86_BUILTIN_UCOMINEQSD,
25692
25693 IX86_BUILTIN_MAXPD,
25694 IX86_BUILTIN_MAXSD,
25695 IX86_BUILTIN_MINPD,
25696 IX86_BUILTIN_MINSD,
25697
25698 IX86_BUILTIN_ANDPD,
25699 IX86_BUILTIN_ANDNPD,
25700 IX86_BUILTIN_ORPD,
25701 IX86_BUILTIN_XORPD,
25702
25703 IX86_BUILTIN_SQRTPD,
25704 IX86_BUILTIN_SQRTSD,
25705
25706 IX86_BUILTIN_UNPCKHPD,
25707 IX86_BUILTIN_UNPCKLPD,
25708
25709 IX86_BUILTIN_SHUFPD,
25710
25711 IX86_BUILTIN_LOADUPD,
25712 IX86_BUILTIN_STOREUPD,
25713 IX86_BUILTIN_MOVSD,
25714
25715 IX86_BUILTIN_LOADHPD,
25716 IX86_BUILTIN_LOADLPD,
25717
25718 IX86_BUILTIN_CVTDQ2PD,
25719 IX86_BUILTIN_CVTDQ2PS,
25720
25721 IX86_BUILTIN_CVTPD2DQ,
25722 IX86_BUILTIN_CVTPD2PI,
25723 IX86_BUILTIN_CVTPD2PS,
25724 IX86_BUILTIN_CVTTPD2DQ,
25725 IX86_BUILTIN_CVTTPD2PI,
25726
25727 IX86_BUILTIN_CVTPI2PD,
25728 IX86_BUILTIN_CVTSI2SD,
25729 IX86_BUILTIN_CVTSI642SD,
25730
25731 IX86_BUILTIN_CVTSD2SI,
25732 IX86_BUILTIN_CVTSD2SI64,
25733 IX86_BUILTIN_CVTSD2SS,
25734 IX86_BUILTIN_CVTSS2SD,
25735 IX86_BUILTIN_CVTTSD2SI,
25736 IX86_BUILTIN_CVTTSD2SI64,
25737
25738 IX86_BUILTIN_CVTPS2DQ,
25739 IX86_BUILTIN_CVTPS2PD,
25740 IX86_BUILTIN_CVTTPS2DQ,
25741
25742 IX86_BUILTIN_MOVNTI,
25743 IX86_BUILTIN_MOVNTI64,
25744 IX86_BUILTIN_MOVNTPD,
25745 IX86_BUILTIN_MOVNTDQ,
25746
25747 IX86_BUILTIN_MOVQ128,
25748
25749 /* SSE2 MMX */
25750 IX86_BUILTIN_MASKMOVDQU,
25751 IX86_BUILTIN_MOVMSKPD,
25752 IX86_BUILTIN_PMOVMSKB128,
25753
25754 IX86_BUILTIN_PACKSSWB128,
25755 IX86_BUILTIN_PACKSSDW128,
25756 IX86_BUILTIN_PACKUSWB128,
25757
25758 IX86_BUILTIN_PADDB128,
25759 IX86_BUILTIN_PADDW128,
25760 IX86_BUILTIN_PADDD128,
25761 IX86_BUILTIN_PADDQ128,
25762 IX86_BUILTIN_PADDSB128,
25763 IX86_BUILTIN_PADDSW128,
25764 IX86_BUILTIN_PADDUSB128,
25765 IX86_BUILTIN_PADDUSW128,
25766 IX86_BUILTIN_PSUBB128,
25767 IX86_BUILTIN_PSUBW128,
25768 IX86_BUILTIN_PSUBD128,
25769 IX86_BUILTIN_PSUBQ128,
25770 IX86_BUILTIN_PSUBSB128,
25771 IX86_BUILTIN_PSUBSW128,
25772 IX86_BUILTIN_PSUBUSB128,
25773 IX86_BUILTIN_PSUBUSW128,
25774
25775 IX86_BUILTIN_PAND128,
25776 IX86_BUILTIN_PANDN128,
25777 IX86_BUILTIN_POR128,
25778 IX86_BUILTIN_PXOR128,
25779
25780 IX86_BUILTIN_PAVGB128,
25781 IX86_BUILTIN_PAVGW128,
25782
25783 IX86_BUILTIN_PCMPEQB128,
25784 IX86_BUILTIN_PCMPEQW128,
25785 IX86_BUILTIN_PCMPEQD128,
25786 IX86_BUILTIN_PCMPGTB128,
25787 IX86_BUILTIN_PCMPGTW128,
25788 IX86_BUILTIN_PCMPGTD128,
25789
25790 IX86_BUILTIN_PMADDWD128,
25791
25792 IX86_BUILTIN_PMAXSW128,
25793 IX86_BUILTIN_PMAXUB128,
25794 IX86_BUILTIN_PMINSW128,
25795 IX86_BUILTIN_PMINUB128,
25796
25797 IX86_BUILTIN_PMULUDQ,
25798 IX86_BUILTIN_PMULUDQ128,
25799 IX86_BUILTIN_PMULHUW128,
25800 IX86_BUILTIN_PMULHW128,
25801 IX86_BUILTIN_PMULLW128,
25802
25803 IX86_BUILTIN_PSADBW128,
25804 IX86_BUILTIN_PSHUFHW,
25805 IX86_BUILTIN_PSHUFLW,
25806 IX86_BUILTIN_PSHUFD,
25807
25808 IX86_BUILTIN_PSLLDQI128,
25809 IX86_BUILTIN_PSLLWI128,
25810 IX86_BUILTIN_PSLLDI128,
25811 IX86_BUILTIN_PSLLQI128,
25812 IX86_BUILTIN_PSRAWI128,
25813 IX86_BUILTIN_PSRADI128,
25814 IX86_BUILTIN_PSRLDQI128,
25815 IX86_BUILTIN_PSRLWI128,
25816 IX86_BUILTIN_PSRLDI128,
25817 IX86_BUILTIN_PSRLQI128,
25818
25819 IX86_BUILTIN_PSLLDQ128,
25820 IX86_BUILTIN_PSLLW128,
25821 IX86_BUILTIN_PSLLD128,
25822 IX86_BUILTIN_PSLLQ128,
25823 IX86_BUILTIN_PSRAW128,
25824 IX86_BUILTIN_PSRAD128,
25825 IX86_BUILTIN_PSRLW128,
25826 IX86_BUILTIN_PSRLD128,
25827 IX86_BUILTIN_PSRLQ128,
25828
25829 IX86_BUILTIN_PUNPCKHBW128,
25830 IX86_BUILTIN_PUNPCKHWD128,
25831 IX86_BUILTIN_PUNPCKHDQ128,
25832 IX86_BUILTIN_PUNPCKHQDQ128,
25833 IX86_BUILTIN_PUNPCKLBW128,
25834 IX86_BUILTIN_PUNPCKLWD128,
25835 IX86_BUILTIN_PUNPCKLDQ128,
25836 IX86_BUILTIN_PUNPCKLQDQ128,
25837
25838 IX86_BUILTIN_CLFLUSH,
25839 IX86_BUILTIN_MFENCE,
25840 IX86_BUILTIN_LFENCE,
25841 IX86_BUILTIN_PAUSE,
25842
25843 IX86_BUILTIN_BSRSI,
25844 IX86_BUILTIN_BSRDI,
25845 IX86_BUILTIN_RDPMC,
25846 IX86_BUILTIN_RDTSC,
25847 IX86_BUILTIN_RDTSCP,
25848 IX86_BUILTIN_ROLQI,
25849 IX86_BUILTIN_ROLHI,
25850 IX86_BUILTIN_RORQI,
25851 IX86_BUILTIN_RORHI,
25852
25853 /* SSE3. */
25854 IX86_BUILTIN_ADDSUBPS,
25855 IX86_BUILTIN_HADDPS,
25856 IX86_BUILTIN_HSUBPS,
25857 IX86_BUILTIN_MOVSHDUP,
25858 IX86_BUILTIN_MOVSLDUP,
25859 IX86_BUILTIN_ADDSUBPD,
25860 IX86_BUILTIN_HADDPD,
25861 IX86_BUILTIN_HSUBPD,
25862 IX86_BUILTIN_LDDQU,
25863
25864 IX86_BUILTIN_MONITOR,
25865 IX86_BUILTIN_MWAIT,
25866
25867 /* SSSE3. */
25868 IX86_BUILTIN_PHADDW,
25869 IX86_BUILTIN_PHADDD,
25870 IX86_BUILTIN_PHADDSW,
25871 IX86_BUILTIN_PHSUBW,
25872 IX86_BUILTIN_PHSUBD,
25873 IX86_BUILTIN_PHSUBSW,
25874 IX86_BUILTIN_PMADDUBSW,
25875 IX86_BUILTIN_PMULHRSW,
25876 IX86_BUILTIN_PSHUFB,
25877 IX86_BUILTIN_PSIGNB,
25878 IX86_BUILTIN_PSIGNW,
25879 IX86_BUILTIN_PSIGND,
25880 IX86_BUILTIN_PALIGNR,
25881 IX86_BUILTIN_PABSB,
25882 IX86_BUILTIN_PABSW,
25883 IX86_BUILTIN_PABSD,
25884
25885 IX86_BUILTIN_PHADDW128,
25886 IX86_BUILTIN_PHADDD128,
25887 IX86_BUILTIN_PHADDSW128,
25888 IX86_BUILTIN_PHSUBW128,
25889 IX86_BUILTIN_PHSUBD128,
25890 IX86_BUILTIN_PHSUBSW128,
25891 IX86_BUILTIN_PMADDUBSW128,
25892 IX86_BUILTIN_PMULHRSW128,
25893 IX86_BUILTIN_PSHUFB128,
25894 IX86_BUILTIN_PSIGNB128,
25895 IX86_BUILTIN_PSIGNW128,
25896 IX86_BUILTIN_PSIGND128,
25897 IX86_BUILTIN_PALIGNR128,
25898 IX86_BUILTIN_PABSB128,
25899 IX86_BUILTIN_PABSW128,
25900 IX86_BUILTIN_PABSD128,
25901
25902 /* AMDFAM10 - SSE4A New Instructions. */
25903 IX86_BUILTIN_MOVNTSD,
25904 IX86_BUILTIN_MOVNTSS,
25905 IX86_BUILTIN_EXTRQI,
25906 IX86_BUILTIN_EXTRQ,
25907 IX86_BUILTIN_INSERTQI,
25908 IX86_BUILTIN_INSERTQ,
25909
25910 /* SSE4.1. */
25911 IX86_BUILTIN_BLENDPD,
25912 IX86_BUILTIN_BLENDPS,
25913 IX86_BUILTIN_BLENDVPD,
25914 IX86_BUILTIN_BLENDVPS,
25915 IX86_BUILTIN_PBLENDVB128,
25916 IX86_BUILTIN_PBLENDW128,
25917
25918 IX86_BUILTIN_DPPD,
25919 IX86_BUILTIN_DPPS,
25920
25921 IX86_BUILTIN_INSERTPS128,
25922
25923 IX86_BUILTIN_MOVNTDQA,
25924 IX86_BUILTIN_MPSADBW128,
25925 IX86_BUILTIN_PACKUSDW128,
25926 IX86_BUILTIN_PCMPEQQ,
25927 IX86_BUILTIN_PHMINPOSUW128,
25928
25929 IX86_BUILTIN_PMAXSB128,
25930 IX86_BUILTIN_PMAXSD128,
25931 IX86_BUILTIN_PMAXUD128,
25932 IX86_BUILTIN_PMAXUW128,
25933
25934 IX86_BUILTIN_PMINSB128,
25935 IX86_BUILTIN_PMINSD128,
25936 IX86_BUILTIN_PMINUD128,
25937 IX86_BUILTIN_PMINUW128,
25938
25939 IX86_BUILTIN_PMOVSXBW128,
25940 IX86_BUILTIN_PMOVSXBD128,
25941 IX86_BUILTIN_PMOVSXBQ128,
25942 IX86_BUILTIN_PMOVSXWD128,
25943 IX86_BUILTIN_PMOVSXWQ128,
25944 IX86_BUILTIN_PMOVSXDQ128,
25945
25946 IX86_BUILTIN_PMOVZXBW128,
25947 IX86_BUILTIN_PMOVZXBD128,
25948 IX86_BUILTIN_PMOVZXBQ128,
25949 IX86_BUILTIN_PMOVZXWD128,
25950 IX86_BUILTIN_PMOVZXWQ128,
25951 IX86_BUILTIN_PMOVZXDQ128,
25952
25953 IX86_BUILTIN_PMULDQ128,
25954 IX86_BUILTIN_PMULLD128,
25955
25956 IX86_BUILTIN_ROUNDSD,
25957 IX86_BUILTIN_ROUNDSS,
25958
25959 IX86_BUILTIN_ROUNDPD,
25960 IX86_BUILTIN_ROUNDPS,
25961
25962 IX86_BUILTIN_FLOORPD,
25963 IX86_BUILTIN_CEILPD,
25964 IX86_BUILTIN_TRUNCPD,
25965 IX86_BUILTIN_RINTPD,
25966 IX86_BUILTIN_ROUNDPD_AZ,
25967
25968 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
25969 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
25970 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
25971
25972 IX86_BUILTIN_FLOORPS,
25973 IX86_BUILTIN_CEILPS,
25974 IX86_BUILTIN_TRUNCPS,
25975 IX86_BUILTIN_RINTPS,
25976 IX86_BUILTIN_ROUNDPS_AZ,
25977
25978 IX86_BUILTIN_FLOORPS_SFIX,
25979 IX86_BUILTIN_CEILPS_SFIX,
25980 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
25981
25982 IX86_BUILTIN_PTESTZ,
25983 IX86_BUILTIN_PTESTC,
25984 IX86_BUILTIN_PTESTNZC,
25985
25986 IX86_BUILTIN_VEC_INIT_V2SI,
25987 IX86_BUILTIN_VEC_INIT_V4HI,
25988 IX86_BUILTIN_VEC_INIT_V8QI,
25989 IX86_BUILTIN_VEC_EXT_V2DF,
25990 IX86_BUILTIN_VEC_EXT_V2DI,
25991 IX86_BUILTIN_VEC_EXT_V4SF,
25992 IX86_BUILTIN_VEC_EXT_V4SI,
25993 IX86_BUILTIN_VEC_EXT_V8HI,
25994 IX86_BUILTIN_VEC_EXT_V2SI,
25995 IX86_BUILTIN_VEC_EXT_V4HI,
25996 IX86_BUILTIN_VEC_EXT_V16QI,
25997 IX86_BUILTIN_VEC_SET_V2DI,
25998 IX86_BUILTIN_VEC_SET_V4SF,
25999 IX86_BUILTIN_VEC_SET_V4SI,
26000 IX86_BUILTIN_VEC_SET_V8HI,
26001 IX86_BUILTIN_VEC_SET_V4HI,
26002 IX86_BUILTIN_VEC_SET_V16QI,
26003
26004 IX86_BUILTIN_VEC_PACK_SFIX,
26005 IX86_BUILTIN_VEC_PACK_SFIX256,
26006
26007 /* SSE4.2. */
26008 IX86_BUILTIN_CRC32QI,
26009 IX86_BUILTIN_CRC32HI,
26010 IX86_BUILTIN_CRC32SI,
26011 IX86_BUILTIN_CRC32DI,
26012
26013 IX86_BUILTIN_PCMPESTRI128,
26014 IX86_BUILTIN_PCMPESTRM128,
26015 IX86_BUILTIN_PCMPESTRA128,
26016 IX86_BUILTIN_PCMPESTRC128,
26017 IX86_BUILTIN_PCMPESTRO128,
26018 IX86_BUILTIN_PCMPESTRS128,
26019 IX86_BUILTIN_PCMPESTRZ128,
26020 IX86_BUILTIN_PCMPISTRI128,
26021 IX86_BUILTIN_PCMPISTRM128,
26022 IX86_BUILTIN_PCMPISTRA128,
26023 IX86_BUILTIN_PCMPISTRC128,
26024 IX86_BUILTIN_PCMPISTRO128,
26025 IX86_BUILTIN_PCMPISTRS128,
26026 IX86_BUILTIN_PCMPISTRZ128,
26027
26028 IX86_BUILTIN_PCMPGTQ,
26029
26030 /* AES instructions */
26031 IX86_BUILTIN_AESENC128,
26032 IX86_BUILTIN_AESENCLAST128,
26033 IX86_BUILTIN_AESDEC128,
26034 IX86_BUILTIN_AESDECLAST128,
26035 IX86_BUILTIN_AESIMC128,
26036 IX86_BUILTIN_AESKEYGENASSIST128,
26037
26038 /* PCLMUL instruction */
26039 IX86_BUILTIN_PCLMULQDQ128,
26040
26041 /* AVX */
26042 IX86_BUILTIN_ADDPD256,
26043 IX86_BUILTIN_ADDPS256,
26044 IX86_BUILTIN_ADDSUBPD256,
26045 IX86_BUILTIN_ADDSUBPS256,
26046 IX86_BUILTIN_ANDPD256,
26047 IX86_BUILTIN_ANDPS256,
26048 IX86_BUILTIN_ANDNPD256,
26049 IX86_BUILTIN_ANDNPS256,
26050 IX86_BUILTIN_BLENDPD256,
26051 IX86_BUILTIN_BLENDPS256,
26052 IX86_BUILTIN_BLENDVPD256,
26053 IX86_BUILTIN_BLENDVPS256,
26054 IX86_BUILTIN_DIVPD256,
26055 IX86_BUILTIN_DIVPS256,
26056 IX86_BUILTIN_DPPS256,
26057 IX86_BUILTIN_HADDPD256,
26058 IX86_BUILTIN_HADDPS256,
26059 IX86_BUILTIN_HSUBPD256,
26060 IX86_BUILTIN_HSUBPS256,
26061 IX86_BUILTIN_MAXPD256,
26062 IX86_BUILTIN_MAXPS256,
26063 IX86_BUILTIN_MINPD256,
26064 IX86_BUILTIN_MINPS256,
26065 IX86_BUILTIN_MULPD256,
26066 IX86_BUILTIN_MULPS256,
26067 IX86_BUILTIN_ORPD256,
26068 IX86_BUILTIN_ORPS256,
26069 IX86_BUILTIN_SHUFPD256,
26070 IX86_BUILTIN_SHUFPS256,
26071 IX86_BUILTIN_SUBPD256,
26072 IX86_BUILTIN_SUBPS256,
26073 IX86_BUILTIN_XORPD256,
26074 IX86_BUILTIN_XORPS256,
26075 IX86_BUILTIN_CMPSD,
26076 IX86_BUILTIN_CMPSS,
26077 IX86_BUILTIN_CMPPD,
26078 IX86_BUILTIN_CMPPS,
26079 IX86_BUILTIN_CMPPD256,
26080 IX86_BUILTIN_CMPPS256,
26081 IX86_BUILTIN_CVTDQ2PD256,
26082 IX86_BUILTIN_CVTDQ2PS256,
26083 IX86_BUILTIN_CVTPD2PS256,
26084 IX86_BUILTIN_CVTPS2DQ256,
26085 IX86_BUILTIN_CVTPS2PD256,
26086 IX86_BUILTIN_CVTTPD2DQ256,
26087 IX86_BUILTIN_CVTPD2DQ256,
26088 IX86_BUILTIN_CVTTPS2DQ256,
26089 IX86_BUILTIN_EXTRACTF128PD256,
26090 IX86_BUILTIN_EXTRACTF128PS256,
26091 IX86_BUILTIN_EXTRACTF128SI256,
26092 IX86_BUILTIN_VZEROALL,
26093 IX86_BUILTIN_VZEROUPPER,
26094 IX86_BUILTIN_VPERMILVARPD,
26095 IX86_BUILTIN_VPERMILVARPS,
26096 IX86_BUILTIN_VPERMILVARPD256,
26097 IX86_BUILTIN_VPERMILVARPS256,
26098 IX86_BUILTIN_VPERMILPD,
26099 IX86_BUILTIN_VPERMILPS,
26100 IX86_BUILTIN_VPERMILPD256,
26101 IX86_BUILTIN_VPERMILPS256,
26102 IX86_BUILTIN_VPERMIL2PD,
26103 IX86_BUILTIN_VPERMIL2PS,
26104 IX86_BUILTIN_VPERMIL2PD256,
26105 IX86_BUILTIN_VPERMIL2PS256,
26106 IX86_BUILTIN_VPERM2F128PD256,
26107 IX86_BUILTIN_VPERM2F128PS256,
26108 IX86_BUILTIN_VPERM2F128SI256,
26109 IX86_BUILTIN_VBROADCASTSS,
26110 IX86_BUILTIN_VBROADCASTSD256,
26111 IX86_BUILTIN_VBROADCASTSS256,
26112 IX86_BUILTIN_VBROADCASTPD256,
26113 IX86_BUILTIN_VBROADCASTPS256,
26114 IX86_BUILTIN_VINSERTF128PD256,
26115 IX86_BUILTIN_VINSERTF128PS256,
26116 IX86_BUILTIN_VINSERTF128SI256,
26117 IX86_BUILTIN_LOADUPD256,
26118 IX86_BUILTIN_LOADUPS256,
26119 IX86_BUILTIN_STOREUPD256,
26120 IX86_BUILTIN_STOREUPS256,
26121 IX86_BUILTIN_LDDQU256,
26122 IX86_BUILTIN_MOVNTDQ256,
26123 IX86_BUILTIN_MOVNTPD256,
26124 IX86_BUILTIN_MOVNTPS256,
26125 IX86_BUILTIN_LOADDQU256,
26126 IX86_BUILTIN_STOREDQU256,
26127 IX86_BUILTIN_MASKLOADPD,
26128 IX86_BUILTIN_MASKLOADPS,
26129 IX86_BUILTIN_MASKSTOREPD,
26130 IX86_BUILTIN_MASKSTOREPS,
26131 IX86_BUILTIN_MASKLOADPD256,
26132 IX86_BUILTIN_MASKLOADPS256,
26133 IX86_BUILTIN_MASKSTOREPD256,
26134 IX86_BUILTIN_MASKSTOREPS256,
26135 IX86_BUILTIN_MOVSHDUP256,
26136 IX86_BUILTIN_MOVSLDUP256,
26137 IX86_BUILTIN_MOVDDUP256,
26138
26139 IX86_BUILTIN_SQRTPD256,
26140 IX86_BUILTIN_SQRTPS256,
26141 IX86_BUILTIN_SQRTPS_NR256,
26142 IX86_BUILTIN_RSQRTPS256,
26143 IX86_BUILTIN_RSQRTPS_NR256,
26144
26145 IX86_BUILTIN_RCPPS256,
26146
26147 IX86_BUILTIN_ROUNDPD256,
26148 IX86_BUILTIN_ROUNDPS256,
26149
26150 IX86_BUILTIN_FLOORPD256,
26151 IX86_BUILTIN_CEILPD256,
26152 IX86_BUILTIN_TRUNCPD256,
26153 IX86_BUILTIN_RINTPD256,
26154 IX86_BUILTIN_ROUNDPD_AZ256,
26155
26156 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
26157 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
26158 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
26159
26160 IX86_BUILTIN_FLOORPS256,
26161 IX86_BUILTIN_CEILPS256,
26162 IX86_BUILTIN_TRUNCPS256,
26163 IX86_BUILTIN_RINTPS256,
26164 IX86_BUILTIN_ROUNDPS_AZ256,
26165
26166 IX86_BUILTIN_FLOORPS_SFIX256,
26167 IX86_BUILTIN_CEILPS_SFIX256,
26168 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
26169
26170 IX86_BUILTIN_UNPCKHPD256,
26171 IX86_BUILTIN_UNPCKLPD256,
26172 IX86_BUILTIN_UNPCKHPS256,
26173 IX86_BUILTIN_UNPCKLPS256,
26174
26175 IX86_BUILTIN_SI256_SI,
26176 IX86_BUILTIN_PS256_PS,
26177 IX86_BUILTIN_PD256_PD,
26178 IX86_BUILTIN_SI_SI256,
26179 IX86_BUILTIN_PS_PS256,
26180 IX86_BUILTIN_PD_PD256,
26181
26182 IX86_BUILTIN_VTESTZPD,
26183 IX86_BUILTIN_VTESTCPD,
26184 IX86_BUILTIN_VTESTNZCPD,
26185 IX86_BUILTIN_VTESTZPS,
26186 IX86_BUILTIN_VTESTCPS,
26187 IX86_BUILTIN_VTESTNZCPS,
26188 IX86_BUILTIN_VTESTZPD256,
26189 IX86_BUILTIN_VTESTCPD256,
26190 IX86_BUILTIN_VTESTNZCPD256,
26191 IX86_BUILTIN_VTESTZPS256,
26192 IX86_BUILTIN_VTESTCPS256,
26193 IX86_BUILTIN_VTESTNZCPS256,
26194 IX86_BUILTIN_PTESTZ256,
26195 IX86_BUILTIN_PTESTC256,
26196 IX86_BUILTIN_PTESTNZC256,
26197
26198 IX86_BUILTIN_MOVMSKPD256,
26199 IX86_BUILTIN_MOVMSKPS256,
26200
26201 /* AVX2 */
26202 IX86_BUILTIN_MPSADBW256,
26203 IX86_BUILTIN_PABSB256,
26204 IX86_BUILTIN_PABSW256,
26205 IX86_BUILTIN_PABSD256,
26206 IX86_BUILTIN_PACKSSDW256,
26207 IX86_BUILTIN_PACKSSWB256,
26208 IX86_BUILTIN_PACKUSDW256,
26209 IX86_BUILTIN_PACKUSWB256,
26210 IX86_BUILTIN_PADDB256,
26211 IX86_BUILTIN_PADDW256,
26212 IX86_BUILTIN_PADDD256,
26213 IX86_BUILTIN_PADDQ256,
26214 IX86_BUILTIN_PADDSB256,
26215 IX86_BUILTIN_PADDSW256,
26216 IX86_BUILTIN_PADDUSB256,
26217 IX86_BUILTIN_PADDUSW256,
26218 IX86_BUILTIN_PALIGNR256,
26219 IX86_BUILTIN_AND256I,
26220 IX86_BUILTIN_ANDNOT256I,
26221 IX86_BUILTIN_PAVGB256,
26222 IX86_BUILTIN_PAVGW256,
26223 IX86_BUILTIN_PBLENDVB256,
26224 IX86_BUILTIN_PBLENDVW256,
26225 IX86_BUILTIN_PCMPEQB256,
26226 IX86_BUILTIN_PCMPEQW256,
26227 IX86_BUILTIN_PCMPEQD256,
26228 IX86_BUILTIN_PCMPEQQ256,
26229 IX86_BUILTIN_PCMPGTB256,
26230 IX86_BUILTIN_PCMPGTW256,
26231 IX86_BUILTIN_PCMPGTD256,
26232 IX86_BUILTIN_PCMPGTQ256,
26233 IX86_BUILTIN_PHADDW256,
26234 IX86_BUILTIN_PHADDD256,
26235 IX86_BUILTIN_PHADDSW256,
26236 IX86_BUILTIN_PHSUBW256,
26237 IX86_BUILTIN_PHSUBD256,
26238 IX86_BUILTIN_PHSUBSW256,
26239 IX86_BUILTIN_PMADDUBSW256,
26240 IX86_BUILTIN_PMADDWD256,
26241 IX86_BUILTIN_PMAXSB256,
26242 IX86_BUILTIN_PMAXSW256,
26243 IX86_BUILTIN_PMAXSD256,
26244 IX86_BUILTIN_PMAXUB256,
26245 IX86_BUILTIN_PMAXUW256,
26246 IX86_BUILTIN_PMAXUD256,
26247 IX86_BUILTIN_PMINSB256,
26248 IX86_BUILTIN_PMINSW256,
26249 IX86_BUILTIN_PMINSD256,
26250 IX86_BUILTIN_PMINUB256,
26251 IX86_BUILTIN_PMINUW256,
26252 IX86_BUILTIN_PMINUD256,
26253 IX86_BUILTIN_PMOVMSKB256,
26254 IX86_BUILTIN_PMOVSXBW256,
26255 IX86_BUILTIN_PMOVSXBD256,
26256 IX86_BUILTIN_PMOVSXBQ256,
26257 IX86_BUILTIN_PMOVSXWD256,
26258 IX86_BUILTIN_PMOVSXWQ256,
26259 IX86_BUILTIN_PMOVSXDQ256,
26260 IX86_BUILTIN_PMOVZXBW256,
26261 IX86_BUILTIN_PMOVZXBD256,
26262 IX86_BUILTIN_PMOVZXBQ256,
26263 IX86_BUILTIN_PMOVZXWD256,
26264 IX86_BUILTIN_PMOVZXWQ256,
26265 IX86_BUILTIN_PMOVZXDQ256,
26266 IX86_BUILTIN_PMULDQ256,
26267 IX86_BUILTIN_PMULHRSW256,
26268 IX86_BUILTIN_PMULHUW256,
26269 IX86_BUILTIN_PMULHW256,
26270 IX86_BUILTIN_PMULLW256,
26271 IX86_BUILTIN_PMULLD256,
26272 IX86_BUILTIN_PMULUDQ256,
26273 IX86_BUILTIN_POR256,
26274 IX86_BUILTIN_PSADBW256,
26275 IX86_BUILTIN_PSHUFB256,
26276 IX86_BUILTIN_PSHUFD256,
26277 IX86_BUILTIN_PSHUFHW256,
26278 IX86_BUILTIN_PSHUFLW256,
26279 IX86_BUILTIN_PSIGNB256,
26280 IX86_BUILTIN_PSIGNW256,
26281 IX86_BUILTIN_PSIGND256,
26282 IX86_BUILTIN_PSLLDQI256,
26283 IX86_BUILTIN_PSLLWI256,
26284 IX86_BUILTIN_PSLLW256,
26285 IX86_BUILTIN_PSLLDI256,
26286 IX86_BUILTIN_PSLLD256,
26287 IX86_BUILTIN_PSLLQI256,
26288 IX86_BUILTIN_PSLLQ256,
26289 IX86_BUILTIN_PSRAWI256,
26290 IX86_BUILTIN_PSRAW256,
26291 IX86_BUILTIN_PSRADI256,
26292 IX86_BUILTIN_PSRAD256,
26293 IX86_BUILTIN_PSRLDQI256,
26294 IX86_BUILTIN_PSRLWI256,
26295 IX86_BUILTIN_PSRLW256,
26296 IX86_BUILTIN_PSRLDI256,
26297 IX86_BUILTIN_PSRLD256,
26298 IX86_BUILTIN_PSRLQI256,
26299 IX86_BUILTIN_PSRLQ256,
26300 IX86_BUILTIN_PSUBB256,
26301 IX86_BUILTIN_PSUBW256,
26302 IX86_BUILTIN_PSUBD256,
26303 IX86_BUILTIN_PSUBQ256,
26304 IX86_BUILTIN_PSUBSB256,
26305 IX86_BUILTIN_PSUBSW256,
26306 IX86_BUILTIN_PSUBUSB256,
26307 IX86_BUILTIN_PSUBUSW256,
26308 IX86_BUILTIN_PUNPCKHBW256,
26309 IX86_BUILTIN_PUNPCKHWD256,
26310 IX86_BUILTIN_PUNPCKHDQ256,
26311 IX86_BUILTIN_PUNPCKHQDQ256,
26312 IX86_BUILTIN_PUNPCKLBW256,
26313 IX86_BUILTIN_PUNPCKLWD256,
26314 IX86_BUILTIN_PUNPCKLDQ256,
26315 IX86_BUILTIN_PUNPCKLQDQ256,
26316 IX86_BUILTIN_PXOR256,
26317 IX86_BUILTIN_MOVNTDQA256,
26318 IX86_BUILTIN_VBROADCASTSS_PS,
26319 IX86_BUILTIN_VBROADCASTSS_PS256,
26320 IX86_BUILTIN_VBROADCASTSD_PD256,
26321 IX86_BUILTIN_VBROADCASTSI256,
26322 IX86_BUILTIN_PBLENDD256,
26323 IX86_BUILTIN_PBLENDD128,
26324 IX86_BUILTIN_PBROADCASTB256,
26325 IX86_BUILTIN_PBROADCASTW256,
26326 IX86_BUILTIN_PBROADCASTD256,
26327 IX86_BUILTIN_PBROADCASTQ256,
26328 IX86_BUILTIN_PBROADCASTB128,
26329 IX86_BUILTIN_PBROADCASTW128,
26330 IX86_BUILTIN_PBROADCASTD128,
26331 IX86_BUILTIN_PBROADCASTQ128,
26332 IX86_BUILTIN_VPERMVARSI256,
26333 IX86_BUILTIN_VPERMDF256,
26334 IX86_BUILTIN_VPERMVARSF256,
26335 IX86_BUILTIN_VPERMDI256,
26336 IX86_BUILTIN_VPERMTI256,
26337 IX86_BUILTIN_VEXTRACT128I256,
26338 IX86_BUILTIN_VINSERT128I256,
26339 IX86_BUILTIN_MASKLOADD,
26340 IX86_BUILTIN_MASKLOADQ,
26341 IX86_BUILTIN_MASKLOADD256,
26342 IX86_BUILTIN_MASKLOADQ256,
26343 IX86_BUILTIN_MASKSTORED,
26344 IX86_BUILTIN_MASKSTOREQ,
26345 IX86_BUILTIN_MASKSTORED256,
26346 IX86_BUILTIN_MASKSTOREQ256,
26347 IX86_BUILTIN_PSLLVV4DI,
26348 IX86_BUILTIN_PSLLVV2DI,
26349 IX86_BUILTIN_PSLLVV8SI,
26350 IX86_BUILTIN_PSLLVV4SI,
26351 IX86_BUILTIN_PSRAVV8SI,
26352 IX86_BUILTIN_PSRAVV4SI,
26353 IX86_BUILTIN_PSRLVV4DI,
26354 IX86_BUILTIN_PSRLVV2DI,
26355 IX86_BUILTIN_PSRLVV8SI,
26356 IX86_BUILTIN_PSRLVV4SI,
26357
26358 IX86_BUILTIN_GATHERSIV2DF,
26359 IX86_BUILTIN_GATHERSIV4DF,
26360 IX86_BUILTIN_GATHERDIV2DF,
26361 IX86_BUILTIN_GATHERDIV4DF,
26362 IX86_BUILTIN_GATHERSIV4SF,
26363 IX86_BUILTIN_GATHERSIV8SF,
26364 IX86_BUILTIN_GATHERDIV4SF,
26365 IX86_BUILTIN_GATHERDIV8SF,
26366 IX86_BUILTIN_GATHERSIV2DI,
26367 IX86_BUILTIN_GATHERSIV4DI,
26368 IX86_BUILTIN_GATHERDIV2DI,
26369 IX86_BUILTIN_GATHERDIV4DI,
26370 IX86_BUILTIN_GATHERSIV4SI,
26371 IX86_BUILTIN_GATHERSIV8SI,
26372 IX86_BUILTIN_GATHERDIV4SI,
26373 IX86_BUILTIN_GATHERDIV8SI,
26374
26375 /* Alternate 4 element gather for the vectorizer where
26376 all operands are 32-byte wide. */
26377 IX86_BUILTIN_GATHERALTSIV4DF,
26378 IX86_BUILTIN_GATHERALTDIV8SF,
26379 IX86_BUILTIN_GATHERALTSIV4DI,
26380 IX86_BUILTIN_GATHERALTDIV8SI,
26381
26382 /* TFmode support builtins. */
26383 IX86_BUILTIN_INFQ,
26384 IX86_BUILTIN_HUGE_VALQ,
26385 IX86_BUILTIN_FABSQ,
26386 IX86_BUILTIN_COPYSIGNQ,
26387
26388 /* Vectorizer support builtins. */
26389 IX86_BUILTIN_CPYSGNPS,
26390 IX86_BUILTIN_CPYSGNPD,
26391 IX86_BUILTIN_CPYSGNPS256,
26392 IX86_BUILTIN_CPYSGNPD256,
26393
26394 /* FMA4 instructions. */
26395 IX86_BUILTIN_VFMADDSS,
26396 IX86_BUILTIN_VFMADDSD,
26397 IX86_BUILTIN_VFMADDPS,
26398 IX86_BUILTIN_VFMADDPD,
26399 IX86_BUILTIN_VFMADDPS256,
26400 IX86_BUILTIN_VFMADDPD256,
26401 IX86_BUILTIN_VFMADDSUBPS,
26402 IX86_BUILTIN_VFMADDSUBPD,
26403 IX86_BUILTIN_VFMADDSUBPS256,
26404 IX86_BUILTIN_VFMADDSUBPD256,
26405
26406 /* FMA3 instructions. */
26407 IX86_BUILTIN_VFMADDSS3,
26408 IX86_BUILTIN_VFMADDSD3,
26409
26410 /* XOP instructions. */
26411 IX86_BUILTIN_VPCMOV,
26412 IX86_BUILTIN_VPCMOV_V2DI,
26413 IX86_BUILTIN_VPCMOV_V4SI,
26414 IX86_BUILTIN_VPCMOV_V8HI,
26415 IX86_BUILTIN_VPCMOV_V16QI,
26416 IX86_BUILTIN_VPCMOV_V4SF,
26417 IX86_BUILTIN_VPCMOV_V2DF,
26418 IX86_BUILTIN_VPCMOV256,
26419 IX86_BUILTIN_VPCMOV_V4DI256,
26420 IX86_BUILTIN_VPCMOV_V8SI256,
26421 IX86_BUILTIN_VPCMOV_V16HI256,
26422 IX86_BUILTIN_VPCMOV_V32QI256,
26423 IX86_BUILTIN_VPCMOV_V8SF256,
26424 IX86_BUILTIN_VPCMOV_V4DF256,
26425
26426 IX86_BUILTIN_VPPERM,
26427
26428 IX86_BUILTIN_VPMACSSWW,
26429 IX86_BUILTIN_VPMACSWW,
26430 IX86_BUILTIN_VPMACSSWD,
26431 IX86_BUILTIN_VPMACSWD,
26432 IX86_BUILTIN_VPMACSSDD,
26433 IX86_BUILTIN_VPMACSDD,
26434 IX86_BUILTIN_VPMACSSDQL,
26435 IX86_BUILTIN_VPMACSSDQH,
26436 IX86_BUILTIN_VPMACSDQL,
26437 IX86_BUILTIN_VPMACSDQH,
26438 IX86_BUILTIN_VPMADCSSWD,
26439 IX86_BUILTIN_VPMADCSWD,
26440
26441 IX86_BUILTIN_VPHADDBW,
26442 IX86_BUILTIN_VPHADDBD,
26443 IX86_BUILTIN_VPHADDBQ,
26444 IX86_BUILTIN_VPHADDWD,
26445 IX86_BUILTIN_VPHADDWQ,
26446 IX86_BUILTIN_VPHADDDQ,
26447 IX86_BUILTIN_VPHADDUBW,
26448 IX86_BUILTIN_VPHADDUBD,
26449 IX86_BUILTIN_VPHADDUBQ,
26450 IX86_BUILTIN_VPHADDUWD,
26451 IX86_BUILTIN_VPHADDUWQ,
26452 IX86_BUILTIN_VPHADDUDQ,
26453 IX86_BUILTIN_VPHSUBBW,
26454 IX86_BUILTIN_VPHSUBWD,
26455 IX86_BUILTIN_VPHSUBDQ,
26456
26457 IX86_BUILTIN_VPROTB,
26458 IX86_BUILTIN_VPROTW,
26459 IX86_BUILTIN_VPROTD,
26460 IX86_BUILTIN_VPROTQ,
26461 IX86_BUILTIN_VPROTB_IMM,
26462 IX86_BUILTIN_VPROTW_IMM,
26463 IX86_BUILTIN_VPROTD_IMM,
26464 IX86_BUILTIN_VPROTQ_IMM,
26465
26466 IX86_BUILTIN_VPSHLB,
26467 IX86_BUILTIN_VPSHLW,
26468 IX86_BUILTIN_VPSHLD,
26469 IX86_BUILTIN_VPSHLQ,
26470 IX86_BUILTIN_VPSHAB,
26471 IX86_BUILTIN_VPSHAW,
26472 IX86_BUILTIN_VPSHAD,
26473 IX86_BUILTIN_VPSHAQ,
26474
26475 IX86_BUILTIN_VFRCZSS,
26476 IX86_BUILTIN_VFRCZSD,
26477 IX86_BUILTIN_VFRCZPS,
26478 IX86_BUILTIN_VFRCZPD,
26479 IX86_BUILTIN_VFRCZPS256,
26480 IX86_BUILTIN_VFRCZPD256,
26481
26482 IX86_BUILTIN_VPCOMEQUB,
26483 IX86_BUILTIN_VPCOMNEUB,
26484 IX86_BUILTIN_VPCOMLTUB,
26485 IX86_BUILTIN_VPCOMLEUB,
26486 IX86_BUILTIN_VPCOMGTUB,
26487 IX86_BUILTIN_VPCOMGEUB,
26488 IX86_BUILTIN_VPCOMFALSEUB,
26489 IX86_BUILTIN_VPCOMTRUEUB,
26490
26491 IX86_BUILTIN_VPCOMEQUW,
26492 IX86_BUILTIN_VPCOMNEUW,
26493 IX86_BUILTIN_VPCOMLTUW,
26494 IX86_BUILTIN_VPCOMLEUW,
26495 IX86_BUILTIN_VPCOMGTUW,
26496 IX86_BUILTIN_VPCOMGEUW,
26497 IX86_BUILTIN_VPCOMFALSEUW,
26498 IX86_BUILTIN_VPCOMTRUEUW,
26499
26500 IX86_BUILTIN_VPCOMEQUD,
26501 IX86_BUILTIN_VPCOMNEUD,
26502 IX86_BUILTIN_VPCOMLTUD,
26503 IX86_BUILTIN_VPCOMLEUD,
26504 IX86_BUILTIN_VPCOMGTUD,
26505 IX86_BUILTIN_VPCOMGEUD,
26506 IX86_BUILTIN_VPCOMFALSEUD,
26507 IX86_BUILTIN_VPCOMTRUEUD,
26508
26509 IX86_BUILTIN_VPCOMEQUQ,
26510 IX86_BUILTIN_VPCOMNEUQ,
26511 IX86_BUILTIN_VPCOMLTUQ,
26512 IX86_BUILTIN_VPCOMLEUQ,
26513 IX86_BUILTIN_VPCOMGTUQ,
26514 IX86_BUILTIN_VPCOMGEUQ,
26515 IX86_BUILTIN_VPCOMFALSEUQ,
26516 IX86_BUILTIN_VPCOMTRUEUQ,
26517
26518 IX86_BUILTIN_VPCOMEQB,
26519 IX86_BUILTIN_VPCOMNEB,
26520 IX86_BUILTIN_VPCOMLTB,
26521 IX86_BUILTIN_VPCOMLEB,
26522 IX86_BUILTIN_VPCOMGTB,
26523 IX86_BUILTIN_VPCOMGEB,
26524 IX86_BUILTIN_VPCOMFALSEB,
26525 IX86_BUILTIN_VPCOMTRUEB,
26526
26527 IX86_BUILTIN_VPCOMEQW,
26528 IX86_BUILTIN_VPCOMNEW,
26529 IX86_BUILTIN_VPCOMLTW,
26530 IX86_BUILTIN_VPCOMLEW,
26531 IX86_BUILTIN_VPCOMGTW,
26532 IX86_BUILTIN_VPCOMGEW,
26533 IX86_BUILTIN_VPCOMFALSEW,
26534 IX86_BUILTIN_VPCOMTRUEW,
26535
26536 IX86_BUILTIN_VPCOMEQD,
26537 IX86_BUILTIN_VPCOMNED,
26538 IX86_BUILTIN_VPCOMLTD,
26539 IX86_BUILTIN_VPCOMLED,
26540 IX86_BUILTIN_VPCOMGTD,
26541 IX86_BUILTIN_VPCOMGED,
26542 IX86_BUILTIN_VPCOMFALSED,
26543 IX86_BUILTIN_VPCOMTRUED,
26544
26545 IX86_BUILTIN_VPCOMEQQ,
26546 IX86_BUILTIN_VPCOMNEQ,
26547 IX86_BUILTIN_VPCOMLTQ,
26548 IX86_BUILTIN_VPCOMLEQ,
26549 IX86_BUILTIN_VPCOMGTQ,
26550 IX86_BUILTIN_VPCOMGEQ,
26551 IX86_BUILTIN_VPCOMFALSEQ,
26552 IX86_BUILTIN_VPCOMTRUEQ,
26553
26554 /* LWP instructions. */
26555 IX86_BUILTIN_LLWPCB,
26556 IX86_BUILTIN_SLWPCB,
26557 IX86_BUILTIN_LWPVAL32,
26558 IX86_BUILTIN_LWPVAL64,
26559 IX86_BUILTIN_LWPINS32,
26560 IX86_BUILTIN_LWPINS64,
26561
26562 IX86_BUILTIN_CLZS,
26563
26564 /* RTM */
26565 IX86_BUILTIN_XBEGIN,
26566 IX86_BUILTIN_XEND,
26567 IX86_BUILTIN_XABORT,
26568 IX86_BUILTIN_XTEST,
26569
26570 /* BMI instructions. */
26571 IX86_BUILTIN_BEXTR32,
26572 IX86_BUILTIN_BEXTR64,
26573 IX86_BUILTIN_CTZS,
26574
26575 /* TBM instructions. */
26576 IX86_BUILTIN_BEXTRI32,
26577 IX86_BUILTIN_BEXTRI64,
26578
26579 /* BMI2 instructions. */
26580 IX86_BUILTIN_BZHI32,
26581 IX86_BUILTIN_BZHI64,
26582 IX86_BUILTIN_PDEP32,
26583 IX86_BUILTIN_PDEP64,
26584 IX86_BUILTIN_PEXT32,
26585 IX86_BUILTIN_PEXT64,
26586
26587 /* ADX instructions. */
26588 IX86_BUILTIN_ADDCARRYX32,
26589 IX86_BUILTIN_ADDCARRYX64,
26590
26591 /* FSGSBASE instructions. */
26592 IX86_BUILTIN_RDFSBASE32,
26593 IX86_BUILTIN_RDFSBASE64,
26594 IX86_BUILTIN_RDGSBASE32,
26595 IX86_BUILTIN_RDGSBASE64,
26596 IX86_BUILTIN_WRFSBASE32,
26597 IX86_BUILTIN_WRFSBASE64,
26598 IX86_BUILTIN_WRGSBASE32,
26599 IX86_BUILTIN_WRGSBASE64,
26600
26601 /* RDRND instructions. */
26602 IX86_BUILTIN_RDRAND16_STEP,
26603 IX86_BUILTIN_RDRAND32_STEP,
26604 IX86_BUILTIN_RDRAND64_STEP,
26605
26606 /* RDSEED instructions. */
26607 IX86_BUILTIN_RDSEED16_STEP,
26608 IX86_BUILTIN_RDSEED32_STEP,
26609 IX86_BUILTIN_RDSEED64_STEP,
26610
26611 /* F16C instructions. */
26612 IX86_BUILTIN_CVTPH2PS,
26613 IX86_BUILTIN_CVTPH2PS256,
26614 IX86_BUILTIN_CVTPS2PH,
26615 IX86_BUILTIN_CVTPS2PH256,
26616
26617 /* CFString built-in for darwin */
26618 IX86_BUILTIN_CFSTRING,
26619
26620 /* Builtins to get CPU type and supported features. */
26621 IX86_BUILTIN_CPU_INIT,
26622 IX86_BUILTIN_CPU_IS,
26623 IX86_BUILTIN_CPU_SUPPORTS,
26624
26625 IX86_BUILTIN_MAX
26626 };
26627
26628 /* Table for the ix86 builtin decls. */
26629 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
26630
26631 /* Table of all of the builtin functions that are possible with different ISA's
26632 but are waiting to be built until a function is declared to use that
26633 ISA. */
26634 struct builtin_isa {
26635 const char *name; /* function name */
26636 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
26637 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
26638 bool const_p; /* true if the declaration is constant */
26639 bool set_and_not_built_p;
26640 };
26641
26642 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
26643
26644
26645 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
26646 of which isa_flags to use in the ix86_builtins_isa array. Stores the
26647 function decl in the ix86_builtins array. Returns the function decl or
26648 NULL_TREE, if the builtin was not added.
26649
26650 If the front end has a special hook for builtin functions, delay adding
26651 builtin functions that aren't in the current ISA until the ISA is changed
26652 with function specific optimization. Doing so, can save about 300K for the
26653 default compiler. When the builtin is expanded, check at that time whether
26654 it is valid.
26655
26656 If the front end doesn't have a special hook, record all builtins, even if
26657 it isn't an instruction set in the current ISA in case the user uses
26658 function specific options for a different ISA, so that we don't get scope
26659 errors if a builtin is added in the middle of a function scope. */
26660
26661 static inline tree
26662 def_builtin (HOST_WIDE_INT mask, const char *name,
26663 enum ix86_builtin_func_type tcode,
26664 enum ix86_builtins code)
26665 {
26666 tree decl = NULL_TREE;
26667
26668 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
26669 {
26670 ix86_builtins_isa[(int) code].isa = mask;
26671
26672 mask &= ~OPTION_MASK_ISA_64BIT;
26673 if (mask == 0
26674 || (mask & ix86_isa_flags) != 0
26675 || (lang_hooks.builtin_function
26676 == lang_hooks.builtin_function_ext_scope))
26677
26678 {
26679 tree type = ix86_get_builtin_func_type (tcode);
26680 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
26681 NULL, NULL_TREE);
26682 ix86_builtins[(int) code] = decl;
26683 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
26684 }
26685 else
26686 {
26687 ix86_builtins[(int) code] = NULL_TREE;
26688 ix86_builtins_isa[(int) code].tcode = tcode;
26689 ix86_builtins_isa[(int) code].name = name;
26690 ix86_builtins_isa[(int) code].const_p = false;
26691 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
26692 }
26693 }
26694
26695 return decl;
26696 }
26697
26698 /* Like def_builtin, but also marks the function decl "const". */
26699
26700 static inline tree
26701 def_builtin_const (HOST_WIDE_INT mask, const char *name,
26702 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
26703 {
26704 tree decl = def_builtin (mask, name, tcode, code);
26705 if (decl)
26706 TREE_READONLY (decl) = 1;
26707 else
26708 ix86_builtins_isa[(int) code].const_p = true;
26709
26710 return decl;
26711 }
26712
26713 /* Add any new builtin functions for a given ISA that may not have been
26714 declared. This saves a bit of space compared to adding all of the
26715 declarations to the tree, even if we didn't use them. */
26716
26717 static void
26718 ix86_add_new_builtins (HOST_WIDE_INT isa)
26719 {
26720 int i;
26721
26722 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
26723 {
26724 if ((ix86_builtins_isa[i].isa & isa) != 0
26725 && ix86_builtins_isa[i].set_and_not_built_p)
26726 {
26727 tree decl, type;
26728
26729 /* Don't define the builtin again. */
26730 ix86_builtins_isa[i].set_and_not_built_p = false;
26731
26732 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
26733 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
26734 type, i, BUILT_IN_MD, NULL,
26735 NULL_TREE);
26736
26737 ix86_builtins[i] = decl;
26738 if (ix86_builtins_isa[i].const_p)
26739 TREE_READONLY (decl) = 1;
26740 }
26741 }
26742 }
26743
26744 /* Bits for builtin_description.flag. */
26745
26746 /* Set when we don't support the comparison natively, and should
26747 swap_comparison in order to support it. */
26748 #define BUILTIN_DESC_SWAP_OPERANDS 1
26749
26750 struct builtin_description
26751 {
26752 const HOST_WIDE_INT mask;
26753 const enum insn_code icode;
26754 const char *const name;
26755 const enum ix86_builtins code;
26756 const enum rtx_code comparison;
26757 const int flag;
26758 };
26759
26760 static const struct builtin_description bdesc_comi[] =
26761 {
26762 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
26763 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
26764 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
26765 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
26766 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
26767 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
26768 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
26769 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
26770 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
26771 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
26772 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
26773 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
26774 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
26775 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
26776 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
26777 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
26778 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
26779 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
26780 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
26781 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
26782 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
26783 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
26784 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
26785 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
26786 };
26787
26788 static const struct builtin_description bdesc_pcmpestr[] =
26789 {
26790 /* SSE4.2 */
26791 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
26792 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
26793 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
26794 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
26795 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
26796 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
26797 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
26798 };
26799
26800 static const struct builtin_description bdesc_pcmpistr[] =
26801 {
26802 /* SSE4.2 */
26803 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
26804 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
26805 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
26806 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
26807 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
26808 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
26809 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
26810 };
26811
26812 /* Special builtins with variable number of arguments. */
26813 static const struct builtin_description bdesc_special_args[] =
26814 {
26815 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
26816 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
26817 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
26818
26819 /* MMX */
26820 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26821
26822 /* 3DNow! */
26823 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26824
26825 /* FXSR, XSAVE and XSAVEOPT */
26826 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxsave", IX86_BUILTIN_FXSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID },
26827 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxrstor", IX86_BUILTIN_FXRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID },
26828 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xsave", IX86_BUILTIN_XSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26829 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xrstor", IX86_BUILTIN_XRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26830 { OPTION_MASK_ISA_XSAVEOPT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt", IX86_BUILTIN_XSAVEOPT, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26831
26832 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxsave64", IX86_BUILTIN_FXSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID },
26833 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxrstor64", IX86_BUILTIN_FXRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID },
26834 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsave64", IX86_BUILTIN_XSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26835 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xrstor64", IX86_BUILTIN_XRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26836 { OPTION_MASK_ISA_XSAVEOPT | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt64", IX86_BUILTIN_XSAVEOPT64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26837
26838 /* SSE */
26839 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storeups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26840 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26841 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26842
26843 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26844 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26845 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26846 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26847
26848 /* SSE or 3DNow!A */
26849 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26850 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
26851
26852 /* SSE2 */
26853 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26854 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26855 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storeupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26856 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storedqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
26857 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26858 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
26859 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
26860 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
26861 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
26862 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loaddqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26863
26864 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26865 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26866
26867 /* SSE3 */
26868 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26869
26870 /* SSE4.1 */
26871 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
26872
26873 /* SSE4A */
26874 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26875 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26876
26877 /* AVX */
26878 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
26879 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
26880
26881 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26882 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26883 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26884 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
26885 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
26886
26887 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26888 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26889 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26890 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26891 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loaddqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26892 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storedqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
26893 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26894
26895 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
26896 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26897 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26898
26899 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
26900 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
26901 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
26902 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
26903 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
26904 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
26905 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
26906 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
26907
26908 /* AVX2 */
26909 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
26910 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
26911 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
26912 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
26913 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
26914 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
26915 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
26916 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
26917 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
26918
26919 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
26920 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
26921 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
26922 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
26923 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
26924 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
26925
26926 /* FSGSBASE */
26927 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26928 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26929 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26930 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26931 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26932 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26933 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26934 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26935
26936 /* RTM */
26937 { OPTION_MASK_ISA_RTM, CODE_FOR_xbegin, "__builtin_ia32_xbegin", IX86_BUILTIN_XBEGIN, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26938 { OPTION_MASK_ISA_RTM, CODE_FOR_xend, "__builtin_ia32_xend", IX86_BUILTIN_XEND, UNKNOWN, (int) VOID_FTYPE_VOID },
26939 { OPTION_MASK_ISA_RTM, CODE_FOR_xtest, "__builtin_ia32_xtest", IX86_BUILTIN_XTEST, UNKNOWN, (int) INT_FTYPE_VOID },
26940 };
26941
26942 /* Builtins with variable number of arguments. */
26943 static const struct builtin_description bdesc_args[] =
26944 {
26945 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
26946 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
26947 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
26948 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26949 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26950 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26951 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26952
26953 /* MMX */
26954 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26955 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26956 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26957 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26958 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26959 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26960
26961 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26962 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26963 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26964 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26965 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26966 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26967 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26968 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26969
26970 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26971 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26972
26973 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26974 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26975 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26976 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26977
26978 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26979 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26980 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26981 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26982 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26983 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26984
26985 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26986 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26987 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26988 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26989 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
26990 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
26991
26992 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26993 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
26994 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26995
26996 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
26997
26998 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26999 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27000 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
27001 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27002 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27003 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
27004
27005 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27006 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27007 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
27008 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27009 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27010 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
27011
27012 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27013 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27014 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27015 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27016
27017 /* 3DNow! */
27018 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
27019 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
27020 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27021 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27022
27023 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27024 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27025 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27026 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27027 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27028 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27029 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27030 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27031 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27032 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27033 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27034 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27035 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27036 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27037 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27038
27039 /* 3DNow!A */
27040 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
27041 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
27042 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
27043 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27044 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27045 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27046
27047 /* SSE */
27048 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
27049 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27050 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27051 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27052 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27053 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27054 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
27055 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
27056 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
27057 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
27058 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
27059 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
27060
27061 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27062
27063 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27064 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27065 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27066 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27067 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27068 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27069 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27070 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27071
27072 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
27073 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
27074 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
27075 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27076 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27077 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27078 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
27079 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
27080 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
27081 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27082 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
27083 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27084 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
27085 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
27086 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
27087 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27088 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
27089 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
27090 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
27091 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27092 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27093 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27094
27095 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27096 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27097 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27098 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27099
27100 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27101 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27102 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27103 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27104
27105 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27106
27107 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27108 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27109 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27110 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27111 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27112
27113 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
27114 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
27115 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
27116
27117 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
27118
27119 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27120 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27121 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27122
27123 { OPTION_MASK_ISA_SSE, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
27124 { OPTION_MASK_ISA_SSE, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
27125
27126 /* SSE MMX or 3Dnow!A */
27127 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27128 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27129 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27130
27131 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27132 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27133 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27134 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27135
27136 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
27137 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
27138
27139 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
27140
27141 /* SSE2 */
27142 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27143
27144 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
27145 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
27146 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
27147 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
27148 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
27149
27150 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
27151 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
27152 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
27153 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
27154 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
27155
27156 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
27157
27158 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
27159 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
27160 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
27161 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
27162
27163 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27164 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
27165 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27166
27167 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27168 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27169 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27170 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27171 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27172 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27173 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27174 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27175
27176 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
27177 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
27178 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
27179 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27180 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
27181 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27182 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
27183 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
27184 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
27185 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27186 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27187 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27188 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
27189 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
27190 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
27191 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27192 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
27193 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
27194 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
27195 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27196
27197 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27198 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27199 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27200 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27201
27202 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27203 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27204 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27205 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27206
27207 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27208
27209 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27210 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27211 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27212
27213 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
27214
27215 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27216 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27217 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27218 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27219 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27220 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27221 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27222 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27223
27224 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27225 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27226 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27227 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27228 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27229 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27230 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27231 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27232
27233 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27234 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
27235
27236 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27237 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27238 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27239 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27240
27241 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27242 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27243
27244 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27245 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27246 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27247 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27248 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27249 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27250
27251 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27252 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27253 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27254 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27255
27256 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27257 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27258 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27259 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27260 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27261 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27262 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27263 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27264
27265 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
27266 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
27267 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
27268
27269 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27270 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
27271
27272 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
27273 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_even_v4si, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
27274
27275 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
27276
27277 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
27278 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
27279 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
27280 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
27281
27282 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
27283 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27284 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27285 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
27286 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27287 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27288 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
27289
27290 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
27291 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27292 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27293 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
27294 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27295 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27296 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
27297
27298 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27299 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27300 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27301 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27302
27303 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
27304 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
27305 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
27306
27307 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
27308
27309 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27310
27311 /* SSE2 MMX */
27312 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
27313 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
27314
27315 /* SSE3 */
27316 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
27317 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27318
27319 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27320 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27321 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27322 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27323 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27324 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27325
27326 /* SSSE3 */
27327 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27328 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
27329 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27330 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
27331 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27332 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
27333
27334 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27335 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27336 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27337 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27338 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27339 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27340 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27341 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27342 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27343 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27344 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27345 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27346 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
27347 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
27348 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27349 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27350 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27351 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27352 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27353 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27354 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27355 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27356 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27357 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27358
27359 /* SSSE3. */
27360 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
27361 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
27362
27363 /* SSE4.1 */
27364 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27365 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27366 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
27367 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
27368 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27369 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27370 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27371 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
27372 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
27373 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
27374
27375 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
27376 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
27377 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
27378 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
27379 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
27380 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
27381 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
27382 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
27383 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
27384 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
27385 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
27386 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
27387 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27388
27389 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
27390 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27391 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27392 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27393 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27394 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27395 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27396 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27397 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27398 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27399 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
27400 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27401
27402 /* SSE4.1 */
27403 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
27404 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
27405 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27406 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27407
27408 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
27409 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
27410 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
27411 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
27412
27413 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
27414 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
27415
27416 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
27417 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
27418
27419 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
27420 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
27421 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
27422 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
27423
27424 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
27425 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
27426
27427 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27428 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27429
27430 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27431 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27432 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27433
27434 /* SSE4.2 */
27435 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27436 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
27437 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
27438 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27439 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27440
27441 /* SSE4A */
27442 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
27443 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
27444 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
27445 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27446
27447 /* AES */
27448 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
27449 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27450
27451 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27452 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27453 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27454 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27455
27456 /* PCLMUL */
27457 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
27458
27459 /* AVX */
27460 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27461 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27462 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27463 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27464 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27465 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27466 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27467 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27468 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27469 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27470 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27471 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27472 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27473 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27474 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27475 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27476 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27477 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27478 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27479 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27480 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27481 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27482 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27483 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27484 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27485 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27486
27487 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
27488 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
27489 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
27490 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27491
27492 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27493 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27494 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
27495 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
27496 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27497 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27498 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27499 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27500 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27501 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27502 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27503 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27504 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27505 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
27506 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
27507 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
27508 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
27509 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
27510 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
27511 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27512 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
27513 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
27514 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
27515 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27516 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27517 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27518 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27519 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
27520 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
27521 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27522 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
27523 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
27524 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
27525 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
27526
27527 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27528 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27529 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27530
27531 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27532 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27533 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27534 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27535 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27536
27537 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27538
27539 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27540 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
27541
27542 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
27543 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
27544 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
27545 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
27546
27547 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27548 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
27549
27550 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
27551 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
27552
27553 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
27554 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
27555 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
27556 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
27557
27558 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
27559 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
27560
27561 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27562 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27563
27564 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27565 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27566 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27567 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27568
27569 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27570 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27571 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27572 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
27573 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
27574 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
27575
27576 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27577 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27578 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27579 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27580 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27581 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27582 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27583 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27584 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27585 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27586 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27587 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27588 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27589 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27590 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27591
27592 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
27593 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
27594
27595 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27596 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27597
27598 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
27599
27600 /* AVX2 */
27601 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
27602 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
27603 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
27604 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
27605 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
27606 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
27607 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
27608 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
27609 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27610 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27611 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27612 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27613 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27614 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27615 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27616 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27617 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
27618 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27619 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27620 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27621 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27622 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
27623 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
27624 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27625 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27626 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27627 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27628 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27629 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27630 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27631 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27632 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27633 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27634 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27635 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27636 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27637 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27638 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
27639 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
27640 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27641 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27642 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27643 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27644 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27645 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27646 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27647 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27648 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27649 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27650 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27651 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27652 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
27653 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
27654 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
27655 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
27656 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
27657 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
27658 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27659 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
27660 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
27661 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
27662 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
27663 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
27664 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27665 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_smult_even_v8si, "__builtin_ia32_pmuldq256", IX86_BUILTIN_PMULDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27666 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27667 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27668 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27669 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27670 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27671 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_umult_even_v8si, "__builtin_ia32_pmuludq256", IX86_BUILTIN_PMULUDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27672 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27673 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
27674 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27675 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
27676 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27677 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27678 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27679 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27680 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27681 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27682 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27683 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27684 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27685 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27686 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27687 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27688 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27689 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27690 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27691 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27692 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27693 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27694 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27695 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27696 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27697 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27698 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27699 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27700 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27701 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27702 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27703 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27704 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27705 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27706 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27707 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27708 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27709 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27710 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27711 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27712 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27713 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27714 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27715 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27716 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27717 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27718 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27719 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27720 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
27721 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27722 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
27723 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
27724 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27725 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27726 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27727 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27728 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27729 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27730 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27731 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27732 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27733 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
27734 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
27735 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
27736 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
27737 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27738 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27739 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27740 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27741 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27742 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27743 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27744 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27745 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27746 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27747
27748 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27749
27750 /* BMI */
27751 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27752 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27753 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27754
27755 /* TBM */
27756 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27757 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27758
27759 /* F16C */
27760 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
27761 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
27762 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
27763 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
27764
27765 /* BMI2 */
27766 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27767 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27768 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27769 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27770 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27771 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27772 };
27773
27774 /* FMA4 and XOP. */
27775 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
27776 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
27777 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
27778 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
27779 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
27780 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
27781 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
27782 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
27783 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
27784 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
27785 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
27786 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
27787 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
27788 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
27789 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
27790 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
27791 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
27792 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
27793 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
27794 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
27795 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
27796 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
27797 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
27798 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
27799 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
27800 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
27801 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
27802 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
27803 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
27804 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
27805 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
27806 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
27807 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
27808 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
27809 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
27810 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
27811 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
27812 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
27813 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
27814 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
27815 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
27816 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
27817 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
27818 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
27819 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
27820 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
27821 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
27822 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
27823 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
27824 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
27825 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
27826 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
27827
27828 static const struct builtin_description bdesc_multi_arg[] =
27829 {
27830 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
27831 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
27832 UNKNOWN, (int)MULTI_ARG_3_SF },
27833 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
27834 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
27835 UNKNOWN, (int)MULTI_ARG_3_DF },
27836
27837 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
27838 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
27839 UNKNOWN, (int)MULTI_ARG_3_SF },
27840 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
27841 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
27842 UNKNOWN, (int)MULTI_ARG_3_DF },
27843
27844 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
27845 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
27846 UNKNOWN, (int)MULTI_ARG_3_SF },
27847 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
27848 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
27849 UNKNOWN, (int)MULTI_ARG_3_DF },
27850 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
27851 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
27852 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27853 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
27854 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
27855 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27856
27857 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
27858 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
27859 UNKNOWN, (int)MULTI_ARG_3_SF },
27860 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
27861 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
27862 UNKNOWN, (int)MULTI_ARG_3_DF },
27863 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
27864 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
27865 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27866 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
27867 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
27868 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27869
27870 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
27871 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
27872 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
27873 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
27874 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
27875 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
27876 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
27877
27878 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27879 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27880 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
27881 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
27882 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
27883 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
27884 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
27885
27886 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
27887
27888 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27889 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27890 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27891 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27892 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27893 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27894 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27895 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27896 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27897 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27898 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27899 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27900
27901 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27902 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
27903 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
27904 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
27905 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
27906 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
27907 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
27908 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
27909 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27910 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
27911 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
27912 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
27913 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27914 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
27915 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
27916 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
27917
27918 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
27919 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
27920 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
27921 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
27922 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
27923 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
27924
27925 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27926 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27927 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27928 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27929 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27930 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27931 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27932 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27933 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27934 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27935 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27936 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27937 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27938 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27939 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27940
27941 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
27942 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27943 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27944 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
27945 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
27946 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
27947 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
27948
27949 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
27950 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27951 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27952 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
27953 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
27954 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
27955 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
27956
27957 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
27958 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27959 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27960 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
27961 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
27962 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
27963 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
27964
27965 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
27966 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27967 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27968 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
27969 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
27970 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
27971 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
27972
27973 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
27974 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27975 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27976 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
27977 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
27978 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
27979 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
27980
27981 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
27982 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27983 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27984 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
27985 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
27986 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
27987 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
27988
27989 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
27990 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
27991 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
27992 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
27993 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
27994 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
27995 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
27996
27997 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
27998 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
27999 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
28000 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
28001 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
28002 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
28003 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
28004
28005 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
28006 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
28007 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
28008 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
28009 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
28010 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
28011 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
28012 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
28013
28014 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
28015 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
28016 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
28017 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
28018 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
28019 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
28020 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
28021 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
28022
28023 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
28024 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
28025 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
28026 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
28027
28028 };
28029 \f
28030 /* TM vector builtins. */
28031
28032 /* Reuse the existing x86-specific `struct builtin_description' cause
28033 we're lazy. Add casts to make them fit. */
28034 static const struct builtin_description bdesc_tm[] =
28035 {
28036 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28037 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28038 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28039 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28040 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28041 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28042 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28043
28044 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28045 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28046 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28047 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28048 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28049 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28050 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28051
28052 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28053 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28054 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28055 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28056 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28057 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28058 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28059
28060 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
28061 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
28062 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
28063 };
28064
28065 /* TM callbacks. */
28066
28067 /* Return the builtin decl needed to load a vector of TYPE. */
28068
28069 static tree
28070 ix86_builtin_tm_load (tree type)
28071 {
28072 if (TREE_CODE (type) == VECTOR_TYPE)
28073 {
28074 switch (tree_low_cst (TYPE_SIZE (type), 1))
28075 {
28076 case 64:
28077 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
28078 case 128:
28079 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
28080 case 256:
28081 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
28082 }
28083 }
28084 return NULL_TREE;
28085 }
28086
28087 /* Return the builtin decl needed to store a vector of TYPE. */
28088
28089 static tree
28090 ix86_builtin_tm_store (tree type)
28091 {
28092 if (TREE_CODE (type) == VECTOR_TYPE)
28093 {
28094 switch (tree_low_cst (TYPE_SIZE (type), 1))
28095 {
28096 case 64:
28097 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
28098 case 128:
28099 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
28100 case 256:
28101 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
28102 }
28103 }
28104 return NULL_TREE;
28105 }
28106 \f
28107 /* Initialize the transactional memory vector load/store builtins. */
28108
28109 static void
28110 ix86_init_tm_builtins (void)
28111 {
28112 enum ix86_builtin_func_type ftype;
28113 const struct builtin_description *d;
28114 size_t i;
28115 tree decl;
28116 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
28117 tree attrs_log, attrs_type_log;
28118
28119 if (!flag_tm)
28120 return;
28121
28122 /* If there are no builtins defined, we must be compiling in a
28123 language without trans-mem support. */
28124 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
28125 return;
28126
28127 /* Use whatever attributes a normal TM load has. */
28128 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
28129 attrs_load = DECL_ATTRIBUTES (decl);
28130 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28131 /* Use whatever attributes a normal TM store has. */
28132 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
28133 attrs_store = DECL_ATTRIBUTES (decl);
28134 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28135 /* Use whatever attributes a normal TM log has. */
28136 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
28137 attrs_log = DECL_ATTRIBUTES (decl);
28138 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28139
28140 for (i = 0, d = bdesc_tm;
28141 i < ARRAY_SIZE (bdesc_tm);
28142 i++, d++)
28143 {
28144 if ((d->mask & ix86_isa_flags) != 0
28145 || (lang_hooks.builtin_function
28146 == lang_hooks.builtin_function_ext_scope))
28147 {
28148 tree type, attrs, attrs_type;
28149 enum built_in_function code = (enum built_in_function) d->code;
28150
28151 ftype = (enum ix86_builtin_func_type) d->flag;
28152 type = ix86_get_builtin_func_type (ftype);
28153
28154 if (BUILTIN_TM_LOAD_P (code))
28155 {
28156 attrs = attrs_load;
28157 attrs_type = attrs_type_load;
28158 }
28159 else if (BUILTIN_TM_STORE_P (code))
28160 {
28161 attrs = attrs_store;
28162 attrs_type = attrs_type_store;
28163 }
28164 else
28165 {
28166 attrs = attrs_log;
28167 attrs_type = attrs_type_log;
28168 }
28169 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
28170 /* The builtin without the prefix for
28171 calling it directly. */
28172 d->name + strlen ("__builtin_"),
28173 attrs);
28174 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
28175 set the TYPE_ATTRIBUTES. */
28176 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
28177
28178 set_builtin_decl (code, decl, false);
28179 }
28180 }
28181 }
28182
28183 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
28184 in the current target ISA to allow the user to compile particular modules
28185 with different target specific options that differ from the command line
28186 options. */
28187 static void
28188 ix86_init_mmx_sse_builtins (void)
28189 {
28190 const struct builtin_description * d;
28191 enum ix86_builtin_func_type ftype;
28192 size_t i;
28193
28194 /* Add all special builtins with variable number of operands. */
28195 for (i = 0, d = bdesc_special_args;
28196 i < ARRAY_SIZE (bdesc_special_args);
28197 i++, d++)
28198 {
28199 if (d->name == 0)
28200 continue;
28201
28202 ftype = (enum ix86_builtin_func_type) d->flag;
28203 def_builtin (d->mask, d->name, ftype, d->code);
28204 }
28205
28206 /* Add all builtins with variable number of operands. */
28207 for (i = 0, d = bdesc_args;
28208 i < ARRAY_SIZE (bdesc_args);
28209 i++, d++)
28210 {
28211 if (d->name == 0)
28212 continue;
28213
28214 ftype = (enum ix86_builtin_func_type) d->flag;
28215 def_builtin_const (d->mask, d->name, ftype, d->code);
28216 }
28217
28218 /* pcmpestr[im] insns. */
28219 for (i = 0, d = bdesc_pcmpestr;
28220 i < ARRAY_SIZE (bdesc_pcmpestr);
28221 i++, d++)
28222 {
28223 if (d->code == IX86_BUILTIN_PCMPESTRM128)
28224 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
28225 else
28226 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
28227 def_builtin_const (d->mask, d->name, ftype, d->code);
28228 }
28229
28230 /* pcmpistr[im] insns. */
28231 for (i = 0, d = bdesc_pcmpistr;
28232 i < ARRAY_SIZE (bdesc_pcmpistr);
28233 i++, d++)
28234 {
28235 if (d->code == IX86_BUILTIN_PCMPISTRM128)
28236 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
28237 else
28238 ftype = INT_FTYPE_V16QI_V16QI_INT;
28239 def_builtin_const (d->mask, d->name, ftype, d->code);
28240 }
28241
28242 /* comi/ucomi insns. */
28243 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
28244 {
28245 if (d->mask == OPTION_MASK_ISA_SSE2)
28246 ftype = INT_FTYPE_V2DF_V2DF;
28247 else
28248 ftype = INT_FTYPE_V4SF_V4SF;
28249 def_builtin_const (d->mask, d->name, ftype, d->code);
28250 }
28251
28252 /* SSE */
28253 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
28254 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
28255 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
28256 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
28257
28258 /* SSE or 3DNow!A */
28259 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28260 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
28261 IX86_BUILTIN_MASKMOVQ);
28262
28263 /* SSE2 */
28264 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
28265 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
28266
28267 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
28268 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
28269 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
28270 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
28271
28272 /* SSE3. */
28273 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
28274 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
28275 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
28276 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
28277
28278 /* AES */
28279 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
28280 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
28281 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
28282 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
28283 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
28284 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
28285 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
28286 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
28287 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
28288 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
28289 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
28290 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
28291
28292 /* PCLMUL */
28293 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
28294 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
28295
28296 /* RDRND */
28297 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
28298 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
28299 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
28300 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
28301 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
28302 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
28303 IX86_BUILTIN_RDRAND64_STEP);
28304
28305 /* AVX2 */
28306 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
28307 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
28308 IX86_BUILTIN_GATHERSIV2DF);
28309
28310 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
28311 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
28312 IX86_BUILTIN_GATHERSIV4DF);
28313
28314 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
28315 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
28316 IX86_BUILTIN_GATHERDIV2DF);
28317
28318 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
28319 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
28320 IX86_BUILTIN_GATHERDIV4DF);
28321
28322 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
28323 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
28324 IX86_BUILTIN_GATHERSIV4SF);
28325
28326 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
28327 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
28328 IX86_BUILTIN_GATHERSIV8SF);
28329
28330 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
28331 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
28332 IX86_BUILTIN_GATHERDIV4SF);
28333
28334 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
28335 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
28336 IX86_BUILTIN_GATHERDIV8SF);
28337
28338 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
28339 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
28340 IX86_BUILTIN_GATHERSIV2DI);
28341
28342 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
28343 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
28344 IX86_BUILTIN_GATHERSIV4DI);
28345
28346 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
28347 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
28348 IX86_BUILTIN_GATHERDIV2DI);
28349
28350 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
28351 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
28352 IX86_BUILTIN_GATHERDIV4DI);
28353
28354 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
28355 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
28356 IX86_BUILTIN_GATHERSIV4SI);
28357
28358 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
28359 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
28360 IX86_BUILTIN_GATHERSIV8SI);
28361
28362 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
28363 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
28364 IX86_BUILTIN_GATHERDIV4SI);
28365
28366 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
28367 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
28368 IX86_BUILTIN_GATHERDIV8SI);
28369
28370 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
28371 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
28372 IX86_BUILTIN_GATHERALTSIV4DF);
28373
28374 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
28375 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
28376 IX86_BUILTIN_GATHERALTDIV8SF);
28377
28378 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
28379 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
28380 IX86_BUILTIN_GATHERALTSIV4DI);
28381
28382 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
28383 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
28384 IX86_BUILTIN_GATHERALTDIV8SI);
28385
28386 /* RTM. */
28387 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
28388 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
28389
28390 /* MMX access to the vec_init patterns. */
28391 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
28392 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
28393
28394 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
28395 V4HI_FTYPE_HI_HI_HI_HI,
28396 IX86_BUILTIN_VEC_INIT_V4HI);
28397
28398 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
28399 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
28400 IX86_BUILTIN_VEC_INIT_V8QI);
28401
28402 /* Access to the vec_extract patterns. */
28403 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
28404 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
28405 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
28406 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
28407 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
28408 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
28409 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
28410 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
28411 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
28412 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
28413
28414 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28415 "__builtin_ia32_vec_ext_v4hi",
28416 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
28417
28418 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
28419 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
28420
28421 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
28422 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
28423
28424 /* Access to the vec_set patterns. */
28425 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
28426 "__builtin_ia32_vec_set_v2di",
28427 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
28428
28429 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
28430 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
28431
28432 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
28433 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
28434
28435 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
28436 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
28437
28438 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28439 "__builtin_ia32_vec_set_v4hi",
28440 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
28441
28442 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
28443 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
28444
28445 /* RDSEED */
28446 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
28447 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
28448 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
28449 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
28450 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
28451 "__builtin_ia32_rdseed_di_step",
28452 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
28453
28454 /* ADCX */
28455 def_builtin (0, "__builtin_ia32_addcarryx_u32",
28456 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
28457 def_builtin (OPTION_MASK_ISA_64BIT,
28458 "__builtin_ia32_addcarryx_u64",
28459 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
28460 IX86_BUILTIN_ADDCARRYX64);
28461
28462 /* Add FMA4 multi-arg argument instructions */
28463 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
28464 {
28465 if (d->name == 0)
28466 continue;
28467
28468 ftype = (enum ix86_builtin_func_type) d->flag;
28469 def_builtin_const (d->mask, d->name, ftype, d->code);
28470 }
28471 }
28472
28473 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
28474 to return a pointer to VERSION_DECL if the outcome of the expression
28475 formed by PREDICATE_CHAIN is true. This function will be called during
28476 version dispatch to decide which function version to execute. It returns
28477 the basic block at the end, to which more conditions can be added. */
28478
28479 static basic_block
28480 add_condition_to_bb (tree function_decl, tree version_decl,
28481 tree predicate_chain, basic_block new_bb)
28482 {
28483 gimple return_stmt;
28484 tree convert_expr, result_var;
28485 gimple convert_stmt;
28486 gimple call_cond_stmt;
28487 gimple if_else_stmt;
28488
28489 basic_block bb1, bb2, bb3;
28490 edge e12, e23;
28491
28492 tree cond_var, and_expr_var = NULL_TREE;
28493 gimple_seq gseq;
28494
28495 tree predicate_decl, predicate_arg;
28496
28497 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
28498
28499 gcc_assert (new_bb != NULL);
28500 gseq = bb_seq (new_bb);
28501
28502
28503 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
28504 build_fold_addr_expr (version_decl));
28505 result_var = create_tmp_var (ptr_type_node, NULL);
28506 convert_stmt = gimple_build_assign (result_var, convert_expr);
28507 return_stmt = gimple_build_return (result_var);
28508
28509 if (predicate_chain == NULL_TREE)
28510 {
28511 gimple_seq_add_stmt (&gseq, convert_stmt);
28512 gimple_seq_add_stmt (&gseq, return_stmt);
28513 set_bb_seq (new_bb, gseq);
28514 gimple_set_bb (convert_stmt, new_bb);
28515 gimple_set_bb (return_stmt, new_bb);
28516 pop_cfun ();
28517 return new_bb;
28518 }
28519
28520 while (predicate_chain != NULL)
28521 {
28522 cond_var = create_tmp_var (integer_type_node, NULL);
28523 predicate_decl = TREE_PURPOSE (predicate_chain);
28524 predicate_arg = TREE_VALUE (predicate_chain);
28525 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
28526 gimple_call_set_lhs (call_cond_stmt, cond_var);
28527
28528 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
28529 gimple_set_bb (call_cond_stmt, new_bb);
28530 gimple_seq_add_stmt (&gseq, call_cond_stmt);
28531
28532 predicate_chain = TREE_CHAIN (predicate_chain);
28533
28534 if (and_expr_var == NULL)
28535 and_expr_var = cond_var;
28536 else
28537 {
28538 gimple assign_stmt;
28539 /* Use MIN_EXPR to check if any integer is zero?.
28540 and_expr_var = min_expr <cond_var, and_expr_var> */
28541 assign_stmt = gimple_build_assign (and_expr_var,
28542 build2 (MIN_EXPR, integer_type_node,
28543 cond_var, and_expr_var));
28544
28545 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
28546 gimple_set_bb (assign_stmt, new_bb);
28547 gimple_seq_add_stmt (&gseq, assign_stmt);
28548 }
28549 }
28550
28551 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
28552 integer_zero_node,
28553 NULL_TREE, NULL_TREE);
28554 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
28555 gimple_set_bb (if_else_stmt, new_bb);
28556 gimple_seq_add_stmt (&gseq, if_else_stmt);
28557
28558 gimple_seq_add_stmt (&gseq, convert_stmt);
28559 gimple_seq_add_stmt (&gseq, return_stmt);
28560 set_bb_seq (new_bb, gseq);
28561
28562 bb1 = new_bb;
28563 e12 = split_block (bb1, if_else_stmt);
28564 bb2 = e12->dest;
28565 e12->flags &= ~EDGE_FALLTHRU;
28566 e12->flags |= EDGE_TRUE_VALUE;
28567
28568 e23 = split_block (bb2, return_stmt);
28569
28570 gimple_set_bb (convert_stmt, bb2);
28571 gimple_set_bb (return_stmt, bb2);
28572
28573 bb3 = e23->dest;
28574 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
28575
28576 remove_edge (e23);
28577 make_edge (bb2, EXIT_BLOCK_PTR, 0);
28578
28579 pop_cfun ();
28580
28581 return bb3;
28582 }
28583
28584 /* This parses the attribute arguments to target in DECL and determines
28585 the right builtin to use to match the platform specification.
28586 It returns the priority value for this version decl. If PREDICATE_LIST
28587 is not NULL, it stores the list of cpu features that need to be checked
28588 before dispatching this function. */
28589
28590 static unsigned int
28591 get_builtin_code_for_version (tree decl, tree *predicate_list)
28592 {
28593 tree attrs;
28594 struct cl_target_option cur_target;
28595 tree target_node;
28596 struct cl_target_option *new_target;
28597 const char *arg_str = NULL;
28598 const char *attrs_str = NULL;
28599 char *tok_str = NULL;
28600 char *token;
28601
28602 /* Priority of i386 features, greater value is higher priority. This is
28603 used to decide the order in which function dispatch must happen. For
28604 instance, a version specialized for SSE4.2 should be checked for dispatch
28605 before a version for SSE3, as SSE4.2 implies SSE3. */
28606 enum feature_priority
28607 {
28608 P_ZERO = 0,
28609 P_MMX,
28610 P_SSE,
28611 P_SSE2,
28612 P_SSE3,
28613 P_SSSE3,
28614 P_PROC_SSSE3,
28615 P_SSE4_a,
28616 P_PROC_SSE4_a,
28617 P_SSE4_1,
28618 P_SSE4_2,
28619 P_PROC_SSE4_2,
28620 P_POPCNT,
28621 P_AVX,
28622 P_AVX2,
28623 P_FMA,
28624 P_PROC_FMA
28625 };
28626
28627 enum feature_priority priority = P_ZERO;
28628
28629 /* These are the target attribute strings for which a dispatcher is
28630 available, from fold_builtin_cpu. */
28631
28632 static struct _feature_list
28633 {
28634 const char *const name;
28635 const enum feature_priority priority;
28636 }
28637 const feature_list[] =
28638 {
28639 {"mmx", P_MMX},
28640 {"sse", P_SSE},
28641 {"sse2", P_SSE2},
28642 {"sse3", P_SSE3},
28643 {"ssse3", P_SSSE3},
28644 {"sse4.1", P_SSE4_1},
28645 {"sse4.2", P_SSE4_2},
28646 {"popcnt", P_POPCNT},
28647 {"avx", P_AVX},
28648 {"avx2", P_AVX2}
28649 };
28650
28651
28652 static unsigned int NUM_FEATURES
28653 = sizeof (feature_list) / sizeof (struct _feature_list);
28654
28655 unsigned int i;
28656
28657 tree predicate_chain = NULL_TREE;
28658 tree predicate_decl, predicate_arg;
28659
28660 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
28661 gcc_assert (attrs != NULL);
28662
28663 attrs = TREE_VALUE (TREE_VALUE (attrs));
28664
28665 gcc_assert (TREE_CODE (attrs) == STRING_CST);
28666 attrs_str = TREE_STRING_POINTER (attrs);
28667
28668
28669 /* Handle arch= if specified. For priority, set it to be 1 more than
28670 the best instruction set the processor can handle. For instance, if
28671 there is a version for atom and a version for ssse3 (the highest ISA
28672 priority for atom), the atom version must be checked for dispatch
28673 before the ssse3 version. */
28674 if (strstr (attrs_str, "arch=") != NULL)
28675 {
28676 cl_target_option_save (&cur_target, &global_options);
28677 target_node = ix86_valid_target_attribute_tree (attrs);
28678
28679 gcc_assert (target_node);
28680 new_target = TREE_TARGET_OPTION (target_node);
28681 gcc_assert (new_target);
28682
28683 if (new_target->arch_specified && new_target->arch > 0)
28684 {
28685 switch (new_target->arch)
28686 {
28687 case PROCESSOR_CORE2:
28688 arg_str = "core2";
28689 priority = P_PROC_SSSE3;
28690 break;
28691 case PROCESSOR_COREI7:
28692 arg_str = "corei7";
28693 priority = P_PROC_SSE4_2;
28694 break;
28695 case PROCESSOR_ATOM:
28696 arg_str = "atom";
28697 priority = P_PROC_SSSE3;
28698 break;
28699 case PROCESSOR_AMDFAM10:
28700 arg_str = "amdfam10h";
28701 priority = P_PROC_SSE4_a;
28702 break;
28703 case PROCESSOR_BDVER1:
28704 arg_str = "bdver1";
28705 priority = P_PROC_FMA;
28706 break;
28707 case PROCESSOR_BDVER2:
28708 arg_str = "bdver2";
28709 priority = P_PROC_FMA;
28710 break;
28711 }
28712 }
28713
28714 cl_target_option_restore (&global_options, &cur_target);
28715
28716 if (predicate_list && arg_str == NULL)
28717 {
28718 error_at (DECL_SOURCE_LOCATION (decl),
28719 "No dispatcher found for the versioning attributes");
28720 return 0;
28721 }
28722
28723 if (predicate_list)
28724 {
28725 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
28726 /* For a C string literal the length includes the trailing NULL. */
28727 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
28728 predicate_chain = tree_cons (predicate_decl, predicate_arg,
28729 predicate_chain);
28730 }
28731 }
28732
28733 /* Process feature name. */
28734 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
28735 strcpy (tok_str, attrs_str);
28736 token = strtok (tok_str, ",");
28737 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
28738
28739 while (token != NULL)
28740 {
28741 /* Do not process "arch=" */
28742 if (strncmp (token, "arch=", 5) == 0)
28743 {
28744 token = strtok (NULL, ",");
28745 continue;
28746 }
28747 for (i = 0; i < NUM_FEATURES; ++i)
28748 {
28749 if (strcmp (token, feature_list[i].name) == 0)
28750 {
28751 if (predicate_list)
28752 {
28753 predicate_arg = build_string_literal (
28754 strlen (feature_list[i].name) + 1,
28755 feature_list[i].name);
28756 predicate_chain = tree_cons (predicate_decl, predicate_arg,
28757 predicate_chain);
28758 }
28759 /* Find the maximum priority feature. */
28760 if (feature_list[i].priority > priority)
28761 priority = feature_list[i].priority;
28762
28763 break;
28764 }
28765 }
28766 if (predicate_list && i == NUM_FEATURES)
28767 {
28768 error_at (DECL_SOURCE_LOCATION (decl),
28769 "No dispatcher found for %s", token);
28770 return 0;
28771 }
28772 token = strtok (NULL, ",");
28773 }
28774 free (tok_str);
28775
28776 if (predicate_list && predicate_chain == NULL_TREE)
28777 {
28778 error_at (DECL_SOURCE_LOCATION (decl),
28779 "No dispatcher found for the versioning attributes : %s",
28780 attrs_str);
28781 return 0;
28782 }
28783 else if (predicate_list)
28784 {
28785 predicate_chain = nreverse (predicate_chain);
28786 *predicate_list = predicate_chain;
28787 }
28788
28789 return priority;
28790 }
28791
28792 /* This compares the priority of target features in function DECL1
28793 and DECL2. It returns positive value if DECL1 is higher priority,
28794 negative value if DECL2 is higher priority and 0 if they are the
28795 same. */
28796
28797 static int
28798 ix86_compare_version_priority (tree decl1, tree decl2)
28799 {
28800 unsigned int priority1 = 0;
28801 unsigned int priority2 = 0;
28802
28803 if (lookup_attribute ("target", DECL_ATTRIBUTES (decl1)) != NULL)
28804 priority1 = get_builtin_code_for_version (decl1, NULL);
28805
28806 if (lookup_attribute ("target", DECL_ATTRIBUTES (decl2)) != NULL)
28807 priority2 = get_builtin_code_for_version (decl2, NULL);
28808
28809 return (int)priority1 - (int)priority2;
28810 }
28811
28812 /* V1 and V2 point to function versions with different priorities
28813 based on the target ISA. This function compares their priorities. */
28814
28815 static int
28816 feature_compare (const void *v1, const void *v2)
28817 {
28818 typedef struct _function_version_info
28819 {
28820 tree version_decl;
28821 tree predicate_chain;
28822 unsigned int dispatch_priority;
28823 } function_version_info;
28824
28825 const function_version_info c1 = *(const function_version_info *)v1;
28826 const function_version_info c2 = *(const function_version_info *)v2;
28827 return (c2.dispatch_priority - c1.dispatch_priority);
28828 }
28829
28830 /* This function generates the dispatch function for
28831 multi-versioned functions. DISPATCH_DECL is the function which will
28832 contain the dispatch logic. FNDECLS are the function choices for
28833 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
28834 in DISPATCH_DECL in which the dispatch code is generated. */
28835
28836 static int
28837 dispatch_function_versions (tree dispatch_decl,
28838 void *fndecls_p,
28839 basic_block *empty_bb)
28840 {
28841 tree default_decl;
28842 gimple ifunc_cpu_init_stmt;
28843 gimple_seq gseq;
28844 int ix;
28845 tree ele;
28846 vec<tree> *fndecls;
28847 unsigned int num_versions = 0;
28848 unsigned int actual_versions = 0;
28849 unsigned int i;
28850
28851 struct _function_version_info
28852 {
28853 tree version_decl;
28854 tree predicate_chain;
28855 unsigned int dispatch_priority;
28856 }*function_version_info;
28857
28858 gcc_assert (dispatch_decl != NULL
28859 && fndecls_p != NULL
28860 && empty_bb != NULL);
28861
28862 /*fndecls_p is actually a vector. */
28863 fndecls = static_cast<vec<tree> *> (fndecls_p);
28864
28865 /* At least one more version other than the default. */
28866 num_versions = fndecls->length ();
28867 gcc_assert (num_versions >= 2);
28868
28869 function_version_info = (struct _function_version_info *)
28870 XNEWVEC (struct _function_version_info, (num_versions - 1));
28871
28872 /* The first version in the vector is the default decl. */
28873 default_decl = (*fndecls)[0];
28874
28875 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
28876
28877 gseq = bb_seq (*empty_bb);
28878 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
28879 constructors, so explicity call __builtin_cpu_init here. */
28880 ifunc_cpu_init_stmt = gimple_build_call_vec (
28881 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
28882 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
28883 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
28884 set_bb_seq (*empty_bb, gseq);
28885
28886 pop_cfun ();
28887
28888
28889 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
28890 {
28891 tree version_decl = ele;
28892 tree predicate_chain = NULL_TREE;
28893 unsigned int priority;
28894 /* Get attribute string, parse it and find the right predicate decl.
28895 The predicate function could be a lengthy combination of many
28896 features, like arch-type and various isa-variants. */
28897 priority = get_builtin_code_for_version (version_decl,
28898 &predicate_chain);
28899
28900 if (predicate_chain == NULL_TREE)
28901 continue;
28902
28903 actual_versions++;
28904 function_version_info [ix - 1].version_decl = version_decl;
28905 function_version_info [ix - 1].predicate_chain = predicate_chain;
28906 function_version_info [ix - 1].dispatch_priority = priority;
28907 }
28908
28909 /* Sort the versions according to descending order of dispatch priority. The
28910 priority is based on the ISA. This is not a perfect solution. There
28911 could still be ambiguity. If more than one function version is suitable
28912 to execute, which one should be dispatched? In future, allow the user
28913 to specify a dispatch priority next to the version. */
28914 qsort (function_version_info, actual_versions,
28915 sizeof (struct _function_version_info), feature_compare);
28916
28917 for (i = 0; i < actual_versions; ++i)
28918 *empty_bb = add_condition_to_bb (dispatch_decl,
28919 function_version_info[i].version_decl,
28920 function_version_info[i].predicate_chain,
28921 *empty_bb);
28922
28923 /* dispatch default version at the end. */
28924 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
28925 NULL, *empty_bb);
28926
28927 free (function_version_info);
28928 return 0;
28929 }
28930
28931 /* Comparator function to be used in qsort routine to sort attribute
28932 specification strings to "target". */
28933
28934 static int
28935 attr_strcmp (const void *v1, const void *v2)
28936 {
28937 const char *c1 = *(char *const*)v1;
28938 const char *c2 = *(char *const*)v2;
28939 return strcmp (c1, c2);
28940 }
28941
28942 /* STR is the argument to target attribute. This function tokenizes
28943 the comma separated arguments, sorts them and returns a string which
28944 is a unique identifier for the comma separated arguments. It also
28945 replaces non-identifier characters "=,-" with "_". */
28946
28947 static char *
28948 sorted_attr_string (const char *str)
28949 {
28950 char **args = NULL;
28951 char *attr_str, *ret_str;
28952 char *attr = NULL;
28953 unsigned int argnum = 1;
28954 unsigned int i;
28955
28956 for (i = 0; i < strlen (str); i++)
28957 if (str[i] == ',')
28958 argnum++;
28959
28960 attr_str = (char *)xmalloc (strlen (str) + 1);
28961 strcpy (attr_str, str);
28962
28963 /* Replace "=,-" with "_". */
28964 for (i = 0; i < strlen (attr_str); i++)
28965 if (attr_str[i] == '=' || attr_str[i]== '-')
28966 attr_str[i] = '_';
28967
28968 if (argnum == 1)
28969 return attr_str;
28970
28971 args = XNEWVEC (char *, argnum);
28972
28973 i = 0;
28974 attr = strtok (attr_str, ",");
28975 while (attr != NULL)
28976 {
28977 args[i] = attr;
28978 i++;
28979 attr = strtok (NULL, ",");
28980 }
28981
28982 qsort (args, argnum, sizeof (char*), attr_strcmp);
28983
28984 ret_str = (char *)xmalloc (strlen (str) + 1);
28985 strcpy (ret_str, args[0]);
28986 for (i = 1; i < argnum; i++)
28987 {
28988 strcat (ret_str, "_");
28989 strcat (ret_str, args[i]);
28990 }
28991
28992 free (args);
28993 free (attr_str);
28994 return ret_str;
28995 }
28996
28997 /* This function changes the assembler name for functions that are
28998 versions. If DECL is a function version and has a "target"
28999 attribute, it appends the attribute string to its assembler name. */
29000
29001 static tree
29002 ix86_mangle_function_version_assembler_name (tree decl, tree id)
29003 {
29004 tree version_attr;
29005 const char *orig_name, *version_string, *attr_str;
29006 char *assembler_name;
29007
29008 if (DECL_DECLARED_INLINE_P (decl)
29009 && lookup_attribute ("gnu_inline",
29010 DECL_ATTRIBUTES (decl)))
29011 error_at (DECL_SOURCE_LOCATION (decl),
29012 "Function versions cannot be marked as gnu_inline,"
29013 " bodies have to be generated");
29014
29015 if (DECL_VIRTUAL_P (decl)
29016 || DECL_VINDEX (decl))
29017 error_at (DECL_SOURCE_LOCATION (decl),
29018 "Virtual function versioning not supported\n");
29019
29020 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
29021
29022 /* target attribute string is NULL for default functions. */
29023 if (version_attr == NULL_TREE)
29024 return id;
29025
29026 orig_name = IDENTIFIER_POINTER (id);
29027 version_string
29028 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
29029
29030 attr_str = sorted_attr_string (version_string);
29031 assembler_name = (char *) xmalloc (strlen (orig_name)
29032 + strlen (attr_str) + 2);
29033
29034 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
29035
29036 /* Allow assembler name to be modified if already set. */
29037 if (DECL_ASSEMBLER_NAME_SET_P (decl))
29038 SET_DECL_RTL (decl, NULL);
29039
29040 return get_identifier (assembler_name);
29041 }
29042
29043 /* This function returns true if FN1 and FN2 are versions of the same function,
29044 that is, the target strings of the function decls are different. This assumes
29045 that FN1 and FN2 have the same signature. */
29046
29047 static bool
29048 ix86_function_versions (tree fn1, tree fn2)
29049 {
29050 tree attr1, attr2;
29051 const char *attr_str1, *attr_str2;
29052 char *target1, *target2;
29053 bool result;
29054
29055 if (TREE_CODE (fn1) != FUNCTION_DECL
29056 || TREE_CODE (fn2) != FUNCTION_DECL)
29057 return false;
29058
29059 attr1 = lookup_attribute ("target", DECL_ATTRIBUTES (fn1));
29060 attr2 = lookup_attribute ("target", DECL_ATTRIBUTES (fn2));
29061
29062 /* At least one function decl should have the target attribute specified. */
29063 if (attr1 == NULL_TREE && attr2 == NULL_TREE)
29064 return false;
29065
29066 /* If one function does not have a target attribute, these are versions. */
29067 if (attr1 == NULL_TREE || attr2 == NULL_TREE)
29068 return true;
29069
29070 attr_str1 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (attr1)));
29071 attr_str2 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (attr2)));
29072
29073 target1 = sorted_attr_string (attr_str1);
29074 target2 = sorted_attr_string (attr_str2);
29075
29076 /* The sorted target strings must be different for fn1 and fn2
29077 to be versions. */
29078 if (strcmp (target1, target2) == 0)
29079 result = false;
29080 else
29081 result = true;
29082
29083 free (target1);
29084 free (target2);
29085
29086 return result;
29087 }
29088
29089 /* This target supports function multiversioning. */
29090
29091 static bool
29092 ix86_supports_function_versions (void)
29093 {
29094 return true;
29095 }
29096
29097 static tree
29098 ix86_mangle_decl_assembler_name (tree decl, tree id)
29099 {
29100 /* For function version, add the target suffix to the assembler name. */
29101 if (TREE_CODE (decl) == FUNCTION_DECL
29102 && DECL_FUNCTION_VERSIONED (decl))
29103 id = ix86_mangle_function_version_assembler_name (decl, id);
29104 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
29105 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
29106 #endif
29107
29108 return id;
29109 }
29110
29111 /* Return a new name by appending SUFFIX to the DECL name. If make_unique
29112 is true, append the full path name of the source file. */
29113
29114 static char *
29115 make_name (tree decl, const char *suffix, bool make_unique)
29116 {
29117 char *global_var_name;
29118 int name_len;
29119 const char *name;
29120 const char *unique_name = NULL;
29121
29122 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
29123
29124 /* Get a unique name that can be used globally without any chances
29125 of collision at link time. */
29126 if (make_unique)
29127 unique_name = IDENTIFIER_POINTER (get_file_function_name ("\0"));
29128
29129 name_len = strlen (name) + strlen (suffix) + 2;
29130
29131 if (make_unique)
29132 name_len += strlen (unique_name) + 1;
29133 global_var_name = XNEWVEC (char, name_len);
29134
29135 /* Use '.' to concatenate names as it is demangler friendly. */
29136 if (make_unique)
29137 snprintf (global_var_name, name_len, "%s.%s.%s", name,
29138 unique_name, suffix);
29139 else
29140 snprintf (global_var_name, name_len, "%s.%s", name, suffix);
29141
29142 return global_var_name;
29143 }
29144
29145 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE) && HAVE_GNU_INDIRECT_FUNCTION
29146
29147 /* Make a dispatcher declaration for the multi-versioned function DECL.
29148 Calls to DECL function will be replaced with calls to the dispatcher
29149 by the front-end. Return the decl created. */
29150
29151 static tree
29152 make_dispatcher_decl (const tree decl)
29153 {
29154 tree func_decl;
29155 char *func_name, *resolver_name;
29156 tree fn_type, func_type;
29157 bool is_uniq = false;
29158
29159 if (TREE_PUBLIC (decl) == 0)
29160 is_uniq = true;
29161
29162 func_name = make_name (decl, "ifunc", is_uniq);
29163 resolver_name = make_name (decl, "resolver", is_uniq);
29164 gcc_assert (resolver_name);
29165
29166 fn_type = TREE_TYPE (decl);
29167 func_type = build_function_type (TREE_TYPE (fn_type),
29168 TYPE_ARG_TYPES (fn_type));
29169
29170 func_decl = build_fn_decl (func_name, func_type);
29171 TREE_USED (func_decl) = 1;
29172 DECL_CONTEXT (func_decl) = NULL_TREE;
29173 DECL_INITIAL (func_decl) = error_mark_node;
29174 DECL_ARTIFICIAL (func_decl) = 1;
29175 /* Mark this func as external, the resolver will flip it again if
29176 it gets generated. */
29177 DECL_EXTERNAL (func_decl) = 1;
29178 /* This will be of type IFUNCs have to be externally visible. */
29179 TREE_PUBLIC (func_decl) = 1;
29180
29181 return func_decl;
29182 }
29183
29184 #endif
29185
29186 /* Returns true if decl is multi-versioned and DECL is the default function,
29187 that is it is not tagged with target specific optimization. */
29188
29189 static bool
29190 is_function_default_version (const tree decl)
29191 {
29192 return (TREE_CODE (decl) == FUNCTION_DECL
29193 && DECL_FUNCTION_VERSIONED (decl)
29194 && lookup_attribute ("target", DECL_ATTRIBUTES (decl)) == NULL_TREE);
29195 }
29196
29197 /* Make a dispatcher declaration for the multi-versioned function DECL.
29198 Calls to DECL function will be replaced with calls to the dispatcher
29199 by the front-end. Returns the decl of the dispatcher function. */
29200
29201 static tree
29202 ix86_get_function_versions_dispatcher (void *decl)
29203 {
29204 tree fn = (tree) decl;
29205 struct cgraph_node *node = NULL;
29206 struct cgraph_node *default_node = NULL;
29207 struct cgraph_function_version_info *node_v = NULL;
29208 struct cgraph_function_version_info *first_v = NULL;
29209
29210 tree dispatch_decl = NULL;
29211
29212 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE) && HAVE_GNU_INDIRECT_FUNCTION
29213 struct cgraph_function_version_info *it_v = NULL;
29214 struct cgraph_node *dispatcher_node = NULL;
29215 struct cgraph_function_version_info *dispatcher_version_info = NULL;
29216 #endif
29217
29218 struct cgraph_function_version_info *default_version_info = NULL;
29219
29220 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
29221
29222 node = cgraph_get_node (fn);
29223 gcc_assert (node != NULL);
29224
29225 node_v = get_cgraph_node_version (node);
29226 gcc_assert (node_v != NULL);
29227
29228 if (node_v->dispatcher_resolver != NULL)
29229 return node_v->dispatcher_resolver;
29230
29231 /* Find the default version and make it the first node. */
29232 first_v = node_v;
29233 /* Go to the beginnig of the chain. */
29234 while (first_v->prev != NULL)
29235 first_v = first_v->prev;
29236 default_version_info = first_v;
29237 while (default_version_info != NULL)
29238 {
29239 if (is_function_default_version
29240 (default_version_info->this_node->symbol.decl))
29241 break;
29242 default_version_info = default_version_info->next;
29243 }
29244
29245 /* If there is no default node, just return NULL. */
29246 if (default_version_info == NULL)
29247 return NULL;
29248
29249 /* Make default info the first node. */
29250 if (first_v != default_version_info)
29251 {
29252 default_version_info->prev->next = default_version_info->next;
29253 if (default_version_info->next)
29254 default_version_info->next->prev = default_version_info->prev;
29255 first_v->prev = default_version_info;
29256 default_version_info->next = first_v;
29257 default_version_info->prev = NULL;
29258 }
29259
29260 default_node = default_version_info->this_node;
29261
29262 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE) && HAVE_GNU_INDIRECT_FUNCTION
29263 /* Right now, the dispatching is done via ifunc. */
29264 dispatch_decl = make_dispatcher_decl (default_node->symbol.decl);
29265
29266 dispatcher_node = cgraph_get_create_node (dispatch_decl);
29267 gcc_assert (dispatcher_node != NULL);
29268 dispatcher_node->dispatcher_function = 1;
29269 dispatcher_version_info
29270 = insert_new_cgraph_node_version (dispatcher_node);
29271 dispatcher_version_info->next = default_version_info;
29272 dispatcher_node->local.finalized = 1;
29273
29274 /* Set the dispatcher for all the versions. */
29275 it_v = default_version_info;
29276 while (it_v != NULL)
29277 {
29278 it_v->dispatcher_resolver = dispatch_decl;
29279 it_v = it_v->next;
29280 }
29281 #else
29282 error_at (DECL_SOURCE_LOCATION (default_node->symbol.decl),
29283 "multiversioning needs ifunc which is not supported "
29284 "in this configuration");
29285 #endif
29286 return dispatch_decl;
29287 }
29288
29289 /* Makes a function attribute of the form NAME(ARG_NAME) and chains
29290 it to CHAIN. */
29291
29292 static tree
29293 make_attribute (const char *name, const char *arg_name, tree chain)
29294 {
29295 tree attr_name;
29296 tree attr_arg_name;
29297 tree attr_args;
29298 tree attr;
29299
29300 attr_name = get_identifier (name);
29301 attr_arg_name = build_string (strlen (arg_name), arg_name);
29302 attr_args = tree_cons (NULL_TREE, attr_arg_name, NULL_TREE);
29303 attr = tree_cons (attr_name, attr_args, chain);
29304 return attr;
29305 }
29306
29307 /* Make the resolver function decl to dispatch the versions of
29308 a multi-versioned function, DEFAULT_DECL. Create an
29309 empty basic block in the resolver and store the pointer in
29310 EMPTY_BB. Return the decl of the resolver function. */
29311
29312 static tree
29313 make_resolver_func (const tree default_decl,
29314 const tree dispatch_decl,
29315 basic_block *empty_bb)
29316 {
29317 char *resolver_name;
29318 tree decl, type, decl_name, t;
29319 bool is_uniq = false;
29320
29321 /* IFUNC's have to be globally visible. So, if the default_decl is
29322 not, then the name of the IFUNC should be made unique. */
29323 if (TREE_PUBLIC (default_decl) == 0)
29324 is_uniq = true;
29325
29326 /* Append the filename to the resolver function if the versions are
29327 not externally visible. This is because the resolver function has
29328 to be externally visible for the loader to find it. So, appending
29329 the filename will prevent conflicts with a resolver function from
29330 another module which is based on the same version name. */
29331 resolver_name = make_name (default_decl, "resolver", is_uniq);
29332
29333 /* The resolver function should return a (void *). */
29334 type = build_function_type_list (ptr_type_node, NULL_TREE);
29335
29336 decl = build_fn_decl (resolver_name, type);
29337 decl_name = get_identifier (resolver_name);
29338 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
29339
29340 DECL_NAME (decl) = decl_name;
29341 TREE_USED (decl) = 1;
29342 DECL_ARTIFICIAL (decl) = 1;
29343 DECL_IGNORED_P (decl) = 0;
29344 /* IFUNC resolvers have to be externally visible. */
29345 TREE_PUBLIC (decl) = 1;
29346 DECL_UNINLINABLE (decl) = 0;
29347
29348 /* Resolver is not external, body is generated. */
29349 DECL_EXTERNAL (decl) = 0;
29350 DECL_EXTERNAL (dispatch_decl) = 0;
29351
29352 DECL_CONTEXT (decl) = NULL_TREE;
29353 DECL_INITIAL (decl) = make_node (BLOCK);
29354 DECL_STATIC_CONSTRUCTOR (decl) = 0;
29355
29356 if (DECL_COMDAT_GROUP (default_decl)
29357 || TREE_PUBLIC (default_decl))
29358 {
29359 /* In this case, each translation unit with a call to this
29360 versioned function will put out a resolver. Ensure it
29361 is comdat to keep just one copy. */
29362 DECL_COMDAT (decl) = 1;
29363 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
29364 }
29365 /* Build result decl and add to function_decl. */
29366 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
29367 DECL_ARTIFICIAL (t) = 1;
29368 DECL_IGNORED_P (t) = 1;
29369 DECL_RESULT (decl) = t;
29370
29371 gimplify_function_tree (decl);
29372 push_cfun (DECL_STRUCT_FUNCTION (decl));
29373 *empty_bb = init_lowered_empty_function (decl, false);
29374
29375 cgraph_add_new_function (decl, true);
29376 cgraph_call_function_insertion_hooks (cgraph_get_create_node (decl));
29377
29378 pop_cfun ();
29379
29380 gcc_assert (dispatch_decl != NULL);
29381 /* Mark dispatch_decl as "ifunc" with resolver as resolver_name. */
29382 DECL_ATTRIBUTES (dispatch_decl)
29383 = make_attribute ("ifunc", resolver_name, DECL_ATTRIBUTES (dispatch_decl));
29384
29385 /* Create the alias for dispatch to resolver here. */
29386 /*cgraph_create_function_alias (dispatch_decl, decl);*/
29387 cgraph_same_body_alias (NULL, dispatch_decl, decl);
29388 return decl;
29389 }
29390
29391 /* Generate the dispatching code body to dispatch multi-versioned function
29392 DECL. The target hook is called to process the "target" attributes and
29393 provide the code to dispatch the right function at run-time. NODE points
29394 to the dispatcher decl whose body will be created. */
29395
29396 static tree
29397 ix86_generate_version_dispatcher_body (void *node_p)
29398 {
29399 tree resolver_decl;
29400 basic_block empty_bb;
29401 vec<tree> fn_ver_vec = vNULL;
29402 tree default_ver_decl;
29403 struct cgraph_node *versn;
29404 struct cgraph_node *node;
29405
29406 struct cgraph_function_version_info *node_version_info = NULL;
29407 struct cgraph_function_version_info *versn_info = NULL;
29408
29409 node = (cgraph_node *)node_p;
29410
29411 node_version_info = get_cgraph_node_version (node);
29412 gcc_assert (node->dispatcher_function
29413 && node_version_info != NULL);
29414
29415 if (node_version_info->dispatcher_resolver)
29416 return node_version_info->dispatcher_resolver;
29417
29418 /* The first version in the chain corresponds to the default version. */
29419 default_ver_decl = node_version_info->next->this_node->symbol.decl;
29420
29421 /* node is going to be an alias, so remove the finalized bit. */
29422 node->local.finalized = false;
29423
29424 resolver_decl = make_resolver_func (default_ver_decl,
29425 node->symbol.decl, &empty_bb);
29426
29427 node_version_info->dispatcher_resolver = resolver_decl;
29428
29429 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
29430
29431 fn_ver_vec.create (2);
29432
29433 for (versn_info = node_version_info->next; versn_info;
29434 versn_info = versn_info->next)
29435 {
29436 versn = versn_info->this_node;
29437 /* Check for virtual functions here again, as by this time it should
29438 have been determined if this function needs a vtable index or
29439 not. This happens for methods in derived classes that override
29440 virtual methods in base classes but are not explicitly marked as
29441 virtual. */
29442 if (DECL_VINDEX (versn->symbol.decl))
29443 error_at (DECL_SOURCE_LOCATION (versn->symbol.decl),
29444 "Virtual function multiversioning not supported");
29445 fn_ver_vec.safe_push (versn->symbol.decl);
29446 }
29447
29448 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
29449
29450 rebuild_cgraph_edges ();
29451 pop_cfun ();
29452 return resolver_decl;
29453 }
29454 /* This builds the processor_model struct type defined in
29455 libgcc/config/i386/cpuinfo.c */
29456
29457 static tree
29458 build_processor_model_struct (void)
29459 {
29460 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
29461 "__cpu_features"};
29462 tree field = NULL_TREE, field_chain = NULL_TREE;
29463 int i;
29464 tree type = make_node (RECORD_TYPE);
29465
29466 /* The first 3 fields are unsigned int. */
29467 for (i = 0; i < 3; ++i)
29468 {
29469 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
29470 get_identifier (field_name[i]), unsigned_type_node);
29471 if (field_chain != NULL_TREE)
29472 DECL_CHAIN (field) = field_chain;
29473 field_chain = field;
29474 }
29475
29476 /* The last field is an array of unsigned integers of size one. */
29477 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
29478 get_identifier (field_name[3]),
29479 build_array_type (unsigned_type_node,
29480 build_index_type (size_one_node)));
29481 if (field_chain != NULL_TREE)
29482 DECL_CHAIN (field) = field_chain;
29483 field_chain = field;
29484
29485 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
29486 return type;
29487 }
29488
29489 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
29490
29491 static tree
29492 make_var_decl (tree type, const char *name)
29493 {
29494 tree new_decl;
29495
29496 new_decl = build_decl (UNKNOWN_LOCATION,
29497 VAR_DECL,
29498 get_identifier(name),
29499 type);
29500
29501 DECL_EXTERNAL (new_decl) = 1;
29502 TREE_STATIC (new_decl) = 1;
29503 TREE_PUBLIC (new_decl) = 1;
29504 DECL_INITIAL (new_decl) = 0;
29505 DECL_ARTIFICIAL (new_decl) = 0;
29506 DECL_PRESERVE_P (new_decl) = 1;
29507
29508 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
29509 assemble_variable (new_decl, 0, 0, 0);
29510
29511 return new_decl;
29512 }
29513
29514 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
29515 into an integer defined in libgcc/config/i386/cpuinfo.c */
29516
29517 static tree
29518 fold_builtin_cpu (tree fndecl, tree *args)
29519 {
29520 unsigned int i;
29521 enum ix86_builtins fn_code = (enum ix86_builtins)
29522 DECL_FUNCTION_CODE (fndecl);
29523 tree param_string_cst = NULL;
29524
29525 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
29526 enum processor_features
29527 {
29528 F_CMOV = 0,
29529 F_MMX,
29530 F_POPCNT,
29531 F_SSE,
29532 F_SSE2,
29533 F_SSE3,
29534 F_SSSE3,
29535 F_SSE4_1,
29536 F_SSE4_2,
29537 F_AVX,
29538 F_AVX2,
29539 F_MAX
29540 };
29541
29542 /* These are the values for vendor types and cpu types and subtypes
29543 in cpuinfo.c. Cpu types and subtypes should be subtracted by
29544 the corresponding start value. */
29545 enum processor_model
29546 {
29547 M_INTEL = 1,
29548 M_AMD,
29549 M_CPU_TYPE_START,
29550 M_INTEL_ATOM,
29551 M_INTEL_CORE2,
29552 M_INTEL_COREI7,
29553 M_AMDFAM10H,
29554 M_AMDFAM15H,
29555 M_CPU_SUBTYPE_START,
29556 M_INTEL_COREI7_NEHALEM,
29557 M_INTEL_COREI7_WESTMERE,
29558 M_INTEL_COREI7_SANDYBRIDGE,
29559 M_AMDFAM10H_BARCELONA,
29560 M_AMDFAM10H_SHANGHAI,
29561 M_AMDFAM10H_ISTANBUL,
29562 M_AMDFAM15H_BDVER1,
29563 M_AMDFAM15H_BDVER2,
29564 M_AMDFAM15H_BDVER3
29565 };
29566
29567 static struct _arch_names_table
29568 {
29569 const char *const name;
29570 const enum processor_model model;
29571 }
29572 const arch_names_table[] =
29573 {
29574 {"amd", M_AMD},
29575 {"intel", M_INTEL},
29576 {"atom", M_INTEL_ATOM},
29577 {"core2", M_INTEL_CORE2},
29578 {"corei7", M_INTEL_COREI7},
29579 {"nehalem", M_INTEL_COREI7_NEHALEM},
29580 {"westmere", M_INTEL_COREI7_WESTMERE},
29581 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
29582 {"amdfam10h", M_AMDFAM10H},
29583 {"barcelona", M_AMDFAM10H_BARCELONA},
29584 {"shanghai", M_AMDFAM10H_SHANGHAI},
29585 {"istanbul", M_AMDFAM10H_ISTANBUL},
29586 {"amdfam15h", M_AMDFAM15H},
29587 {"bdver1", M_AMDFAM15H_BDVER1},
29588 {"bdver2", M_AMDFAM15H_BDVER2},
29589 {"bdver3", M_AMDFAM15H_BDVER3},
29590 };
29591
29592 static struct _isa_names_table
29593 {
29594 const char *const name;
29595 const enum processor_features feature;
29596 }
29597 const isa_names_table[] =
29598 {
29599 {"cmov", F_CMOV},
29600 {"mmx", F_MMX},
29601 {"popcnt", F_POPCNT},
29602 {"sse", F_SSE},
29603 {"sse2", F_SSE2},
29604 {"sse3", F_SSE3},
29605 {"ssse3", F_SSSE3},
29606 {"sse4.1", F_SSE4_1},
29607 {"sse4.2", F_SSE4_2},
29608 {"avx", F_AVX},
29609 {"avx2", F_AVX2}
29610 };
29611
29612 tree __processor_model_type = build_processor_model_struct ();
29613 tree __cpu_model_var = make_var_decl (__processor_model_type,
29614 "__cpu_model");
29615
29616 gcc_assert ((args != NULL) && (*args != NULL));
29617
29618 param_string_cst = *args;
29619 while (param_string_cst
29620 && TREE_CODE (param_string_cst) != STRING_CST)
29621 {
29622 /* *args must be a expr that can contain other EXPRS leading to a
29623 STRING_CST. */
29624 if (!EXPR_P (param_string_cst))
29625 {
29626 error ("Parameter to builtin must be a string constant or literal");
29627 return integer_zero_node;
29628 }
29629 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
29630 }
29631
29632 gcc_assert (param_string_cst);
29633
29634 if (fn_code == IX86_BUILTIN_CPU_IS)
29635 {
29636 tree ref;
29637 tree field;
29638 tree final;
29639
29640 unsigned int field_val = 0;
29641 unsigned int NUM_ARCH_NAMES
29642 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
29643
29644 for (i = 0; i < NUM_ARCH_NAMES; i++)
29645 if (strcmp (arch_names_table[i].name,
29646 TREE_STRING_POINTER (param_string_cst)) == 0)
29647 break;
29648
29649 if (i == NUM_ARCH_NAMES)
29650 {
29651 error ("Parameter to builtin not valid: %s",
29652 TREE_STRING_POINTER (param_string_cst));
29653 return integer_zero_node;
29654 }
29655
29656 field = TYPE_FIELDS (__processor_model_type);
29657 field_val = arch_names_table[i].model;
29658
29659 /* CPU types are stored in the next field. */
29660 if (field_val > M_CPU_TYPE_START
29661 && field_val < M_CPU_SUBTYPE_START)
29662 {
29663 field = DECL_CHAIN (field);
29664 field_val -= M_CPU_TYPE_START;
29665 }
29666
29667 /* CPU subtypes are stored in the next field. */
29668 if (field_val > M_CPU_SUBTYPE_START)
29669 {
29670 field = DECL_CHAIN ( DECL_CHAIN (field));
29671 field_val -= M_CPU_SUBTYPE_START;
29672 }
29673
29674 /* Get the appropriate field in __cpu_model. */
29675 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
29676 field, NULL_TREE);
29677
29678 /* Check the value. */
29679 final = build2 (EQ_EXPR, unsigned_type_node, ref,
29680 build_int_cstu (unsigned_type_node, field_val));
29681 return build1 (CONVERT_EXPR, integer_type_node, final);
29682 }
29683 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
29684 {
29685 tree ref;
29686 tree array_elt;
29687 tree field;
29688 tree final;
29689
29690 unsigned int field_val = 0;
29691 unsigned int NUM_ISA_NAMES
29692 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
29693
29694 for (i = 0; i < NUM_ISA_NAMES; i++)
29695 if (strcmp (isa_names_table[i].name,
29696 TREE_STRING_POINTER (param_string_cst)) == 0)
29697 break;
29698
29699 if (i == NUM_ISA_NAMES)
29700 {
29701 error ("Parameter to builtin not valid: %s",
29702 TREE_STRING_POINTER (param_string_cst));
29703 return integer_zero_node;
29704 }
29705
29706 field = TYPE_FIELDS (__processor_model_type);
29707 /* Get the last field, which is __cpu_features. */
29708 while (DECL_CHAIN (field))
29709 field = DECL_CHAIN (field);
29710
29711 /* Get the appropriate field: __cpu_model.__cpu_features */
29712 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
29713 field, NULL_TREE);
29714
29715 /* Access the 0th element of __cpu_features array. */
29716 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
29717 integer_zero_node, NULL_TREE, NULL_TREE);
29718
29719 field_val = (1 << isa_names_table[i].feature);
29720 /* Return __cpu_model.__cpu_features[0] & field_val */
29721 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
29722 build_int_cstu (unsigned_type_node, field_val));
29723 return build1 (CONVERT_EXPR, integer_type_node, final);
29724 }
29725 gcc_unreachable ();
29726 }
29727
29728 static tree
29729 ix86_fold_builtin (tree fndecl, int n_args,
29730 tree *args, bool ignore ATTRIBUTE_UNUSED)
29731 {
29732 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
29733 {
29734 enum ix86_builtins fn_code = (enum ix86_builtins)
29735 DECL_FUNCTION_CODE (fndecl);
29736 if (fn_code == IX86_BUILTIN_CPU_IS
29737 || fn_code == IX86_BUILTIN_CPU_SUPPORTS)
29738 {
29739 gcc_assert (n_args == 1);
29740 return fold_builtin_cpu (fndecl, args);
29741 }
29742 }
29743
29744 #ifdef SUBTARGET_FOLD_BUILTIN
29745 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
29746 #endif
29747
29748 return NULL_TREE;
29749 }
29750
29751 /* Make builtins to detect cpu type and features supported. NAME is
29752 the builtin name, CODE is the builtin code, and FTYPE is the function
29753 type of the builtin. */
29754
29755 static void
29756 make_cpu_type_builtin (const char* name, int code,
29757 enum ix86_builtin_func_type ftype, bool is_const)
29758 {
29759 tree decl;
29760 tree type;
29761
29762 type = ix86_get_builtin_func_type (ftype);
29763 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
29764 NULL, NULL_TREE);
29765 gcc_assert (decl != NULL_TREE);
29766 ix86_builtins[(int) code] = decl;
29767 TREE_READONLY (decl) = is_const;
29768 }
29769
29770 /* Make builtins to get CPU type and features supported. The created
29771 builtins are :
29772
29773 __builtin_cpu_init (), to detect cpu type and features,
29774 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
29775 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
29776 */
29777
29778 static void
29779 ix86_init_platform_type_builtins (void)
29780 {
29781 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
29782 INT_FTYPE_VOID, false);
29783 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
29784 INT_FTYPE_PCCHAR, true);
29785 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
29786 INT_FTYPE_PCCHAR, true);
29787 }
29788
29789 /* Internal method for ix86_init_builtins. */
29790
29791 static void
29792 ix86_init_builtins_va_builtins_abi (void)
29793 {
29794 tree ms_va_ref, sysv_va_ref;
29795 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
29796 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
29797 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
29798 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
29799
29800 if (!TARGET_64BIT)
29801 return;
29802 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
29803 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
29804 ms_va_ref = build_reference_type (ms_va_list_type_node);
29805 sysv_va_ref =
29806 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
29807
29808 fnvoid_va_end_ms =
29809 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
29810 fnvoid_va_start_ms =
29811 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
29812 fnvoid_va_end_sysv =
29813 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
29814 fnvoid_va_start_sysv =
29815 build_varargs_function_type_list (void_type_node, sysv_va_ref,
29816 NULL_TREE);
29817 fnvoid_va_copy_ms =
29818 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
29819 NULL_TREE);
29820 fnvoid_va_copy_sysv =
29821 build_function_type_list (void_type_node, sysv_va_ref,
29822 sysv_va_ref, NULL_TREE);
29823
29824 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
29825 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
29826 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
29827 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
29828 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
29829 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
29830 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
29831 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
29832 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
29833 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
29834 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
29835 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
29836 }
29837
29838 static void
29839 ix86_init_builtin_types (void)
29840 {
29841 tree float128_type_node, float80_type_node;
29842
29843 /* The __float80 type. */
29844 float80_type_node = long_double_type_node;
29845 if (TYPE_MODE (float80_type_node) != XFmode)
29846 {
29847 /* The __float80 type. */
29848 float80_type_node = make_node (REAL_TYPE);
29849
29850 TYPE_PRECISION (float80_type_node) = 80;
29851 layout_type (float80_type_node);
29852 }
29853 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
29854
29855 /* The __float128 type. */
29856 float128_type_node = make_node (REAL_TYPE);
29857 TYPE_PRECISION (float128_type_node) = 128;
29858 layout_type (float128_type_node);
29859 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
29860
29861 /* This macro is built by i386-builtin-types.awk. */
29862 DEFINE_BUILTIN_PRIMITIVE_TYPES;
29863 }
29864
29865 static void
29866 ix86_init_builtins (void)
29867 {
29868 tree t;
29869
29870 ix86_init_builtin_types ();
29871
29872 /* Builtins to get CPU type and features. */
29873 ix86_init_platform_type_builtins ();
29874
29875 /* TFmode support builtins. */
29876 def_builtin_const (0, "__builtin_infq",
29877 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
29878 def_builtin_const (0, "__builtin_huge_valq",
29879 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
29880
29881 /* We will expand them to normal call if SSE isn't available since
29882 they are used by libgcc. */
29883 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
29884 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
29885 BUILT_IN_MD, "__fabstf2", NULL_TREE);
29886 TREE_READONLY (t) = 1;
29887 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
29888
29889 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
29890 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
29891 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
29892 TREE_READONLY (t) = 1;
29893 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
29894
29895 ix86_init_tm_builtins ();
29896 ix86_init_mmx_sse_builtins ();
29897
29898 if (TARGET_LP64)
29899 ix86_init_builtins_va_builtins_abi ();
29900
29901 #ifdef SUBTARGET_INIT_BUILTINS
29902 SUBTARGET_INIT_BUILTINS;
29903 #endif
29904 }
29905
29906 /* Return the ix86 builtin for CODE. */
29907
29908 static tree
29909 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
29910 {
29911 if (code >= IX86_BUILTIN_MAX)
29912 return error_mark_node;
29913
29914 return ix86_builtins[code];
29915 }
29916
29917 /* Errors in the source file can cause expand_expr to return const0_rtx
29918 where we expect a vector. To avoid crashing, use one of the vector
29919 clear instructions. */
29920 static rtx
29921 safe_vector_operand (rtx x, enum machine_mode mode)
29922 {
29923 if (x == const0_rtx)
29924 x = CONST0_RTX (mode);
29925 return x;
29926 }
29927
29928 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
29929
29930 static rtx
29931 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
29932 {
29933 rtx pat;
29934 tree arg0 = CALL_EXPR_ARG (exp, 0);
29935 tree arg1 = CALL_EXPR_ARG (exp, 1);
29936 rtx op0 = expand_normal (arg0);
29937 rtx op1 = expand_normal (arg1);
29938 enum machine_mode tmode = insn_data[icode].operand[0].mode;
29939 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
29940 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
29941
29942 if (VECTOR_MODE_P (mode0))
29943 op0 = safe_vector_operand (op0, mode0);
29944 if (VECTOR_MODE_P (mode1))
29945 op1 = safe_vector_operand (op1, mode1);
29946
29947 if (optimize || !target
29948 || GET_MODE (target) != tmode
29949 || !insn_data[icode].operand[0].predicate (target, tmode))
29950 target = gen_reg_rtx (tmode);
29951
29952 if (GET_MODE (op1) == SImode && mode1 == TImode)
29953 {
29954 rtx x = gen_reg_rtx (V4SImode);
29955 emit_insn (gen_sse2_loadd (x, op1));
29956 op1 = gen_lowpart (TImode, x);
29957 }
29958
29959 if (!insn_data[icode].operand[1].predicate (op0, mode0))
29960 op0 = copy_to_mode_reg (mode0, op0);
29961 if (!insn_data[icode].operand[2].predicate (op1, mode1))
29962 op1 = copy_to_mode_reg (mode1, op1);
29963
29964 pat = GEN_FCN (icode) (target, op0, op1);
29965 if (! pat)
29966 return 0;
29967
29968 emit_insn (pat);
29969
29970 return target;
29971 }
29972
29973 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
29974
29975 static rtx
29976 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
29977 enum ix86_builtin_func_type m_type,
29978 enum rtx_code sub_code)
29979 {
29980 rtx pat;
29981 int i;
29982 int nargs;
29983 bool comparison_p = false;
29984 bool tf_p = false;
29985 bool last_arg_constant = false;
29986 int num_memory = 0;
29987 struct {
29988 rtx op;
29989 enum machine_mode mode;
29990 } args[4];
29991
29992 enum machine_mode tmode = insn_data[icode].operand[0].mode;
29993
29994 switch (m_type)
29995 {
29996 case MULTI_ARG_4_DF2_DI_I:
29997 case MULTI_ARG_4_DF2_DI_I1:
29998 case MULTI_ARG_4_SF2_SI_I:
29999 case MULTI_ARG_4_SF2_SI_I1:
30000 nargs = 4;
30001 last_arg_constant = true;
30002 break;
30003
30004 case MULTI_ARG_3_SF:
30005 case MULTI_ARG_3_DF:
30006 case MULTI_ARG_3_SF2:
30007 case MULTI_ARG_3_DF2:
30008 case MULTI_ARG_3_DI:
30009 case MULTI_ARG_3_SI:
30010 case MULTI_ARG_3_SI_DI:
30011 case MULTI_ARG_3_HI:
30012 case MULTI_ARG_3_HI_SI:
30013 case MULTI_ARG_3_QI:
30014 case MULTI_ARG_3_DI2:
30015 case MULTI_ARG_3_SI2:
30016 case MULTI_ARG_3_HI2:
30017 case MULTI_ARG_3_QI2:
30018 nargs = 3;
30019 break;
30020
30021 case MULTI_ARG_2_SF:
30022 case MULTI_ARG_2_DF:
30023 case MULTI_ARG_2_DI:
30024 case MULTI_ARG_2_SI:
30025 case MULTI_ARG_2_HI:
30026 case MULTI_ARG_2_QI:
30027 nargs = 2;
30028 break;
30029
30030 case MULTI_ARG_2_DI_IMM:
30031 case MULTI_ARG_2_SI_IMM:
30032 case MULTI_ARG_2_HI_IMM:
30033 case MULTI_ARG_2_QI_IMM:
30034 nargs = 2;
30035 last_arg_constant = true;
30036 break;
30037
30038 case MULTI_ARG_1_SF:
30039 case MULTI_ARG_1_DF:
30040 case MULTI_ARG_1_SF2:
30041 case MULTI_ARG_1_DF2:
30042 case MULTI_ARG_1_DI:
30043 case MULTI_ARG_1_SI:
30044 case MULTI_ARG_1_HI:
30045 case MULTI_ARG_1_QI:
30046 case MULTI_ARG_1_SI_DI:
30047 case MULTI_ARG_1_HI_DI:
30048 case MULTI_ARG_1_HI_SI:
30049 case MULTI_ARG_1_QI_DI:
30050 case MULTI_ARG_1_QI_SI:
30051 case MULTI_ARG_1_QI_HI:
30052 nargs = 1;
30053 break;
30054
30055 case MULTI_ARG_2_DI_CMP:
30056 case MULTI_ARG_2_SI_CMP:
30057 case MULTI_ARG_2_HI_CMP:
30058 case MULTI_ARG_2_QI_CMP:
30059 nargs = 2;
30060 comparison_p = true;
30061 break;
30062
30063 case MULTI_ARG_2_SF_TF:
30064 case MULTI_ARG_2_DF_TF:
30065 case MULTI_ARG_2_DI_TF:
30066 case MULTI_ARG_2_SI_TF:
30067 case MULTI_ARG_2_HI_TF:
30068 case MULTI_ARG_2_QI_TF:
30069 nargs = 2;
30070 tf_p = true;
30071 break;
30072
30073 default:
30074 gcc_unreachable ();
30075 }
30076
30077 if (optimize || !target
30078 || GET_MODE (target) != tmode
30079 || !insn_data[icode].operand[0].predicate (target, tmode))
30080 target = gen_reg_rtx (tmode);
30081
30082 gcc_assert (nargs <= 4);
30083
30084 for (i = 0; i < nargs; i++)
30085 {
30086 tree arg = CALL_EXPR_ARG (exp, i);
30087 rtx op = expand_normal (arg);
30088 int adjust = (comparison_p) ? 1 : 0;
30089 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
30090
30091 if (last_arg_constant && i == nargs - 1)
30092 {
30093 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
30094 {
30095 enum insn_code new_icode = icode;
30096 switch (icode)
30097 {
30098 case CODE_FOR_xop_vpermil2v2df3:
30099 case CODE_FOR_xop_vpermil2v4sf3:
30100 case CODE_FOR_xop_vpermil2v4df3:
30101 case CODE_FOR_xop_vpermil2v8sf3:
30102 error ("the last argument must be a 2-bit immediate");
30103 return gen_reg_rtx (tmode);
30104 case CODE_FOR_xop_rotlv2di3:
30105 new_icode = CODE_FOR_rotlv2di3;
30106 goto xop_rotl;
30107 case CODE_FOR_xop_rotlv4si3:
30108 new_icode = CODE_FOR_rotlv4si3;
30109 goto xop_rotl;
30110 case CODE_FOR_xop_rotlv8hi3:
30111 new_icode = CODE_FOR_rotlv8hi3;
30112 goto xop_rotl;
30113 case CODE_FOR_xop_rotlv16qi3:
30114 new_icode = CODE_FOR_rotlv16qi3;
30115 xop_rotl:
30116 if (CONST_INT_P (op))
30117 {
30118 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
30119 op = GEN_INT (INTVAL (op) & mask);
30120 gcc_checking_assert
30121 (insn_data[icode].operand[i + 1].predicate (op, mode));
30122 }
30123 else
30124 {
30125 gcc_checking_assert
30126 (nargs == 2
30127 && insn_data[new_icode].operand[0].mode == tmode
30128 && insn_data[new_icode].operand[1].mode == tmode
30129 && insn_data[new_icode].operand[2].mode == mode
30130 && insn_data[new_icode].operand[0].predicate
30131 == insn_data[icode].operand[0].predicate
30132 && insn_data[new_icode].operand[1].predicate
30133 == insn_data[icode].operand[1].predicate);
30134 icode = new_icode;
30135 goto non_constant;
30136 }
30137 break;
30138 default:
30139 gcc_unreachable ();
30140 }
30141 }
30142 }
30143 else
30144 {
30145 non_constant:
30146 if (VECTOR_MODE_P (mode))
30147 op = safe_vector_operand (op, mode);
30148
30149 /* If we aren't optimizing, only allow one memory operand to be
30150 generated. */
30151 if (memory_operand (op, mode))
30152 num_memory++;
30153
30154 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
30155
30156 if (optimize
30157 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
30158 || num_memory > 1)
30159 op = force_reg (mode, op);
30160 }
30161
30162 args[i].op = op;
30163 args[i].mode = mode;
30164 }
30165
30166 switch (nargs)
30167 {
30168 case 1:
30169 pat = GEN_FCN (icode) (target, args[0].op);
30170 break;
30171
30172 case 2:
30173 if (tf_p)
30174 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
30175 GEN_INT ((int)sub_code));
30176 else if (! comparison_p)
30177 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
30178 else
30179 {
30180 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
30181 args[0].op,
30182 args[1].op);
30183
30184 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
30185 }
30186 break;
30187
30188 case 3:
30189 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
30190 break;
30191
30192 case 4:
30193 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
30194 break;
30195
30196 default:
30197 gcc_unreachable ();
30198 }
30199
30200 if (! pat)
30201 return 0;
30202
30203 emit_insn (pat);
30204 return target;
30205 }
30206
30207 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
30208 insns with vec_merge. */
30209
30210 static rtx
30211 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
30212 rtx target)
30213 {
30214 rtx pat;
30215 tree arg0 = CALL_EXPR_ARG (exp, 0);
30216 rtx op1, op0 = expand_normal (arg0);
30217 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30218 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
30219
30220 if (optimize || !target
30221 || GET_MODE (target) != tmode
30222 || !insn_data[icode].operand[0].predicate (target, tmode))
30223 target = gen_reg_rtx (tmode);
30224
30225 if (VECTOR_MODE_P (mode0))
30226 op0 = safe_vector_operand (op0, mode0);
30227
30228 if ((optimize && !register_operand (op0, mode0))
30229 || !insn_data[icode].operand[1].predicate (op0, mode0))
30230 op0 = copy_to_mode_reg (mode0, op0);
30231
30232 op1 = op0;
30233 if (!insn_data[icode].operand[2].predicate (op1, mode0))
30234 op1 = copy_to_mode_reg (mode0, op1);
30235
30236 pat = GEN_FCN (icode) (target, op0, op1);
30237 if (! pat)
30238 return 0;
30239 emit_insn (pat);
30240 return target;
30241 }
30242
30243 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
30244
30245 static rtx
30246 ix86_expand_sse_compare (const struct builtin_description *d,
30247 tree exp, rtx target, bool swap)
30248 {
30249 rtx pat;
30250 tree arg0 = CALL_EXPR_ARG (exp, 0);
30251 tree arg1 = CALL_EXPR_ARG (exp, 1);
30252 rtx op0 = expand_normal (arg0);
30253 rtx op1 = expand_normal (arg1);
30254 rtx op2;
30255 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30256 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30257 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
30258 enum rtx_code comparison = d->comparison;
30259
30260 if (VECTOR_MODE_P (mode0))
30261 op0 = safe_vector_operand (op0, mode0);
30262 if (VECTOR_MODE_P (mode1))
30263 op1 = safe_vector_operand (op1, mode1);
30264
30265 /* Swap operands if we have a comparison that isn't available in
30266 hardware. */
30267 if (swap)
30268 {
30269 rtx tmp = gen_reg_rtx (mode1);
30270 emit_move_insn (tmp, op1);
30271 op1 = op0;
30272 op0 = tmp;
30273 }
30274
30275 if (optimize || !target
30276 || GET_MODE (target) != tmode
30277 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30278 target = gen_reg_rtx (tmode);
30279
30280 if ((optimize && !register_operand (op0, mode0))
30281 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
30282 op0 = copy_to_mode_reg (mode0, op0);
30283 if ((optimize && !register_operand (op1, mode1))
30284 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
30285 op1 = copy_to_mode_reg (mode1, op1);
30286
30287 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
30288 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
30289 if (! pat)
30290 return 0;
30291 emit_insn (pat);
30292 return target;
30293 }
30294
30295 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
30296
30297 static rtx
30298 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
30299 rtx target)
30300 {
30301 rtx pat;
30302 tree arg0 = CALL_EXPR_ARG (exp, 0);
30303 tree arg1 = CALL_EXPR_ARG (exp, 1);
30304 rtx op0 = expand_normal (arg0);
30305 rtx op1 = expand_normal (arg1);
30306 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
30307 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
30308 enum rtx_code comparison = d->comparison;
30309
30310 if (VECTOR_MODE_P (mode0))
30311 op0 = safe_vector_operand (op0, mode0);
30312 if (VECTOR_MODE_P (mode1))
30313 op1 = safe_vector_operand (op1, mode1);
30314
30315 /* Swap operands if we have a comparison that isn't available in
30316 hardware. */
30317 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
30318 {
30319 rtx tmp = op1;
30320 op1 = op0;
30321 op0 = tmp;
30322 }
30323
30324 target = gen_reg_rtx (SImode);
30325 emit_move_insn (target, const0_rtx);
30326 target = gen_rtx_SUBREG (QImode, target, 0);
30327
30328 if ((optimize && !register_operand (op0, mode0))
30329 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30330 op0 = copy_to_mode_reg (mode0, op0);
30331 if ((optimize && !register_operand (op1, mode1))
30332 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30333 op1 = copy_to_mode_reg (mode1, op1);
30334
30335 pat = GEN_FCN (d->icode) (op0, op1);
30336 if (! pat)
30337 return 0;
30338 emit_insn (pat);
30339 emit_insn (gen_rtx_SET (VOIDmode,
30340 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30341 gen_rtx_fmt_ee (comparison, QImode,
30342 SET_DEST (pat),
30343 const0_rtx)));
30344
30345 return SUBREG_REG (target);
30346 }
30347
30348 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
30349
30350 static rtx
30351 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
30352 rtx target)
30353 {
30354 rtx pat;
30355 tree arg0 = CALL_EXPR_ARG (exp, 0);
30356 rtx op1, op0 = expand_normal (arg0);
30357 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30358 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30359
30360 if (optimize || target == 0
30361 || GET_MODE (target) != tmode
30362 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30363 target = gen_reg_rtx (tmode);
30364
30365 if (VECTOR_MODE_P (mode0))
30366 op0 = safe_vector_operand (op0, mode0);
30367
30368 if ((optimize && !register_operand (op0, mode0))
30369 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30370 op0 = copy_to_mode_reg (mode0, op0);
30371
30372 op1 = GEN_INT (d->comparison);
30373
30374 pat = GEN_FCN (d->icode) (target, op0, op1);
30375 if (! pat)
30376 return 0;
30377 emit_insn (pat);
30378 return target;
30379 }
30380
30381 static rtx
30382 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
30383 tree exp, rtx target)
30384 {
30385 rtx pat;
30386 tree arg0 = CALL_EXPR_ARG (exp, 0);
30387 tree arg1 = CALL_EXPR_ARG (exp, 1);
30388 rtx op0 = expand_normal (arg0);
30389 rtx op1 = expand_normal (arg1);
30390 rtx op2;
30391 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30392 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30393 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
30394
30395 if (optimize || target == 0
30396 || GET_MODE (target) != tmode
30397 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30398 target = gen_reg_rtx (tmode);
30399
30400 op0 = safe_vector_operand (op0, mode0);
30401 op1 = safe_vector_operand (op1, mode1);
30402
30403 if ((optimize && !register_operand (op0, mode0))
30404 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30405 op0 = copy_to_mode_reg (mode0, op0);
30406 if ((optimize && !register_operand (op1, mode1))
30407 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30408 op1 = copy_to_mode_reg (mode1, op1);
30409
30410 op2 = GEN_INT (d->comparison);
30411
30412 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
30413 if (! pat)
30414 return 0;
30415 emit_insn (pat);
30416 return target;
30417 }
30418
30419 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
30420
30421 static rtx
30422 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
30423 rtx target)
30424 {
30425 rtx pat;
30426 tree arg0 = CALL_EXPR_ARG (exp, 0);
30427 tree arg1 = CALL_EXPR_ARG (exp, 1);
30428 rtx op0 = expand_normal (arg0);
30429 rtx op1 = expand_normal (arg1);
30430 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
30431 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
30432 enum rtx_code comparison = d->comparison;
30433
30434 if (VECTOR_MODE_P (mode0))
30435 op0 = safe_vector_operand (op0, mode0);
30436 if (VECTOR_MODE_P (mode1))
30437 op1 = safe_vector_operand (op1, mode1);
30438
30439 target = gen_reg_rtx (SImode);
30440 emit_move_insn (target, const0_rtx);
30441 target = gen_rtx_SUBREG (QImode, target, 0);
30442
30443 if ((optimize && !register_operand (op0, mode0))
30444 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30445 op0 = copy_to_mode_reg (mode0, op0);
30446 if ((optimize && !register_operand (op1, mode1))
30447 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30448 op1 = copy_to_mode_reg (mode1, op1);
30449
30450 pat = GEN_FCN (d->icode) (op0, op1);
30451 if (! pat)
30452 return 0;
30453 emit_insn (pat);
30454 emit_insn (gen_rtx_SET (VOIDmode,
30455 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30456 gen_rtx_fmt_ee (comparison, QImode,
30457 SET_DEST (pat),
30458 const0_rtx)));
30459
30460 return SUBREG_REG (target);
30461 }
30462
30463 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
30464
30465 static rtx
30466 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
30467 tree exp, rtx target)
30468 {
30469 rtx pat;
30470 tree arg0 = CALL_EXPR_ARG (exp, 0);
30471 tree arg1 = CALL_EXPR_ARG (exp, 1);
30472 tree arg2 = CALL_EXPR_ARG (exp, 2);
30473 tree arg3 = CALL_EXPR_ARG (exp, 3);
30474 tree arg4 = CALL_EXPR_ARG (exp, 4);
30475 rtx scratch0, scratch1;
30476 rtx op0 = expand_normal (arg0);
30477 rtx op1 = expand_normal (arg1);
30478 rtx op2 = expand_normal (arg2);
30479 rtx op3 = expand_normal (arg3);
30480 rtx op4 = expand_normal (arg4);
30481 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
30482
30483 tmode0 = insn_data[d->icode].operand[0].mode;
30484 tmode1 = insn_data[d->icode].operand[1].mode;
30485 modev2 = insn_data[d->icode].operand[2].mode;
30486 modei3 = insn_data[d->icode].operand[3].mode;
30487 modev4 = insn_data[d->icode].operand[4].mode;
30488 modei5 = insn_data[d->icode].operand[5].mode;
30489 modeimm = insn_data[d->icode].operand[6].mode;
30490
30491 if (VECTOR_MODE_P (modev2))
30492 op0 = safe_vector_operand (op0, modev2);
30493 if (VECTOR_MODE_P (modev4))
30494 op2 = safe_vector_operand (op2, modev4);
30495
30496 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
30497 op0 = copy_to_mode_reg (modev2, op0);
30498 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
30499 op1 = copy_to_mode_reg (modei3, op1);
30500 if ((optimize && !register_operand (op2, modev4))
30501 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
30502 op2 = copy_to_mode_reg (modev4, op2);
30503 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
30504 op3 = copy_to_mode_reg (modei5, op3);
30505
30506 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
30507 {
30508 error ("the fifth argument must be an 8-bit immediate");
30509 return const0_rtx;
30510 }
30511
30512 if (d->code == IX86_BUILTIN_PCMPESTRI128)
30513 {
30514 if (optimize || !target
30515 || GET_MODE (target) != tmode0
30516 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
30517 target = gen_reg_rtx (tmode0);
30518
30519 scratch1 = gen_reg_rtx (tmode1);
30520
30521 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
30522 }
30523 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
30524 {
30525 if (optimize || !target
30526 || GET_MODE (target) != tmode1
30527 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
30528 target = gen_reg_rtx (tmode1);
30529
30530 scratch0 = gen_reg_rtx (tmode0);
30531
30532 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
30533 }
30534 else
30535 {
30536 gcc_assert (d->flag);
30537
30538 scratch0 = gen_reg_rtx (tmode0);
30539 scratch1 = gen_reg_rtx (tmode1);
30540
30541 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
30542 }
30543
30544 if (! pat)
30545 return 0;
30546
30547 emit_insn (pat);
30548
30549 if (d->flag)
30550 {
30551 target = gen_reg_rtx (SImode);
30552 emit_move_insn (target, const0_rtx);
30553 target = gen_rtx_SUBREG (QImode, target, 0);
30554
30555 emit_insn
30556 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30557 gen_rtx_fmt_ee (EQ, QImode,
30558 gen_rtx_REG ((enum machine_mode) d->flag,
30559 FLAGS_REG),
30560 const0_rtx)));
30561 return SUBREG_REG (target);
30562 }
30563 else
30564 return target;
30565 }
30566
30567
30568 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
30569
30570 static rtx
30571 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
30572 tree exp, rtx target)
30573 {
30574 rtx pat;
30575 tree arg0 = CALL_EXPR_ARG (exp, 0);
30576 tree arg1 = CALL_EXPR_ARG (exp, 1);
30577 tree arg2 = CALL_EXPR_ARG (exp, 2);
30578 rtx scratch0, scratch1;
30579 rtx op0 = expand_normal (arg0);
30580 rtx op1 = expand_normal (arg1);
30581 rtx op2 = expand_normal (arg2);
30582 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
30583
30584 tmode0 = insn_data[d->icode].operand[0].mode;
30585 tmode1 = insn_data[d->icode].operand[1].mode;
30586 modev2 = insn_data[d->icode].operand[2].mode;
30587 modev3 = insn_data[d->icode].operand[3].mode;
30588 modeimm = insn_data[d->icode].operand[4].mode;
30589
30590 if (VECTOR_MODE_P (modev2))
30591 op0 = safe_vector_operand (op0, modev2);
30592 if (VECTOR_MODE_P (modev3))
30593 op1 = safe_vector_operand (op1, modev3);
30594
30595 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
30596 op0 = copy_to_mode_reg (modev2, op0);
30597 if ((optimize && !register_operand (op1, modev3))
30598 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
30599 op1 = copy_to_mode_reg (modev3, op1);
30600
30601 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
30602 {
30603 error ("the third argument must be an 8-bit immediate");
30604 return const0_rtx;
30605 }
30606
30607 if (d->code == IX86_BUILTIN_PCMPISTRI128)
30608 {
30609 if (optimize || !target
30610 || GET_MODE (target) != tmode0
30611 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
30612 target = gen_reg_rtx (tmode0);
30613
30614 scratch1 = gen_reg_rtx (tmode1);
30615
30616 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
30617 }
30618 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
30619 {
30620 if (optimize || !target
30621 || GET_MODE (target) != tmode1
30622 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
30623 target = gen_reg_rtx (tmode1);
30624
30625 scratch0 = gen_reg_rtx (tmode0);
30626
30627 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
30628 }
30629 else
30630 {
30631 gcc_assert (d->flag);
30632
30633 scratch0 = gen_reg_rtx (tmode0);
30634 scratch1 = gen_reg_rtx (tmode1);
30635
30636 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
30637 }
30638
30639 if (! pat)
30640 return 0;
30641
30642 emit_insn (pat);
30643
30644 if (d->flag)
30645 {
30646 target = gen_reg_rtx (SImode);
30647 emit_move_insn (target, const0_rtx);
30648 target = gen_rtx_SUBREG (QImode, target, 0);
30649
30650 emit_insn
30651 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30652 gen_rtx_fmt_ee (EQ, QImode,
30653 gen_rtx_REG ((enum machine_mode) d->flag,
30654 FLAGS_REG),
30655 const0_rtx)));
30656 return SUBREG_REG (target);
30657 }
30658 else
30659 return target;
30660 }
30661
30662 /* Subroutine of ix86_expand_builtin to take care of insns with
30663 variable number of operands. */
30664
30665 static rtx
30666 ix86_expand_args_builtin (const struct builtin_description *d,
30667 tree exp, rtx target)
30668 {
30669 rtx pat, real_target;
30670 unsigned int i, nargs;
30671 unsigned int nargs_constant = 0;
30672 int num_memory = 0;
30673 struct
30674 {
30675 rtx op;
30676 enum machine_mode mode;
30677 } args[4];
30678 bool last_arg_count = false;
30679 enum insn_code icode = d->icode;
30680 const struct insn_data_d *insn_p = &insn_data[icode];
30681 enum machine_mode tmode = insn_p->operand[0].mode;
30682 enum machine_mode rmode = VOIDmode;
30683 bool swap = false;
30684 enum rtx_code comparison = d->comparison;
30685
30686 switch ((enum ix86_builtin_func_type) d->flag)
30687 {
30688 case V2DF_FTYPE_V2DF_ROUND:
30689 case V4DF_FTYPE_V4DF_ROUND:
30690 case V4SF_FTYPE_V4SF_ROUND:
30691 case V8SF_FTYPE_V8SF_ROUND:
30692 case V4SI_FTYPE_V4SF_ROUND:
30693 case V8SI_FTYPE_V8SF_ROUND:
30694 return ix86_expand_sse_round (d, exp, target);
30695 case V4SI_FTYPE_V2DF_V2DF_ROUND:
30696 case V8SI_FTYPE_V4DF_V4DF_ROUND:
30697 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
30698 case INT_FTYPE_V8SF_V8SF_PTEST:
30699 case INT_FTYPE_V4DI_V4DI_PTEST:
30700 case INT_FTYPE_V4DF_V4DF_PTEST:
30701 case INT_FTYPE_V4SF_V4SF_PTEST:
30702 case INT_FTYPE_V2DI_V2DI_PTEST:
30703 case INT_FTYPE_V2DF_V2DF_PTEST:
30704 return ix86_expand_sse_ptest (d, exp, target);
30705 case FLOAT128_FTYPE_FLOAT128:
30706 case FLOAT_FTYPE_FLOAT:
30707 case INT_FTYPE_INT:
30708 case UINT64_FTYPE_INT:
30709 case UINT16_FTYPE_UINT16:
30710 case INT64_FTYPE_INT64:
30711 case INT64_FTYPE_V4SF:
30712 case INT64_FTYPE_V2DF:
30713 case INT_FTYPE_V16QI:
30714 case INT_FTYPE_V8QI:
30715 case INT_FTYPE_V8SF:
30716 case INT_FTYPE_V4DF:
30717 case INT_FTYPE_V4SF:
30718 case INT_FTYPE_V2DF:
30719 case INT_FTYPE_V32QI:
30720 case V16QI_FTYPE_V16QI:
30721 case V8SI_FTYPE_V8SF:
30722 case V8SI_FTYPE_V4SI:
30723 case V8HI_FTYPE_V8HI:
30724 case V8HI_FTYPE_V16QI:
30725 case V8QI_FTYPE_V8QI:
30726 case V8SF_FTYPE_V8SF:
30727 case V8SF_FTYPE_V8SI:
30728 case V8SF_FTYPE_V4SF:
30729 case V8SF_FTYPE_V8HI:
30730 case V4SI_FTYPE_V4SI:
30731 case V4SI_FTYPE_V16QI:
30732 case V4SI_FTYPE_V4SF:
30733 case V4SI_FTYPE_V8SI:
30734 case V4SI_FTYPE_V8HI:
30735 case V4SI_FTYPE_V4DF:
30736 case V4SI_FTYPE_V2DF:
30737 case V4HI_FTYPE_V4HI:
30738 case V4DF_FTYPE_V4DF:
30739 case V4DF_FTYPE_V4SI:
30740 case V4DF_FTYPE_V4SF:
30741 case V4DF_FTYPE_V2DF:
30742 case V4SF_FTYPE_V4SF:
30743 case V4SF_FTYPE_V4SI:
30744 case V4SF_FTYPE_V8SF:
30745 case V4SF_FTYPE_V4DF:
30746 case V4SF_FTYPE_V8HI:
30747 case V4SF_FTYPE_V2DF:
30748 case V2DI_FTYPE_V2DI:
30749 case V2DI_FTYPE_V16QI:
30750 case V2DI_FTYPE_V8HI:
30751 case V2DI_FTYPE_V4SI:
30752 case V2DF_FTYPE_V2DF:
30753 case V2DF_FTYPE_V4SI:
30754 case V2DF_FTYPE_V4DF:
30755 case V2DF_FTYPE_V4SF:
30756 case V2DF_FTYPE_V2SI:
30757 case V2SI_FTYPE_V2SI:
30758 case V2SI_FTYPE_V4SF:
30759 case V2SI_FTYPE_V2SF:
30760 case V2SI_FTYPE_V2DF:
30761 case V2SF_FTYPE_V2SF:
30762 case V2SF_FTYPE_V2SI:
30763 case V32QI_FTYPE_V32QI:
30764 case V32QI_FTYPE_V16QI:
30765 case V16HI_FTYPE_V16HI:
30766 case V16HI_FTYPE_V8HI:
30767 case V8SI_FTYPE_V8SI:
30768 case V16HI_FTYPE_V16QI:
30769 case V8SI_FTYPE_V16QI:
30770 case V4DI_FTYPE_V16QI:
30771 case V8SI_FTYPE_V8HI:
30772 case V4DI_FTYPE_V8HI:
30773 case V4DI_FTYPE_V4SI:
30774 case V4DI_FTYPE_V2DI:
30775 nargs = 1;
30776 break;
30777 case V4SF_FTYPE_V4SF_VEC_MERGE:
30778 case V2DF_FTYPE_V2DF_VEC_MERGE:
30779 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
30780 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
30781 case V16QI_FTYPE_V16QI_V16QI:
30782 case V16QI_FTYPE_V8HI_V8HI:
30783 case V8QI_FTYPE_V8QI_V8QI:
30784 case V8QI_FTYPE_V4HI_V4HI:
30785 case V8HI_FTYPE_V8HI_V8HI:
30786 case V8HI_FTYPE_V16QI_V16QI:
30787 case V8HI_FTYPE_V4SI_V4SI:
30788 case V8SF_FTYPE_V8SF_V8SF:
30789 case V8SF_FTYPE_V8SF_V8SI:
30790 case V4SI_FTYPE_V4SI_V4SI:
30791 case V4SI_FTYPE_V8HI_V8HI:
30792 case V4SI_FTYPE_V4SF_V4SF:
30793 case V4SI_FTYPE_V2DF_V2DF:
30794 case V4HI_FTYPE_V4HI_V4HI:
30795 case V4HI_FTYPE_V8QI_V8QI:
30796 case V4HI_FTYPE_V2SI_V2SI:
30797 case V4DF_FTYPE_V4DF_V4DF:
30798 case V4DF_FTYPE_V4DF_V4DI:
30799 case V4SF_FTYPE_V4SF_V4SF:
30800 case V4SF_FTYPE_V4SF_V4SI:
30801 case V4SF_FTYPE_V4SF_V2SI:
30802 case V4SF_FTYPE_V4SF_V2DF:
30803 case V4SF_FTYPE_V4SF_DI:
30804 case V4SF_FTYPE_V4SF_SI:
30805 case V2DI_FTYPE_V2DI_V2DI:
30806 case V2DI_FTYPE_V16QI_V16QI:
30807 case V2DI_FTYPE_V4SI_V4SI:
30808 case V2UDI_FTYPE_V4USI_V4USI:
30809 case V2DI_FTYPE_V2DI_V16QI:
30810 case V2DI_FTYPE_V2DF_V2DF:
30811 case V2SI_FTYPE_V2SI_V2SI:
30812 case V2SI_FTYPE_V4HI_V4HI:
30813 case V2SI_FTYPE_V2SF_V2SF:
30814 case V2DF_FTYPE_V2DF_V2DF:
30815 case V2DF_FTYPE_V2DF_V4SF:
30816 case V2DF_FTYPE_V2DF_V2DI:
30817 case V2DF_FTYPE_V2DF_DI:
30818 case V2DF_FTYPE_V2DF_SI:
30819 case V2SF_FTYPE_V2SF_V2SF:
30820 case V1DI_FTYPE_V1DI_V1DI:
30821 case V1DI_FTYPE_V8QI_V8QI:
30822 case V1DI_FTYPE_V2SI_V2SI:
30823 case V32QI_FTYPE_V16HI_V16HI:
30824 case V16HI_FTYPE_V8SI_V8SI:
30825 case V32QI_FTYPE_V32QI_V32QI:
30826 case V16HI_FTYPE_V32QI_V32QI:
30827 case V16HI_FTYPE_V16HI_V16HI:
30828 case V8SI_FTYPE_V4DF_V4DF:
30829 case V8SI_FTYPE_V8SI_V8SI:
30830 case V8SI_FTYPE_V16HI_V16HI:
30831 case V4DI_FTYPE_V4DI_V4DI:
30832 case V4DI_FTYPE_V8SI_V8SI:
30833 case V4UDI_FTYPE_V8USI_V8USI:
30834 if (comparison == UNKNOWN)
30835 return ix86_expand_binop_builtin (icode, exp, target);
30836 nargs = 2;
30837 break;
30838 case V4SF_FTYPE_V4SF_V4SF_SWAP:
30839 case V2DF_FTYPE_V2DF_V2DF_SWAP:
30840 gcc_assert (comparison != UNKNOWN);
30841 nargs = 2;
30842 swap = true;
30843 break;
30844 case V16HI_FTYPE_V16HI_V8HI_COUNT:
30845 case V16HI_FTYPE_V16HI_SI_COUNT:
30846 case V8SI_FTYPE_V8SI_V4SI_COUNT:
30847 case V8SI_FTYPE_V8SI_SI_COUNT:
30848 case V4DI_FTYPE_V4DI_V2DI_COUNT:
30849 case V4DI_FTYPE_V4DI_INT_COUNT:
30850 case V8HI_FTYPE_V8HI_V8HI_COUNT:
30851 case V8HI_FTYPE_V8HI_SI_COUNT:
30852 case V4SI_FTYPE_V4SI_V4SI_COUNT:
30853 case V4SI_FTYPE_V4SI_SI_COUNT:
30854 case V4HI_FTYPE_V4HI_V4HI_COUNT:
30855 case V4HI_FTYPE_V4HI_SI_COUNT:
30856 case V2DI_FTYPE_V2DI_V2DI_COUNT:
30857 case V2DI_FTYPE_V2DI_SI_COUNT:
30858 case V2SI_FTYPE_V2SI_V2SI_COUNT:
30859 case V2SI_FTYPE_V2SI_SI_COUNT:
30860 case V1DI_FTYPE_V1DI_V1DI_COUNT:
30861 case V1DI_FTYPE_V1DI_SI_COUNT:
30862 nargs = 2;
30863 last_arg_count = true;
30864 break;
30865 case UINT64_FTYPE_UINT64_UINT64:
30866 case UINT_FTYPE_UINT_UINT:
30867 case UINT_FTYPE_UINT_USHORT:
30868 case UINT_FTYPE_UINT_UCHAR:
30869 case UINT16_FTYPE_UINT16_INT:
30870 case UINT8_FTYPE_UINT8_INT:
30871 nargs = 2;
30872 break;
30873 case V2DI_FTYPE_V2DI_INT_CONVERT:
30874 nargs = 2;
30875 rmode = V1TImode;
30876 nargs_constant = 1;
30877 break;
30878 case V4DI_FTYPE_V4DI_INT_CONVERT:
30879 nargs = 2;
30880 rmode = V2TImode;
30881 nargs_constant = 1;
30882 break;
30883 case V8HI_FTYPE_V8HI_INT:
30884 case V8HI_FTYPE_V8SF_INT:
30885 case V8HI_FTYPE_V4SF_INT:
30886 case V8SF_FTYPE_V8SF_INT:
30887 case V4SI_FTYPE_V4SI_INT:
30888 case V4SI_FTYPE_V8SI_INT:
30889 case V4HI_FTYPE_V4HI_INT:
30890 case V4DF_FTYPE_V4DF_INT:
30891 case V4SF_FTYPE_V4SF_INT:
30892 case V4SF_FTYPE_V8SF_INT:
30893 case V2DI_FTYPE_V2DI_INT:
30894 case V2DF_FTYPE_V2DF_INT:
30895 case V2DF_FTYPE_V4DF_INT:
30896 case V16HI_FTYPE_V16HI_INT:
30897 case V8SI_FTYPE_V8SI_INT:
30898 case V4DI_FTYPE_V4DI_INT:
30899 case V2DI_FTYPE_V4DI_INT:
30900 nargs = 2;
30901 nargs_constant = 1;
30902 break;
30903 case V16QI_FTYPE_V16QI_V16QI_V16QI:
30904 case V8SF_FTYPE_V8SF_V8SF_V8SF:
30905 case V4DF_FTYPE_V4DF_V4DF_V4DF:
30906 case V4SF_FTYPE_V4SF_V4SF_V4SF:
30907 case V2DF_FTYPE_V2DF_V2DF_V2DF:
30908 case V32QI_FTYPE_V32QI_V32QI_V32QI:
30909 nargs = 3;
30910 break;
30911 case V32QI_FTYPE_V32QI_V32QI_INT:
30912 case V16HI_FTYPE_V16HI_V16HI_INT:
30913 case V16QI_FTYPE_V16QI_V16QI_INT:
30914 case V4DI_FTYPE_V4DI_V4DI_INT:
30915 case V8HI_FTYPE_V8HI_V8HI_INT:
30916 case V8SI_FTYPE_V8SI_V8SI_INT:
30917 case V8SI_FTYPE_V8SI_V4SI_INT:
30918 case V8SF_FTYPE_V8SF_V8SF_INT:
30919 case V8SF_FTYPE_V8SF_V4SF_INT:
30920 case V4SI_FTYPE_V4SI_V4SI_INT:
30921 case V4DF_FTYPE_V4DF_V4DF_INT:
30922 case V4DF_FTYPE_V4DF_V2DF_INT:
30923 case V4SF_FTYPE_V4SF_V4SF_INT:
30924 case V2DI_FTYPE_V2DI_V2DI_INT:
30925 case V4DI_FTYPE_V4DI_V2DI_INT:
30926 case V2DF_FTYPE_V2DF_V2DF_INT:
30927 nargs = 3;
30928 nargs_constant = 1;
30929 break;
30930 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
30931 nargs = 3;
30932 rmode = V4DImode;
30933 nargs_constant = 1;
30934 break;
30935 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
30936 nargs = 3;
30937 rmode = V2DImode;
30938 nargs_constant = 1;
30939 break;
30940 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
30941 nargs = 3;
30942 rmode = DImode;
30943 nargs_constant = 1;
30944 break;
30945 case V2DI_FTYPE_V2DI_UINT_UINT:
30946 nargs = 3;
30947 nargs_constant = 2;
30948 break;
30949 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
30950 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
30951 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
30952 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
30953 nargs = 4;
30954 nargs_constant = 1;
30955 break;
30956 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
30957 nargs = 4;
30958 nargs_constant = 2;
30959 break;
30960 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
30961 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
30962 nargs = 4;
30963 break;
30964 default:
30965 gcc_unreachable ();
30966 }
30967
30968 gcc_assert (nargs <= ARRAY_SIZE (args));
30969
30970 if (comparison != UNKNOWN)
30971 {
30972 gcc_assert (nargs == 2);
30973 return ix86_expand_sse_compare (d, exp, target, swap);
30974 }
30975
30976 if (rmode == VOIDmode || rmode == tmode)
30977 {
30978 if (optimize
30979 || target == 0
30980 || GET_MODE (target) != tmode
30981 || !insn_p->operand[0].predicate (target, tmode))
30982 target = gen_reg_rtx (tmode);
30983 real_target = target;
30984 }
30985 else
30986 {
30987 target = gen_reg_rtx (rmode);
30988 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
30989 }
30990
30991 for (i = 0; i < nargs; i++)
30992 {
30993 tree arg = CALL_EXPR_ARG (exp, i);
30994 rtx op = expand_normal (arg);
30995 enum machine_mode mode = insn_p->operand[i + 1].mode;
30996 bool match = insn_p->operand[i + 1].predicate (op, mode);
30997
30998 if (last_arg_count && (i + 1) == nargs)
30999 {
31000 /* SIMD shift insns take either an 8-bit immediate or
31001 register as count. But builtin functions take int as
31002 count. If count doesn't match, we put it in register. */
31003 if (!match)
31004 {
31005 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
31006 if (!insn_p->operand[i + 1].predicate (op, mode))
31007 op = copy_to_reg (op);
31008 }
31009 }
31010 else if ((nargs - i) <= nargs_constant)
31011 {
31012 if (!match)
31013 switch (icode)
31014 {
31015 case CODE_FOR_avx2_inserti128:
31016 case CODE_FOR_avx2_extracti128:
31017 error ("the last argument must be an 1-bit immediate");
31018 return const0_rtx;
31019
31020 case CODE_FOR_sse4_1_roundsd:
31021 case CODE_FOR_sse4_1_roundss:
31022
31023 case CODE_FOR_sse4_1_roundpd:
31024 case CODE_FOR_sse4_1_roundps:
31025 case CODE_FOR_avx_roundpd256:
31026 case CODE_FOR_avx_roundps256:
31027
31028 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
31029 case CODE_FOR_sse4_1_roundps_sfix:
31030 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
31031 case CODE_FOR_avx_roundps_sfix256:
31032
31033 case CODE_FOR_sse4_1_blendps:
31034 case CODE_FOR_avx_blendpd256:
31035 case CODE_FOR_avx_vpermilv4df:
31036 error ("the last argument must be a 4-bit immediate");
31037 return const0_rtx;
31038
31039 case CODE_FOR_sse4_1_blendpd:
31040 case CODE_FOR_avx_vpermilv2df:
31041 case CODE_FOR_xop_vpermil2v2df3:
31042 case CODE_FOR_xop_vpermil2v4sf3:
31043 case CODE_FOR_xop_vpermil2v4df3:
31044 case CODE_FOR_xop_vpermil2v8sf3:
31045 error ("the last argument must be a 2-bit immediate");
31046 return const0_rtx;
31047
31048 case CODE_FOR_avx_vextractf128v4df:
31049 case CODE_FOR_avx_vextractf128v8sf:
31050 case CODE_FOR_avx_vextractf128v8si:
31051 case CODE_FOR_avx_vinsertf128v4df:
31052 case CODE_FOR_avx_vinsertf128v8sf:
31053 case CODE_FOR_avx_vinsertf128v8si:
31054 error ("the last argument must be a 1-bit immediate");
31055 return const0_rtx;
31056
31057 case CODE_FOR_avx_vmcmpv2df3:
31058 case CODE_FOR_avx_vmcmpv4sf3:
31059 case CODE_FOR_avx_cmpv2df3:
31060 case CODE_FOR_avx_cmpv4sf3:
31061 case CODE_FOR_avx_cmpv4df3:
31062 case CODE_FOR_avx_cmpv8sf3:
31063 error ("the last argument must be a 5-bit immediate");
31064 return const0_rtx;
31065
31066 default:
31067 switch (nargs_constant)
31068 {
31069 case 2:
31070 if ((nargs - i) == nargs_constant)
31071 {
31072 error ("the next to last argument must be an 8-bit immediate");
31073 break;
31074 }
31075 case 1:
31076 error ("the last argument must be an 8-bit immediate");
31077 break;
31078 default:
31079 gcc_unreachable ();
31080 }
31081 return const0_rtx;
31082 }
31083 }
31084 else
31085 {
31086 if (VECTOR_MODE_P (mode))
31087 op = safe_vector_operand (op, mode);
31088
31089 /* If we aren't optimizing, only allow one memory operand to
31090 be generated. */
31091 if (memory_operand (op, mode))
31092 num_memory++;
31093
31094 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
31095 {
31096 if (optimize || !match || num_memory > 1)
31097 op = copy_to_mode_reg (mode, op);
31098 }
31099 else
31100 {
31101 op = copy_to_reg (op);
31102 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
31103 }
31104 }
31105
31106 args[i].op = op;
31107 args[i].mode = mode;
31108 }
31109
31110 switch (nargs)
31111 {
31112 case 1:
31113 pat = GEN_FCN (icode) (real_target, args[0].op);
31114 break;
31115 case 2:
31116 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
31117 break;
31118 case 3:
31119 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
31120 args[2].op);
31121 break;
31122 case 4:
31123 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
31124 args[2].op, args[3].op);
31125 break;
31126 default:
31127 gcc_unreachable ();
31128 }
31129
31130 if (! pat)
31131 return 0;
31132
31133 emit_insn (pat);
31134 return target;
31135 }
31136
31137 /* Subroutine of ix86_expand_builtin to take care of special insns
31138 with variable number of operands. */
31139
31140 static rtx
31141 ix86_expand_special_args_builtin (const struct builtin_description *d,
31142 tree exp, rtx target)
31143 {
31144 tree arg;
31145 rtx pat, op;
31146 unsigned int i, nargs, arg_adjust, memory;
31147 struct
31148 {
31149 rtx op;
31150 enum machine_mode mode;
31151 } args[3];
31152 enum insn_code icode = d->icode;
31153 bool last_arg_constant = false;
31154 const struct insn_data_d *insn_p = &insn_data[icode];
31155 enum machine_mode tmode = insn_p->operand[0].mode;
31156 enum { load, store } klass;
31157
31158 switch ((enum ix86_builtin_func_type) d->flag)
31159 {
31160 case VOID_FTYPE_VOID:
31161 emit_insn (GEN_FCN (icode) (target));
31162 return 0;
31163 case VOID_FTYPE_UINT64:
31164 case VOID_FTYPE_UNSIGNED:
31165 nargs = 0;
31166 klass = store;
31167 memory = 0;
31168 break;
31169
31170 case INT_FTYPE_VOID:
31171 case UINT64_FTYPE_VOID:
31172 case UNSIGNED_FTYPE_VOID:
31173 nargs = 0;
31174 klass = load;
31175 memory = 0;
31176 break;
31177 case UINT64_FTYPE_PUNSIGNED:
31178 case V2DI_FTYPE_PV2DI:
31179 case V4DI_FTYPE_PV4DI:
31180 case V32QI_FTYPE_PCCHAR:
31181 case V16QI_FTYPE_PCCHAR:
31182 case V8SF_FTYPE_PCV4SF:
31183 case V8SF_FTYPE_PCFLOAT:
31184 case V4SF_FTYPE_PCFLOAT:
31185 case V4DF_FTYPE_PCV2DF:
31186 case V4DF_FTYPE_PCDOUBLE:
31187 case V2DF_FTYPE_PCDOUBLE:
31188 case VOID_FTYPE_PVOID:
31189 nargs = 1;
31190 klass = load;
31191 memory = 0;
31192 break;
31193 case VOID_FTYPE_PV2SF_V4SF:
31194 case VOID_FTYPE_PV4DI_V4DI:
31195 case VOID_FTYPE_PV2DI_V2DI:
31196 case VOID_FTYPE_PCHAR_V32QI:
31197 case VOID_FTYPE_PCHAR_V16QI:
31198 case VOID_FTYPE_PFLOAT_V8SF:
31199 case VOID_FTYPE_PFLOAT_V4SF:
31200 case VOID_FTYPE_PDOUBLE_V4DF:
31201 case VOID_FTYPE_PDOUBLE_V2DF:
31202 case VOID_FTYPE_PLONGLONG_LONGLONG:
31203 case VOID_FTYPE_PULONGLONG_ULONGLONG:
31204 case VOID_FTYPE_PINT_INT:
31205 nargs = 1;
31206 klass = store;
31207 /* Reserve memory operand for target. */
31208 memory = ARRAY_SIZE (args);
31209 break;
31210 case V4SF_FTYPE_V4SF_PCV2SF:
31211 case V2DF_FTYPE_V2DF_PCDOUBLE:
31212 nargs = 2;
31213 klass = load;
31214 memory = 1;
31215 break;
31216 case V8SF_FTYPE_PCV8SF_V8SI:
31217 case V4DF_FTYPE_PCV4DF_V4DI:
31218 case V4SF_FTYPE_PCV4SF_V4SI:
31219 case V2DF_FTYPE_PCV2DF_V2DI:
31220 case V8SI_FTYPE_PCV8SI_V8SI:
31221 case V4DI_FTYPE_PCV4DI_V4DI:
31222 case V4SI_FTYPE_PCV4SI_V4SI:
31223 case V2DI_FTYPE_PCV2DI_V2DI:
31224 nargs = 2;
31225 klass = load;
31226 memory = 0;
31227 break;
31228 case VOID_FTYPE_PV8SF_V8SI_V8SF:
31229 case VOID_FTYPE_PV4DF_V4DI_V4DF:
31230 case VOID_FTYPE_PV4SF_V4SI_V4SF:
31231 case VOID_FTYPE_PV2DF_V2DI_V2DF:
31232 case VOID_FTYPE_PV8SI_V8SI_V8SI:
31233 case VOID_FTYPE_PV4DI_V4DI_V4DI:
31234 case VOID_FTYPE_PV4SI_V4SI_V4SI:
31235 case VOID_FTYPE_PV2DI_V2DI_V2DI:
31236 nargs = 2;
31237 klass = store;
31238 /* Reserve memory operand for target. */
31239 memory = ARRAY_SIZE (args);
31240 break;
31241 case VOID_FTYPE_UINT_UINT_UINT:
31242 case VOID_FTYPE_UINT64_UINT_UINT:
31243 case UCHAR_FTYPE_UINT_UINT_UINT:
31244 case UCHAR_FTYPE_UINT64_UINT_UINT:
31245 nargs = 3;
31246 klass = load;
31247 memory = ARRAY_SIZE (args);
31248 last_arg_constant = true;
31249 break;
31250 default:
31251 gcc_unreachable ();
31252 }
31253
31254 gcc_assert (nargs <= ARRAY_SIZE (args));
31255
31256 if (klass == store)
31257 {
31258 arg = CALL_EXPR_ARG (exp, 0);
31259 op = expand_normal (arg);
31260 gcc_assert (target == 0);
31261 if (memory)
31262 {
31263 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
31264 target = gen_rtx_MEM (tmode, op);
31265 }
31266 else
31267 target = force_reg (tmode, op);
31268 arg_adjust = 1;
31269 }
31270 else
31271 {
31272 arg_adjust = 0;
31273 if (optimize
31274 || target == 0
31275 || !register_operand (target, tmode)
31276 || GET_MODE (target) != tmode)
31277 target = gen_reg_rtx (tmode);
31278 }
31279
31280 for (i = 0; i < nargs; i++)
31281 {
31282 enum machine_mode mode = insn_p->operand[i + 1].mode;
31283 bool match;
31284
31285 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
31286 op = expand_normal (arg);
31287 match = insn_p->operand[i + 1].predicate (op, mode);
31288
31289 if (last_arg_constant && (i + 1) == nargs)
31290 {
31291 if (!match)
31292 {
31293 if (icode == CODE_FOR_lwp_lwpvalsi3
31294 || icode == CODE_FOR_lwp_lwpinssi3
31295 || icode == CODE_FOR_lwp_lwpvaldi3
31296 || icode == CODE_FOR_lwp_lwpinsdi3)
31297 error ("the last argument must be a 32-bit immediate");
31298 else
31299 error ("the last argument must be an 8-bit immediate");
31300 return const0_rtx;
31301 }
31302 }
31303 else
31304 {
31305 if (i == memory)
31306 {
31307 /* This must be the memory operand. */
31308 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
31309 op = gen_rtx_MEM (mode, op);
31310 gcc_assert (GET_MODE (op) == mode
31311 || GET_MODE (op) == VOIDmode);
31312 }
31313 else
31314 {
31315 /* This must be register. */
31316 if (VECTOR_MODE_P (mode))
31317 op = safe_vector_operand (op, mode);
31318
31319 gcc_assert (GET_MODE (op) == mode
31320 || GET_MODE (op) == VOIDmode);
31321 op = copy_to_mode_reg (mode, op);
31322 }
31323 }
31324
31325 args[i].op = op;
31326 args[i].mode = mode;
31327 }
31328
31329 switch (nargs)
31330 {
31331 case 0:
31332 pat = GEN_FCN (icode) (target);
31333 break;
31334 case 1:
31335 pat = GEN_FCN (icode) (target, args[0].op);
31336 break;
31337 case 2:
31338 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
31339 break;
31340 case 3:
31341 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
31342 break;
31343 default:
31344 gcc_unreachable ();
31345 }
31346
31347 if (! pat)
31348 return 0;
31349 emit_insn (pat);
31350 return klass == store ? 0 : target;
31351 }
31352
31353 /* Return the integer constant in ARG. Constrain it to be in the range
31354 of the subparts of VEC_TYPE; issue an error if not. */
31355
31356 static int
31357 get_element_number (tree vec_type, tree arg)
31358 {
31359 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
31360
31361 if (!host_integerp (arg, 1)
31362 || (elt = tree_low_cst (arg, 1), elt > max))
31363 {
31364 error ("selector must be an integer constant in the range 0..%wi", max);
31365 return 0;
31366 }
31367
31368 return elt;
31369 }
31370
31371 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31372 ix86_expand_vector_init. We DO have language-level syntax for this, in
31373 the form of (type){ init-list }. Except that since we can't place emms
31374 instructions from inside the compiler, we can't allow the use of MMX
31375 registers unless the user explicitly asks for it. So we do *not* define
31376 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
31377 we have builtins invoked by mmintrin.h that gives us license to emit
31378 these sorts of instructions. */
31379
31380 static rtx
31381 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
31382 {
31383 enum machine_mode tmode = TYPE_MODE (type);
31384 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
31385 int i, n_elt = GET_MODE_NUNITS (tmode);
31386 rtvec v = rtvec_alloc (n_elt);
31387
31388 gcc_assert (VECTOR_MODE_P (tmode));
31389 gcc_assert (call_expr_nargs (exp) == n_elt);
31390
31391 for (i = 0; i < n_elt; ++i)
31392 {
31393 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
31394 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
31395 }
31396
31397 if (!target || !register_operand (target, tmode))
31398 target = gen_reg_rtx (tmode);
31399
31400 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
31401 return target;
31402 }
31403
31404 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31405 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
31406 had a language-level syntax for referencing vector elements. */
31407
31408 static rtx
31409 ix86_expand_vec_ext_builtin (tree exp, rtx target)
31410 {
31411 enum machine_mode tmode, mode0;
31412 tree arg0, arg1;
31413 int elt;
31414 rtx op0;
31415
31416 arg0 = CALL_EXPR_ARG (exp, 0);
31417 arg1 = CALL_EXPR_ARG (exp, 1);
31418
31419 op0 = expand_normal (arg0);
31420 elt = get_element_number (TREE_TYPE (arg0), arg1);
31421
31422 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
31423 mode0 = TYPE_MODE (TREE_TYPE (arg0));
31424 gcc_assert (VECTOR_MODE_P (mode0));
31425
31426 op0 = force_reg (mode0, op0);
31427
31428 if (optimize || !target || !register_operand (target, tmode))
31429 target = gen_reg_rtx (tmode);
31430
31431 ix86_expand_vector_extract (true, target, op0, elt);
31432
31433 return target;
31434 }
31435
31436 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31437 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
31438 a language-level syntax for referencing vector elements. */
31439
31440 static rtx
31441 ix86_expand_vec_set_builtin (tree exp)
31442 {
31443 enum machine_mode tmode, mode1;
31444 tree arg0, arg1, arg2;
31445 int elt;
31446 rtx op0, op1, target;
31447
31448 arg0 = CALL_EXPR_ARG (exp, 0);
31449 arg1 = CALL_EXPR_ARG (exp, 1);
31450 arg2 = CALL_EXPR_ARG (exp, 2);
31451
31452 tmode = TYPE_MODE (TREE_TYPE (arg0));
31453 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
31454 gcc_assert (VECTOR_MODE_P (tmode));
31455
31456 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
31457 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
31458 elt = get_element_number (TREE_TYPE (arg0), arg2);
31459
31460 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
31461 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
31462
31463 op0 = force_reg (tmode, op0);
31464 op1 = force_reg (mode1, op1);
31465
31466 /* OP0 is the source of these builtin functions and shouldn't be
31467 modified. Create a copy, use it and return it as target. */
31468 target = gen_reg_rtx (tmode);
31469 emit_move_insn (target, op0);
31470 ix86_expand_vector_set (true, target, op1, elt);
31471
31472 return target;
31473 }
31474
31475 /* Expand an expression EXP that calls a built-in function,
31476 with result going to TARGET if that's convenient
31477 (and in mode MODE if that's convenient).
31478 SUBTARGET may be used as the target for computing one of EXP's operands.
31479 IGNORE is nonzero if the value is to be ignored. */
31480
31481 static rtx
31482 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
31483 enum machine_mode mode ATTRIBUTE_UNUSED,
31484 int ignore ATTRIBUTE_UNUSED)
31485 {
31486 const struct builtin_description *d;
31487 size_t i;
31488 enum insn_code icode;
31489 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
31490 tree arg0, arg1, arg2, arg3, arg4;
31491 rtx op0, op1, op2, op3, op4, pat, insn;
31492 enum machine_mode mode0, mode1, mode2, mode3, mode4;
31493 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
31494
31495 /* For CPU builtins that can be folded, fold first and expand the fold. */
31496 switch (fcode)
31497 {
31498 case IX86_BUILTIN_CPU_INIT:
31499 {
31500 /* Make it call __cpu_indicator_init in libgcc. */
31501 tree call_expr, fndecl, type;
31502 type = build_function_type_list (integer_type_node, NULL_TREE);
31503 fndecl = build_fn_decl ("__cpu_indicator_init", type);
31504 call_expr = build_call_expr (fndecl, 0);
31505 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
31506 }
31507 case IX86_BUILTIN_CPU_IS:
31508 case IX86_BUILTIN_CPU_SUPPORTS:
31509 {
31510 tree arg0 = CALL_EXPR_ARG (exp, 0);
31511 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
31512 gcc_assert (fold_expr != NULL_TREE);
31513 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
31514 }
31515 }
31516
31517 /* Determine whether the builtin function is available under the current ISA.
31518 Originally the builtin was not created if it wasn't applicable to the
31519 current ISA based on the command line switches. With function specific
31520 options, we need to check in the context of the function making the call
31521 whether it is supported. */
31522 if (ix86_builtins_isa[fcode].isa
31523 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
31524 {
31525 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
31526 NULL, (enum fpmath_unit) 0, false);
31527
31528 if (!opts)
31529 error ("%qE needs unknown isa option", fndecl);
31530 else
31531 {
31532 gcc_assert (opts != NULL);
31533 error ("%qE needs isa option %s", fndecl, opts);
31534 free (opts);
31535 }
31536 return const0_rtx;
31537 }
31538
31539 switch (fcode)
31540 {
31541 case IX86_BUILTIN_MASKMOVQ:
31542 case IX86_BUILTIN_MASKMOVDQU:
31543 icode = (fcode == IX86_BUILTIN_MASKMOVQ
31544 ? CODE_FOR_mmx_maskmovq
31545 : CODE_FOR_sse2_maskmovdqu);
31546 /* Note the arg order is different from the operand order. */
31547 arg1 = CALL_EXPR_ARG (exp, 0);
31548 arg2 = CALL_EXPR_ARG (exp, 1);
31549 arg0 = CALL_EXPR_ARG (exp, 2);
31550 op0 = expand_normal (arg0);
31551 op1 = expand_normal (arg1);
31552 op2 = expand_normal (arg2);
31553 mode0 = insn_data[icode].operand[0].mode;
31554 mode1 = insn_data[icode].operand[1].mode;
31555 mode2 = insn_data[icode].operand[2].mode;
31556
31557 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31558 op0 = gen_rtx_MEM (mode1, op0);
31559
31560 if (!insn_data[icode].operand[0].predicate (op0, mode0))
31561 op0 = copy_to_mode_reg (mode0, op0);
31562 if (!insn_data[icode].operand[1].predicate (op1, mode1))
31563 op1 = copy_to_mode_reg (mode1, op1);
31564 if (!insn_data[icode].operand[2].predicate (op2, mode2))
31565 op2 = copy_to_mode_reg (mode2, op2);
31566 pat = GEN_FCN (icode) (op0, op1, op2);
31567 if (! pat)
31568 return 0;
31569 emit_insn (pat);
31570 return 0;
31571
31572 case IX86_BUILTIN_LDMXCSR:
31573 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
31574 target = assign_386_stack_local (SImode, SLOT_TEMP);
31575 emit_move_insn (target, op0);
31576 emit_insn (gen_sse_ldmxcsr (target));
31577 return 0;
31578
31579 case IX86_BUILTIN_STMXCSR:
31580 target = assign_386_stack_local (SImode, SLOT_TEMP);
31581 emit_insn (gen_sse_stmxcsr (target));
31582 return copy_to_mode_reg (SImode, target);
31583
31584 case IX86_BUILTIN_CLFLUSH:
31585 arg0 = CALL_EXPR_ARG (exp, 0);
31586 op0 = expand_normal (arg0);
31587 icode = CODE_FOR_sse2_clflush;
31588 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
31589 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31590
31591 emit_insn (gen_sse2_clflush (op0));
31592 return 0;
31593
31594 case IX86_BUILTIN_MONITOR:
31595 arg0 = CALL_EXPR_ARG (exp, 0);
31596 arg1 = CALL_EXPR_ARG (exp, 1);
31597 arg2 = CALL_EXPR_ARG (exp, 2);
31598 op0 = expand_normal (arg0);
31599 op1 = expand_normal (arg1);
31600 op2 = expand_normal (arg2);
31601 if (!REG_P (op0))
31602 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31603 if (!REG_P (op1))
31604 op1 = copy_to_mode_reg (SImode, op1);
31605 if (!REG_P (op2))
31606 op2 = copy_to_mode_reg (SImode, op2);
31607 emit_insn (ix86_gen_monitor (op0, op1, op2));
31608 return 0;
31609
31610 case IX86_BUILTIN_MWAIT:
31611 arg0 = CALL_EXPR_ARG (exp, 0);
31612 arg1 = CALL_EXPR_ARG (exp, 1);
31613 op0 = expand_normal (arg0);
31614 op1 = expand_normal (arg1);
31615 if (!REG_P (op0))
31616 op0 = copy_to_mode_reg (SImode, op0);
31617 if (!REG_P (op1))
31618 op1 = copy_to_mode_reg (SImode, op1);
31619 emit_insn (gen_sse3_mwait (op0, op1));
31620 return 0;
31621
31622 case IX86_BUILTIN_VEC_INIT_V2SI:
31623 case IX86_BUILTIN_VEC_INIT_V4HI:
31624 case IX86_BUILTIN_VEC_INIT_V8QI:
31625 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
31626
31627 case IX86_BUILTIN_VEC_EXT_V2DF:
31628 case IX86_BUILTIN_VEC_EXT_V2DI:
31629 case IX86_BUILTIN_VEC_EXT_V4SF:
31630 case IX86_BUILTIN_VEC_EXT_V4SI:
31631 case IX86_BUILTIN_VEC_EXT_V8HI:
31632 case IX86_BUILTIN_VEC_EXT_V2SI:
31633 case IX86_BUILTIN_VEC_EXT_V4HI:
31634 case IX86_BUILTIN_VEC_EXT_V16QI:
31635 return ix86_expand_vec_ext_builtin (exp, target);
31636
31637 case IX86_BUILTIN_VEC_SET_V2DI:
31638 case IX86_BUILTIN_VEC_SET_V4SF:
31639 case IX86_BUILTIN_VEC_SET_V4SI:
31640 case IX86_BUILTIN_VEC_SET_V8HI:
31641 case IX86_BUILTIN_VEC_SET_V4HI:
31642 case IX86_BUILTIN_VEC_SET_V16QI:
31643 return ix86_expand_vec_set_builtin (exp);
31644
31645 case IX86_BUILTIN_INFQ:
31646 case IX86_BUILTIN_HUGE_VALQ:
31647 {
31648 REAL_VALUE_TYPE inf;
31649 rtx tmp;
31650
31651 real_inf (&inf);
31652 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
31653
31654 tmp = validize_mem (force_const_mem (mode, tmp));
31655
31656 if (target == 0)
31657 target = gen_reg_rtx (mode);
31658
31659 emit_move_insn (target, tmp);
31660 return target;
31661 }
31662
31663 case IX86_BUILTIN_RDPMC:
31664 case IX86_BUILTIN_RDTSC:
31665 case IX86_BUILTIN_RDTSCP:
31666
31667 op0 = gen_reg_rtx (DImode);
31668 op1 = gen_reg_rtx (DImode);
31669
31670 if (fcode == IX86_BUILTIN_RDPMC)
31671 {
31672 arg0 = CALL_EXPR_ARG (exp, 0);
31673 op2 = expand_normal (arg0);
31674 if (!register_operand (op2, SImode))
31675 op2 = copy_to_mode_reg (SImode, op2);
31676
31677 insn = (TARGET_64BIT
31678 ? gen_rdpmc_rex64 (op0, op1, op2)
31679 : gen_rdpmc (op0, op2));
31680 emit_insn (insn);
31681 }
31682 else if (fcode == IX86_BUILTIN_RDTSC)
31683 {
31684 insn = (TARGET_64BIT
31685 ? gen_rdtsc_rex64 (op0, op1)
31686 : gen_rdtsc (op0));
31687 emit_insn (insn);
31688 }
31689 else
31690 {
31691 op2 = gen_reg_rtx (SImode);
31692
31693 insn = (TARGET_64BIT
31694 ? gen_rdtscp_rex64 (op0, op1, op2)
31695 : gen_rdtscp (op0, op2));
31696 emit_insn (insn);
31697
31698 arg0 = CALL_EXPR_ARG (exp, 0);
31699 op4 = expand_normal (arg0);
31700 if (!address_operand (op4, VOIDmode))
31701 {
31702 op4 = convert_memory_address (Pmode, op4);
31703 op4 = copy_addr_to_reg (op4);
31704 }
31705 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
31706 }
31707
31708 if (target == 0)
31709 target = gen_reg_rtx (mode);
31710
31711 if (TARGET_64BIT)
31712 {
31713 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
31714 op1, 1, OPTAB_DIRECT);
31715 op0 = expand_simple_binop (DImode, IOR, op0, op1,
31716 op0, 1, OPTAB_DIRECT);
31717 }
31718
31719 emit_move_insn (target, op0);
31720 return target;
31721
31722 case IX86_BUILTIN_FXSAVE:
31723 case IX86_BUILTIN_FXRSTOR:
31724 case IX86_BUILTIN_FXSAVE64:
31725 case IX86_BUILTIN_FXRSTOR64:
31726 switch (fcode)
31727 {
31728 case IX86_BUILTIN_FXSAVE:
31729 icode = CODE_FOR_fxsave;
31730 break;
31731 case IX86_BUILTIN_FXRSTOR:
31732 icode = CODE_FOR_fxrstor;
31733 break;
31734 case IX86_BUILTIN_FXSAVE64:
31735 icode = CODE_FOR_fxsave64;
31736 break;
31737 case IX86_BUILTIN_FXRSTOR64:
31738 icode = CODE_FOR_fxrstor64;
31739 break;
31740 default:
31741 gcc_unreachable ();
31742 }
31743
31744 arg0 = CALL_EXPR_ARG (exp, 0);
31745 op0 = expand_normal (arg0);
31746
31747 if (!address_operand (op0, VOIDmode))
31748 {
31749 op0 = convert_memory_address (Pmode, op0);
31750 op0 = copy_addr_to_reg (op0);
31751 }
31752 op0 = gen_rtx_MEM (BLKmode, op0);
31753
31754 pat = GEN_FCN (icode) (op0);
31755 if (pat)
31756 emit_insn (pat);
31757 return 0;
31758
31759 case IX86_BUILTIN_XSAVE:
31760 case IX86_BUILTIN_XRSTOR:
31761 case IX86_BUILTIN_XSAVE64:
31762 case IX86_BUILTIN_XRSTOR64:
31763 case IX86_BUILTIN_XSAVEOPT:
31764 case IX86_BUILTIN_XSAVEOPT64:
31765 arg0 = CALL_EXPR_ARG (exp, 0);
31766 arg1 = CALL_EXPR_ARG (exp, 1);
31767 op0 = expand_normal (arg0);
31768 op1 = expand_normal (arg1);
31769
31770 if (!address_operand (op0, VOIDmode))
31771 {
31772 op0 = convert_memory_address (Pmode, op0);
31773 op0 = copy_addr_to_reg (op0);
31774 }
31775 op0 = gen_rtx_MEM (BLKmode, op0);
31776
31777 op1 = force_reg (DImode, op1);
31778
31779 if (TARGET_64BIT)
31780 {
31781 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
31782 NULL, 1, OPTAB_DIRECT);
31783 switch (fcode)
31784 {
31785 case IX86_BUILTIN_XSAVE:
31786 icode = CODE_FOR_xsave_rex64;
31787 break;
31788 case IX86_BUILTIN_XRSTOR:
31789 icode = CODE_FOR_xrstor_rex64;
31790 break;
31791 case IX86_BUILTIN_XSAVE64:
31792 icode = CODE_FOR_xsave64;
31793 break;
31794 case IX86_BUILTIN_XRSTOR64:
31795 icode = CODE_FOR_xrstor64;
31796 break;
31797 case IX86_BUILTIN_XSAVEOPT:
31798 icode = CODE_FOR_xsaveopt_rex64;
31799 break;
31800 case IX86_BUILTIN_XSAVEOPT64:
31801 icode = CODE_FOR_xsaveopt64;
31802 break;
31803 default:
31804 gcc_unreachable ();
31805 }
31806
31807 op2 = gen_lowpart (SImode, op2);
31808 op1 = gen_lowpart (SImode, op1);
31809 pat = GEN_FCN (icode) (op0, op1, op2);
31810 }
31811 else
31812 {
31813 switch (fcode)
31814 {
31815 case IX86_BUILTIN_XSAVE:
31816 icode = CODE_FOR_xsave;
31817 break;
31818 case IX86_BUILTIN_XRSTOR:
31819 icode = CODE_FOR_xrstor;
31820 break;
31821 case IX86_BUILTIN_XSAVEOPT:
31822 icode = CODE_FOR_xsaveopt;
31823 break;
31824 default:
31825 gcc_unreachable ();
31826 }
31827 pat = GEN_FCN (icode) (op0, op1);
31828 }
31829
31830 if (pat)
31831 emit_insn (pat);
31832 return 0;
31833
31834 case IX86_BUILTIN_LLWPCB:
31835 arg0 = CALL_EXPR_ARG (exp, 0);
31836 op0 = expand_normal (arg0);
31837 icode = CODE_FOR_lwp_llwpcb;
31838 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
31839 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31840 emit_insn (gen_lwp_llwpcb (op0));
31841 return 0;
31842
31843 case IX86_BUILTIN_SLWPCB:
31844 icode = CODE_FOR_lwp_slwpcb;
31845 if (!target
31846 || !insn_data[icode].operand[0].predicate (target, Pmode))
31847 target = gen_reg_rtx (Pmode);
31848 emit_insn (gen_lwp_slwpcb (target));
31849 return target;
31850
31851 case IX86_BUILTIN_BEXTRI32:
31852 case IX86_BUILTIN_BEXTRI64:
31853 arg0 = CALL_EXPR_ARG (exp, 0);
31854 arg1 = CALL_EXPR_ARG (exp, 1);
31855 op0 = expand_normal (arg0);
31856 op1 = expand_normal (arg1);
31857 icode = (fcode == IX86_BUILTIN_BEXTRI32
31858 ? CODE_FOR_tbm_bextri_si
31859 : CODE_FOR_tbm_bextri_di);
31860 if (!CONST_INT_P (op1))
31861 {
31862 error ("last argument must be an immediate");
31863 return const0_rtx;
31864 }
31865 else
31866 {
31867 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
31868 unsigned char lsb_index = INTVAL (op1) & 0xFF;
31869 op1 = GEN_INT (length);
31870 op2 = GEN_INT (lsb_index);
31871 pat = GEN_FCN (icode) (target, op0, op1, op2);
31872 if (pat)
31873 emit_insn (pat);
31874 return target;
31875 }
31876
31877 case IX86_BUILTIN_RDRAND16_STEP:
31878 icode = CODE_FOR_rdrandhi_1;
31879 mode0 = HImode;
31880 goto rdrand_step;
31881
31882 case IX86_BUILTIN_RDRAND32_STEP:
31883 icode = CODE_FOR_rdrandsi_1;
31884 mode0 = SImode;
31885 goto rdrand_step;
31886
31887 case IX86_BUILTIN_RDRAND64_STEP:
31888 icode = CODE_FOR_rdranddi_1;
31889 mode0 = DImode;
31890
31891 rdrand_step:
31892 op0 = gen_reg_rtx (mode0);
31893 emit_insn (GEN_FCN (icode) (op0));
31894
31895 arg0 = CALL_EXPR_ARG (exp, 0);
31896 op1 = expand_normal (arg0);
31897 if (!address_operand (op1, VOIDmode))
31898 {
31899 op1 = convert_memory_address (Pmode, op1);
31900 op1 = copy_addr_to_reg (op1);
31901 }
31902 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
31903
31904 op1 = gen_reg_rtx (SImode);
31905 emit_move_insn (op1, CONST1_RTX (SImode));
31906
31907 /* Emit SImode conditional move. */
31908 if (mode0 == HImode)
31909 {
31910 op2 = gen_reg_rtx (SImode);
31911 emit_insn (gen_zero_extendhisi2 (op2, op0));
31912 }
31913 else if (mode0 == SImode)
31914 op2 = op0;
31915 else
31916 op2 = gen_rtx_SUBREG (SImode, op0, 0);
31917
31918 if (target == 0)
31919 target = gen_reg_rtx (SImode);
31920
31921 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
31922 const0_rtx);
31923 emit_insn (gen_rtx_SET (VOIDmode, target,
31924 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
31925 return target;
31926
31927 case IX86_BUILTIN_RDSEED16_STEP:
31928 icode = CODE_FOR_rdseedhi_1;
31929 mode0 = HImode;
31930 goto rdseed_step;
31931
31932 case IX86_BUILTIN_RDSEED32_STEP:
31933 icode = CODE_FOR_rdseedsi_1;
31934 mode0 = SImode;
31935 goto rdseed_step;
31936
31937 case IX86_BUILTIN_RDSEED64_STEP:
31938 icode = CODE_FOR_rdseeddi_1;
31939 mode0 = DImode;
31940
31941 rdseed_step:
31942 op0 = gen_reg_rtx (mode0);
31943 emit_insn (GEN_FCN (icode) (op0));
31944
31945 arg0 = CALL_EXPR_ARG (exp, 0);
31946 op1 = expand_normal (arg0);
31947 if (!address_operand (op1, VOIDmode))
31948 {
31949 op1 = convert_memory_address (Pmode, op1);
31950 op1 = copy_addr_to_reg (op1);
31951 }
31952 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
31953
31954 op2 = gen_reg_rtx (QImode);
31955
31956 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
31957 const0_rtx);
31958 emit_insn (gen_rtx_SET (VOIDmode, op2, pat));
31959
31960 if (target == 0)
31961 target = gen_reg_rtx (SImode);
31962
31963 emit_insn (gen_zero_extendqisi2 (target, op2));
31964 return target;
31965
31966 case IX86_BUILTIN_ADDCARRYX32:
31967 icode = TARGET_ADX ? CODE_FOR_adcxsi3 : CODE_FOR_addsi3_carry;
31968 mode0 = SImode;
31969 goto addcarryx;
31970
31971 case IX86_BUILTIN_ADDCARRYX64:
31972 icode = TARGET_ADX ? CODE_FOR_adcxdi3 : CODE_FOR_adddi3_carry;
31973 mode0 = DImode;
31974
31975 addcarryx:
31976 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
31977 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
31978 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
31979 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
31980
31981 op0 = gen_reg_rtx (QImode);
31982
31983 /* Generate CF from input operand. */
31984 op1 = expand_normal (arg0);
31985 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
31986 emit_insn (gen_addqi3_cc (op0, op1, constm1_rtx));
31987
31988 /* Gen ADCX instruction to compute X+Y+CF. */
31989 op2 = expand_normal (arg1);
31990 op3 = expand_normal (arg2);
31991
31992 if (!REG_P (op2))
31993 op2 = copy_to_mode_reg (mode0, op2);
31994 if (!REG_P (op3))
31995 op3 = copy_to_mode_reg (mode0, op3);
31996
31997 op0 = gen_reg_rtx (mode0);
31998
31999 op4 = gen_rtx_REG (CCCmode, FLAGS_REG);
32000 pat = gen_rtx_LTU (VOIDmode, op4, const0_rtx);
32001 emit_insn (GEN_FCN (icode) (op0, op2, op3, op4, pat));
32002
32003 /* Store the result. */
32004 op4 = expand_normal (arg3);
32005 if (!address_operand (op4, VOIDmode))
32006 {
32007 op4 = convert_memory_address (Pmode, op4);
32008 op4 = copy_addr_to_reg (op4);
32009 }
32010 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
32011
32012 /* Return current CF value. */
32013 if (target == 0)
32014 target = gen_reg_rtx (QImode);
32015
32016 PUT_MODE (pat, QImode);
32017 emit_insn (gen_rtx_SET (VOIDmode, target, pat));
32018 return target;
32019
32020 case IX86_BUILTIN_GATHERSIV2DF:
32021 icode = CODE_FOR_avx2_gathersiv2df;
32022 goto gather_gen;
32023 case IX86_BUILTIN_GATHERSIV4DF:
32024 icode = CODE_FOR_avx2_gathersiv4df;
32025 goto gather_gen;
32026 case IX86_BUILTIN_GATHERDIV2DF:
32027 icode = CODE_FOR_avx2_gatherdiv2df;
32028 goto gather_gen;
32029 case IX86_BUILTIN_GATHERDIV4DF:
32030 icode = CODE_FOR_avx2_gatherdiv4df;
32031 goto gather_gen;
32032 case IX86_BUILTIN_GATHERSIV4SF:
32033 icode = CODE_FOR_avx2_gathersiv4sf;
32034 goto gather_gen;
32035 case IX86_BUILTIN_GATHERSIV8SF:
32036 icode = CODE_FOR_avx2_gathersiv8sf;
32037 goto gather_gen;
32038 case IX86_BUILTIN_GATHERDIV4SF:
32039 icode = CODE_FOR_avx2_gatherdiv4sf;
32040 goto gather_gen;
32041 case IX86_BUILTIN_GATHERDIV8SF:
32042 icode = CODE_FOR_avx2_gatherdiv8sf;
32043 goto gather_gen;
32044 case IX86_BUILTIN_GATHERSIV2DI:
32045 icode = CODE_FOR_avx2_gathersiv2di;
32046 goto gather_gen;
32047 case IX86_BUILTIN_GATHERSIV4DI:
32048 icode = CODE_FOR_avx2_gathersiv4di;
32049 goto gather_gen;
32050 case IX86_BUILTIN_GATHERDIV2DI:
32051 icode = CODE_FOR_avx2_gatherdiv2di;
32052 goto gather_gen;
32053 case IX86_BUILTIN_GATHERDIV4DI:
32054 icode = CODE_FOR_avx2_gatherdiv4di;
32055 goto gather_gen;
32056 case IX86_BUILTIN_GATHERSIV4SI:
32057 icode = CODE_FOR_avx2_gathersiv4si;
32058 goto gather_gen;
32059 case IX86_BUILTIN_GATHERSIV8SI:
32060 icode = CODE_FOR_avx2_gathersiv8si;
32061 goto gather_gen;
32062 case IX86_BUILTIN_GATHERDIV4SI:
32063 icode = CODE_FOR_avx2_gatherdiv4si;
32064 goto gather_gen;
32065 case IX86_BUILTIN_GATHERDIV8SI:
32066 icode = CODE_FOR_avx2_gatherdiv8si;
32067 goto gather_gen;
32068 case IX86_BUILTIN_GATHERALTSIV4DF:
32069 icode = CODE_FOR_avx2_gathersiv4df;
32070 goto gather_gen;
32071 case IX86_BUILTIN_GATHERALTDIV8SF:
32072 icode = CODE_FOR_avx2_gatherdiv8sf;
32073 goto gather_gen;
32074 case IX86_BUILTIN_GATHERALTSIV4DI:
32075 icode = CODE_FOR_avx2_gathersiv4di;
32076 goto gather_gen;
32077 case IX86_BUILTIN_GATHERALTDIV8SI:
32078 icode = CODE_FOR_avx2_gatherdiv8si;
32079 goto gather_gen;
32080
32081 gather_gen:
32082 arg0 = CALL_EXPR_ARG (exp, 0);
32083 arg1 = CALL_EXPR_ARG (exp, 1);
32084 arg2 = CALL_EXPR_ARG (exp, 2);
32085 arg3 = CALL_EXPR_ARG (exp, 3);
32086 arg4 = CALL_EXPR_ARG (exp, 4);
32087 op0 = expand_normal (arg0);
32088 op1 = expand_normal (arg1);
32089 op2 = expand_normal (arg2);
32090 op3 = expand_normal (arg3);
32091 op4 = expand_normal (arg4);
32092 /* Note the arg order is different from the operand order. */
32093 mode0 = insn_data[icode].operand[1].mode;
32094 mode2 = insn_data[icode].operand[3].mode;
32095 mode3 = insn_data[icode].operand[4].mode;
32096 mode4 = insn_data[icode].operand[5].mode;
32097
32098 if (target == NULL_RTX
32099 || GET_MODE (target) != insn_data[icode].operand[0].mode)
32100 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
32101 else
32102 subtarget = target;
32103
32104 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
32105 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
32106 {
32107 rtx half = gen_reg_rtx (V4SImode);
32108 if (!nonimmediate_operand (op2, V8SImode))
32109 op2 = copy_to_mode_reg (V8SImode, op2);
32110 emit_insn (gen_vec_extract_lo_v8si (half, op2));
32111 op2 = half;
32112 }
32113 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
32114 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
32115 {
32116 rtx (*gen) (rtx, rtx);
32117 rtx half = gen_reg_rtx (mode0);
32118 if (mode0 == V4SFmode)
32119 gen = gen_vec_extract_lo_v8sf;
32120 else
32121 gen = gen_vec_extract_lo_v8si;
32122 if (!nonimmediate_operand (op0, GET_MODE (op0)))
32123 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
32124 emit_insn (gen (half, op0));
32125 op0 = half;
32126 if (!nonimmediate_operand (op3, GET_MODE (op3)))
32127 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
32128 emit_insn (gen (half, op3));
32129 op3 = half;
32130 }
32131
32132 /* Force memory operand only with base register here. But we
32133 don't want to do it on memory operand for other builtin
32134 functions. */
32135 op1 = force_reg (Pmode, convert_to_mode (Pmode, op1, 1));
32136
32137 if (!insn_data[icode].operand[1].predicate (op0, mode0))
32138 op0 = copy_to_mode_reg (mode0, op0);
32139 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
32140 op1 = copy_to_mode_reg (Pmode, op1);
32141 if (!insn_data[icode].operand[3].predicate (op2, mode2))
32142 op2 = copy_to_mode_reg (mode2, op2);
32143 if (!insn_data[icode].operand[4].predicate (op3, mode3))
32144 op3 = copy_to_mode_reg (mode3, op3);
32145 if (!insn_data[icode].operand[5].predicate (op4, mode4))
32146 {
32147 error ("last argument must be scale 1, 2, 4, 8");
32148 return const0_rtx;
32149 }
32150
32151 /* Optimize. If mask is known to have all high bits set,
32152 replace op0 with pc_rtx to signal that the instruction
32153 overwrites the whole destination and doesn't use its
32154 previous contents. */
32155 if (optimize)
32156 {
32157 if (TREE_CODE (arg3) == VECTOR_CST)
32158 {
32159 unsigned int negative = 0;
32160 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
32161 {
32162 tree cst = VECTOR_CST_ELT (arg3, i);
32163 if (TREE_CODE (cst) == INTEGER_CST
32164 && tree_int_cst_sign_bit (cst))
32165 negative++;
32166 else if (TREE_CODE (cst) == REAL_CST
32167 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
32168 negative++;
32169 }
32170 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
32171 op0 = pc_rtx;
32172 }
32173 else if (TREE_CODE (arg3) == SSA_NAME)
32174 {
32175 /* Recognize also when mask is like:
32176 __v2df src = _mm_setzero_pd ();
32177 __v2df mask = _mm_cmpeq_pd (src, src);
32178 or
32179 __v8sf src = _mm256_setzero_ps ();
32180 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
32181 as that is a cheaper way to load all ones into
32182 a register than having to load a constant from
32183 memory. */
32184 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
32185 if (is_gimple_call (def_stmt))
32186 {
32187 tree fndecl = gimple_call_fndecl (def_stmt);
32188 if (fndecl
32189 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
32190 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
32191 {
32192 case IX86_BUILTIN_CMPPD:
32193 case IX86_BUILTIN_CMPPS:
32194 case IX86_BUILTIN_CMPPD256:
32195 case IX86_BUILTIN_CMPPS256:
32196 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
32197 break;
32198 /* FALLTHRU */
32199 case IX86_BUILTIN_CMPEQPD:
32200 case IX86_BUILTIN_CMPEQPS:
32201 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
32202 && initializer_zerop (gimple_call_arg (def_stmt,
32203 1)))
32204 op0 = pc_rtx;
32205 break;
32206 default:
32207 break;
32208 }
32209 }
32210 }
32211 }
32212
32213 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
32214 if (! pat)
32215 return const0_rtx;
32216 emit_insn (pat);
32217
32218 if (fcode == IX86_BUILTIN_GATHERDIV8SF
32219 || fcode == IX86_BUILTIN_GATHERDIV8SI)
32220 {
32221 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
32222 ? V4SFmode : V4SImode;
32223 if (target == NULL_RTX)
32224 target = gen_reg_rtx (tmode);
32225 if (tmode == V4SFmode)
32226 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
32227 else
32228 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
32229 }
32230 else
32231 target = subtarget;
32232
32233 return target;
32234
32235 case IX86_BUILTIN_XABORT:
32236 icode = CODE_FOR_xabort;
32237 arg0 = CALL_EXPR_ARG (exp, 0);
32238 op0 = expand_normal (arg0);
32239 mode0 = insn_data[icode].operand[0].mode;
32240 if (!insn_data[icode].operand[0].predicate (op0, mode0))
32241 {
32242 error ("the xabort's argument must be an 8-bit immediate");
32243 return const0_rtx;
32244 }
32245 emit_insn (gen_xabort (op0));
32246 return 0;
32247
32248 default:
32249 break;
32250 }
32251
32252 for (i = 0, d = bdesc_special_args;
32253 i < ARRAY_SIZE (bdesc_special_args);
32254 i++, d++)
32255 if (d->code == fcode)
32256 return ix86_expand_special_args_builtin (d, exp, target);
32257
32258 for (i = 0, d = bdesc_args;
32259 i < ARRAY_SIZE (bdesc_args);
32260 i++, d++)
32261 if (d->code == fcode)
32262 switch (fcode)
32263 {
32264 case IX86_BUILTIN_FABSQ:
32265 case IX86_BUILTIN_COPYSIGNQ:
32266 if (!TARGET_SSE)
32267 /* Emit a normal call if SSE isn't available. */
32268 return expand_call (exp, target, ignore);
32269 default:
32270 return ix86_expand_args_builtin (d, exp, target);
32271 }
32272
32273 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
32274 if (d->code == fcode)
32275 return ix86_expand_sse_comi (d, exp, target);
32276
32277 for (i = 0, d = bdesc_pcmpestr;
32278 i < ARRAY_SIZE (bdesc_pcmpestr);
32279 i++, d++)
32280 if (d->code == fcode)
32281 return ix86_expand_sse_pcmpestr (d, exp, target);
32282
32283 for (i = 0, d = bdesc_pcmpistr;
32284 i < ARRAY_SIZE (bdesc_pcmpistr);
32285 i++, d++)
32286 if (d->code == fcode)
32287 return ix86_expand_sse_pcmpistr (d, exp, target);
32288
32289 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
32290 if (d->code == fcode)
32291 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
32292 (enum ix86_builtin_func_type)
32293 d->flag, d->comparison);
32294
32295 gcc_unreachable ();
32296 }
32297
32298 /* Returns a function decl for a vectorized version of the builtin function
32299 with builtin function code FN and the result vector type TYPE, or NULL_TREE
32300 if it is not available. */
32301
32302 static tree
32303 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
32304 tree type_in)
32305 {
32306 enum machine_mode in_mode, out_mode;
32307 int in_n, out_n;
32308 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
32309
32310 if (TREE_CODE (type_out) != VECTOR_TYPE
32311 || TREE_CODE (type_in) != VECTOR_TYPE
32312 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
32313 return NULL_TREE;
32314
32315 out_mode = TYPE_MODE (TREE_TYPE (type_out));
32316 out_n = TYPE_VECTOR_SUBPARTS (type_out);
32317 in_mode = TYPE_MODE (TREE_TYPE (type_in));
32318 in_n = TYPE_VECTOR_SUBPARTS (type_in);
32319
32320 switch (fn)
32321 {
32322 case BUILT_IN_SQRT:
32323 if (out_mode == DFmode && in_mode == DFmode)
32324 {
32325 if (out_n == 2 && in_n == 2)
32326 return ix86_builtins[IX86_BUILTIN_SQRTPD];
32327 else if (out_n == 4 && in_n == 4)
32328 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
32329 }
32330 break;
32331
32332 case BUILT_IN_SQRTF:
32333 if (out_mode == SFmode && in_mode == SFmode)
32334 {
32335 if (out_n == 4 && in_n == 4)
32336 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
32337 else if (out_n == 8 && in_n == 8)
32338 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
32339 }
32340 break;
32341
32342 case BUILT_IN_IFLOOR:
32343 case BUILT_IN_LFLOOR:
32344 case BUILT_IN_LLFLOOR:
32345 /* The round insn does not trap on denormals. */
32346 if (flag_trapping_math || !TARGET_ROUND)
32347 break;
32348
32349 if (out_mode == SImode && in_mode == DFmode)
32350 {
32351 if (out_n == 4 && in_n == 2)
32352 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
32353 else if (out_n == 8 && in_n == 4)
32354 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
32355 }
32356 break;
32357
32358 case BUILT_IN_IFLOORF:
32359 case BUILT_IN_LFLOORF:
32360 case BUILT_IN_LLFLOORF:
32361 /* The round insn does not trap on denormals. */
32362 if (flag_trapping_math || !TARGET_ROUND)
32363 break;
32364
32365 if (out_mode == SImode && in_mode == SFmode)
32366 {
32367 if (out_n == 4 && in_n == 4)
32368 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
32369 else if (out_n == 8 && in_n == 8)
32370 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
32371 }
32372 break;
32373
32374 case BUILT_IN_ICEIL:
32375 case BUILT_IN_LCEIL:
32376 case BUILT_IN_LLCEIL:
32377 /* The round insn does not trap on denormals. */
32378 if (flag_trapping_math || !TARGET_ROUND)
32379 break;
32380
32381 if (out_mode == SImode && in_mode == DFmode)
32382 {
32383 if (out_n == 4 && in_n == 2)
32384 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
32385 else if (out_n == 8 && in_n == 4)
32386 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
32387 }
32388 break;
32389
32390 case BUILT_IN_ICEILF:
32391 case BUILT_IN_LCEILF:
32392 case BUILT_IN_LLCEILF:
32393 /* The round insn does not trap on denormals. */
32394 if (flag_trapping_math || !TARGET_ROUND)
32395 break;
32396
32397 if (out_mode == SImode && in_mode == SFmode)
32398 {
32399 if (out_n == 4 && in_n == 4)
32400 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
32401 else if (out_n == 8 && in_n == 8)
32402 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
32403 }
32404 break;
32405
32406 case BUILT_IN_IRINT:
32407 case BUILT_IN_LRINT:
32408 case BUILT_IN_LLRINT:
32409 if (out_mode == SImode && in_mode == DFmode)
32410 {
32411 if (out_n == 4 && in_n == 2)
32412 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
32413 else if (out_n == 8 && in_n == 4)
32414 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
32415 }
32416 break;
32417
32418 case BUILT_IN_IRINTF:
32419 case BUILT_IN_LRINTF:
32420 case BUILT_IN_LLRINTF:
32421 if (out_mode == SImode && in_mode == SFmode)
32422 {
32423 if (out_n == 4 && in_n == 4)
32424 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
32425 else if (out_n == 8 && in_n == 8)
32426 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
32427 }
32428 break;
32429
32430 case BUILT_IN_IROUND:
32431 case BUILT_IN_LROUND:
32432 case BUILT_IN_LLROUND:
32433 /* The round insn does not trap on denormals. */
32434 if (flag_trapping_math || !TARGET_ROUND)
32435 break;
32436
32437 if (out_mode == SImode && in_mode == DFmode)
32438 {
32439 if (out_n == 4 && in_n == 2)
32440 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
32441 else if (out_n == 8 && in_n == 4)
32442 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
32443 }
32444 break;
32445
32446 case BUILT_IN_IROUNDF:
32447 case BUILT_IN_LROUNDF:
32448 case BUILT_IN_LLROUNDF:
32449 /* The round insn does not trap on denormals. */
32450 if (flag_trapping_math || !TARGET_ROUND)
32451 break;
32452
32453 if (out_mode == SImode && in_mode == SFmode)
32454 {
32455 if (out_n == 4 && in_n == 4)
32456 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
32457 else if (out_n == 8 && in_n == 8)
32458 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
32459 }
32460 break;
32461
32462 case BUILT_IN_COPYSIGN:
32463 if (out_mode == DFmode && in_mode == DFmode)
32464 {
32465 if (out_n == 2 && in_n == 2)
32466 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
32467 else if (out_n == 4 && in_n == 4)
32468 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
32469 }
32470 break;
32471
32472 case BUILT_IN_COPYSIGNF:
32473 if (out_mode == SFmode && in_mode == SFmode)
32474 {
32475 if (out_n == 4 && in_n == 4)
32476 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
32477 else if (out_n == 8 && in_n == 8)
32478 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
32479 }
32480 break;
32481
32482 case BUILT_IN_FLOOR:
32483 /* The round insn does not trap on denormals. */
32484 if (flag_trapping_math || !TARGET_ROUND)
32485 break;
32486
32487 if (out_mode == DFmode && in_mode == DFmode)
32488 {
32489 if (out_n == 2 && in_n == 2)
32490 return ix86_builtins[IX86_BUILTIN_FLOORPD];
32491 else if (out_n == 4 && in_n == 4)
32492 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
32493 }
32494 break;
32495
32496 case BUILT_IN_FLOORF:
32497 /* The round insn does not trap on denormals. */
32498 if (flag_trapping_math || !TARGET_ROUND)
32499 break;
32500
32501 if (out_mode == SFmode && in_mode == SFmode)
32502 {
32503 if (out_n == 4 && in_n == 4)
32504 return ix86_builtins[IX86_BUILTIN_FLOORPS];
32505 else if (out_n == 8 && in_n == 8)
32506 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
32507 }
32508 break;
32509
32510 case BUILT_IN_CEIL:
32511 /* The round insn does not trap on denormals. */
32512 if (flag_trapping_math || !TARGET_ROUND)
32513 break;
32514
32515 if (out_mode == DFmode && in_mode == DFmode)
32516 {
32517 if (out_n == 2 && in_n == 2)
32518 return ix86_builtins[IX86_BUILTIN_CEILPD];
32519 else if (out_n == 4 && in_n == 4)
32520 return ix86_builtins[IX86_BUILTIN_CEILPD256];
32521 }
32522 break;
32523
32524 case BUILT_IN_CEILF:
32525 /* The round insn does not trap on denormals. */
32526 if (flag_trapping_math || !TARGET_ROUND)
32527 break;
32528
32529 if (out_mode == SFmode && in_mode == SFmode)
32530 {
32531 if (out_n == 4 && in_n == 4)
32532 return ix86_builtins[IX86_BUILTIN_CEILPS];
32533 else if (out_n == 8 && in_n == 8)
32534 return ix86_builtins[IX86_BUILTIN_CEILPS256];
32535 }
32536 break;
32537
32538 case BUILT_IN_TRUNC:
32539 /* The round insn does not trap on denormals. */
32540 if (flag_trapping_math || !TARGET_ROUND)
32541 break;
32542
32543 if (out_mode == DFmode && in_mode == DFmode)
32544 {
32545 if (out_n == 2 && in_n == 2)
32546 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
32547 else if (out_n == 4 && in_n == 4)
32548 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
32549 }
32550 break;
32551
32552 case BUILT_IN_TRUNCF:
32553 /* The round insn does not trap on denormals. */
32554 if (flag_trapping_math || !TARGET_ROUND)
32555 break;
32556
32557 if (out_mode == SFmode && in_mode == SFmode)
32558 {
32559 if (out_n == 4 && in_n == 4)
32560 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
32561 else if (out_n == 8 && in_n == 8)
32562 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
32563 }
32564 break;
32565
32566 case BUILT_IN_RINT:
32567 /* The round insn does not trap on denormals. */
32568 if (flag_trapping_math || !TARGET_ROUND)
32569 break;
32570
32571 if (out_mode == DFmode && in_mode == DFmode)
32572 {
32573 if (out_n == 2 && in_n == 2)
32574 return ix86_builtins[IX86_BUILTIN_RINTPD];
32575 else if (out_n == 4 && in_n == 4)
32576 return ix86_builtins[IX86_BUILTIN_RINTPD256];
32577 }
32578 break;
32579
32580 case BUILT_IN_RINTF:
32581 /* The round insn does not trap on denormals. */
32582 if (flag_trapping_math || !TARGET_ROUND)
32583 break;
32584
32585 if (out_mode == SFmode && in_mode == SFmode)
32586 {
32587 if (out_n == 4 && in_n == 4)
32588 return ix86_builtins[IX86_BUILTIN_RINTPS];
32589 else if (out_n == 8 && in_n == 8)
32590 return ix86_builtins[IX86_BUILTIN_RINTPS256];
32591 }
32592 break;
32593
32594 case BUILT_IN_ROUND:
32595 /* The round insn does not trap on denormals. */
32596 if (flag_trapping_math || !TARGET_ROUND)
32597 break;
32598
32599 if (out_mode == DFmode && in_mode == DFmode)
32600 {
32601 if (out_n == 2 && in_n == 2)
32602 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
32603 else if (out_n == 4 && in_n == 4)
32604 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
32605 }
32606 break;
32607
32608 case BUILT_IN_ROUNDF:
32609 /* The round insn does not trap on denormals. */
32610 if (flag_trapping_math || !TARGET_ROUND)
32611 break;
32612
32613 if (out_mode == SFmode && in_mode == SFmode)
32614 {
32615 if (out_n == 4 && in_n == 4)
32616 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
32617 else if (out_n == 8 && in_n == 8)
32618 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
32619 }
32620 break;
32621
32622 case BUILT_IN_FMA:
32623 if (out_mode == DFmode && in_mode == DFmode)
32624 {
32625 if (out_n == 2 && in_n == 2)
32626 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
32627 if (out_n == 4 && in_n == 4)
32628 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
32629 }
32630 break;
32631
32632 case BUILT_IN_FMAF:
32633 if (out_mode == SFmode && in_mode == SFmode)
32634 {
32635 if (out_n == 4 && in_n == 4)
32636 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
32637 if (out_n == 8 && in_n == 8)
32638 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
32639 }
32640 break;
32641
32642 default:
32643 break;
32644 }
32645
32646 /* Dispatch to a handler for a vectorization library. */
32647 if (ix86_veclib_handler)
32648 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
32649 type_in);
32650
32651 return NULL_TREE;
32652 }
32653
32654 /* Handler for an SVML-style interface to
32655 a library with vectorized intrinsics. */
32656
32657 static tree
32658 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
32659 {
32660 char name[20];
32661 tree fntype, new_fndecl, args;
32662 unsigned arity;
32663 const char *bname;
32664 enum machine_mode el_mode, in_mode;
32665 int n, in_n;
32666
32667 /* The SVML is suitable for unsafe math only. */
32668 if (!flag_unsafe_math_optimizations)
32669 return NULL_TREE;
32670
32671 el_mode = TYPE_MODE (TREE_TYPE (type_out));
32672 n = TYPE_VECTOR_SUBPARTS (type_out);
32673 in_mode = TYPE_MODE (TREE_TYPE (type_in));
32674 in_n = TYPE_VECTOR_SUBPARTS (type_in);
32675 if (el_mode != in_mode
32676 || n != in_n)
32677 return NULL_TREE;
32678
32679 switch (fn)
32680 {
32681 case BUILT_IN_EXP:
32682 case BUILT_IN_LOG:
32683 case BUILT_IN_LOG10:
32684 case BUILT_IN_POW:
32685 case BUILT_IN_TANH:
32686 case BUILT_IN_TAN:
32687 case BUILT_IN_ATAN:
32688 case BUILT_IN_ATAN2:
32689 case BUILT_IN_ATANH:
32690 case BUILT_IN_CBRT:
32691 case BUILT_IN_SINH:
32692 case BUILT_IN_SIN:
32693 case BUILT_IN_ASINH:
32694 case BUILT_IN_ASIN:
32695 case BUILT_IN_COSH:
32696 case BUILT_IN_COS:
32697 case BUILT_IN_ACOSH:
32698 case BUILT_IN_ACOS:
32699 if (el_mode != DFmode || n != 2)
32700 return NULL_TREE;
32701 break;
32702
32703 case BUILT_IN_EXPF:
32704 case BUILT_IN_LOGF:
32705 case BUILT_IN_LOG10F:
32706 case BUILT_IN_POWF:
32707 case BUILT_IN_TANHF:
32708 case BUILT_IN_TANF:
32709 case BUILT_IN_ATANF:
32710 case BUILT_IN_ATAN2F:
32711 case BUILT_IN_ATANHF:
32712 case BUILT_IN_CBRTF:
32713 case BUILT_IN_SINHF:
32714 case BUILT_IN_SINF:
32715 case BUILT_IN_ASINHF:
32716 case BUILT_IN_ASINF:
32717 case BUILT_IN_COSHF:
32718 case BUILT_IN_COSF:
32719 case BUILT_IN_ACOSHF:
32720 case BUILT_IN_ACOSF:
32721 if (el_mode != SFmode || n != 4)
32722 return NULL_TREE;
32723 break;
32724
32725 default:
32726 return NULL_TREE;
32727 }
32728
32729 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
32730
32731 if (fn == BUILT_IN_LOGF)
32732 strcpy (name, "vmlsLn4");
32733 else if (fn == BUILT_IN_LOG)
32734 strcpy (name, "vmldLn2");
32735 else if (n == 4)
32736 {
32737 sprintf (name, "vmls%s", bname+10);
32738 name[strlen (name)-1] = '4';
32739 }
32740 else
32741 sprintf (name, "vmld%s2", bname+10);
32742
32743 /* Convert to uppercase. */
32744 name[4] &= ~0x20;
32745
32746 arity = 0;
32747 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
32748 args;
32749 args = TREE_CHAIN (args))
32750 arity++;
32751
32752 if (arity == 1)
32753 fntype = build_function_type_list (type_out, type_in, NULL);
32754 else
32755 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
32756
32757 /* Build a function declaration for the vectorized function. */
32758 new_fndecl = build_decl (BUILTINS_LOCATION,
32759 FUNCTION_DECL, get_identifier (name), fntype);
32760 TREE_PUBLIC (new_fndecl) = 1;
32761 DECL_EXTERNAL (new_fndecl) = 1;
32762 DECL_IS_NOVOPS (new_fndecl) = 1;
32763 TREE_READONLY (new_fndecl) = 1;
32764
32765 return new_fndecl;
32766 }
32767
32768 /* Handler for an ACML-style interface to
32769 a library with vectorized intrinsics. */
32770
32771 static tree
32772 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
32773 {
32774 char name[20] = "__vr.._";
32775 tree fntype, new_fndecl, args;
32776 unsigned arity;
32777 const char *bname;
32778 enum machine_mode el_mode, in_mode;
32779 int n, in_n;
32780
32781 /* The ACML is 64bits only and suitable for unsafe math only as
32782 it does not correctly support parts of IEEE with the required
32783 precision such as denormals. */
32784 if (!TARGET_64BIT
32785 || !flag_unsafe_math_optimizations)
32786 return NULL_TREE;
32787
32788 el_mode = TYPE_MODE (TREE_TYPE (type_out));
32789 n = TYPE_VECTOR_SUBPARTS (type_out);
32790 in_mode = TYPE_MODE (TREE_TYPE (type_in));
32791 in_n = TYPE_VECTOR_SUBPARTS (type_in);
32792 if (el_mode != in_mode
32793 || n != in_n)
32794 return NULL_TREE;
32795
32796 switch (fn)
32797 {
32798 case BUILT_IN_SIN:
32799 case BUILT_IN_COS:
32800 case BUILT_IN_EXP:
32801 case BUILT_IN_LOG:
32802 case BUILT_IN_LOG2:
32803 case BUILT_IN_LOG10:
32804 name[4] = 'd';
32805 name[5] = '2';
32806 if (el_mode != DFmode
32807 || n != 2)
32808 return NULL_TREE;
32809 break;
32810
32811 case BUILT_IN_SINF:
32812 case BUILT_IN_COSF:
32813 case BUILT_IN_EXPF:
32814 case BUILT_IN_POWF:
32815 case BUILT_IN_LOGF:
32816 case BUILT_IN_LOG2F:
32817 case BUILT_IN_LOG10F:
32818 name[4] = 's';
32819 name[5] = '4';
32820 if (el_mode != SFmode
32821 || n != 4)
32822 return NULL_TREE;
32823 break;
32824
32825 default:
32826 return NULL_TREE;
32827 }
32828
32829 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
32830 sprintf (name + 7, "%s", bname+10);
32831
32832 arity = 0;
32833 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
32834 args;
32835 args = TREE_CHAIN (args))
32836 arity++;
32837
32838 if (arity == 1)
32839 fntype = build_function_type_list (type_out, type_in, NULL);
32840 else
32841 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
32842
32843 /* Build a function declaration for the vectorized function. */
32844 new_fndecl = build_decl (BUILTINS_LOCATION,
32845 FUNCTION_DECL, get_identifier (name), fntype);
32846 TREE_PUBLIC (new_fndecl) = 1;
32847 DECL_EXTERNAL (new_fndecl) = 1;
32848 DECL_IS_NOVOPS (new_fndecl) = 1;
32849 TREE_READONLY (new_fndecl) = 1;
32850
32851 return new_fndecl;
32852 }
32853
32854 /* Returns a decl of a function that implements gather load with
32855 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
32856 Return NULL_TREE if it is not available. */
32857
32858 static tree
32859 ix86_vectorize_builtin_gather (const_tree mem_vectype,
32860 const_tree index_type, int scale)
32861 {
32862 bool si;
32863 enum ix86_builtins code;
32864
32865 if (! TARGET_AVX2)
32866 return NULL_TREE;
32867
32868 if ((TREE_CODE (index_type) != INTEGER_TYPE
32869 && !POINTER_TYPE_P (index_type))
32870 || (TYPE_MODE (index_type) != SImode
32871 && TYPE_MODE (index_type) != DImode))
32872 return NULL_TREE;
32873
32874 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
32875 return NULL_TREE;
32876
32877 /* v*gather* insn sign extends index to pointer mode. */
32878 if (TYPE_PRECISION (index_type) < POINTER_SIZE
32879 && TYPE_UNSIGNED (index_type))
32880 return NULL_TREE;
32881
32882 if (scale <= 0
32883 || scale > 8
32884 || (scale & (scale - 1)) != 0)
32885 return NULL_TREE;
32886
32887 si = TYPE_MODE (index_type) == SImode;
32888 switch (TYPE_MODE (mem_vectype))
32889 {
32890 case V2DFmode:
32891 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
32892 break;
32893 case V4DFmode:
32894 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
32895 break;
32896 case V2DImode:
32897 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
32898 break;
32899 case V4DImode:
32900 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
32901 break;
32902 case V4SFmode:
32903 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
32904 break;
32905 case V8SFmode:
32906 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
32907 break;
32908 case V4SImode:
32909 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
32910 break;
32911 case V8SImode:
32912 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
32913 break;
32914 default:
32915 return NULL_TREE;
32916 }
32917
32918 return ix86_builtins[code];
32919 }
32920
32921 /* Returns a code for a target-specific builtin that implements
32922 reciprocal of the function, or NULL_TREE if not available. */
32923
32924 static tree
32925 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
32926 bool sqrt ATTRIBUTE_UNUSED)
32927 {
32928 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
32929 && flag_finite_math_only && !flag_trapping_math
32930 && flag_unsafe_math_optimizations))
32931 return NULL_TREE;
32932
32933 if (md_fn)
32934 /* Machine dependent builtins. */
32935 switch (fn)
32936 {
32937 /* Vectorized version of sqrt to rsqrt conversion. */
32938 case IX86_BUILTIN_SQRTPS_NR:
32939 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
32940
32941 case IX86_BUILTIN_SQRTPS_NR256:
32942 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
32943
32944 default:
32945 return NULL_TREE;
32946 }
32947 else
32948 /* Normal builtins. */
32949 switch (fn)
32950 {
32951 /* Sqrt to rsqrt conversion. */
32952 case BUILT_IN_SQRTF:
32953 return ix86_builtins[IX86_BUILTIN_RSQRTF];
32954
32955 default:
32956 return NULL_TREE;
32957 }
32958 }
32959 \f
32960 /* Helper for avx_vpermilps256_operand et al. This is also used by
32961 the expansion functions to turn the parallel back into a mask.
32962 The return value is 0 for no match and the imm8+1 for a match. */
32963
32964 int
32965 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
32966 {
32967 unsigned i, nelt = GET_MODE_NUNITS (mode);
32968 unsigned mask = 0;
32969 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
32970
32971 if (XVECLEN (par, 0) != (int) nelt)
32972 return 0;
32973
32974 /* Validate that all of the elements are constants, and not totally
32975 out of range. Copy the data into an integral array to make the
32976 subsequent checks easier. */
32977 for (i = 0; i < nelt; ++i)
32978 {
32979 rtx er = XVECEXP (par, 0, i);
32980 unsigned HOST_WIDE_INT ei;
32981
32982 if (!CONST_INT_P (er))
32983 return 0;
32984 ei = INTVAL (er);
32985 if (ei >= nelt)
32986 return 0;
32987 ipar[i] = ei;
32988 }
32989
32990 switch (mode)
32991 {
32992 case V4DFmode:
32993 /* In the 256-bit DFmode case, we can only move elements within
32994 a 128-bit lane. */
32995 for (i = 0; i < 2; ++i)
32996 {
32997 if (ipar[i] >= 2)
32998 return 0;
32999 mask |= ipar[i] << i;
33000 }
33001 for (i = 2; i < 4; ++i)
33002 {
33003 if (ipar[i] < 2)
33004 return 0;
33005 mask |= (ipar[i] - 2) << i;
33006 }
33007 break;
33008
33009 case V8SFmode:
33010 /* In the 256-bit SFmode case, we have full freedom of movement
33011 within the low 128-bit lane, but the high 128-bit lane must
33012 mirror the exact same pattern. */
33013 for (i = 0; i < 4; ++i)
33014 if (ipar[i] + 4 != ipar[i + 4])
33015 return 0;
33016 nelt = 4;
33017 /* FALLTHRU */
33018
33019 case V2DFmode:
33020 case V4SFmode:
33021 /* In the 128-bit case, we've full freedom in the placement of
33022 the elements from the source operand. */
33023 for (i = 0; i < nelt; ++i)
33024 mask |= ipar[i] << (i * (nelt / 2));
33025 break;
33026
33027 default:
33028 gcc_unreachable ();
33029 }
33030
33031 /* Make sure success has a non-zero value by adding one. */
33032 return mask + 1;
33033 }
33034
33035 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
33036 the expansion functions to turn the parallel back into a mask.
33037 The return value is 0 for no match and the imm8+1 for a match. */
33038
33039 int
33040 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
33041 {
33042 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
33043 unsigned mask = 0;
33044 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
33045
33046 if (XVECLEN (par, 0) != (int) nelt)
33047 return 0;
33048
33049 /* Validate that all of the elements are constants, and not totally
33050 out of range. Copy the data into an integral array to make the
33051 subsequent checks easier. */
33052 for (i = 0; i < nelt; ++i)
33053 {
33054 rtx er = XVECEXP (par, 0, i);
33055 unsigned HOST_WIDE_INT ei;
33056
33057 if (!CONST_INT_P (er))
33058 return 0;
33059 ei = INTVAL (er);
33060 if (ei >= 2 * nelt)
33061 return 0;
33062 ipar[i] = ei;
33063 }
33064
33065 /* Validate that the halves of the permute are halves. */
33066 for (i = 0; i < nelt2 - 1; ++i)
33067 if (ipar[i] + 1 != ipar[i + 1])
33068 return 0;
33069 for (i = nelt2; i < nelt - 1; ++i)
33070 if (ipar[i] + 1 != ipar[i + 1])
33071 return 0;
33072
33073 /* Reconstruct the mask. */
33074 for (i = 0; i < 2; ++i)
33075 {
33076 unsigned e = ipar[i * nelt2];
33077 if (e % nelt2)
33078 return 0;
33079 e /= nelt2;
33080 mask |= e << (i * 4);
33081 }
33082
33083 /* Make sure success has a non-zero value by adding one. */
33084 return mask + 1;
33085 }
33086 \f
33087 /* Store OPERAND to the memory after reload is completed. This means
33088 that we can't easily use assign_stack_local. */
33089 rtx
33090 ix86_force_to_memory (enum machine_mode mode, rtx operand)
33091 {
33092 rtx result;
33093
33094 gcc_assert (reload_completed);
33095 if (ix86_using_red_zone ())
33096 {
33097 result = gen_rtx_MEM (mode,
33098 gen_rtx_PLUS (Pmode,
33099 stack_pointer_rtx,
33100 GEN_INT (-RED_ZONE_SIZE)));
33101 emit_move_insn (result, operand);
33102 }
33103 else if (TARGET_64BIT)
33104 {
33105 switch (mode)
33106 {
33107 case HImode:
33108 case SImode:
33109 operand = gen_lowpart (DImode, operand);
33110 /* FALLTHRU */
33111 case DImode:
33112 emit_insn (
33113 gen_rtx_SET (VOIDmode,
33114 gen_rtx_MEM (DImode,
33115 gen_rtx_PRE_DEC (DImode,
33116 stack_pointer_rtx)),
33117 operand));
33118 break;
33119 default:
33120 gcc_unreachable ();
33121 }
33122 result = gen_rtx_MEM (mode, stack_pointer_rtx);
33123 }
33124 else
33125 {
33126 switch (mode)
33127 {
33128 case DImode:
33129 {
33130 rtx operands[2];
33131 split_double_mode (mode, &operand, 1, operands, operands + 1);
33132 emit_insn (
33133 gen_rtx_SET (VOIDmode,
33134 gen_rtx_MEM (SImode,
33135 gen_rtx_PRE_DEC (Pmode,
33136 stack_pointer_rtx)),
33137 operands[1]));
33138 emit_insn (
33139 gen_rtx_SET (VOIDmode,
33140 gen_rtx_MEM (SImode,
33141 gen_rtx_PRE_DEC (Pmode,
33142 stack_pointer_rtx)),
33143 operands[0]));
33144 }
33145 break;
33146 case HImode:
33147 /* Store HImodes as SImodes. */
33148 operand = gen_lowpart (SImode, operand);
33149 /* FALLTHRU */
33150 case SImode:
33151 emit_insn (
33152 gen_rtx_SET (VOIDmode,
33153 gen_rtx_MEM (GET_MODE (operand),
33154 gen_rtx_PRE_DEC (SImode,
33155 stack_pointer_rtx)),
33156 operand));
33157 break;
33158 default:
33159 gcc_unreachable ();
33160 }
33161 result = gen_rtx_MEM (mode, stack_pointer_rtx);
33162 }
33163 return result;
33164 }
33165
33166 /* Free operand from the memory. */
33167 void
33168 ix86_free_from_memory (enum machine_mode mode)
33169 {
33170 if (!ix86_using_red_zone ())
33171 {
33172 int size;
33173
33174 if (mode == DImode || TARGET_64BIT)
33175 size = 8;
33176 else
33177 size = 4;
33178 /* Use LEA to deallocate stack space. In peephole2 it will be converted
33179 to pop or add instruction if registers are available. */
33180 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
33181 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
33182 GEN_INT (size))));
33183 }
33184 }
33185
33186 /* Return a register priority for hard reg REGNO. */
33187 static int
33188 ix86_register_priority (int hard_regno)
33189 {
33190 /* ebp and r13 as the base always wants a displacement, r12 as the
33191 base always wants an index. So discourage their usage in an
33192 address. */
33193 if (hard_regno == R12_REG || hard_regno == R13_REG)
33194 return 0;
33195 if (hard_regno == BP_REG)
33196 return 1;
33197 /* New x86-64 int registers result in bigger code size. Discourage
33198 them. */
33199 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
33200 return 2;
33201 /* New x86-64 SSE registers result in bigger code size. Discourage
33202 them. */
33203 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
33204 return 2;
33205 /* Usage of AX register results in smaller code. Prefer it. */
33206 if (hard_regno == 0)
33207 return 4;
33208 return 3;
33209 }
33210
33211 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
33212
33213 Put float CONST_DOUBLE in the constant pool instead of fp regs.
33214 QImode must go into class Q_REGS.
33215 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
33216 movdf to do mem-to-mem moves through integer regs. */
33217
33218 static reg_class_t
33219 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
33220 {
33221 enum machine_mode mode = GET_MODE (x);
33222
33223 /* We're only allowed to return a subclass of CLASS. Many of the
33224 following checks fail for NO_REGS, so eliminate that early. */
33225 if (regclass == NO_REGS)
33226 return NO_REGS;
33227
33228 /* All classes can load zeros. */
33229 if (x == CONST0_RTX (mode))
33230 return regclass;
33231
33232 /* Force constants into memory if we are loading a (nonzero) constant into
33233 an MMX or SSE register. This is because there are no MMX/SSE instructions
33234 to load from a constant. */
33235 if (CONSTANT_P (x)
33236 && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
33237 return NO_REGS;
33238
33239 /* Prefer SSE regs only, if we can use them for math. */
33240 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
33241 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
33242
33243 /* Floating-point constants need more complex checks. */
33244 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
33245 {
33246 /* General regs can load everything. */
33247 if (reg_class_subset_p (regclass, GENERAL_REGS))
33248 return regclass;
33249
33250 /* Floats can load 0 and 1 plus some others. Note that we eliminated
33251 zero above. We only want to wind up preferring 80387 registers if
33252 we plan on doing computation with them. */
33253 if (TARGET_80387
33254 && standard_80387_constant_p (x) > 0)
33255 {
33256 /* Limit class to non-sse. */
33257 if (regclass == FLOAT_SSE_REGS)
33258 return FLOAT_REGS;
33259 if (regclass == FP_TOP_SSE_REGS)
33260 return FP_TOP_REG;
33261 if (regclass == FP_SECOND_SSE_REGS)
33262 return FP_SECOND_REG;
33263 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
33264 return regclass;
33265 }
33266
33267 return NO_REGS;
33268 }
33269
33270 /* Generally when we see PLUS here, it's the function invariant
33271 (plus soft-fp const_int). Which can only be computed into general
33272 regs. */
33273 if (GET_CODE (x) == PLUS)
33274 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
33275
33276 /* QImode constants are easy to load, but non-constant QImode data
33277 must go into Q_REGS. */
33278 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
33279 {
33280 if (reg_class_subset_p (regclass, Q_REGS))
33281 return regclass;
33282 if (reg_class_subset_p (Q_REGS, regclass))
33283 return Q_REGS;
33284 return NO_REGS;
33285 }
33286
33287 return regclass;
33288 }
33289
33290 /* Discourage putting floating-point values in SSE registers unless
33291 SSE math is being used, and likewise for the 387 registers. */
33292 static reg_class_t
33293 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
33294 {
33295 enum machine_mode mode = GET_MODE (x);
33296
33297 /* Restrict the output reload class to the register bank that we are doing
33298 math on. If we would like not to return a subset of CLASS, reject this
33299 alternative: if reload cannot do this, it will still use its choice. */
33300 mode = GET_MODE (x);
33301 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
33302 return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
33303
33304 if (X87_FLOAT_MODE_P (mode))
33305 {
33306 if (regclass == FP_TOP_SSE_REGS)
33307 return FP_TOP_REG;
33308 else if (regclass == FP_SECOND_SSE_REGS)
33309 return FP_SECOND_REG;
33310 else
33311 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
33312 }
33313
33314 return regclass;
33315 }
33316
33317 static reg_class_t
33318 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
33319 enum machine_mode mode, secondary_reload_info *sri)
33320 {
33321 /* Double-word spills from general registers to non-offsettable memory
33322 references (zero-extended addresses) require special handling. */
33323 if (TARGET_64BIT
33324 && MEM_P (x)
33325 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
33326 && rclass == GENERAL_REGS
33327 && !offsettable_memref_p (x))
33328 {
33329 sri->icode = (in_p
33330 ? CODE_FOR_reload_noff_load
33331 : CODE_FOR_reload_noff_store);
33332 /* Add the cost of moving address to a temporary. */
33333 sri->extra_cost = 1;
33334
33335 return NO_REGS;
33336 }
33337
33338 /* QImode spills from non-QI registers require
33339 intermediate register on 32bit targets. */
33340 if (!TARGET_64BIT
33341 && !in_p && mode == QImode
33342 && (rclass == GENERAL_REGS
33343 || rclass == LEGACY_REGS
33344 || rclass == NON_Q_REGS
33345 || rclass == SIREG
33346 || rclass == DIREG
33347 || rclass == INDEX_REGS))
33348 {
33349 int regno;
33350
33351 if (REG_P (x))
33352 regno = REGNO (x);
33353 else
33354 regno = -1;
33355
33356 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
33357 regno = true_regnum (x);
33358
33359 /* Return Q_REGS if the operand is in memory. */
33360 if (regno == -1)
33361 return Q_REGS;
33362 }
33363
33364 /* This condition handles corner case where an expression involving
33365 pointers gets vectorized. We're trying to use the address of a
33366 stack slot as a vector initializer.
33367
33368 (set (reg:V2DI 74 [ vect_cst_.2 ])
33369 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
33370
33371 Eventually frame gets turned into sp+offset like this:
33372
33373 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33374 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
33375 (const_int 392 [0x188]))))
33376
33377 That later gets turned into:
33378
33379 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33380 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
33381 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
33382
33383 We'll have the following reload recorded:
33384
33385 Reload 0: reload_in (DI) =
33386 (plus:DI (reg/f:DI 7 sp)
33387 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
33388 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33389 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
33390 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
33391 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33392 reload_reg_rtx: (reg:V2DI 22 xmm1)
33393
33394 Which isn't going to work since SSE instructions can't handle scalar
33395 additions. Returning GENERAL_REGS forces the addition into integer
33396 register and reload can handle subsequent reloads without problems. */
33397
33398 if (in_p && GET_CODE (x) == PLUS
33399 && SSE_CLASS_P (rclass)
33400 && SCALAR_INT_MODE_P (mode))
33401 return GENERAL_REGS;
33402
33403 return NO_REGS;
33404 }
33405
33406 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
33407
33408 static bool
33409 ix86_class_likely_spilled_p (reg_class_t rclass)
33410 {
33411 switch (rclass)
33412 {
33413 case AREG:
33414 case DREG:
33415 case CREG:
33416 case BREG:
33417 case AD_REGS:
33418 case SIREG:
33419 case DIREG:
33420 case SSE_FIRST_REG:
33421 case FP_TOP_REG:
33422 case FP_SECOND_REG:
33423 return true;
33424
33425 default:
33426 break;
33427 }
33428
33429 return false;
33430 }
33431
33432 /* If we are copying between general and FP registers, we need a memory
33433 location. The same is true for SSE and MMX registers.
33434
33435 To optimize register_move_cost performance, allow inline variant.
33436
33437 The macro can't work reliably when one of the CLASSES is class containing
33438 registers from multiple units (SSE, MMX, integer). We avoid this by never
33439 combining those units in single alternative in the machine description.
33440 Ensure that this constraint holds to avoid unexpected surprises.
33441
33442 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
33443 enforce these sanity checks. */
33444
33445 static inline bool
33446 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
33447 enum machine_mode mode, int strict)
33448 {
33449 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
33450 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
33451 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
33452 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
33453 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
33454 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
33455 {
33456 gcc_assert (!strict || lra_in_progress);
33457 return true;
33458 }
33459
33460 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
33461 return true;
33462
33463 /* ??? This is a lie. We do have moves between mmx/general, and for
33464 mmx/sse2. But by saying we need secondary memory we discourage the
33465 register allocator from using the mmx registers unless needed. */
33466 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
33467 return true;
33468
33469 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
33470 {
33471 /* SSE1 doesn't have any direct moves from other classes. */
33472 if (!TARGET_SSE2)
33473 return true;
33474
33475 /* If the target says that inter-unit moves are more expensive
33476 than moving through memory, then don't generate them. */
33477 if (!TARGET_INTER_UNIT_MOVES)
33478 return true;
33479
33480 /* Between SSE and general, we have moves no larger than word size. */
33481 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
33482 return true;
33483 }
33484
33485 return false;
33486 }
33487
33488 bool
33489 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
33490 enum machine_mode mode, int strict)
33491 {
33492 return inline_secondary_memory_needed (class1, class2, mode, strict);
33493 }
33494
33495 /* Implement the TARGET_CLASS_MAX_NREGS hook.
33496
33497 On the 80386, this is the size of MODE in words,
33498 except in the FP regs, where a single reg is always enough. */
33499
33500 static unsigned char
33501 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
33502 {
33503 if (MAYBE_INTEGER_CLASS_P (rclass))
33504 {
33505 if (mode == XFmode)
33506 return (TARGET_64BIT ? 2 : 3);
33507 else if (mode == XCmode)
33508 return (TARGET_64BIT ? 4 : 6);
33509 else
33510 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
33511 }
33512 else
33513 {
33514 if (COMPLEX_MODE_P (mode))
33515 return 2;
33516 else
33517 return 1;
33518 }
33519 }
33520
33521 /* Return true if the registers in CLASS cannot represent the change from
33522 modes FROM to TO. */
33523
33524 bool
33525 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
33526 enum reg_class regclass)
33527 {
33528 if (from == to)
33529 return false;
33530
33531 /* x87 registers can't do subreg at all, as all values are reformatted
33532 to extended precision. */
33533 if (MAYBE_FLOAT_CLASS_P (regclass))
33534 return true;
33535
33536 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
33537 {
33538 /* Vector registers do not support QI or HImode loads. If we don't
33539 disallow a change to these modes, reload will assume it's ok to
33540 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
33541 the vec_dupv4hi pattern. */
33542 if (GET_MODE_SIZE (from) < 4)
33543 return true;
33544
33545 /* Vector registers do not support subreg with nonzero offsets, which
33546 are otherwise valid for integer registers. Since we can't see
33547 whether we have a nonzero offset from here, prohibit all
33548 nonparadoxical subregs changing size. */
33549 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
33550 return true;
33551 }
33552
33553 return false;
33554 }
33555
33556 /* Return the cost of moving data of mode M between a
33557 register and memory. A value of 2 is the default; this cost is
33558 relative to those in `REGISTER_MOVE_COST'.
33559
33560 This function is used extensively by register_move_cost that is used to
33561 build tables at startup. Make it inline in this case.
33562 When IN is 2, return maximum of in and out move cost.
33563
33564 If moving between registers and memory is more expensive than
33565 between two registers, you should define this macro to express the
33566 relative cost.
33567
33568 Model also increased moving costs of QImode registers in non
33569 Q_REGS classes.
33570 */
33571 static inline int
33572 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
33573 int in)
33574 {
33575 int cost;
33576 if (FLOAT_CLASS_P (regclass))
33577 {
33578 int index;
33579 switch (mode)
33580 {
33581 case SFmode:
33582 index = 0;
33583 break;
33584 case DFmode:
33585 index = 1;
33586 break;
33587 case XFmode:
33588 index = 2;
33589 break;
33590 default:
33591 return 100;
33592 }
33593 if (in == 2)
33594 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
33595 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
33596 }
33597 if (SSE_CLASS_P (regclass))
33598 {
33599 int index;
33600 switch (GET_MODE_SIZE (mode))
33601 {
33602 case 4:
33603 index = 0;
33604 break;
33605 case 8:
33606 index = 1;
33607 break;
33608 case 16:
33609 index = 2;
33610 break;
33611 default:
33612 return 100;
33613 }
33614 if (in == 2)
33615 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
33616 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
33617 }
33618 if (MMX_CLASS_P (regclass))
33619 {
33620 int index;
33621 switch (GET_MODE_SIZE (mode))
33622 {
33623 case 4:
33624 index = 0;
33625 break;
33626 case 8:
33627 index = 1;
33628 break;
33629 default:
33630 return 100;
33631 }
33632 if (in)
33633 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
33634 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
33635 }
33636 switch (GET_MODE_SIZE (mode))
33637 {
33638 case 1:
33639 if (Q_CLASS_P (regclass) || TARGET_64BIT)
33640 {
33641 if (!in)
33642 return ix86_cost->int_store[0];
33643 if (TARGET_PARTIAL_REG_DEPENDENCY
33644 && optimize_function_for_speed_p (cfun))
33645 cost = ix86_cost->movzbl_load;
33646 else
33647 cost = ix86_cost->int_load[0];
33648 if (in == 2)
33649 return MAX (cost, ix86_cost->int_store[0]);
33650 return cost;
33651 }
33652 else
33653 {
33654 if (in == 2)
33655 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
33656 if (in)
33657 return ix86_cost->movzbl_load;
33658 else
33659 return ix86_cost->int_store[0] + 4;
33660 }
33661 break;
33662 case 2:
33663 if (in == 2)
33664 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
33665 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
33666 default:
33667 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
33668 if (mode == TFmode)
33669 mode = XFmode;
33670 if (in == 2)
33671 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
33672 else if (in)
33673 cost = ix86_cost->int_load[2];
33674 else
33675 cost = ix86_cost->int_store[2];
33676 return (cost * (((int) GET_MODE_SIZE (mode)
33677 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
33678 }
33679 }
33680
33681 static int
33682 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
33683 bool in)
33684 {
33685 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
33686 }
33687
33688
33689 /* Return the cost of moving data from a register in class CLASS1 to
33690 one in class CLASS2.
33691
33692 It is not required that the cost always equal 2 when FROM is the same as TO;
33693 on some machines it is expensive to move between registers if they are not
33694 general registers. */
33695
33696 static int
33697 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
33698 reg_class_t class2_i)
33699 {
33700 enum reg_class class1 = (enum reg_class) class1_i;
33701 enum reg_class class2 = (enum reg_class) class2_i;
33702
33703 /* In case we require secondary memory, compute cost of the store followed
33704 by load. In order to avoid bad register allocation choices, we need
33705 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
33706
33707 if (inline_secondary_memory_needed (class1, class2, mode, 0))
33708 {
33709 int cost = 1;
33710
33711 cost += inline_memory_move_cost (mode, class1, 2);
33712 cost += inline_memory_move_cost (mode, class2, 2);
33713
33714 /* In case of copying from general_purpose_register we may emit multiple
33715 stores followed by single load causing memory size mismatch stall.
33716 Count this as arbitrarily high cost of 20. */
33717 if (targetm.class_max_nregs (class1, mode)
33718 > targetm.class_max_nregs (class2, mode))
33719 cost += 20;
33720
33721 /* In the case of FP/MMX moves, the registers actually overlap, and we
33722 have to switch modes in order to treat them differently. */
33723 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
33724 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
33725 cost += 20;
33726
33727 return cost;
33728 }
33729
33730 /* Moves between SSE/MMX and integer unit are expensive. */
33731 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
33732 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
33733
33734 /* ??? By keeping returned value relatively high, we limit the number
33735 of moves between integer and MMX/SSE registers for all targets.
33736 Additionally, high value prevents problem with x86_modes_tieable_p(),
33737 where integer modes in MMX/SSE registers are not tieable
33738 because of missing QImode and HImode moves to, from or between
33739 MMX/SSE registers. */
33740 return MAX (8, ix86_cost->mmxsse_to_integer);
33741
33742 if (MAYBE_FLOAT_CLASS_P (class1))
33743 return ix86_cost->fp_move;
33744 if (MAYBE_SSE_CLASS_P (class1))
33745 return ix86_cost->sse_move;
33746 if (MAYBE_MMX_CLASS_P (class1))
33747 return ix86_cost->mmx_move;
33748 return 2;
33749 }
33750
33751 /* Return TRUE if hard register REGNO can hold a value of machine-mode
33752 MODE. */
33753
33754 bool
33755 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
33756 {
33757 /* Flags and only flags can only hold CCmode values. */
33758 if (CC_REGNO_P (regno))
33759 return GET_MODE_CLASS (mode) == MODE_CC;
33760 if (GET_MODE_CLASS (mode) == MODE_CC
33761 || GET_MODE_CLASS (mode) == MODE_RANDOM
33762 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
33763 return false;
33764 if (STACK_REGNO_P (regno))
33765 return VALID_FP_MODE_P (mode);
33766 if (SSE_REGNO_P (regno))
33767 {
33768 /* We implement the move patterns for all vector modes into and
33769 out of SSE registers, even when no operation instructions
33770 are available. OImode move is available only when AVX is
33771 enabled. */
33772 return ((TARGET_AVX && mode == OImode)
33773 || VALID_AVX256_REG_MODE (mode)
33774 || VALID_SSE_REG_MODE (mode)
33775 || VALID_SSE2_REG_MODE (mode)
33776 || VALID_MMX_REG_MODE (mode)
33777 || VALID_MMX_REG_MODE_3DNOW (mode));
33778 }
33779 if (MMX_REGNO_P (regno))
33780 {
33781 /* We implement the move patterns for 3DNOW modes even in MMX mode,
33782 so if the register is available at all, then we can move data of
33783 the given mode into or out of it. */
33784 return (VALID_MMX_REG_MODE (mode)
33785 || VALID_MMX_REG_MODE_3DNOW (mode));
33786 }
33787
33788 if (mode == QImode)
33789 {
33790 /* Take care for QImode values - they can be in non-QI regs,
33791 but then they do cause partial register stalls. */
33792 if (TARGET_64BIT || QI_REGNO_P (regno))
33793 return true;
33794 if (!TARGET_PARTIAL_REG_STALL)
33795 return true;
33796 return !can_create_pseudo_p ();
33797 }
33798 /* We handle both integer and floats in the general purpose registers. */
33799 else if (VALID_INT_MODE_P (mode))
33800 return true;
33801 else if (VALID_FP_MODE_P (mode))
33802 return true;
33803 else if (VALID_DFP_MODE_P (mode))
33804 return true;
33805 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
33806 on to use that value in smaller contexts, this can easily force a
33807 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
33808 supporting DImode, allow it. */
33809 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
33810 return true;
33811
33812 return false;
33813 }
33814
33815 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
33816 tieable integer mode. */
33817
33818 static bool
33819 ix86_tieable_integer_mode_p (enum machine_mode mode)
33820 {
33821 switch (mode)
33822 {
33823 case HImode:
33824 case SImode:
33825 return true;
33826
33827 case QImode:
33828 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
33829
33830 case DImode:
33831 return TARGET_64BIT;
33832
33833 default:
33834 return false;
33835 }
33836 }
33837
33838 /* Return true if MODE1 is accessible in a register that can hold MODE2
33839 without copying. That is, all register classes that can hold MODE2
33840 can also hold MODE1. */
33841
33842 bool
33843 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
33844 {
33845 if (mode1 == mode2)
33846 return true;
33847
33848 if (ix86_tieable_integer_mode_p (mode1)
33849 && ix86_tieable_integer_mode_p (mode2))
33850 return true;
33851
33852 /* MODE2 being XFmode implies fp stack or general regs, which means we
33853 can tie any smaller floating point modes to it. Note that we do not
33854 tie this with TFmode. */
33855 if (mode2 == XFmode)
33856 return mode1 == SFmode || mode1 == DFmode;
33857
33858 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
33859 that we can tie it with SFmode. */
33860 if (mode2 == DFmode)
33861 return mode1 == SFmode;
33862
33863 /* If MODE2 is only appropriate for an SSE register, then tie with
33864 any other mode acceptable to SSE registers. */
33865 if (GET_MODE_SIZE (mode2) == 32
33866 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
33867 return (GET_MODE_SIZE (mode1) == 32
33868 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
33869 if (GET_MODE_SIZE (mode2) == 16
33870 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
33871 return (GET_MODE_SIZE (mode1) == 16
33872 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
33873
33874 /* If MODE2 is appropriate for an MMX register, then tie
33875 with any other mode acceptable to MMX registers. */
33876 if (GET_MODE_SIZE (mode2) == 8
33877 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
33878 return (GET_MODE_SIZE (mode1) == 8
33879 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
33880
33881 return false;
33882 }
33883
33884 /* Return the cost of moving between two registers of mode MODE. */
33885
33886 static int
33887 ix86_set_reg_reg_cost (enum machine_mode mode)
33888 {
33889 unsigned int units = UNITS_PER_WORD;
33890
33891 switch (GET_MODE_CLASS (mode))
33892 {
33893 default:
33894 break;
33895
33896 case MODE_CC:
33897 units = GET_MODE_SIZE (CCmode);
33898 break;
33899
33900 case MODE_FLOAT:
33901 if ((TARGET_SSE && mode == TFmode)
33902 || (TARGET_80387 && mode == XFmode)
33903 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
33904 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
33905 units = GET_MODE_SIZE (mode);
33906 break;
33907
33908 case MODE_COMPLEX_FLOAT:
33909 if ((TARGET_SSE && mode == TCmode)
33910 || (TARGET_80387 && mode == XCmode)
33911 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
33912 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
33913 units = GET_MODE_SIZE (mode);
33914 break;
33915
33916 case MODE_VECTOR_INT:
33917 case MODE_VECTOR_FLOAT:
33918 if ((TARGET_AVX && VALID_AVX256_REG_MODE (mode))
33919 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
33920 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
33921 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
33922 units = GET_MODE_SIZE (mode);
33923 }
33924
33925 /* Return the cost of moving between two registers of mode MODE,
33926 assuming that the move will be in pieces of at most UNITS bytes. */
33927 return COSTS_N_INSNS ((GET_MODE_SIZE (mode) + units - 1) / units);
33928 }
33929
33930 /* Compute a (partial) cost for rtx X. Return true if the complete
33931 cost has been computed, and false if subexpressions should be
33932 scanned. In either case, *TOTAL contains the cost result. */
33933
33934 static bool
33935 ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
33936 bool speed)
33937 {
33938 enum rtx_code code = (enum rtx_code) code_i;
33939 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
33940 enum machine_mode mode = GET_MODE (x);
33941 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
33942
33943 switch (code)
33944 {
33945 case SET:
33946 if (register_operand (SET_DEST (x), VOIDmode)
33947 && reg_or_0_operand (SET_SRC (x), VOIDmode))
33948 {
33949 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
33950 return true;
33951 }
33952 return false;
33953
33954 case CONST_INT:
33955 case CONST:
33956 case LABEL_REF:
33957 case SYMBOL_REF:
33958 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
33959 *total = 3;
33960 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
33961 *total = 2;
33962 else if (flag_pic && SYMBOLIC_CONST (x)
33963 && (!TARGET_64BIT
33964 || (!GET_CODE (x) != LABEL_REF
33965 && (GET_CODE (x) != SYMBOL_REF
33966 || !SYMBOL_REF_LOCAL_P (x)))))
33967 *total = 1;
33968 else
33969 *total = 0;
33970 return true;
33971
33972 case CONST_DOUBLE:
33973 if (mode == VOIDmode)
33974 {
33975 *total = 0;
33976 return true;
33977 }
33978 switch (standard_80387_constant_p (x))
33979 {
33980 case 1: /* 0.0 */
33981 *total = 1;
33982 return true;
33983 default: /* Other constants */
33984 *total = 2;
33985 return true;
33986 case 0:
33987 case -1:
33988 break;
33989 }
33990 if (SSE_FLOAT_MODE_P (mode))
33991 {
33992 case CONST_VECTOR:
33993 switch (standard_sse_constant_p (x))
33994 {
33995 case 0:
33996 break;
33997 case 1: /* 0: xor eliminates false dependency */
33998 *total = 0;
33999 return true;
34000 default: /* -1: cmp contains false dependency */
34001 *total = 1;
34002 return true;
34003 }
34004 }
34005 /* Fall back to (MEM (SYMBOL_REF)), since that's where
34006 it'll probably end up. Add a penalty for size. */
34007 *total = (COSTS_N_INSNS (1)
34008 + (flag_pic != 0 && !TARGET_64BIT)
34009 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
34010 return true;
34011
34012 case ZERO_EXTEND:
34013 /* The zero extensions is often completely free on x86_64, so make
34014 it as cheap as possible. */
34015 if (TARGET_64BIT && mode == DImode
34016 && GET_MODE (XEXP (x, 0)) == SImode)
34017 *total = 1;
34018 else if (TARGET_ZERO_EXTEND_WITH_AND)
34019 *total = cost->add;
34020 else
34021 *total = cost->movzx;
34022 return false;
34023
34024 case SIGN_EXTEND:
34025 *total = cost->movsx;
34026 return false;
34027
34028 case ASHIFT:
34029 if (SCALAR_INT_MODE_P (mode)
34030 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
34031 && CONST_INT_P (XEXP (x, 1)))
34032 {
34033 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
34034 if (value == 1)
34035 {
34036 *total = cost->add;
34037 return false;
34038 }
34039 if ((value == 2 || value == 3)
34040 && cost->lea <= cost->shift_const)
34041 {
34042 *total = cost->lea;
34043 return false;
34044 }
34045 }
34046 /* FALLTHRU */
34047
34048 case ROTATE:
34049 case ASHIFTRT:
34050 case LSHIFTRT:
34051 case ROTATERT:
34052 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34053 {
34054 /* ??? Should be SSE vector operation cost. */
34055 /* At least for published AMD latencies, this really is the same
34056 as the latency for a simple fpu operation like fabs. */
34057 /* V*QImode is emulated with 1-11 insns. */
34058 if (mode == V16QImode || mode == V32QImode)
34059 {
34060 int count = 11;
34061 if (TARGET_XOP && mode == V16QImode)
34062 {
34063 /* For XOP we use vpshab, which requires a broadcast of the
34064 value to the variable shift insn. For constants this
34065 means a V16Q const in mem; even when we can perform the
34066 shift with one insn set the cost to prefer paddb. */
34067 if (CONSTANT_P (XEXP (x, 1)))
34068 {
34069 *total = (cost->fabs
34070 + rtx_cost (XEXP (x, 0), code, 0, speed)
34071 + (speed ? 2 : COSTS_N_BYTES (16)));
34072 return true;
34073 }
34074 count = 3;
34075 }
34076 else if (TARGET_SSSE3)
34077 count = 7;
34078 *total = cost->fabs * count;
34079 }
34080 else
34081 *total = cost->fabs;
34082 }
34083 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34084 {
34085 if (CONST_INT_P (XEXP (x, 1)))
34086 {
34087 if (INTVAL (XEXP (x, 1)) > 32)
34088 *total = cost->shift_const + COSTS_N_INSNS (2);
34089 else
34090 *total = cost->shift_const * 2;
34091 }
34092 else
34093 {
34094 if (GET_CODE (XEXP (x, 1)) == AND)
34095 *total = cost->shift_var * 2;
34096 else
34097 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
34098 }
34099 }
34100 else
34101 {
34102 if (CONST_INT_P (XEXP (x, 1)))
34103 *total = cost->shift_const;
34104 else
34105 *total = cost->shift_var;
34106 }
34107 return false;
34108
34109 case FMA:
34110 {
34111 rtx sub;
34112
34113 gcc_assert (FLOAT_MODE_P (mode));
34114 gcc_assert (TARGET_FMA || TARGET_FMA4);
34115
34116 /* ??? SSE scalar/vector cost should be used here. */
34117 /* ??? Bald assumption that fma has the same cost as fmul. */
34118 *total = cost->fmul;
34119 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
34120
34121 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
34122 sub = XEXP (x, 0);
34123 if (GET_CODE (sub) == NEG)
34124 sub = XEXP (sub, 0);
34125 *total += rtx_cost (sub, FMA, 0, speed);
34126
34127 sub = XEXP (x, 2);
34128 if (GET_CODE (sub) == NEG)
34129 sub = XEXP (sub, 0);
34130 *total += rtx_cost (sub, FMA, 2, speed);
34131 return true;
34132 }
34133
34134 case MULT:
34135 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34136 {
34137 /* ??? SSE scalar cost should be used here. */
34138 *total = cost->fmul;
34139 return false;
34140 }
34141 else if (X87_FLOAT_MODE_P (mode))
34142 {
34143 *total = cost->fmul;
34144 return false;
34145 }
34146 else if (FLOAT_MODE_P (mode))
34147 {
34148 /* ??? SSE vector cost should be used here. */
34149 *total = cost->fmul;
34150 return false;
34151 }
34152 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34153 {
34154 /* V*QImode is emulated with 7-13 insns. */
34155 if (mode == V16QImode || mode == V32QImode)
34156 {
34157 int extra = 11;
34158 if (TARGET_XOP && mode == V16QImode)
34159 extra = 5;
34160 else if (TARGET_SSSE3)
34161 extra = 6;
34162 *total = cost->fmul * 2 + cost->fabs * extra;
34163 }
34164 /* V*DImode is emulated with 5-8 insns. */
34165 else if (mode == V2DImode || mode == V4DImode)
34166 {
34167 if (TARGET_XOP && mode == V2DImode)
34168 *total = cost->fmul * 2 + cost->fabs * 3;
34169 else
34170 *total = cost->fmul * 3 + cost->fabs * 5;
34171 }
34172 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
34173 insns, including two PMULUDQ. */
34174 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
34175 *total = cost->fmul * 2 + cost->fabs * 5;
34176 else
34177 *total = cost->fmul;
34178 return false;
34179 }
34180 else
34181 {
34182 rtx op0 = XEXP (x, 0);
34183 rtx op1 = XEXP (x, 1);
34184 int nbits;
34185 if (CONST_INT_P (XEXP (x, 1)))
34186 {
34187 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
34188 for (nbits = 0; value != 0; value &= value - 1)
34189 nbits++;
34190 }
34191 else
34192 /* This is arbitrary. */
34193 nbits = 7;
34194
34195 /* Compute costs correctly for widening multiplication. */
34196 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
34197 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
34198 == GET_MODE_SIZE (mode))
34199 {
34200 int is_mulwiden = 0;
34201 enum machine_mode inner_mode = GET_MODE (op0);
34202
34203 if (GET_CODE (op0) == GET_CODE (op1))
34204 is_mulwiden = 1, op1 = XEXP (op1, 0);
34205 else if (CONST_INT_P (op1))
34206 {
34207 if (GET_CODE (op0) == SIGN_EXTEND)
34208 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
34209 == INTVAL (op1);
34210 else
34211 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
34212 }
34213
34214 if (is_mulwiden)
34215 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
34216 }
34217
34218 *total = (cost->mult_init[MODE_INDEX (mode)]
34219 + nbits * cost->mult_bit
34220 + rtx_cost (op0, outer_code, opno, speed)
34221 + rtx_cost (op1, outer_code, opno, speed));
34222
34223 return true;
34224 }
34225
34226 case DIV:
34227 case UDIV:
34228 case MOD:
34229 case UMOD:
34230 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34231 /* ??? SSE cost should be used here. */
34232 *total = cost->fdiv;
34233 else if (X87_FLOAT_MODE_P (mode))
34234 *total = cost->fdiv;
34235 else if (FLOAT_MODE_P (mode))
34236 /* ??? SSE vector cost should be used here. */
34237 *total = cost->fdiv;
34238 else
34239 *total = cost->divide[MODE_INDEX (mode)];
34240 return false;
34241
34242 case PLUS:
34243 if (GET_MODE_CLASS (mode) == MODE_INT
34244 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
34245 {
34246 if (GET_CODE (XEXP (x, 0)) == PLUS
34247 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
34248 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
34249 && CONSTANT_P (XEXP (x, 1)))
34250 {
34251 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
34252 if (val == 2 || val == 4 || val == 8)
34253 {
34254 *total = cost->lea;
34255 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
34256 outer_code, opno, speed);
34257 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
34258 outer_code, opno, speed);
34259 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34260 return true;
34261 }
34262 }
34263 else if (GET_CODE (XEXP (x, 0)) == MULT
34264 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
34265 {
34266 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
34267 if (val == 2 || val == 4 || val == 8)
34268 {
34269 *total = cost->lea;
34270 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
34271 outer_code, opno, speed);
34272 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34273 return true;
34274 }
34275 }
34276 else if (GET_CODE (XEXP (x, 0)) == PLUS)
34277 {
34278 *total = cost->lea;
34279 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
34280 outer_code, opno, speed);
34281 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
34282 outer_code, opno, speed);
34283 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34284 return true;
34285 }
34286 }
34287 /* FALLTHRU */
34288
34289 case MINUS:
34290 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34291 {
34292 /* ??? SSE cost should be used here. */
34293 *total = cost->fadd;
34294 return false;
34295 }
34296 else if (X87_FLOAT_MODE_P (mode))
34297 {
34298 *total = cost->fadd;
34299 return false;
34300 }
34301 else if (FLOAT_MODE_P (mode))
34302 {
34303 /* ??? SSE vector cost should be used here. */
34304 *total = cost->fadd;
34305 return false;
34306 }
34307 /* FALLTHRU */
34308
34309 case AND:
34310 case IOR:
34311 case XOR:
34312 if (GET_MODE_CLASS (mode) == MODE_INT
34313 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34314 {
34315 *total = (cost->add * 2
34316 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
34317 << (GET_MODE (XEXP (x, 0)) != DImode))
34318 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
34319 << (GET_MODE (XEXP (x, 1)) != DImode)));
34320 return true;
34321 }
34322 /* FALLTHRU */
34323
34324 case NEG:
34325 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34326 {
34327 /* ??? SSE cost should be used here. */
34328 *total = cost->fchs;
34329 return false;
34330 }
34331 else if (X87_FLOAT_MODE_P (mode))
34332 {
34333 *total = cost->fchs;
34334 return false;
34335 }
34336 else if (FLOAT_MODE_P (mode))
34337 {
34338 /* ??? SSE vector cost should be used here. */
34339 *total = cost->fchs;
34340 return false;
34341 }
34342 /* FALLTHRU */
34343
34344 case NOT:
34345 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34346 {
34347 /* ??? Should be SSE vector operation cost. */
34348 /* At least for published AMD latencies, this really is the same
34349 as the latency for a simple fpu operation like fabs. */
34350 *total = cost->fabs;
34351 }
34352 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34353 *total = cost->add * 2;
34354 else
34355 *total = cost->add;
34356 return false;
34357
34358 case COMPARE:
34359 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
34360 && XEXP (XEXP (x, 0), 1) == const1_rtx
34361 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
34362 && XEXP (x, 1) == const0_rtx)
34363 {
34364 /* This kind of construct is implemented using test[bwl].
34365 Treat it as if we had an AND. */
34366 *total = (cost->add
34367 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
34368 + rtx_cost (const1_rtx, outer_code, opno, speed));
34369 return true;
34370 }
34371 return false;
34372
34373 case FLOAT_EXTEND:
34374 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
34375 *total = 0;
34376 return false;
34377
34378 case ABS:
34379 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34380 /* ??? SSE cost should be used here. */
34381 *total = cost->fabs;
34382 else if (X87_FLOAT_MODE_P (mode))
34383 *total = cost->fabs;
34384 else if (FLOAT_MODE_P (mode))
34385 /* ??? SSE vector cost should be used here. */
34386 *total = cost->fabs;
34387 return false;
34388
34389 case SQRT:
34390 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34391 /* ??? SSE cost should be used here. */
34392 *total = cost->fsqrt;
34393 else if (X87_FLOAT_MODE_P (mode))
34394 *total = cost->fsqrt;
34395 else if (FLOAT_MODE_P (mode))
34396 /* ??? SSE vector cost should be used here. */
34397 *total = cost->fsqrt;
34398 return false;
34399
34400 case UNSPEC:
34401 if (XINT (x, 1) == UNSPEC_TP)
34402 *total = 0;
34403 return false;
34404
34405 case VEC_SELECT:
34406 case VEC_CONCAT:
34407 case VEC_MERGE:
34408 case VEC_DUPLICATE:
34409 /* ??? Assume all of these vector manipulation patterns are
34410 recognizable. In which case they all pretty much have the
34411 same cost. */
34412 *total = cost->fabs;
34413 return true;
34414
34415 default:
34416 return false;
34417 }
34418 }
34419
34420 #if TARGET_MACHO
34421
34422 static int current_machopic_label_num;
34423
34424 /* Given a symbol name and its associated stub, write out the
34425 definition of the stub. */
34426
34427 void
34428 machopic_output_stub (FILE *file, const char *symb, const char *stub)
34429 {
34430 unsigned int length;
34431 char *binder_name, *symbol_name, lazy_ptr_name[32];
34432 int label = ++current_machopic_label_num;
34433
34434 /* For 64-bit we shouldn't get here. */
34435 gcc_assert (!TARGET_64BIT);
34436
34437 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
34438 symb = targetm.strip_name_encoding (symb);
34439
34440 length = strlen (stub);
34441 binder_name = XALLOCAVEC (char, length + 32);
34442 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
34443
34444 length = strlen (symb);
34445 symbol_name = XALLOCAVEC (char, length + 32);
34446 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
34447
34448 sprintf (lazy_ptr_name, "L%d$lz", label);
34449
34450 if (MACHOPIC_ATT_STUB)
34451 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
34452 else if (MACHOPIC_PURE)
34453 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
34454 else
34455 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
34456
34457 fprintf (file, "%s:\n", stub);
34458 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
34459
34460 if (MACHOPIC_ATT_STUB)
34461 {
34462 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
34463 }
34464 else if (MACHOPIC_PURE)
34465 {
34466 /* PIC stub. */
34467 /* 25-byte PIC stub using "CALL get_pc_thunk". */
34468 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
34469 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
34470 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
34471 label, lazy_ptr_name, label);
34472 fprintf (file, "\tjmp\t*%%ecx\n");
34473 }
34474 else
34475 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
34476
34477 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
34478 it needs no stub-binding-helper. */
34479 if (MACHOPIC_ATT_STUB)
34480 return;
34481
34482 fprintf (file, "%s:\n", binder_name);
34483
34484 if (MACHOPIC_PURE)
34485 {
34486 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
34487 fprintf (file, "\tpushl\t%%ecx\n");
34488 }
34489 else
34490 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
34491
34492 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
34493
34494 /* N.B. Keep the correspondence of these
34495 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
34496 old-pic/new-pic/non-pic stubs; altering this will break
34497 compatibility with existing dylibs. */
34498 if (MACHOPIC_PURE)
34499 {
34500 /* 25-byte PIC stub using "CALL get_pc_thunk". */
34501 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
34502 }
34503 else
34504 /* 16-byte -mdynamic-no-pic stub. */
34505 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
34506
34507 fprintf (file, "%s:\n", lazy_ptr_name);
34508 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
34509 fprintf (file, ASM_LONG "%s\n", binder_name);
34510 }
34511 #endif /* TARGET_MACHO */
34512
34513 /* Order the registers for register allocator. */
34514
34515 void
34516 x86_order_regs_for_local_alloc (void)
34517 {
34518 int pos = 0;
34519 int i;
34520
34521 /* First allocate the local general purpose registers. */
34522 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
34523 if (GENERAL_REGNO_P (i) && call_used_regs[i])
34524 reg_alloc_order [pos++] = i;
34525
34526 /* Global general purpose registers. */
34527 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
34528 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
34529 reg_alloc_order [pos++] = i;
34530
34531 /* x87 registers come first in case we are doing FP math
34532 using them. */
34533 if (!TARGET_SSE_MATH)
34534 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
34535 reg_alloc_order [pos++] = i;
34536
34537 /* SSE registers. */
34538 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
34539 reg_alloc_order [pos++] = i;
34540 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
34541 reg_alloc_order [pos++] = i;
34542
34543 /* x87 registers. */
34544 if (TARGET_SSE_MATH)
34545 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
34546 reg_alloc_order [pos++] = i;
34547
34548 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
34549 reg_alloc_order [pos++] = i;
34550
34551 /* Initialize the rest of array as we do not allocate some registers
34552 at all. */
34553 while (pos < FIRST_PSEUDO_REGISTER)
34554 reg_alloc_order [pos++] = 0;
34555 }
34556
34557 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
34558 in struct attribute_spec handler. */
34559 static tree
34560 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
34561 tree args,
34562 int flags ATTRIBUTE_UNUSED,
34563 bool *no_add_attrs)
34564 {
34565 if (TREE_CODE (*node) != FUNCTION_TYPE
34566 && TREE_CODE (*node) != METHOD_TYPE
34567 && TREE_CODE (*node) != FIELD_DECL
34568 && TREE_CODE (*node) != TYPE_DECL)
34569 {
34570 warning (OPT_Wattributes, "%qE attribute only applies to functions",
34571 name);
34572 *no_add_attrs = true;
34573 return NULL_TREE;
34574 }
34575 if (TARGET_64BIT)
34576 {
34577 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
34578 name);
34579 *no_add_attrs = true;
34580 return NULL_TREE;
34581 }
34582 if (is_attribute_p ("callee_pop_aggregate_return", name))
34583 {
34584 tree cst;
34585
34586 cst = TREE_VALUE (args);
34587 if (TREE_CODE (cst) != INTEGER_CST)
34588 {
34589 warning (OPT_Wattributes,
34590 "%qE attribute requires an integer constant argument",
34591 name);
34592 *no_add_attrs = true;
34593 }
34594 else if (compare_tree_int (cst, 0) != 0
34595 && compare_tree_int (cst, 1) != 0)
34596 {
34597 warning (OPT_Wattributes,
34598 "argument to %qE attribute is neither zero, nor one",
34599 name);
34600 *no_add_attrs = true;
34601 }
34602
34603 return NULL_TREE;
34604 }
34605
34606 return NULL_TREE;
34607 }
34608
34609 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
34610 struct attribute_spec.handler. */
34611 static tree
34612 ix86_handle_abi_attribute (tree *node, tree name,
34613 tree args ATTRIBUTE_UNUSED,
34614 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
34615 {
34616 if (TREE_CODE (*node) != FUNCTION_TYPE
34617 && TREE_CODE (*node) != METHOD_TYPE
34618 && TREE_CODE (*node) != FIELD_DECL
34619 && TREE_CODE (*node) != TYPE_DECL)
34620 {
34621 warning (OPT_Wattributes, "%qE attribute only applies to functions",
34622 name);
34623 *no_add_attrs = true;
34624 return NULL_TREE;
34625 }
34626
34627 /* Can combine regparm with all attributes but fastcall. */
34628 if (is_attribute_p ("ms_abi", name))
34629 {
34630 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
34631 {
34632 error ("ms_abi and sysv_abi attributes are not compatible");
34633 }
34634
34635 return NULL_TREE;
34636 }
34637 else if (is_attribute_p ("sysv_abi", name))
34638 {
34639 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
34640 {
34641 error ("ms_abi and sysv_abi attributes are not compatible");
34642 }
34643
34644 return NULL_TREE;
34645 }
34646
34647 return NULL_TREE;
34648 }
34649
34650 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
34651 struct attribute_spec.handler. */
34652 static tree
34653 ix86_handle_struct_attribute (tree *node, tree name,
34654 tree args ATTRIBUTE_UNUSED,
34655 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
34656 {
34657 tree *type = NULL;
34658 if (DECL_P (*node))
34659 {
34660 if (TREE_CODE (*node) == TYPE_DECL)
34661 type = &TREE_TYPE (*node);
34662 }
34663 else
34664 type = node;
34665
34666 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
34667 {
34668 warning (OPT_Wattributes, "%qE attribute ignored",
34669 name);
34670 *no_add_attrs = true;
34671 }
34672
34673 else if ((is_attribute_p ("ms_struct", name)
34674 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
34675 || ((is_attribute_p ("gcc_struct", name)
34676 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
34677 {
34678 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
34679 name);
34680 *no_add_attrs = true;
34681 }
34682
34683 return NULL_TREE;
34684 }
34685
34686 static tree
34687 ix86_handle_fndecl_attribute (tree *node, tree name,
34688 tree args ATTRIBUTE_UNUSED,
34689 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
34690 {
34691 if (TREE_CODE (*node) != FUNCTION_DECL)
34692 {
34693 warning (OPT_Wattributes, "%qE attribute only applies to functions",
34694 name);
34695 *no_add_attrs = true;
34696 }
34697 return NULL_TREE;
34698 }
34699
34700 static bool
34701 ix86_ms_bitfield_layout_p (const_tree record_type)
34702 {
34703 return ((TARGET_MS_BITFIELD_LAYOUT
34704 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
34705 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
34706 }
34707
34708 /* Returns an expression indicating where the this parameter is
34709 located on entry to the FUNCTION. */
34710
34711 static rtx
34712 x86_this_parameter (tree function)
34713 {
34714 tree type = TREE_TYPE (function);
34715 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
34716 int nregs;
34717
34718 if (TARGET_64BIT)
34719 {
34720 const int *parm_regs;
34721
34722 if (ix86_function_type_abi (type) == MS_ABI)
34723 parm_regs = x86_64_ms_abi_int_parameter_registers;
34724 else
34725 parm_regs = x86_64_int_parameter_registers;
34726 return gen_rtx_REG (Pmode, parm_regs[aggr]);
34727 }
34728
34729 nregs = ix86_function_regparm (type, function);
34730
34731 if (nregs > 0 && !stdarg_p (type))
34732 {
34733 int regno;
34734 unsigned int ccvt = ix86_get_callcvt (type);
34735
34736 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
34737 regno = aggr ? DX_REG : CX_REG;
34738 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
34739 {
34740 regno = CX_REG;
34741 if (aggr)
34742 return gen_rtx_MEM (SImode,
34743 plus_constant (Pmode, stack_pointer_rtx, 4));
34744 }
34745 else
34746 {
34747 regno = AX_REG;
34748 if (aggr)
34749 {
34750 regno = DX_REG;
34751 if (nregs == 1)
34752 return gen_rtx_MEM (SImode,
34753 plus_constant (Pmode,
34754 stack_pointer_rtx, 4));
34755 }
34756 }
34757 return gen_rtx_REG (SImode, regno);
34758 }
34759
34760 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
34761 aggr ? 8 : 4));
34762 }
34763
34764 /* Determine whether x86_output_mi_thunk can succeed. */
34765
34766 static bool
34767 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
34768 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
34769 HOST_WIDE_INT vcall_offset, const_tree function)
34770 {
34771 /* 64-bit can handle anything. */
34772 if (TARGET_64BIT)
34773 return true;
34774
34775 /* For 32-bit, everything's fine if we have one free register. */
34776 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
34777 return true;
34778
34779 /* Need a free register for vcall_offset. */
34780 if (vcall_offset)
34781 return false;
34782
34783 /* Need a free register for GOT references. */
34784 if (flag_pic && !targetm.binds_local_p (function))
34785 return false;
34786
34787 /* Otherwise ok. */
34788 return true;
34789 }
34790
34791 /* Output the assembler code for a thunk function. THUNK_DECL is the
34792 declaration for the thunk function itself, FUNCTION is the decl for
34793 the target function. DELTA is an immediate constant offset to be
34794 added to THIS. If VCALL_OFFSET is nonzero, the word at
34795 *(*this + vcall_offset) should be added to THIS. */
34796
34797 static void
34798 x86_output_mi_thunk (FILE *file,
34799 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
34800 HOST_WIDE_INT vcall_offset, tree function)
34801 {
34802 rtx this_param = x86_this_parameter (function);
34803 rtx this_reg, tmp, fnaddr;
34804 unsigned int tmp_regno;
34805
34806 if (TARGET_64BIT)
34807 tmp_regno = R10_REG;
34808 else
34809 {
34810 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
34811 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
34812 tmp_regno = AX_REG;
34813 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
34814 tmp_regno = DX_REG;
34815 else
34816 tmp_regno = CX_REG;
34817 }
34818
34819 emit_note (NOTE_INSN_PROLOGUE_END);
34820
34821 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
34822 pull it in now and let DELTA benefit. */
34823 if (REG_P (this_param))
34824 this_reg = this_param;
34825 else if (vcall_offset)
34826 {
34827 /* Put the this parameter into %eax. */
34828 this_reg = gen_rtx_REG (Pmode, AX_REG);
34829 emit_move_insn (this_reg, this_param);
34830 }
34831 else
34832 this_reg = NULL_RTX;
34833
34834 /* Adjust the this parameter by a fixed constant. */
34835 if (delta)
34836 {
34837 rtx delta_rtx = GEN_INT (delta);
34838 rtx delta_dst = this_reg ? this_reg : this_param;
34839
34840 if (TARGET_64BIT)
34841 {
34842 if (!x86_64_general_operand (delta_rtx, Pmode))
34843 {
34844 tmp = gen_rtx_REG (Pmode, tmp_regno);
34845 emit_move_insn (tmp, delta_rtx);
34846 delta_rtx = tmp;
34847 }
34848 }
34849
34850 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
34851 }
34852
34853 /* Adjust the this parameter by a value stored in the vtable. */
34854 if (vcall_offset)
34855 {
34856 rtx vcall_addr, vcall_mem, this_mem;
34857
34858 tmp = gen_rtx_REG (Pmode, tmp_regno);
34859
34860 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
34861 if (Pmode != ptr_mode)
34862 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
34863 emit_move_insn (tmp, this_mem);
34864
34865 /* Adjust the this parameter. */
34866 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
34867 if (TARGET_64BIT
34868 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
34869 {
34870 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
34871 emit_move_insn (tmp2, GEN_INT (vcall_offset));
34872 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
34873 }
34874
34875 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
34876 if (Pmode != ptr_mode)
34877 emit_insn (gen_addsi_1_zext (this_reg,
34878 gen_rtx_REG (ptr_mode,
34879 REGNO (this_reg)),
34880 vcall_mem));
34881 else
34882 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
34883 }
34884
34885 /* If necessary, drop THIS back to its stack slot. */
34886 if (this_reg && this_reg != this_param)
34887 emit_move_insn (this_param, this_reg);
34888
34889 fnaddr = XEXP (DECL_RTL (function), 0);
34890 if (TARGET_64BIT)
34891 {
34892 if (!flag_pic || targetm.binds_local_p (function)
34893 || cfun->machine->call_abi == MS_ABI)
34894 ;
34895 else
34896 {
34897 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
34898 tmp = gen_rtx_CONST (Pmode, tmp);
34899 fnaddr = gen_rtx_MEM (Pmode, tmp);
34900 }
34901 }
34902 else
34903 {
34904 if (!flag_pic || targetm.binds_local_p (function))
34905 ;
34906 #if TARGET_MACHO
34907 else if (TARGET_MACHO)
34908 {
34909 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
34910 fnaddr = XEXP (fnaddr, 0);
34911 }
34912 #endif /* TARGET_MACHO */
34913 else
34914 {
34915 tmp = gen_rtx_REG (Pmode, CX_REG);
34916 output_set_got (tmp, NULL_RTX);
34917
34918 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
34919 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
34920 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
34921 }
34922 }
34923
34924 /* Our sibling call patterns do not allow memories, because we have no
34925 predicate that can distinguish between frame and non-frame memory.
34926 For our purposes here, we can get away with (ab)using a jump pattern,
34927 because we're going to do no optimization. */
34928 if (MEM_P (fnaddr))
34929 emit_jump_insn (gen_indirect_jump (fnaddr));
34930 else
34931 {
34932 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
34933 fnaddr = legitimize_pic_address (fnaddr,
34934 gen_rtx_REG (Pmode, tmp_regno));
34935
34936 if (!sibcall_insn_operand (fnaddr, word_mode))
34937 {
34938 tmp = gen_rtx_REG (word_mode, tmp_regno);
34939 if (GET_MODE (fnaddr) != word_mode)
34940 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
34941 emit_move_insn (tmp, fnaddr);
34942 fnaddr = tmp;
34943 }
34944
34945 tmp = gen_rtx_MEM (QImode, fnaddr);
34946 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
34947 tmp = emit_call_insn (tmp);
34948 SIBLING_CALL_P (tmp) = 1;
34949 }
34950 emit_barrier ();
34951
34952 /* Emit just enough of rest_of_compilation to get the insns emitted.
34953 Note that use_thunk calls assemble_start_function et al. */
34954 tmp = get_insns ();
34955 shorten_branches (tmp);
34956 final_start_function (tmp, file, 1);
34957 final (tmp, file, 1);
34958 final_end_function ();
34959 }
34960
34961 static void
34962 x86_file_start (void)
34963 {
34964 default_file_start ();
34965 #if TARGET_MACHO
34966 darwin_file_start ();
34967 #endif
34968 if (X86_FILE_START_VERSION_DIRECTIVE)
34969 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
34970 if (X86_FILE_START_FLTUSED)
34971 fputs ("\t.global\t__fltused\n", asm_out_file);
34972 if (ix86_asm_dialect == ASM_INTEL)
34973 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
34974 }
34975
34976 int
34977 x86_field_alignment (tree field, int computed)
34978 {
34979 enum machine_mode mode;
34980 tree type = TREE_TYPE (field);
34981
34982 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
34983 return computed;
34984 mode = TYPE_MODE (strip_array_types (type));
34985 if (mode == DFmode || mode == DCmode
34986 || GET_MODE_CLASS (mode) == MODE_INT
34987 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
34988 return MIN (32, computed);
34989 return computed;
34990 }
34991
34992 /* Output assembler code to FILE to increment profiler label # LABELNO
34993 for profiling a function entry. */
34994 void
34995 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
34996 {
34997 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
34998 : MCOUNT_NAME);
34999
35000 if (TARGET_64BIT)
35001 {
35002 #ifndef NO_PROFILE_COUNTERS
35003 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
35004 #endif
35005
35006 if (DEFAULT_ABI == SYSV_ABI && flag_pic)
35007 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
35008 else
35009 fprintf (file, "\tcall\t%s\n", mcount_name);
35010 }
35011 else if (flag_pic)
35012 {
35013 #ifndef NO_PROFILE_COUNTERS
35014 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
35015 LPREFIX, labelno);
35016 #endif
35017 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
35018 }
35019 else
35020 {
35021 #ifndef NO_PROFILE_COUNTERS
35022 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
35023 LPREFIX, labelno);
35024 #endif
35025 fprintf (file, "\tcall\t%s\n", mcount_name);
35026 }
35027 }
35028
35029 /* We don't have exact information about the insn sizes, but we may assume
35030 quite safely that we are informed about all 1 byte insns and memory
35031 address sizes. This is enough to eliminate unnecessary padding in
35032 99% of cases. */
35033
35034 static int
35035 min_insn_size (rtx insn)
35036 {
35037 int l = 0, len;
35038
35039 if (!INSN_P (insn) || !active_insn_p (insn))
35040 return 0;
35041
35042 /* Discard alignments we've emit and jump instructions. */
35043 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
35044 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
35045 return 0;
35046 if (JUMP_TABLE_DATA_P (insn))
35047 return 0;
35048
35049 /* Important case - calls are always 5 bytes.
35050 It is common to have many calls in the row. */
35051 if (CALL_P (insn)
35052 && symbolic_reference_mentioned_p (PATTERN (insn))
35053 && !SIBLING_CALL_P (insn))
35054 return 5;
35055 len = get_attr_length (insn);
35056 if (len <= 1)
35057 return 1;
35058
35059 /* For normal instructions we rely on get_attr_length being exact,
35060 with a few exceptions. */
35061 if (!JUMP_P (insn))
35062 {
35063 enum attr_type type = get_attr_type (insn);
35064
35065 switch (type)
35066 {
35067 case TYPE_MULTI:
35068 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
35069 || asm_noperands (PATTERN (insn)) >= 0)
35070 return 0;
35071 break;
35072 case TYPE_OTHER:
35073 case TYPE_FCMP:
35074 break;
35075 default:
35076 /* Otherwise trust get_attr_length. */
35077 return len;
35078 }
35079
35080 l = get_attr_length_address (insn);
35081 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
35082 l = 4;
35083 }
35084 if (l)
35085 return 1+l;
35086 else
35087 return 2;
35088 }
35089
35090 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
35091
35092 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
35093 window. */
35094
35095 static void
35096 ix86_avoid_jump_mispredicts (void)
35097 {
35098 rtx insn, start = get_insns ();
35099 int nbytes = 0, njumps = 0;
35100 int isjump = 0;
35101
35102 /* Look for all minimal intervals of instructions containing 4 jumps.
35103 The intervals are bounded by START and INSN. NBYTES is the total
35104 size of instructions in the interval including INSN and not including
35105 START. When the NBYTES is smaller than 16 bytes, it is possible
35106 that the end of START and INSN ends up in the same 16byte page.
35107
35108 The smallest offset in the page INSN can start is the case where START
35109 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
35110 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
35111 */
35112 for (insn = start; insn; insn = NEXT_INSN (insn))
35113 {
35114 int min_size;
35115
35116 if (LABEL_P (insn))
35117 {
35118 int align = label_to_alignment (insn);
35119 int max_skip = label_to_max_skip (insn);
35120
35121 if (max_skip > 15)
35122 max_skip = 15;
35123 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
35124 already in the current 16 byte page, because otherwise
35125 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
35126 bytes to reach 16 byte boundary. */
35127 if (align <= 0
35128 || (align <= 3 && max_skip != (1 << align) - 1))
35129 max_skip = 0;
35130 if (dump_file)
35131 fprintf (dump_file, "Label %i with max_skip %i\n",
35132 INSN_UID (insn), max_skip);
35133 if (max_skip)
35134 {
35135 while (nbytes + max_skip >= 16)
35136 {
35137 start = NEXT_INSN (start);
35138 if ((JUMP_P (start)
35139 && GET_CODE (PATTERN (start)) != ADDR_VEC
35140 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
35141 || CALL_P (start))
35142 njumps--, isjump = 1;
35143 else
35144 isjump = 0;
35145 nbytes -= min_insn_size (start);
35146 }
35147 }
35148 continue;
35149 }
35150
35151 min_size = min_insn_size (insn);
35152 nbytes += min_size;
35153 if (dump_file)
35154 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
35155 INSN_UID (insn), min_size);
35156 if ((JUMP_P (insn)
35157 && GET_CODE (PATTERN (insn)) != ADDR_VEC
35158 && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
35159 || CALL_P (insn))
35160 njumps++;
35161 else
35162 continue;
35163
35164 while (njumps > 3)
35165 {
35166 start = NEXT_INSN (start);
35167 if ((JUMP_P (start)
35168 && GET_CODE (PATTERN (start)) != ADDR_VEC
35169 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
35170 || CALL_P (start))
35171 njumps--, isjump = 1;
35172 else
35173 isjump = 0;
35174 nbytes -= min_insn_size (start);
35175 }
35176 gcc_assert (njumps >= 0);
35177 if (dump_file)
35178 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
35179 INSN_UID (start), INSN_UID (insn), nbytes);
35180
35181 if (njumps == 3 && isjump && nbytes < 16)
35182 {
35183 int padsize = 15 - nbytes + min_insn_size (insn);
35184
35185 if (dump_file)
35186 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
35187 INSN_UID (insn), padsize);
35188 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
35189 }
35190 }
35191 }
35192 #endif
35193
35194 /* AMD Athlon works faster
35195 when RET is not destination of conditional jump or directly preceded
35196 by other jump instruction. We avoid the penalty by inserting NOP just
35197 before the RET instructions in such cases. */
35198 static void
35199 ix86_pad_returns (void)
35200 {
35201 edge e;
35202 edge_iterator ei;
35203
35204 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
35205 {
35206 basic_block bb = e->src;
35207 rtx ret = BB_END (bb);
35208 rtx prev;
35209 bool replace = false;
35210
35211 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
35212 || optimize_bb_for_size_p (bb))
35213 continue;
35214 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
35215 if (active_insn_p (prev) || LABEL_P (prev))
35216 break;
35217 if (prev && LABEL_P (prev))
35218 {
35219 edge e;
35220 edge_iterator ei;
35221
35222 FOR_EACH_EDGE (e, ei, bb->preds)
35223 if (EDGE_FREQUENCY (e) && e->src->index >= 0
35224 && !(e->flags & EDGE_FALLTHRU))
35225 replace = true;
35226 }
35227 if (!replace)
35228 {
35229 prev = prev_active_insn (ret);
35230 if (prev
35231 && ((JUMP_P (prev) && any_condjump_p (prev))
35232 || CALL_P (prev)))
35233 replace = true;
35234 /* Empty functions get branch mispredict even when
35235 the jump destination is not visible to us. */
35236 if (!prev && !optimize_function_for_size_p (cfun))
35237 replace = true;
35238 }
35239 if (replace)
35240 {
35241 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
35242 delete_insn (ret);
35243 }
35244 }
35245 }
35246
35247 /* Count the minimum number of instructions in BB. Return 4 if the
35248 number of instructions >= 4. */
35249
35250 static int
35251 ix86_count_insn_bb (basic_block bb)
35252 {
35253 rtx insn;
35254 int insn_count = 0;
35255
35256 /* Count number of instructions in this block. Return 4 if the number
35257 of instructions >= 4. */
35258 FOR_BB_INSNS (bb, insn)
35259 {
35260 /* Only happen in exit blocks. */
35261 if (JUMP_P (insn)
35262 && ANY_RETURN_P (PATTERN (insn)))
35263 break;
35264
35265 if (NONDEBUG_INSN_P (insn)
35266 && GET_CODE (PATTERN (insn)) != USE
35267 && GET_CODE (PATTERN (insn)) != CLOBBER)
35268 {
35269 insn_count++;
35270 if (insn_count >= 4)
35271 return insn_count;
35272 }
35273 }
35274
35275 return insn_count;
35276 }
35277
35278
35279 /* Count the minimum number of instructions in code path in BB.
35280 Return 4 if the number of instructions >= 4. */
35281
35282 static int
35283 ix86_count_insn (basic_block bb)
35284 {
35285 edge e;
35286 edge_iterator ei;
35287 int min_prev_count;
35288
35289 /* Only bother counting instructions along paths with no
35290 more than 2 basic blocks between entry and exit. Given
35291 that BB has an edge to exit, determine if a predecessor
35292 of BB has an edge from entry. If so, compute the number
35293 of instructions in the predecessor block. If there
35294 happen to be multiple such blocks, compute the minimum. */
35295 min_prev_count = 4;
35296 FOR_EACH_EDGE (e, ei, bb->preds)
35297 {
35298 edge prev_e;
35299 edge_iterator prev_ei;
35300
35301 if (e->src == ENTRY_BLOCK_PTR)
35302 {
35303 min_prev_count = 0;
35304 break;
35305 }
35306 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
35307 {
35308 if (prev_e->src == ENTRY_BLOCK_PTR)
35309 {
35310 int count = ix86_count_insn_bb (e->src);
35311 if (count < min_prev_count)
35312 min_prev_count = count;
35313 break;
35314 }
35315 }
35316 }
35317
35318 if (min_prev_count < 4)
35319 min_prev_count += ix86_count_insn_bb (bb);
35320
35321 return min_prev_count;
35322 }
35323
35324 /* Pad short function to 4 instructions. */
35325
35326 static void
35327 ix86_pad_short_function (void)
35328 {
35329 edge e;
35330 edge_iterator ei;
35331
35332 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
35333 {
35334 rtx ret = BB_END (e->src);
35335 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
35336 {
35337 int insn_count = ix86_count_insn (e->src);
35338
35339 /* Pad short function. */
35340 if (insn_count < 4)
35341 {
35342 rtx insn = ret;
35343
35344 /* Find epilogue. */
35345 while (insn
35346 && (!NOTE_P (insn)
35347 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
35348 insn = PREV_INSN (insn);
35349
35350 if (!insn)
35351 insn = ret;
35352
35353 /* Two NOPs count as one instruction. */
35354 insn_count = 2 * (4 - insn_count);
35355 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
35356 }
35357 }
35358 }
35359 }
35360
35361 /* Implement machine specific optimizations. We implement padding of returns
35362 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
35363 static void
35364 ix86_reorg (void)
35365 {
35366 /* We are freeing block_for_insn in the toplev to keep compatibility
35367 with old MDEP_REORGS that are not CFG based. Recompute it now. */
35368 compute_bb_for_insn ();
35369
35370 if (optimize && optimize_function_for_speed_p (cfun))
35371 {
35372 if (TARGET_PAD_SHORT_FUNCTION)
35373 ix86_pad_short_function ();
35374 else if (TARGET_PAD_RETURNS)
35375 ix86_pad_returns ();
35376 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
35377 if (TARGET_FOUR_JUMP_LIMIT)
35378 ix86_avoid_jump_mispredicts ();
35379 #endif
35380 }
35381 }
35382
35383 /* Return nonzero when QImode register that must be represented via REX prefix
35384 is used. */
35385 bool
35386 x86_extended_QIreg_mentioned_p (rtx insn)
35387 {
35388 int i;
35389 extract_insn_cached (insn);
35390 for (i = 0; i < recog_data.n_operands; i++)
35391 if (GENERAL_REG_P (recog_data.operand[i])
35392 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
35393 return true;
35394 return false;
35395 }
35396
35397 /* Return nonzero when P points to register encoded via REX prefix.
35398 Called via for_each_rtx. */
35399 static int
35400 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
35401 {
35402 unsigned int regno;
35403 if (!REG_P (*p))
35404 return 0;
35405 regno = REGNO (*p);
35406 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
35407 }
35408
35409 /* Return true when INSN mentions register that must be encoded using REX
35410 prefix. */
35411 bool
35412 x86_extended_reg_mentioned_p (rtx insn)
35413 {
35414 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
35415 extended_reg_mentioned_1, NULL);
35416 }
35417
35418 /* If profitable, negate (without causing overflow) integer constant
35419 of mode MODE at location LOC. Return true in this case. */
35420 bool
35421 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
35422 {
35423 HOST_WIDE_INT val;
35424
35425 if (!CONST_INT_P (*loc))
35426 return false;
35427
35428 switch (mode)
35429 {
35430 case DImode:
35431 /* DImode x86_64 constants must fit in 32 bits. */
35432 gcc_assert (x86_64_immediate_operand (*loc, mode));
35433
35434 mode = SImode;
35435 break;
35436
35437 case SImode:
35438 case HImode:
35439 case QImode:
35440 break;
35441
35442 default:
35443 gcc_unreachable ();
35444 }
35445
35446 /* Avoid overflows. */
35447 if (mode_signbit_p (mode, *loc))
35448 return false;
35449
35450 val = INTVAL (*loc);
35451
35452 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
35453 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
35454 if ((val < 0 && val != -128)
35455 || val == 128)
35456 {
35457 *loc = GEN_INT (-val);
35458 return true;
35459 }
35460
35461 return false;
35462 }
35463
35464 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
35465 optabs would emit if we didn't have TFmode patterns. */
35466
35467 void
35468 x86_emit_floatuns (rtx operands[2])
35469 {
35470 rtx neglab, donelab, i0, i1, f0, in, out;
35471 enum machine_mode mode, inmode;
35472
35473 inmode = GET_MODE (operands[1]);
35474 gcc_assert (inmode == SImode || inmode == DImode);
35475
35476 out = operands[0];
35477 in = force_reg (inmode, operands[1]);
35478 mode = GET_MODE (out);
35479 neglab = gen_label_rtx ();
35480 donelab = gen_label_rtx ();
35481 f0 = gen_reg_rtx (mode);
35482
35483 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
35484
35485 expand_float (out, in, 0);
35486
35487 emit_jump_insn (gen_jump (donelab));
35488 emit_barrier ();
35489
35490 emit_label (neglab);
35491
35492 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
35493 1, OPTAB_DIRECT);
35494 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
35495 1, OPTAB_DIRECT);
35496 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
35497
35498 expand_float (f0, i0, 0);
35499
35500 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
35501
35502 emit_label (donelab);
35503 }
35504 \f
35505 /* AVX2 does support 32-byte integer vector operations,
35506 thus the longest vector we are faced with is V32QImode. */
35507 #define MAX_VECT_LEN 32
35508
35509 struct expand_vec_perm_d
35510 {
35511 rtx target, op0, op1;
35512 unsigned char perm[MAX_VECT_LEN];
35513 enum machine_mode vmode;
35514 unsigned char nelt;
35515 bool one_operand_p;
35516 bool testing_p;
35517 };
35518
35519 static bool canonicalize_perm (struct expand_vec_perm_d *d);
35520 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
35521 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
35522
35523 /* Get a vector mode of the same size as the original but with elements
35524 twice as wide. This is only guaranteed to apply to integral vectors. */
35525
35526 static inline enum machine_mode
35527 get_mode_wider_vector (enum machine_mode o)
35528 {
35529 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
35530 enum machine_mode n = GET_MODE_WIDER_MODE (o);
35531 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
35532 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
35533 return n;
35534 }
35535
35536 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
35537 with all elements equal to VAR. Return true if successful. */
35538
35539 static bool
35540 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
35541 rtx target, rtx val)
35542 {
35543 bool ok;
35544
35545 switch (mode)
35546 {
35547 case V2SImode:
35548 case V2SFmode:
35549 if (!mmx_ok)
35550 return false;
35551 /* FALLTHRU */
35552
35553 case V4DFmode:
35554 case V4DImode:
35555 case V8SFmode:
35556 case V8SImode:
35557 case V2DFmode:
35558 case V2DImode:
35559 case V4SFmode:
35560 case V4SImode:
35561 {
35562 rtx insn, dup;
35563
35564 /* First attempt to recognize VAL as-is. */
35565 dup = gen_rtx_VEC_DUPLICATE (mode, val);
35566 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
35567 if (recog_memoized (insn) < 0)
35568 {
35569 rtx seq;
35570 /* If that fails, force VAL into a register. */
35571
35572 start_sequence ();
35573 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
35574 seq = get_insns ();
35575 end_sequence ();
35576 if (seq)
35577 emit_insn_before (seq, insn);
35578
35579 ok = recog_memoized (insn) >= 0;
35580 gcc_assert (ok);
35581 }
35582 }
35583 return true;
35584
35585 case V4HImode:
35586 if (!mmx_ok)
35587 return false;
35588 if (TARGET_SSE || TARGET_3DNOW_A)
35589 {
35590 rtx x;
35591
35592 val = gen_lowpart (SImode, val);
35593 x = gen_rtx_TRUNCATE (HImode, val);
35594 x = gen_rtx_VEC_DUPLICATE (mode, x);
35595 emit_insn (gen_rtx_SET (VOIDmode, target, x));
35596 return true;
35597 }
35598 goto widen;
35599
35600 case V8QImode:
35601 if (!mmx_ok)
35602 return false;
35603 goto widen;
35604
35605 case V8HImode:
35606 if (TARGET_SSE2)
35607 {
35608 struct expand_vec_perm_d dperm;
35609 rtx tmp1, tmp2;
35610
35611 permute:
35612 memset (&dperm, 0, sizeof (dperm));
35613 dperm.target = target;
35614 dperm.vmode = mode;
35615 dperm.nelt = GET_MODE_NUNITS (mode);
35616 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
35617 dperm.one_operand_p = true;
35618
35619 /* Extend to SImode using a paradoxical SUBREG. */
35620 tmp1 = gen_reg_rtx (SImode);
35621 emit_move_insn (tmp1, gen_lowpart (SImode, val));
35622
35623 /* Insert the SImode value as low element of a V4SImode vector. */
35624 tmp2 = gen_lowpart (V4SImode, dperm.op0);
35625 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
35626
35627 ok = (expand_vec_perm_1 (&dperm)
35628 || expand_vec_perm_broadcast_1 (&dperm));
35629 gcc_assert (ok);
35630 return ok;
35631 }
35632 goto widen;
35633
35634 case V16QImode:
35635 if (TARGET_SSE2)
35636 goto permute;
35637 goto widen;
35638
35639 widen:
35640 /* Replicate the value once into the next wider mode and recurse. */
35641 {
35642 enum machine_mode smode, wsmode, wvmode;
35643 rtx x;
35644
35645 smode = GET_MODE_INNER (mode);
35646 wvmode = get_mode_wider_vector (mode);
35647 wsmode = GET_MODE_INNER (wvmode);
35648
35649 val = convert_modes (wsmode, smode, val, true);
35650 x = expand_simple_binop (wsmode, ASHIFT, val,
35651 GEN_INT (GET_MODE_BITSIZE (smode)),
35652 NULL_RTX, 1, OPTAB_LIB_WIDEN);
35653 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
35654
35655 x = gen_lowpart (wvmode, target);
35656 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
35657 gcc_assert (ok);
35658 return ok;
35659 }
35660
35661 case V16HImode:
35662 case V32QImode:
35663 {
35664 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
35665 rtx x = gen_reg_rtx (hvmode);
35666
35667 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
35668 gcc_assert (ok);
35669
35670 x = gen_rtx_VEC_CONCAT (mode, x, x);
35671 emit_insn (gen_rtx_SET (VOIDmode, target, x));
35672 }
35673 return true;
35674
35675 default:
35676 return false;
35677 }
35678 }
35679
35680 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
35681 whose ONE_VAR element is VAR, and other elements are zero. Return true
35682 if successful. */
35683
35684 static bool
35685 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
35686 rtx target, rtx var, int one_var)
35687 {
35688 enum machine_mode vsimode;
35689 rtx new_target;
35690 rtx x, tmp;
35691 bool use_vector_set = false;
35692
35693 switch (mode)
35694 {
35695 case V2DImode:
35696 /* For SSE4.1, we normally use vector set. But if the second
35697 element is zero and inter-unit moves are OK, we use movq
35698 instead. */
35699 use_vector_set = (TARGET_64BIT
35700 && TARGET_SSE4_1
35701 && !(TARGET_INTER_UNIT_MOVES
35702 && one_var == 0));
35703 break;
35704 case V16QImode:
35705 case V4SImode:
35706 case V4SFmode:
35707 use_vector_set = TARGET_SSE4_1;
35708 break;
35709 case V8HImode:
35710 use_vector_set = TARGET_SSE2;
35711 break;
35712 case V4HImode:
35713 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
35714 break;
35715 case V32QImode:
35716 case V16HImode:
35717 case V8SImode:
35718 case V8SFmode:
35719 case V4DFmode:
35720 use_vector_set = TARGET_AVX;
35721 break;
35722 case V4DImode:
35723 /* Use ix86_expand_vector_set in 64bit mode only. */
35724 use_vector_set = TARGET_AVX && TARGET_64BIT;
35725 break;
35726 default:
35727 break;
35728 }
35729
35730 if (use_vector_set)
35731 {
35732 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
35733 var = force_reg (GET_MODE_INNER (mode), var);
35734 ix86_expand_vector_set (mmx_ok, target, var, one_var);
35735 return true;
35736 }
35737
35738 switch (mode)
35739 {
35740 case V2SFmode:
35741 case V2SImode:
35742 if (!mmx_ok)
35743 return false;
35744 /* FALLTHRU */
35745
35746 case V2DFmode:
35747 case V2DImode:
35748 if (one_var != 0)
35749 return false;
35750 var = force_reg (GET_MODE_INNER (mode), var);
35751 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
35752 emit_insn (gen_rtx_SET (VOIDmode, target, x));
35753 return true;
35754
35755 case V4SFmode:
35756 case V4SImode:
35757 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
35758 new_target = gen_reg_rtx (mode);
35759 else
35760 new_target = target;
35761 var = force_reg (GET_MODE_INNER (mode), var);
35762 x = gen_rtx_VEC_DUPLICATE (mode, var);
35763 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
35764 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
35765 if (one_var != 0)
35766 {
35767 /* We need to shuffle the value to the correct position, so
35768 create a new pseudo to store the intermediate result. */
35769
35770 /* With SSE2, we can use the integer shuffle insns. */
35771 if (mode != V4SFmode && TARGET_SSE2)
35772 {
35773 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
35774 const1_rtx,
35775 GEN_INT (one_var == 1 ? 0 : 1),
35776 GEN_INT (one_var == 2 ? 0 : 1),
35777 GEN_INT (one_var == 3 ? 0 : 1)));
35778 if (target != new_target)
35779 emit_move_insn (target, new_target);
35780 return true;
35781 }
35782
35783 /* Otherwise convert the intermediate result to V4SFmode and
35784 use the SSE1 shuffle instructions. */
35785 if (mode != V4SFmode)
35786 {
35787 tmp = gen_reg_rtx (V4SFmode);
35788 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
35789 }
35790 else
35791 tmp = new_target;
35792
35793 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
35794 const1_rtx,
35795 GEN_INT (one_var == 1 ? 0 : 1),
35796 GEN_INT (one_var == 2 ? 0+4 : 1+4),
35797 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
35798
35799 if (mode != V4SFmode)
35800 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
35801 else if (tmp != target)
35802 emit_move_insn (target, tmp);
35803 }
35804 else if (target != new_target)
35805 emit_move_insn (target, new_target);
35806 return true;
35807
35808 case V8HImode:
35809 case V16QImode:
35810 vsimode = V4SImode;
35811 goto widen;
35812 case V4HImode:
35813 case V8QImode:
35814 if (!mmx_ok)
35815 return false;
35816 vsimode = V2SImode;
35817 goto widen;
35818 widen:
35819 if (one_var != 0)
35820 return false;
35821
35822 /* Zero extend the variable element to SImode and recurse. */
35823 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
35824
35825 x = gen_reg_rtx (vsimode);
35826 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
35827 var, one_var))
35828 gcc_unreachable ();
35829
35830 emit_move_insn (target, gen_lowpart (mode, x));
35831 return true;
35832
35833 default:
35834 return false;
35835 }
35836 }
35837
35838 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
35839 consisting of the values in VALS. It is known that all elements
35840 except ONE_VAR are constants. Return true if successful. */
35841
35842 static bool
35843 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
35844 rtx target, rtx vals, int one_var)
35845 {
35846 rtx var = XVECEXP (vals, 0, one_var);
35847 enum machine_mode wmode;
35848 rtx const_vec, x;
35849
35850 const_vec = copy_rtx (vals);
35851 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
35852 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
35853
35854 switch (mode)
35855 {
35856 case V2DFmode:
35857 case V2DImode:
35858 case V2SFmode:
35859 case V2SImode:
35860 /* For the two element vectors, it's just as easy to use
35861 the general case. */
35862 return false;
35863
35864 case V4DImode:
35865 /* Use ix86_expand_vector_set in 64bit mode only. */
35866 if (!TARGET_64BIT)
35867 return false;
35868 case V4DFmode:
35869 case V8SFmode:
35870 case V8SImode:
35871 case V16HImode:
35872 case V32QImode:
35873 case V4SFmode:
35874 case V4SImode:
35875 case V8HImode:
35876 case V4HImode:
35877 break;
35878
35879 case V16QImode:
35880 if (TARGET_SSE4_1)
35881 break;
35882 wmode = V8HImode;
35883 goto widen;
35884 case V8QImode:
35885 wmode = V4HImode;
35886 goto widen;
35887 widen:
35888 /* There's no way to set one QImode entry easily. Combine
35889 the variable value with its adjacent constant value, and
35890 promote to an HImode set. */
35891 x = XVECEXP (vals, 0, one_var ^ 1);
35892 if (one_var & 1)
35893 {
35894 var = convert_modes (HImode, QImode, var, true);
35895 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
35896 NULL_RTX, 1, OPTAB_LIB_WIDEN);
35897 x = GEN_INT (INTVAL (x) & 0xff);
35898 }
35899 else
35900 {
35901 var = convert_modes (HImode, QImode, var, true);
35902 x = gen_int_mode (INTVAL (x) << 8, HImode);
35903 }
35904 if (x != const0_rtx)
35905 var = expand_simple_binop (HImode, IOR, var, x, var,
35906 1, OPTAB_LIB_WIDEN);
35907
35908 x = gen_reg_rtx (wmode);
35909 emit_move_insn (x, gen_lowpart (wmode, const_vec));
35910 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
35911
35912 emit_move_insn (target, gen_lowpart (mode, x));
35913 return true;
35914
35915 default:
35916 return false;
35917 }
35918
35919 emit_move_insn (target, const_vec);
35920 ix86_expand_vector_set (mmx_ok, target, var, one_var);
35921 return true;
35922 }
35923
35924 /* A subroutine of ix86_expand_vector_init_general. Use vector
35925 concatenate to handle the most general case: all values variable,
35926 and none identical. */
35927
35928 static void
35929 ix86_expand_vector_init_concat (enum machine_mode mode,
35930 rtx target, rtx *ops, int n)
35931 {
35932 enum machine_mode cmode, hmode = VOIDmode;
35933 rtx first[8], second[4];
35934 rtvec v;
35935 int i, j;
35936
35937 switch (n)
35938 {
35939 case 2:
35940 switch (mode)
35941 {
35942 case V8SImode:
35943 cmode = V4SImode;
35944 break;
35945 case V8SFmode:
35946 cmode = V4SFmode;
35947 break;
35948 case V4DImode:
35949 cmode = V2DImode;
35950 break;
35951 case V4DFmode:
35952 cmode = V2DFmode;
35953 break;
35954 case V4SImode:
35955 cmode = V2SImode;
35956 break;
35957 case V4SFmode:
35958 cmode = V2SFmode;
35959 break;
35960 case V2DImode:
35961 cmode = DImode;
35962 break;
35963 case V2SImode:
35964 cmode = SImode;
35965 break;
35966 case V2DFmode:
35967 cmode = DFmode;
35968 break;
35969 case V2SFmode:
35970 cmode = SFmode;
35971 break;
35972 default:
35973 gcc_unreachable ();
35974 }
35975
35976 if (!register_operand (ops[1], cmode))
35977 ops[1] = force_reg (cmode, ops[1]);
35978 if (!register_operand (ops[0], cmode))
35979 ops[0] = force_reg (cmode, ops[0]);
35980 emit_insn (gen_rtx_SET (VOIDmode, target,
35981 gen_rtx_VEC_CONCAT (mode, ops[0],
35982 ops[1])));
35983 break;
35984
35985 case 4:
35986 switch (mode)
35987 {
35988 case V4DImode:
35989 cmode = V2DImode;
35990 break;
35991 case V4DFmode:
35992 cmode = V2DFmode;
35993 break;
35994 case V4SImode:
35995 cmode = V2SImode;
35996 break;
35997 case V4SFmode:
35998 cmode = V2SFmode;
35999 break;
36000 default:
36001 gcc_unreachable ();
36002 }
36003 goto half;
36004
36005 case 8:
36006 switch (mode)
36007 {
36008 case V8SImode:
36009 cmode = V2SImode;
36010 hmode = V4SImode;
36011 break;
36012 case V8SFmode:
36013 cmode = V2SFmode;
36014 hmode = V4SFmode;
36015 break;
36016 default:
36017 gcc_unreachable ();
36018 }
36019 goto half;
36020
36021 half:
36022 /* FIXME: We process inputs backward to help RA. PR 36222. */
36023 i = n - 1;
36024 j = (n >> 1) - 1;
36025 for (; i > 0; i -= 2, j--)
36026 {
36027 first[j] = gen_reg_rtx (cmode);
36028 v = gen_rtvec (2, ops[i - 1], ops[i]);
36029 ix86_expand_vector_init (false, first[j],
36030 gen_rtx_PARALLEL (cmode, v));
36031 }
36032
36033 n >>= 1;
36034 if (n > 2)
36035 {
36036 gcc_assert (hmode != VOIDmode);
36037 for (i = j = 0; i < n; i += 2, j++)
36038 {
36039 second[j] = gen_reg_rtx (hmode);
36040 ix86_expand_vector_init_concat (hmode, second [j],
36041 &first [i], 2);
36042 }
36043 n >>= 1;
36044 ix86_expand_vector_init_concat (mode, target, second, n);
36045 }
36046 else
36047 ix86_expand_vector_init_concat (mode, target, first, n);
36048 break;
36049
36050 default:
36051 gcc_unreachable ();
36052 }
36053 }
36054
36055 /* A subroutine of ix86_expand_vector_init_general. Use vector
36056 interleave to handle the most general case: all values variable,
36057 and none identical. */
36058
36059 static void
36060 ix86_expand_vector_init_interleave (enum machine_mode mode,
36061 rtx target, rtx *ops, int n)
36062 {
36063 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
36064 int i, j;
36065 rtx op0, op1;
36066 rtx (*gen_load_even) (rtx, rtx, rtx);
36067 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
36068 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
36069
36070 switch (mode)
36071 {
36072 case V8HImode:
36073 gen_load_even = gen_vec_setv8hi;
36074 gen_interleave_first_low = gen_vec_interleave_lowv4si;
36075 gen_interleave_second_low = gen_vec_interleave_lowv2di;
36076 inner_mode = HImode;
36077 first_imode = V4SImode;
36078 second_imode = V2DImode;
36079 third_imode = VOIDmode;
36080 break;
36081 case V16QImode:
36082 gen_load_even = gen_vec_setv16qi;
36083 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
36084 gen_interleave_second_low = gen_vec_interleave_lowv4si;
36085 inner_mode = QImode;
36086 first_imode = V8HImode;
36087 second_imode = V4SImode;
36088 third_imode = V2DImode;
36089 break;
36090 default:
36091 gcc_unreachable ();
36092 }
36093
36094 for (i = 0; i < n; i++)
36095 {
36096 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
36097 op0 = gen_reg_rtx (SImode);
36098 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
36099
36100 /* Insert the SImode value as low element of V4SImode vector. */
36101 op1 = gen_reg_rtx (V4SImode);
36102 op0 = gen_rtx_VEC_MERGE (V4SImode,
36103 gen_rtx_VEC_DUPLICATE (V4SImode,
36104 op0),
36105 CONST0_RTX (V4SImode),
36106 const1_rtx);
36107 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
36108
36109 /* Cast the V4SImode vector back to a vector in orignal mode. */
36110 op0 = gen_reg_rtx (mode);
36111 emit_move_insn (op0, gen_lowpart (mode, op1));
36112
36113 /* Load even elements into the second positon. */
36114 emit_insn (gen_load_even (op0,
36115 force_reg (inner_mode,
36116 ops [i + i + 1]),
36117 const1_rtx));
36118
36119 /* Cast vector to FIRST_IMODE vector. */
36120 ops[i] = gen_reg_rtx (first_imode);
36121 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
36122 }
36123
36124 /* Interleave low FIRST_IMODE vectors. */
36125 for (i = j = 0; i < n; i += 2, j++)
36126 {
36127 op0 = gen_reg_rtx (first_imode);
36128 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
36129
36130 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
36131 ops[j] = gen_reg_rtx (second_imode);
36132 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
36133 }
36134
36135 /* Interleave low SECOND_IMODE vectors. */
36136 switch (second_imode)
36137 {
36138 case V4SImode:
36139 for (i = j = 0; i < n / 2; i += 2, j++)
36140 {
36141 op0 = gen_reg_rtx (second_imode);
36142 emit_insn (gen_interleave_second_low (op0, ops[i],
36143 ops[i + 1]));
36144
36145 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
36146 vector. */
36147 ops[j] = gen_reg_rtx (third_imode);
36148 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
36149 }
36150 second_imode = V2DImode;
36151 gen_interleave_second_low = gen_vec_interleave_lowv2di;
36152 /* FALLTHRU */
36153
36154 case V2DImode:
36155 op0 = gen_reg_rtx (second_imode);
36156 emit_insn (gen_interleave_second_low (op0, ops[0],
36157 ops[1]));
36158
36159 /* Cast the SECOND_IMODE vector back to a vector on original
36160 mode. */
36161 emit_insn (gen_rtx_SET (VOIDmode, target,
36162 gen_lowpart (mode, op0)));
36163 break;
36164
36165 default:
36166 gcc_unreachable ();
36167 }
36168 }
36169
36170 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
36171 all values variable, and none identical. */
36172
36173 static void
36174 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
36175 rtx target, rtx vals)
36176 {
36177 rtx ops[32], op0, op1;
36178 enum machine_mode half_mode = VOIDmode;
36179 int n, i;
36180
36181 switch (mode)
36182 {
36183 case V2SFmode:
36184 case V2SImode:
36185 if (!mmx_ok && !TARGET_SSE)
36186 break;
36187 /* FALLTHRU */
36188
36189 case V8SFmode:
36190 case V8SImode:
36191 case V4DFmode:
36192 case V4DImode:
36193 case V4SFmode:
36194 case V4SImode:
36195 case V2DFmode:
36196 case V2DImode:
36197 n = GET_MODE_NUNITS (mode);
36198 for (i = 0; i < n; i++)
36199 ops[i] = XVECEXP (vals, 0, i);
36200 ix86_expand_vector_init_concat (mode, target, ops, n);
36201 return;
36202
36203 case V32QImode:
36204 half_mode = V16QImode;
36205 goto half;
36206
36207 case V16HImode:
36208 half_mode = V8HImode;
36209 goto half;
36210
36211 half:
36212 n = GET_MODE_NUNITS (mode);
36213 for (i = 0; i < n; i++)
36214 ops[i] = XVECEXP (vals, 0, i);
36215 op0 = gen_reg_rtx (half_mode);
36216 op1 = gen_reg_rtx (half_mode);
36217 ix86_expand_vector_init_interleave (half_mode, op0, ops,
36218 n >> 2);
36219 ix86_expand_vector_init_interleave (half_mode, op1,
36220 &ops [n >> 1], n >> 2);
36221 emit_insn (gen_rtx_SET (VOIDmode, target,
36222 gen_rtx_VEC_CONCAT (mode, op0, op1)));
36223 return;
36224
36225 case V16QImode:
36226 if (!TARGET_SSE4_1)
36227 break;
36228 /* FALLTHRU */
36229
36230 case V8HImode:
36231 if (!TARGET_SSE2)
36232 break;
36233
36234 /* Don't use ix86_expand_vector_init_interleave if we can't
36235 move from GPR to SSE register directly. */
36236 if (!TARGET_INTER_UNIT_MOVES)
36237 break;
36238
36239 n = GET_MODE_NUNITS (mode);
36240 for (i = 0; i < n; i++)
36241 ops[i] = XVECEXP (vals, 0, i);
36242 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
36243 return;
36244
36245 case V4HImode:
36246 case V8QImode:
36247 break;
36248
36249 default:
36250 gcc_unreachable ();
36251 }
36252
36253 {
36254 int i, j, n_elts, n_words, n_elt_per_word;
36255 enum machine_mode inner_mode;
36256 rtx words[4], shift;
36257
36258 inner_mode = GET_MODE_INNER (mode);
36259 n_elts = GET_MODE_NUNITS (mode);
36260 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
36261 n_elt_per_word = n_elts / n_words;
36262 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
36263
36264 for (i = 0; i < n_words; ++i)
36265 {
36266 rtx word = NULL_RTX;
36267
36268 for (j = 0; j < n_elt_per_word; ++j)
36269 {
36270 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
36271 elt = convert_modes (word_mode, inner_mode, elt, true);
36272
36273 if (j == 0)
36274 word = elt;
36275 else
36276 {
36277 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
36278 word, 1, OPTAB_LIB_WIDEN);
36279 word = expand_simple_binop (word_mode, IOR, word, elt,
36280 word, 1, OPTAB_LIB_WIDEN);
36281 }
36282 }
36283
36284 words[i] = word;
36285 }
36286
36287 if (n_words == 1)
36288 emit_move_insn (target, gen_lowpart (mode, words[0]));
36289 else if (n_words == 2)
36290 {
36291 rtx tmp = gen_reg_rtx (mode);
36292 emit_clobber (tmp);
36293 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
36294 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
36295 emit_move_insn (target, tmp);
36296 }
36297 else if (n_words == 4)
36298 {
36299 rtx tmp = gen_reg_rtx (V4SImode);
36300 gcc_assert (word_mode == SImode);
36301 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
36302 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
36303 emit_move_insn (target, gen_lowpart (mode, tmp));
36304 }
36305 else
36306 gcc_unreachable ();
36307 }
36308 }
36309
36310 /* Initialize vector TARGET via VALS. Suppress the use of MMX
36311 instructions unless MMX_OK is true. */
36312
36313 void
36314 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
36315 {
36316 enum machine_mode mode = GET_MODE (target);
36317 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36318 int n_elts = GET_MODE_NUNITS (mode);
36319 int n_var = 0, one_var = -1;
36320 bool all_same = true, all_const_zero = true;
36321 int i;
36322 rtx x;
36323
36324 for (i = 0; i < n_elts; ++i)
36325 {
36326 x = XVECEXP (vals, 0, i);
36327 if (!(CONST_INT_P (x)
36328 || GET_CODE (x) == CONST_DOUBLE
36329 || GET_CODE (x) == CONST_FIXED))
36330 n_var++, one_var = i;
36331 else if (x != CONST0_RTX (inner_mode))
36332 all_const_zero = false;
36333 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
36334 all_same = false;
36335 }
36336
36337 /* Constants are best loaded from the constant pool. */
36338 if (n_var == 0)
36339 {
36340 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
36341 return;
36342 }
36343
36344 /* If all values are identical, broadcast the value. */
36345 if (all_same
36346 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
36347 XVECEXP (vals, 0, 0)))
36348 return;
36349
36350 /* Values where only one field is non-constant are best loaded from
36351 the pool and overwritten via move later. */
36352 if (n_var == 1)
36353 {
36354 if (all_const_zero
36355 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
36356 XVECEXP (vals, 0, one_var),
36357 one_var))
36358 return;
36359
36360 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
36361 return;
36362 }
36363
36364 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
36365 }
36366
36367 void
36368 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
36369 {
36370 enum machine_mode mode = GET_MODE (target);
36371 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36372 enum machine_mode half_mode;
36373 bool use_vec_merge = false;
36374 rtx tmp;
36375 static rtx (*gen_extract[6][2]) (rtx, rtx)
36376 = {
36377 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
36378 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
36379 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
36380 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
36381 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
36382 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
36383 };
36384 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
36385 = {
36386 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
36387 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
36388 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
36389 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
36390 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
36391 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
36392 };
36393 int i, j, n;
36394
36395 switch (mode)
36396 {
36397 case V2SFmode:
36398 case V2SImode:
36399 if (mmx_ok)
36400 {
36401 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
36402 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
36403 if (elt == 0)
36404 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
36405 else
36406 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
36407 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36408 return;
36409 }
36410 break;
36411
36412 case V2DImode:
36413 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
36414 if (use_vec_merge)
36415 break;
36416
36417 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
36418 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
36419 if (elt == 0)
36420 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
36421 else
36422 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
36423 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36424 return;
36425
36426 case V2DFmode:
36427 {
36428 rtx op0, op1;
36429
36430 /* For the two element vectors, we implement a VEC_CONCAT with
36431 the extraction of the other element. */
36432
36433 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
36434 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
36435
36436 if (elt == 0)
36437 op0 = val, op1 = tmp;
36438 else
36439 op0 = tmp, op1 = val;
36440
36441 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
36442 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36443 }
36444 return;
36445
36446 case V4SFmode:
36447 use_vec_merge = TARGET_SSE4_1;
36448 if (use_vec_merge)
36449 break;
36450
36451 switch (elt)
36452 {
36453 case 0:
36454 use_vec_merge = true;
36455 break;
36456
36457 case 1:
36458 /* tmp = target = A B C D */
36459 tmp = copy_to_reg (target);
36460 /* target = A A B B */
36461 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
36462 /* target = X A B B */
36463 ix86_expand_vector_set (false, target, val, 0);
36464 /* target = A X C D */
36465 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
36466 const1_rtx, const0_rtx,
36467 GEN_INT (2+4), GEN_INT (3+4)));
36468 return;
36469
36470 case 2:
36471 /* tmp = target = A B C D */
36472 tmp = copy_to_reg (target);
36473 /* tmp = X B C D */
36474 ix86_expand_vector_set (false, tmp, val, 0);
36475 /* target = A B X D */
36476 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
36477 const0_rtx, const1_rtx,
36478 GEN_INT (0+4), GEN_INT (3+4)));
36479 return;
36480
36481 case 3:
36482 /* tmp = target = A B C D */
36483 tmp = copy_to_reg (target);
36484 /* tmp = X B C D */
36485 ix86_expand_vector_set (false, tmp, val, 0);
36486 /* target = A B X D */
36487 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
36488 const0_rtx, const1_rtx,
36489 GEN_INT (2+4), GEN_INT (0+4)));
36490 return;
36491
36492 default:
36493 gcc_unreachable ();
36494 }
36495 break;
36496
36497 case V4SImode:
36498 use_vec_merge = TARGET_SSE4_1;
36499 if (use_vec_merge)
36500 break;
36501
36502 /* Element 0 handled by vec_merge below. */
36503 if (elt == 0)
36504 {
36505 use_vec_merge = true;
36506 break;
36507 }
36508
36509 if (TARGET_SSE2)
36510 {
36511 /* With SSE2, use integer shuffles to swap element 0 and ELT,
36512 store into element 0, then shuffle them back. */
36513
36514 rtx order[4];
36515
36516 order[0] = GEN_INT (elt);
36517 order[1] = const1_rtx;
36518 order[2] = const2_rtx;
36519 order[3] = GEN_INT (3);
36520 order[elt] = const0_rtx;
36521
36522 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
36523 order[1], order[2], order[3]));
36524
36525 ix86_expand_vector_set (false, target, val, 0);
36526
36527 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
36528 order[1], order[2], order[3]));
36529 }
36530 else
36531 {
36532 /* For SSE1, we have to reuse the V4SF code. */
36533 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
36534 gen_lowpart (SFmode, val), elt);
36535 }
36536 return;
36537
36538 case V8HImode:
36539 use_vec_merge = TARGET_SSE2;
36540 break;
36541 case V4HImode:
36542 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
36543 break;
36544
36545 case V16QImode:
36546 use_vec_merge = TARGET_SSE4_1;
36547 break;
36548
36549 case V8QImode:
36550 break;
36551
36552 case V32QImode:
36553 half_mode = V16QImode;
36554 j = 0;
36555 n = 16;
36556 goto half;
36557
36558 case V16HImode:
36559 half_mode = V8HImode;
36560 j = 1;
36561 n = 8;
36562 goto half;
36563
36564 case V8SImode:
36565 half_mode = V4SImode;
36566 j = 2;
36567 n = 4;
36568 goto half;
36569
36570 case V4DImode:
36571 half_mode = V2DImode;
36572 j = 3;
36573 n = 2;
36574 goto half;
36575
36576 case V8SFmode:
36577 half_mode = V4SFmode;
36578 j = 4;
36579 n = 4;
36580 goto half;
36581
36582 case V4DFmode:
36583 half_mode = V2DFmode;
36584 j = 5;
36585 n = 2;
36586 goto half;
36587
36588 half:
36589 /* Compute offset. */
36590 i = elt / n;
36591 elt %= n;
36592
36593 gcc_assert (i <= 1);
36594
36595 /* Extract the half. */
36596 tmp = gen_reg_rtx (half_mode);
36597 emit_insn (gen_extract[j][i] (tmp, target));
36598
36599 /* Put val in tmp at elt. */
36600 ix86_expand_vector_set (false, tmp, val, elt);
36601
36602 /* Put it back. */
36603 emit_insn (gen_insert[j][i] (target, target, tmp));
36604 return;
36605
36606 default:
36607 break;
36608 }
36609
36610 if (use_vec_merge)
36611 {
36612 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
36613 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
36614 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36615 }
36616 else
36617 {
36618 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
36619
36620 emit_move_insn (mem, target);
36621
36622 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
36623 emit_move_insn (tmp, val);
36624
36625 emit_move_insn (target, mem);
36626 }
36627 }
36628
36629 void
36630 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
36631 {
36632 enum machine_mode mode = GET_MODE (vec);
36633 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36634 bool use_vec_extr = false;
36635 rtx tmp;
36636
36637 switch (mode)
36638 {
36639 case V2SImode:
36640 case V2SFmode:
36641 if (!mmx_ok)
36642 break;
36643 /* FALLTHRU */
36644
36645 case V2DFmode:
36646 case V2DImode:
36647 use_vec_extr = true;
36648 break;
36649
36650 case V4SFmode:
36651 use_vec_extr = TARGET_SSE4_1;
36652 if (use_vec_extr)
36653 break;
36654
36655 switch (elt)
36656 {
36657 case 0:
36658 tmp = vec;
36659 break;
36660
36661 case 1:
36662 case 3:
36663 tmp = gen_reg_rtx (mode);
36664 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
36665 GEN_INT (elt), GEN_INT (elt),
36666 GEN_INT (elt+4), GEN_INT (elt+4)));
36667 break;
36668
36669 case 2:
36670 tmp = gen_reg_rtx (mode);
36671 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
36672 break;
36673
36674 default:
36675 gcc_unreachable ();
36676 }
36677 vec = tmp;
36678 use_vec_extr = true;
36679 elt = 0;
36680 break;
36681
36682 case V4SImode:
36683 use_vec_extr = TARGET_SSE4_1;
36684 if (use_vec_extr)
36685 break;
36686
36687 if (TARGET_SSE2)
36688 {
36689 switch (elt)
36690 {
36691 case 0:
36692 tmp = vec;
36693 break;
36694
36695 case 1:
36696 case 3:
36697 tmp = gen_reg_rtx (mode);
36698 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
36699 GEN_INT (elt), GEN_INT (elt),
36700 GEN_INT (elt), GEN_INT (elt)));
36701 break;
36702
36703 case 2:
36704 tmp = gen_reg_rtx (mode);
36705 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
36706 break;
36707
36708 default:
36709 gcc_unreachable ();
36710 }
36711 vec = tmp;
36712 use_vec_extr = true;
36713 elt = 0;
36714 }
36715 else
36716 {
36717 /* For SSE1, we have to reuse the V4SF code. */
36718 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
36719 gen_lowpart (V4SFmode, vec), elt);
36720 return;
36721 }
36722 break;
36723
36724 case V8HImode:
36725 use_vec_extr = TARGET_SSE2;
36726 break;
36727 case V4HImode:
36728 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
36729 break;
36730
36731 case V16QImode:
36732 use_vec_extr = TARGET_SSE4_1;
36733 break;
36734
36735 case V8SFmode:
36736 if (TARGET_AVX)
36737 {
36738 tmp = gen_reg_rtx (V4SFmode);
36739 if (elt < 4)
36740 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
36741 else
36742 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
36743 ix86_expand_vector_extract (false, target, tmp, elt & 3);
36744 return;
36745 }
36746 break;
36747
36748 case V4DFmode:
36749 if (TARGET_AVX)
36750 {
36751 tmp = gen_reg_rtx (V2DFmode);
36752 if (elt < 2)
36753 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
36754 else
36755 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
36756 ix86_expand_vector_extract (false, target, tmp, elt & 1);
36757 return;
36758 }
36759 break;
36760
36761 case V32QImode:
36762 if (TARGET_AVX)
36763 {
36764 tmp = gen_reg_rtx (V16QImode);
36765 if (elt < 16)
36766 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
36767 else
36768 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
36769 ix86_expand_vector_extract (false, target, tmp, elt & 15);
36770 return;
36771 }
36772 break;
36773
36774 case V16HImode:
36775 if (TARGET_AVX)
36776 {
36777 tmp = gen_reg_rtx (V8HImode);
36778 if (elt < 8)
36779 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
36780 else
36781 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
36782 ix86_expand_vector_extract (false, target, tmp, elt & 7);
36783 return;
36784 }
36785 break;
36786
36787 case V8SImode:
36788 if (TARGET_AVX)
36789 {
36790 tmp = gen_reg_rtx (V4SImode);
36791 if (elt < 4)
36792 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
36793 else
36794 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
36795 ix86_expand_vector_extract (false, target, tmp, elt & 3);
36796 return;
36797 }
36798 break;
36799
36800 case V4DImode:
36801 if (TARGET_AVX)
36802 {
36803 tmp = gen_reg_rtx (V2DImode);
36804 if (elt < 2)
36805 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
36806 else
36807 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
36808 ix86_expand_vector_extract (false, target, tmp, elt & 1);
36809 return;
36810 }
36811 break;
36812
36813 case V8QImode:
36814 /* ??? Could extract the appropriate HImode element and shift. */
36815 default:
36816 break;
36817 }
36818
36819 if (use_vec_extr)
36820 {
36821 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
36822 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
36823
36824 /* Let the rtl optimizers know about the zero extension performed. */
36825 if (inner_mode == QImode || inner_mode == HImode)
36826 {
36827 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
36828 target = gen_lowpart (SImode, target);
36829 }
36830
36831 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36832 }
36833 else
36834 {
36835 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
36836
36837 emit_move_insn (mem, vec);
36838
36839 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
36840 emit_move_insn (target, tmp);
36841 }
36842 }
36843
36844 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
36845 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
36846 The upper bits of DEST are undefined, though they shouldn't cause
36847 exceptions (some bits from src or all zeros are ok). */
36848
36849 static void
36850 emit_reduc_half (rtx dest, rtx src, int i)
36851 {
36852 rtx tem;
36853 switch (GET_MODE (src))
36854 {
36855 case V4SFmode:
36856 if (i == 128)
36857 tem = gen_sse_movhlps (dest, src, src);
36858 else
36859 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
36860 GEN_INT (1 + 4), GEN_INT (1 + 4));
36861 break;
36862 case V2DFmode:
36863 tem = gen_vec_interleave_highv2df (dest, src, src);
36864 break;
36865 case V16QImode:
36866 case V8HImode:
36867 case V4SImode:
36868 case V2DImode:
36869 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
36870 gen_lowpart (V1TImode, src),
36871 GEN_INT (i / 2));
36872 break;
36873 case V8SFmode:
36874 if (i == 256)
36875 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
36876 else
36877 tem = gen_avx_shufps256 (dest, src, src,
36878 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
36879 break;
36880 case V4DFmode:
36881 if (i == 256)
36882 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
36883 else
36884 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
36885 break;
36886 case V32QImode:
36887 case V16HImode:
36888 case V8SImode:
36889 case V4DImode:
36890 if (i == 256)
36891 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
36892 gen_lowpart (V4DImode, src),
36893 gen_lowpart (V4DImode, src),
36894 const1_rtx);
36895 else
36896 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
36897 gen_lowpart (V2TImode, src),
36898 GEN_INT (i / 2));
36899 break;
36900 default:
36901 gcc_unreachable ();
36902 }
36903 emit_insn (tem);
36904 }
36905
36906 /* Expand a vector reduction. FN is the binary pattern to reduce;
36907 DEST is the destination; IN is the input vector. */
36908
36909 void
36910 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
36911 {
36912 rtx half, dst, vec = in;
36913 enum machine_mode mode = GET_MODE (in);
36914 int i;
36915
36916 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
36917 if (TARGET_SSE4_1
36918 && mode == V8HImode
36919 && fn == gen_uminv8hi3)
36920 {
36921 emit_insn (gen_sse4_1_phminposuw (dest, in));
36922 return;
36923 }
36924
36925 for (i = GET_MODE_BITSIZE (mode);
36926 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
36927 i >>= 1)
36928 {
36929 half = gen_reg_rtx (mode);
36930 emit_reduc_half (half, vec, i);
36931 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
36932 dst = dest;
36933 else
36934 dst = gen_reg_rtx (mode);
36935 emit_insn (fn (dst, half, vec));
36936 vec = dst;
36937 }
36938 }
36939 \f
36940 /* Target hook for scalar_mode_supported_p. */
36941 static bool
36942 ix86_scalar_mode_supported_p (enum machine_mode mode)
36943 {
36944 if (DECIMAL_FLOAT_MODE_P (mode))
36945 return default_decimal_float_supported_p ();
36946 else if (mode == TFmode)
36947 return true;
36948 else
36949 return default_scalar_mode_supported_p (mode);
36950 }
36951
36952 /* Implements target hook vector_mode_supported_p. */
36953 static bool
36954 ix86_vector_mode_supported_p (enum machine_mode mode)
36955 {
36956 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
36957 return true;
36958 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
36959 return true;
36960 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
36961 return true;
36962 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
36963 return true;
36964 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
36965 return true;
36966 return false;
36967 }
36968
36969 /* Target hook for c_mode_for_suffix. */
36970 static enum machine_mode
36971 ix86_c_mode_for_suffix (char suffix)
36972 {
36973 if (suffix == 'q')
36974 return TFmode;
36975 if (suffix == 'w')
36976 return XFmode;
36977
36978 return VOIDmode;
36979 }
36980
36981 /* Worker function for TARGET_MD_ASM_CLOBBERS.
36982
36983 We do this in the new i386 backend to maintain source compatibility
36984 with the old cc0-based compiler. */
36985
36986 static tree
36987 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
36988 tree inputs ATTRIBUTE_UNUSED,
36989 tree clobbers)
36990 {
36991 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
36992 clobbers);
36993 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
36994 clobbers);
36995 return clobbers;
36996 }
36997
36998 /* Implements target vector targetm.asm.encode_section_info. */
36999
37000 static void ATTRIBUTE_UNUSED
37001 ix86_encode_section_info (tree decl, rtx rtl, int first)
37002 {
37003 default_encode_section_info (decl, rtl, first);
37004
37005 if (TREE_CODE (decl) == VAR_DECL
37006 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
37007 && ix86_in_large_data_p (decl))
37008 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
37009 }
37010
37011 /* Worker function for REVERSE_CONDITION. */
37012
37013 enum rtx_code
37014 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
37015 {
37016 return (mode != CCFPmode && mode != CCFPUmode
37017 ? reverse_condition (code)
37018 : reverse_condition_maybe_unordered (code));
37019 }
37020
37021 /* Output code to perform an x87 FP register move, from OPERANDS[1]
37022 to OPERANDS[0]. */
37023
37024 const char *
37025 output_387_reg_move (rtx insn, rtx *operands)
37026 {
37027 if (REG_P (operands[0]))
37028 {
37029 if (REG_P (operands[1])
37030 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
37031 {
37032 if (REGNO (operands[0]) == FIRST_STACK_REG)
37033 return output_387_ffreep (operands, 0);
37034 return "fstp\t%y0";
37035 }
37036 if (STACK_TOP_P (operands[0]))
37037 return "fld%Z1\t%y1";
37038 return "fst\t%y0";
37039 }
37040 else if (MEM_P (operands[0]))
37041 {
37042 gcc_assert (REG_P (operands[1]));
37043 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
37044 return "fstp%Z0\t%y0";
37045 else
37046 {
37047 /* There is no non-popping store to memory for XFmode.
37048 So if we need one, follow the store with a load. */
37049 if (GET_MODE (operands[0]) == XFmode)
37050 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
37051 else
37052 return "fst%Z0\t%y0";
37053 }
37054 }
37055 else
37056 gcc_unreachable();
37057 }
37058
37059 /* Output code to perform a conditional jump to LABEL, if C2 flag in
37060 FP status register is set. */
37061
37062 void
37063 ix86_emit_fp_unordered_jump (rtx label)
37064 {
37065 rtx reg = gen_reg_rtx (HImode);
37066 rtx temp;
37067
37068 emit_insn (gen_x86_fnstsw_1 (reg));
37069
37070 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
37071 {
37072 emit_insn (gen_x86_sahf_1 (reg));
37073
37074 temp = gen_rtx_REG (CCmode, FLAGS_REG);
37075 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
37076 }
37077 else
37078 {
37079 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
37080
37081 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
37082 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
37083 }
37084
37085 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
37086 gen_rtx_LABEL_REF (VOIDmode, label),
37087 pc_rtx);
37088 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
37089
37090 emit_jump_insn (temp);
37091 predict_jump (REG_BR_PROB_BASE * 10 / 100);
37092 }
37093
37094 /* Output code to perform a log1p XFmode calculation. */
37095
37096 void ix86_emit_i387_log1p (rtx op0, rtx op1)
37097 {
37098 rtx label1 = gen_label_rtx ();
37099 rtx label2 = gen_label_rtx ();
37100
37101 rtx tmp = gen_reg_rtx (XFmode);
37102 rtx tmp2 = gen_reg_rtx (XFmode);
37103 rtx test;
37104
37105 emit_insn (gen_absxf2 (tmp, op1));
37106 test = gen_rtx_GE (VOIDmode, tmp,
37107 CONST_DOUBLE_FROM_REAL_VALUE (
37108 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
37109 XFmode));
37110 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
37111
37112 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
37113 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
37114 emit_jump (label2);
37115
37116 emit_label (label1);
37117 emit_move_insn (tmp, CONST1_RTX (XFmode));
37118 emit_insn (gen_addxf3 (tmp, op1, tmp));
37119 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
37120 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
37121
37122 emit_label (label2);
37123 }
37124
37125 /* Emit code for round calculation. */
37126 void ix86_emit_i387_round (rtx op0, rtx op1)
37127 {
37128 enum machine_mode inmode = GET_MODE (op1);
37129 enum machine_mode outmode = GET_MODE (op0);
37130 rtx e1, e2, res, tmp, tmp1, half;
37131 rtx scratch = gen_reg_rtx (HImode);
37132 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
37133 rtx jump_label = gen_label_rtx ();
37134 rtx insn;
37135 rtx (*gen_abs) (rtx, rtx);
37136 rtx (*gen_neg) (rtx, rtx);
37137
37138 switch (inmode)
37139 {
37140 case SFmode:
37141 gen_abs = gen_abssf2;
37142 break;
37143 case DFmode:
37144 gen_abs = gen_absdf2;
37145 break;
37146 case XFmode:
37147 gen_abs = gen_absxf2;
37148 break;
37149 default:
37150 gcc_unreachable ();
37151 }
37152
37153 switch (outmode)
37154 {
37155 case SFmode:
37156 gen_neg = gen_negsf2;
37157 break;
37158 case DFmode:
37159 gen_neg = gen_negdf2;
37160 break;
37161 case XFmode:
37162 gen_neg = gen_negxf2;
37163 break;
37164 case HImode:
37165 gen_neg = gen_neghi2;
37166 break;
37167 case SImode:
37168 gen_neg = gen_negsi2;
37169 break;
37170 case DImode:
37171 gen_neg = gen_negdi2;
37172 break;
37173 default:
37174 gcc_unreachable ();
37175 }
37176
37177 e1 = gen_reg_rtx (inmode);
37178 e2 = gen_reg_rtx (inmode);
37179 res = gen_reg_rtx (outmode);
37180
37181 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
37182
37183 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
37184
37185 /* scratch = fxam(op1) */
37186 emit_insn (gen_rtx_SET (VOIDmode, scratch,
37187 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
37188 UNSPEC_FXAM)));
37189 /* e1 = fabs(op1) */
37190 emit_insn (gen_abs (e1, op1));
37191
37192 /* e2 = e1 + 0.5 */
37193 half = force_reg (inmode, half);
37194 emit_insn (gen_rtx_SET (VOIDmode, e2,
37195 gen_rtx_PLUS (inmode, e1, half)));
37196
37197 /* res = floor(e2) */
37198 if (inmode != XFmode)
37199 {
37200 tmp1 = gen_reg_rtx (XFmode);
37201
37202 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
37203 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
37204 }
37205 else
37206 tmp1 = e2;
37207
37208 switch (outmode)
37209 {
37210 case SFmode:
37211 case DFmode:
37212 {
37213 rtx tmp0 = gen_reg_rtx (XFmode);
37214
37215 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
37216
37217 emit_insn (gen_rtx_SET (VOIDmode, res,
37218 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
37219 UNSPEC_TRUNC_NOOP)));
37220 }
37221 break;
37222 case XFmode:
37223 emit_insn (gen_frndintxf2_floor (res, tmp1));
37224 break;
37225 case HImode:
37226 emit_insn (gen_lfloorxfhi2 (res, tmp1));
37227 break;
37228 case SImode:
37229 emit_insn (gen_lfloorxfsi2 (res, tmp1));
37230 break;
37231 case DImode:
37232 emit_insn (gen_lfloorxfdi2 (res, tmp1));
37233 break;
37234 default:
37235 gcc_unreachable ();
37236 }
37237
37238 /* flags = signbit(a) */
37239 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
37240
37241 /* if (flags) then res = -res */
37242 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
37243 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
37244 gen_rtx_LABEL_REF (VOIDmode, jump_label),
37245 pc_rtx);
37246 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
37247 predict_jump (REG_BR_PROB_BASE * 50 / 100);
37248 JUMP_LABEL (insn) = jump_label;
37249
37250 emit_insn (gen_neg (res, res));
37251
37252 emit_label (jump_label);
37253 LABEL_NUSES (jump_label) = 1;
37254
37255 emit_move_insn (op0, res);
37256 }
37257
37258 /* Output code to perform a Newton-Rhapson approximation of a single precision
37259 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
37260
37261 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
37262 {
37263 rtx x0, x1, e0, e1;
37264
37265 x0 = gen_reg_rtx (mode);
37266 e0 = gen_reg_rtx (mode);
37267 e1 = gen_reg_rtx (mode);
37268 x1 = gen_reg_rtx (mode);
37269
37270 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
37271
37272 b = force_reg (mode, b);
37273
37274 /* x0 = rcp(b) estimate */
37275 emit_insn (gen_rtx_SET (VOIDmode, x0,
37276 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
37277 UNSPEC_RCP)));
37278 /* e0 = x0 * b */
37279 emit_insn (gen_rtx_SET (VOIDmode, e0,
37280 gen_rtx_MULT (mode, x0, b)));
37281
37282 /* e0 = x0 * e0 */
37283 emit_insn (gen_rtx_SET (VOIDmode, e0,
37284 gen_rtx_MULT (mode, x0, e0)));
37285
37286 /* e1 = x0 + x0 */
37287 emit_insn (gen_rtx_SET (VOIDmode, e1,
37288 gen_rtx_PLUS (mode, x0, x0)));
37289
37290 /* x1 = e1 - e0 */
37291 emit_insn (gen_rtx_SET (VOIDmode, x1,
37292 gen_rtx_MINUS (mode, e1, e0)));
37293
37294 /* res = a * x1 */
37295 emit_insn (gen_rtx_SET (VOIDmode, res,
37296 gen_rtx_MULT (mode, a, x1)));
37297 }
37298
37299 /* Output code to perform a Newton-Rhapson approximation of a
37300 single precision floating point [reciprocal] square root. */
37301
37302 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
37303 bool recip)
37304 {
37305 rtx x0, e0, e1, e2, e3, mthree, mhalf;
37306 REAL_VALUE_TYPE r;
37307
37308 x0 = gen_reg_rtx (mode);
37309 e0 = gen_reg_rtx (mode);
37310 e1 = gen_reg_rtx (mode);
37311 e2 = gen_reg_rtx (mode);
37312 e3 = gen_reg_rtx (mode);
37313
37314 real_from_integer (&r, VOIDmode, -3, -1, 0);
37315 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
37316
37317 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
37318 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
37319
37320 if (VECTOR_MODE_P (mode))
37321 {
37322 mthree = ix86_build_const_vector (mode, true, mthree);
37323 mhalf = ix86_build_const_vector (mode, true, mhalf);
37324 }
37325
37326 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
37327 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
37328
37329 a = force_reg (mode, a);
37330
37331 /* x0 = rsqrt(a) estimate */
37332 emit_insn (gen_rtx_SET (VOIDmode, x0,
37333 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
37334 UNSPEC_RSQRT)));
37335
37336 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
37337 if (!recip)
37338 {
37339 rtx zero, mask;
37340
37341 zero = gen_reg_rtx (mode);
37342 mask = gen_reg_rtx (mode);
37343
37344 zero = force_reg (mode, CONST0_RTX(mode));
37345 emit_insn (gen_rtx_SET (VOIDmode, mask,
37346 gen_rtx_NE (mode, zero, a)));
37347
37348 emit_insn (gen_rtx_SET (VOIDmode, x0,
37349 gen_rtx_AND (mode, x0, mask)));
37350 }
37351
37352 /* e0 = x0 * a */
37353 emit_insn (gen_rtx_SET (VOIDmode, e0,
37354 gen_rtx_MULT (mode, x0, a)));
37355 /* e1 = e0 * x0 */
37356 emit_insn (gen_rtx_SET (VOIDmode, e1,
37357 gen_rtx_MULT (mode, e0, x0)));
37358
37359 /* e2 = e1 - 3. */
37360 mthree = force_reg (mode, mthree);
37361 emit_insn (gen_rtx_SET (VOIDmode, e2,
37362 gen_rtx_PLUS (mode, e1, mthree)));
37363
37364 mhalf = force_reg (mode, mhalf);
37365 if (recip)
37366 /* e3 = -.5 * x0 */
37367 emit_insn (gen_rtx_SET (VOIDmode, e3,
37368 gen_rtx_MULT (mode, x0, mhalf)));
37369 else
37370 /* e3 = -.5 * e0 */
37371 emit_insn (gen_rtx_SET (VOIDmode, e3,
37372 gen_rtx_MULT (mode, e0, mhalf)));
37373 /* ret = e2 * e3 */
37374 emit_insn (gen_rtx_SET (VOIDmode, res,
37375 gen_rtx_MULT (mode, e2, e3)));
37376 }
37377
37378 #ifdef TARGET_SOLARIS
37379 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
37380
37381 static void
37382 i386_solaris_elf_named_section (const char *name, unsigned int flags,
37383 tree decl)
37384 {
37385 /* With Binutils 2.15, the "@unwind" marker must be specified on
37386 every occurrence of the ".eh_frame" section, not just the first
37387 one. */
37388 if (TARGET_64BIT
37389 && strcmp (name, ".eh_frame") == 0)
37390 {
37391 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
37392 flags & SECTION_WRITE ? "aw" : "a");
37393 return;
37394 }
37395
37396 #ifndef USE_GAS
37397 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
37398 {
37399 solaris_elf_asm_comdat_section (name, flags, decl);
37400 return;
37401 }
37402 #endif
37403
37404 default_elf_asm_named_section (name, flags, decl);
37405 }
37406 #endif /* TARGET_SOLARIS */
37407
37408 /* Return the mangling of TYPE if it is an extended fundamental type. */
37409
37410 static const char *
37411 ix86_mangle_type (const_tree type)
37412 {
37413 type = TYPE_MAIN_VARIANT (type);
37414
37415 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
37416 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
37417 return NULL;
37418
37419 switch (TYPE_MODE (type))
37420 {
37421 case TFmode:
37422 /* __float128 is "g". */
37423 return "g";
37424 case XFmode:
37425 /* "long double" or __float80 is "e". */
37426 return "e";
37427 default:
37428 return NULL;
37429 }
37430 }
37431
37432 /* For 32-bit code we can save PIC register setup by using
37433 __stack_chk_fail_local hidden function instead of calling
37434 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
37435 register, so it is better to call __stack_chk_fail directly. */
37436
37437 static tree ATTRIBUTE_UNUSED
37438 ix86_stack_protect_fail (void)
37439 {
37440 return TARGET_64BIT
37441 ? default_external_stack_protect_fail ()
37442 : default_hidden_stack_protect_fail ();
37443 }
37444
37445 /* Select a format to encode pointers in exception handling data. CODE
37446 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
37447 true if the symbol may be affected by dynamic relocations.
37448
37449 ??? All x86 object file formats are capable of representing this.
37450 After all, the relocation needed is the same as for the call insn.
37451 Whether or not a particular assembler allows us to enter such, I
37452 guess we'll have to see. */
37453 int
37454 asm_preferred_eh_data_format (int code, int global)
37455 {
37456 if (flag_pic)
37457 {
37458 int type = DW_EH_PE_sdata8;
37459 if (!TARGET_64BIT
37460 || ix86_cmodel == CM_SMALL_PIC
37461 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
37462 type = DW_EH_PE_sdata4;
37463 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
37464 }
37465 if (ix86_cmodel == CM_SMALL
37466 || (ix86_cmodel == CM_MEDIUM && code))
37467 return DW_EH_PE_udata4;
37468 return DW_EH_PE_absptr;
37469 }
37470 \f
37471 /* Expand copysign from SIGN to the positive value ABS_VALUE
37472 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
37473 the sign-bit. */
37474 static void
37475 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
37476 {
37477 enum machine_mode mode = GET_MODE (sign);
37478 rtx sgn = gen_reg_rtx (mode);
37479 if (mask == NULL_RTX)
37480 {
37481 enum machine_mode vmode;
37482
37483 if (mode == SFmode)
37484 vmode = V4SFmode;
37485 else if (mode == DFmode)
37486 vmode = V2DFmode;
37487 else
37488 vmode = mode;
37489
37490 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
37491 if (!VECTOR_MODE_P (mode))
37492 {
37493 /* We need to generate a scalar mode mask in this case. */
37494 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
37495 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
37496 mask = gen_reg_rtx (mode);
37497 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
37498 }
37499 }
37500 else
37501 mask = gen_rtx_NOT (mode, mask);
37502 emit_insn (gen_rtx_SET (VOIDmode, sgn,
37503 gen_rtx_AND (mode, mask, sign)));
37504 emit_insn (gen_rtx_SET (VOIDmode, result,
37505 gen_rtx_IOR (mode, abs_value, sgn)));
37506 }
37507
37508 /* Expand fabs (OP0) and return a new rtx that holds the result. The
37509 mask for masking out the sign-bit is stored in *SMASK, if that is
37510 non-null. */
37511 static rtx
37512 ix86_expand_sse_fabs (rtx op0, rtx *smask)
37513 {
37514 enum machine_mode vmode, mode = GET_MODE (op0);
37515 rtx xa, mask;
37516
37517 xa = gen_reg_rtx (mode);
37518 if (mode == SFmode)
37519 vmode = V4SFmode;
37520 else if (mode == DFmode)
37521 vmode = V2DFmode;
37522 else
37523 vmode = mode;
37524 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
37525 if (!VECTOR_MODE_P (mode))
37526 {
37527 /* We need to generate a scalar mode mask in this case. */
37528 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
37529 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
37530 mask = gen_reg_rtx (mode);
37531 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
37532 }
37533 emit_insn (gen_rtx_SET (VOIDmode, xa,
37534 gen_rtx_AND (mode, op0, mask)));
37535
37536 if (smask)
37537 *smask = mask;
37538
37539 return xa;
37540 }
37541
37542 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
37543 swapping the operands if SWAP_OPERANDS is true. The expanded
37544 code is a forward jump to a newly created label in case the
37545 comparison is true. The generated label rtx is returned. */
37546 static rtx
37547 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
37548 bool swap_operands)
37549 {
37550 rtx label, tmp;
37551
37552 if (swap_operands)
37553 {
37554 tmp = op0;
37555 op0 = op1;
37556 op1 = tmp;
37557 }
37558
37559 label = gen_label_rtx ();
37560 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
37561 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37562 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
37563 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
37564 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
37565 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
37566 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
37567 JUMP_LABEL (tmp) = label;
37568
37569 return label;
37570 }
37571
37572 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
37573 using comparison code CODE. Operands are swapped for the comparison if
37574 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
37575 static rtx
37576 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
37577 bool swap_operands)
37578 {
37579 rtx (*insn)(rtx, rtx, rtx, rtx);
37580 enum machine_mode mode = GET_MODE (op0);
37581 rtx mask = gen_reg_rtx (mode);
37582
37583 if (swap_operands)
37584 {
37585 rtx tmp = op0;
37586 op0 = op1;
37587 op1 = tmp;
37588 }
37589
37590 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
37591
37592 emit_insn (insn (mask, op0, op1,
37593 gen_rtx_fmt_ee (code, mode, op0, op1)));
37594 return mask;
37595 }
37596
37597 /* Generate and return a rtx of mode MODE for 2**n where n is the number
37598 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
37599 static rtx
37600 ix86_gen_TWO52 (enum machine_mode mode)
37601 {
37602 REAL_VALUE_TYPE TWO52r;
37603 rtx TWO52;
37604
37605 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
37606 TWO52 = const_double_from_real_value (TWO52r, mode);
37607 TWO52 = force_reg (mode, TWO52);
37608
37609 return TWO52;
37610 }
37611
37612 /* Expand SSE sequence for computing lround from OP1 storing
37613 into OP0. */
37614 void
37615 ix86_expand_lround (rtx op0, rtx op1)
37616 {
37617 /* C code for the stuff we're doing below:
37618 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
37619 return (long)tmp;
37620 */
37621 enum machine_mode mode = GET_MODE (op1);
37622 const struct real_format *fmt;
37623 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
37624 rtx adj;
37625
37626 /* load nextafter (0.5, 0.0) */
37627 fmt = REAL_MODE_FORMAT (mode);
37628 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
37629 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
37630
37631 /* adj = copysign (0.5, op1) */
37632 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
37633 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
37634
37635 /* adj = op1 + adj */
37636 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
37637
37638 /* op0 = (imode)adj */
37639 expand_fix (op0, adj, 0);
37640 }
37641
37642 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
37643 into OPERAND0. */
37644 void
37645 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
37646 {
37647 /* C code for the stuff we're doing below (for do_floor):
37648 xi = (long)op1;
37649 xi -= (double)xi > op1 ? 1 : 0;
37650 return xi;
37651 */
37652 enum machine_mode fmode = GET_MODE (op1);
37653 enum machine_mode imode = GET_MODE (op0);
37654 rtx ireg, freg, label, tmp;
37655
37656 /* reg = (long)op1 */
37657 ireg = gen_reg_rtx (imode);
37658 expand_fix (ireg, op1, 0);
37659
37660 /* freg = (double)reg */
37661 freg = gen_reg_rtx (fmode);
37662 expand_float (freg, ireg, 0);
37663
37664 /* ireg = (freg > op1) ? ireg - 1 : ireg */
37665 label = ix86_expand_sse_compare_and_jump (UNLE,
37666 freg, op1, !do_floor);
37667 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
37668 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
37669 emit_move_insn (ireg, tmp);
37670
37671 emit_label (label);
37672 LABEL_NUSES (label) = 1;
37673
37674 emit_move_insn (op0, ireg);
37675 }
37676
37677 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
37678 result in OPERAND0. */
37679 void
37680 ix86_expand_rint (rtx operand0, rtx operand1)
37681 {
37682 /* C code for the stuff we're doing below:
37683 xa = fabs (operand1);
37684 if (!isless (xa, 2**52))
37685 return operand1;
37686 xa = xa + 2**52 - 2**52;
37687 return copysign (xa, operand1);
37688 */
37689 enum machine_mode mode = GET_MODE (operand0);
37690 rtx res, xa, label, TWO52, mask;
37691
37692 res = gen_reg_rtx (mode);
37693 emit_move_insn (res, operand1);
37694
37695 /* xa = abs (operand1) */
37696 xa = ix86_expand_sse_fabs (res, &mask);
37697
37698 /* if (!isless (xa, TWO52)) goto label; */
37699 TWO52 = ix86_gen_TWO52 (mode);
37700 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37701
37702 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
37703 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
37704
37705 ix86_sse_copysign_to_positive (res, xa, res, mask);
37706
37707 emit_label (label);
37708 LABEL_NUSES (label) = 1;
37709
37710 emit_move_insn (operand0, res);
37711 }
37712
37713 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
37714 into OPERAND0. */
37715 void
37716 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
37717 {
37718 /* C code for the stuff we expand below.
37719 double xa = fabs (x), x2;
37720 if (!isless (xa, TWO52))
37721 return x;
37722 xa = xa + TWO52 - TWO52;
37723 x2 = copysign (xa, x);
37724 Compensate. Floor:
37725 if (x2 > x)
37726 x2 -= 1;
37727 Compensate. Ceil:
37728 if (x2 < x)
37729 x2 -= -1;
37730 return x2;
37731 */
37732 enum machine_mode mode = GET_MODE (operand0);
37733 rtx xa, TWO52, tmp, label, one, res, mask;
37734
37735 TWO52 = ix86_gen_TWO52 (mode);
37736
37737 /* Temporary for holding the result, initialized to the input
37738 operand to ease control flow. */
37739 res = gen_reg_rtx (mode);
37740 emit_move_insn (res, operand1);
37741
37742 /* xa = abs (operand1) */
37743 xa = ix86_expand_sse_fabs (res, &mask);
37744
37745 /* if (!isless (xa, TWO52)) goto label; */
37746 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37747
37748 /* xa = xa + TWO52 - TWO52; */
37749 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
37750 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
37751
37752 /* xa = copysign (xa, operand1) */
37753 ix86_sse_copysign_to_positive (xa, xa, res, mask);
37754
37755 /* generate 1.0 or -1.0 */
37756 one = force_reg (mode,
37757 const_double_from_real_value (do_floor
37758 ? dconst1 : dconstm1, mode));
37759
37760 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
37761 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
37762 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37763 gen_rtx_AND (mode, one, tmp)));
37764 /* We always need to subtract here to preserve signed zero. */
37765 tmp = expand_simple_binop (mode, MINUS,
37766 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
37767 emit_move_insn (res, tmp);
37768
37769 emit_label (label);
37770 LABEL_NUSES (label) = 1;
37771
37772 emit_move_insn (operand0, res);
37773 }
37774
37775 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
37776 into OPERAND0. */
37777 void
37778 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
37779 {
37780 /* C code for the stuff we expand below.
37781 double xa = fabs (x), x2;
37782 if (!isless (xa, TWO52))
37783 return x;
37784 x2 = (double)(long)x;
37785 Compensate. Floor:
37786 if (x2 > x)
37787 x2 -= 1;
37788 Compensate. Ceil:
37789 if (x2 < x)
37790 x2 += 1;
37791 if (HONOR_SIGNED_ZEROS (mode))
37792 return copysign (x2, x);
37793 return x2;
37794 */
37795 enum machine_mode mode = GET_MODE (operand0);
37796 rtx xa, xi, TWO52, tmp, label, one, res, mask;
37797
37798 TWO52 = ix86_gen_TWO52 (mode);
37799
37800 /* Temporary for holding the result, initialized to the input
37801 operand to ease control flow. */
37802 res = gen_reg_rtx (mode);
37803 emit_move_insn (res, operand1);
37804
37805 /* xa = abs (operand1) */
37806 xa = ix86_expand_sse_fabs (res, &mask);
37807
37808 /* if (!isless (xa, TWO52)) goto label; */
37809 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37810
37811 /* xa = (double)(long)x */
37812 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
37813 expand_fix (xi, res, 0);
37814 expand_float (xa, xi, 0);
37815
37816 /* generate 1.0 */
37817 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
37818
37819 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
37820 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
37821 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37822 gen_rtx_AND (mode, one, tmp)));
37823 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
37824 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
37825 emit_move_insn (res, tmp);
37826
37827 if (HONOR_SIGNED_ZEROS (mode))
37828 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
37829
37830 emit_label (label);
37831 LABEL_NUSES (label) = 1;
37832
37833 emit_move_insn (operand0, res);
37834 }
37835
37836 /* Expand SSE sequence for computing round from OPERAND1 storing
37837 into OPERAND0. Sequence that works without relying on DImode truncation
37838 via cvttsd2siq that is only available on 64bit targets. */
37839 void
37840 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
37841 {
37842 /* C code for the stuff we expand below.
37843 double xa = fabs (x), xa2, x2;
37844 if (!isless (xa, TWO52))
37845 return x;
37846 Using the absolute value and copying back sign makes
37847 -0.0 -> -0.0 correct.
37848 xa2 = xa + TWO52 - TWO52;
37849 Compensate.
37850 dxa = xa2 - xa;
37851 if (dxa <= -0.5)
37852 xa2 += 1;
37853 else if (dxa > 0.5)
37854 xa2 -= 1;
37855 x2 = copysign (xa2, x);
37856 return x2;
37857 */
37858 enum machine_mode mode = GET_MODE (operand0);
37859 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
37860
37861 TWO52 = ix86_gen_TWO52 (mode);
37862
37863 /* Temporary for holding the result, initialized to the input
37864 operand to ease control flow. */
37865 res = gen_reg_rtx (mode);
37866 emit_move_insn (res, operand1);
37867
37868 /* xa = abs (operand1) */
37869 xa = ix86_expand_sse_fabs (res, &mask);
37870
37871 /* if (!isless (xa, TWO52)) goto label; */
37872 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37873
37874 /* xa2 = xa + TWO52 - TWO52; */
37875 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
37876 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
37877
37878 /* dxa = xa2 - xa; */
37879 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
37880
37881 /* generate 0.5, 1.0 and -0.5 */
37882 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
37883 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
37884 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
37885 0, OPTAB_DIRECT);
37886
37887 /* Compensate. */
37888 tmp = gen_reg_rtx (mode);
37889 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
37890 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
37891 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37892 gen_rtx_AND (mode, one, tmp)));
37893 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
37894 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
37895 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
37896 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37897 gen_rtx_AND (mode, one, tmp)));
37898 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
37899
37900 /* res = copysign (xa2, operand1) */
37901 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
37902
37903 emit_label (label);
37904 LABEL_NUSES (label) = 1;
37905
37906 emit_move_insn (operand0, res);
37907 }
37908
37909 /* Expand SSE sequence for computing trunc from OPERAND1 storing
37910 into OPERAND0. */
37911 void
37912 ix86_expand_trunc (rtx operand0, rtx operand1)
37913 {
37914 /* C code for SSE variant we expand below.
37915 double xa = fabs (x), x2;
37916 if (!isless (xa, TWO52))
37917 return x;
37918 x2 = (double)(long)x;
37919 if (HONOR_SIGNED_ZEROS (mode))
37920 return copysign (x2, x);
37921 return x2;
37922 */
37923 enum machine_mode mode = GET_MODE (operand0);
37924 rtx xa, xi, TWO52, label, res, mask;
37925
37926 TWO52 = ix86_gen_TWO52 (mode);
37927
37928 /* Temporary for holding the result, initialized to the input
37929 operand to ease control flow. */
37930 res = gen_reg_rtx (mode);
37931 emit_move_insn (res, operand1);
37932
37933 /* xa = abs (operand1) */
37934 xa = ix86_expand_sse_fabs (res, &mask);
37935
37936 /* if (!isless (xa, TWO52)) goto label; */
37937 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37938
37939 /* x = (double)(long)x */
37940 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
37941 expand_fix (xi, res, 0);
37942 expand_float (res, xi, 0);
37943
37944 if (HONOR_SIGNED_ZEROS (mode))
37945 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
37946
37947 emit_label (label);
37948 LABEL_NUSES (label) = 1;
37949
37950 emit_move_insn (operand0, res);
37951 }
37952
37953 /* Expand SSE sequence for computing trunc from OPERAND1 storing
37954 into OPERAND0. */
37955 void
37956 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
37957 {
37958 enum machine_mode mode = GET_MODE (operand0);
37959 rtx xa, mask, TWO52, label, one, res, smask, tmp;
37960
37961 /* C code for SSE variant we expand below.
37962 double xa = fabs (x), x2;
37963 if (!isless (xa, TWO52))
37964 return x;
37965 xa2 = xa + TWO52 - TWO52;
37966 Compensate:
37967 if (xa2 > xa)
37968 xa2 -= 1.0;
37969 x2 = copysign (xa2, x);
37970 return x2;
37971 */
37972
37973 TWO52 = ix86_gen_TWO52 (mode);
37974
37975 /* Temporary for holding the result, initialized to the input
37976 operand to ease control flow. */
37977 res = gen_reg_rtx (mode);
37978 emit_move_insn (res, operand1);
37979
37980 /* xa = abs (operand1) */
37981 xa = ix86_expand_sse_fabs (res, &smask);
37982
37983 /* if (!isless (xa, TWO52)) goto label; */
37984 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37985
37986 /* res = xa + TWO52 - TWO52; */
37987 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
37988 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
37989 emit_move_insn (res, tmp);
37990
37991 /* generate 1.0 */
37992 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
37993
37994 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
37995 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
37996 emit_insn (gen_rtx_SET (VOIDmode, mask,
37997 gen_rtx_AND (mode, mask, one)));
37998 tmp = expand_simple_binop (mode, MINUS,
37999 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
38000 emit_move_insn (res, tmp);
38001
38002 /* res = copysign (res, operand1) */
38003 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
38004
38005 emit_label (label);
38006 LABEL_NUSES (label) = 1;
38007
38008 emit_move_insn (operand0, res);
38009 }
38010
38011 /* Expand SSE sequence for computing round from OPERAND1 storing
38012 into OPERAND0. */
38013 void
38014 ix86_expand_round (rtx operand0, rtx operand1)
38015 {
38016 /* C code for the stuff we're doing below:
38017 double xa = fabs (x);
38018 if (!isless (xa, TWO52))
38019 return x;
38020 xa = (double)(long)(xa + nextafter (0.5, 0.0));
38021 return copysign (xa, x);
38022 */
38023 enum machine_mode mode = GET_MODE (operand0);
38024 rtx res, TWO52, xa, label, xi, half, mask;
38025 const struct real_format *fmt;
38026 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
38027
38028 /* Temporary for holding the result, initialized to the input
38029 operand to ease control flow. */
38030 res = gen_reg_rtx (mode);
38031 emit_move_insn (res, operand1);
38032
38033 TWO52 = ix86_gen_TWO52 (mode);
38034 xa = ix86_expand_sse_fabs (res, &mask);
38035 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38036
38037 /* load nextafter (0.5, 0.0) */
38038 fmt = REAL_MODE_FORMAT (mode);
38039 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
38040 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
38041
38042 /* xa = xa + 0.5 */
38043 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
38044 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
38045
38046 /* xa = (double)(int64_t)xa */
38047 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
38048 expand_fix (xi, xa, 0);
38049 expand_float (xa, xi, 0);
38050
38051 /* res = copysign (xa, operand1) */
38052 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
38053
38054 emit_label (label);
38055 LABEL_NUSES (label) = 1;
38056
38057 emit_move_insn (operand0, res);
38058 }
38059
38060 /* Expand SSE sequence for computing round
38061 from OP1 storing into OP0 using sse4 round insn. */
38062 void
38063 ix86_expand_round_sse4 (rtx op0, rtx op1)
38064 {
38065 enum machine_mode mode = GET_MODE (op0);
38066 rtx e1, e2, res, half;
38067 const struct real_format *fmt;
38068 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
38069 rtx (*gen_copysign) (rtx, rtx, rtx);
38070 rtx (*gen_round) (rtx, rtx, rtx);
38071
38072 switch (mode)
38073 {
38074 case SFmode:
38075 gen_copysign = gen_copysignsf3;
38076 gen_round = gen_sse4_1_roundsf2;
38077 break;
38078 case DFmode:
38079 gen_copysign = gen_copysigndf3;
38080 gen_round = gen_sse4_1_rounddf2;
38081 break;
38082 default:
38083 gcc_unreachable ();
38084 }
38085
38086 /* round (a) = trunc (a + copysign (0.5, a)) */
38087
38088 /* load nextafter (0.5, 0.0) */
38089 fmt = REAL_MODE_FORMAT (mode);
38090 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
38091 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
38092 half = const_double_from_real_value (pred_half, mode);
38093
38094 /* e1 = copysign (0.5, op1) */
38095 e1 = gen_reg_rtx (mode);
38096 emit_insn (gen_copysign (e1, half, op1));
38097
38098 /* e2 = op1 + e1 */
38099 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
38100
38101 /* res = trunc (e2) */
38102 res = gen_reg_rtx (mode);
38103 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
38104
38105 emit_move_insn (op0, res);
38106 }
38107 \f
38108
38109 /* Table of valid machine attributes. */
38110 static const struct attribute_spec ix86_attribute_table[] =
38111 {
38112 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
38113 affects_type_identity } */
38114 /* Stdcall attribute says callee is responsible for popping arguments
38115 if they are not variable. */
38116 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38117 true },
38118 /* Fastcall attribute says callee is responsible for popping arguments
38119 if they are not variable. */
38120 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38121 true },
38122 /* Thiscall attribute says callee is responsible for popping arguments
38123 if they are not variable. */
38124 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38125 true },
38126 /* Cdecl attribute says the callee is a normal C declaration */
38127 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38128 true },
38129 /* Regparm attribute specifies how many integer arguments are to be
38130 passed in registers. */
38131 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
38132 true },
38133 /* Sseregparm attribute says we are using x86_64 calling conventions
38134 for FP arguments. */
38135 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38136 true },
38137 /* The transactional memory builtins are implicitly regparm or fastcall
38138 depending on the ABI. Override the generic do-nothing attribute that
38139 these builtins were declared with. */
38140 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
38141 true },
38142 /* force_align_arg_pointer says this function realigns the stack at entry. */
38143 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
38144 false, true, true, ix86_handle_cconv_attribute, false },
38145 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
38146 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
38147 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
38148 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
38149 false },
38150 #endif
38151 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
38152 false },
38153 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
38154 false },
38155 #ifdef SUBTARGET_ATTRIBUTE_TABLE
38156 SUBTARGET_ATTRIBUTE_TABLE,
38157 #endif
38158 /* ms_abi and sysv_abi calling convention function attributes. */
38159 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
38160 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
38161 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
38162 false },
38163 { "callee_pop_aggregate_return", 1, 1, false, true, true,
38164 ix86_handle_callee_pop_aggregate_return, true },
38165 /* End element. */
38166 { NULL, 0, 0, false, false, false, NULL, false }
38167 };
38168
38169 /* Implement targetm.vectorize.builtin_vectorization_cost. */
38170 static int
38171 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
38172 tree vectype,
38173 int misalign ATTRIBUTE_UNUSED)
38174 {
38175 unsigned elements;
38176
38177 switch (type_of_cost)
38178 {
38179 case scalar_stmt:
38180 return ix86_cost->scalar_stmt_cost;
38181
38182 case scalar_load:
38183 return ix86_cost->scalar_load_cost;
38184
38185 case scalar_store:
38186 return ix86_cost->scalar_store_cost;
38187
38188 case vector_stmt:
38189 return ix86_cost->vec_stmt_cost;
38190
38191 case vector_load:
38192 return ix86_cost->vec_align_load_cost;
38193
38194 case vector_store:
38195 return ix86_cost->vec_store_cost;
38196
38197 case vec_to_scalar:
38198 return ix86_cost->vec_to_scalar_cost;
38199
38200 case scalar_to_vec:
38201 return ix86_cost->scalar_to_vec_cost;
38202
38203 case unaligned_load:
38204 case unaligned_store:
38205 return ix86_cost->vec_unalign_load_cost;
38206
38207 case cond_branch_taken:
38208 return ix86_cost->cond_taken_branch_cost;
38209
38210 case cond_branch_not_taken:
38211 return ix86_cost->cond_not_taken_branch_cost;
38212
38213 case vec_perm:
38214 case vec_promote_demote:
38215 return ix86_cost->vec_stmt_cost;
38216
38217 case vec_construct:
38218 elements = TYPE_VECTOR_SUBPARTS (vectype);
38219 return elements / 2 + 1;
38220
38221 default:
38222 gcc_unreachable ();
38223 }
38224 }
38225
38226 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
38227 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
38228 insn every time. */
38229
38230 static GTY(()) rtx vselect_insn;
38231
38232 /* Initialize vselect_insn. */
38233
38234 static void
38235 init_vselect_insn (void)
38236 {
38237 unsigned i;
38238 rtx x;
38239
38240 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
38241 for (i = 0; i < MAX_VECT_LEN; ++i)
38242 XVECEXP (x, 0, i) = const0_rtx;
38243 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
38244 const0_rtx), x);
38245 x = gen_rtx_SET (VOIDmode, const0_rtx, x);
38246 start_sequence ();
38247 vselect_insn = emit_insn (x);
38248 end_sequence ();
38249 }
38250
38251 /* Construct (set target (vec_select op0 (parallel perm))) and
38252 return true if that's a valid instruction in the active ISA. */
38253
38254 static bool
38255 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
38256 unsigned nelt, bool testing_p)
38257 {
38258 unsigned int i;
38259 rtx x, save_vconcat;
38260 int icode;
38261
38262 if (vselect_insn == NULL_RTX)
38263 init_vselect_insn ();
38264
38265 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
38266 PUT_NUM_ELEM (XVEC (x, 0), nelt);
38267 for (i = 0; i < nelt; ++i)
38268 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
38269 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
38270 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
38271 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
38272 SET_DEST (PATTERN (vselect_insn)) = target;
38273 icode = recog_memoized (vselect_insn);
38274
38275 if (icode >= 0 && !testing_p)
38276 emit_insn (copy_rtx (PATTERN (vselect_insn)));
38277
38278 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
38279 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
38280 INSN_CODE (vselect_insn) = -1;
38281
38282 return icode >= 0;
38283 }
38284
38285 /* Similar, but generate a vec_concat from op0 and op1 as well. */
38286
38287 static bool
38288 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
38289 const unsigned char *perm, unsigned nelt,
38290 bool testing_p)
38291 {
38292 enum machine_mode v2mode;
38293 rtx x;
38294 bool ok;
38295
38296 if (vselect_insn == NULL_RTX)
38297 init_vselect_insn ();
38298
38299 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
38300 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
38301 PUT_MODE (x, v2mode);
38302 XEXP (x, 0) = op0;
38303 XEXP (x, 1) = op1;
38304 ok = expand_vselect (target, x, perm, nelt, testing_p);
38305 XEXP (x, 0) = const0_rtx;
38306 XEXP (x, 1) = const0_rtx;
38307 return ok;
38308 }
38309
38310 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38311 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
38312
38313 static bool
38314 expand_vec_perm_blend (struct expand_vec_perm_d *d)
38315 {
38316 enum machine_mode vmode = d->vmode;
38317 unsigned i, mask, nelt = d->nelt;
38318 rtx target, op0, op1, x;
38319 rtx rperm[32], vperm;
38320
38321 if (d->one_operand_p)
38322 return false;
38323 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
38324 ;
38325 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
38326 ;
38327 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
38328 ;
38329 else
38330 return false;
38331
38332 /* This is a blend, not a permute. Elements must stay in their
38333 respective lanes. */
38334 for (i = 0; i < nelt; ++i)
38335 {
38336 unsigned e = d->perm[i];
38337 if (!(e == i || e == i + nelt))
38338 return false;
38339 }
38340
38341 if (d->testing_p)
38342 return true;
38343
38344 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
38345 decision should be extracted elsewhere, so that we only try that
38346 sequence once all budget==3 options have been tried. */
38347 target = d->target;
38348 op0 = d->op0;
38349 op1 = d->op1;
38350 mask = 0;
38351
38352 switch (vmode)
38353 {
38354 case V4DFmode:
38355 case V8SFmode:
38356 case V2DFmode:
38357 case V4SFmode:
38358 case V8HImode:
38359 case V8SImode:
38360 for (i = 0; i < nelt; ++i)
38361 mask |= (d->perm[i] >= nelt) << i;
38362 break;
38363
38364 case V2DImode:
38365 for (i = 0; i < 2; ++i)
38366 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
38367 vmode = V8HImode;
38368 goto do_subreg;
38369
38370 case V4SImode:
38371 for (i = 0; i < 4; ++i)
38372 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
38373 vmode = V8HImode;
38374 goto do_subreg;
38375
38376 case V16QImode:
38377 /* See if bytes move in pairs so we can use pblendw with
38378 an immediate argument, rather than pblendvb with a vector
38379 argument. */
38380 for (i = 0; i < 16; i += 2)
38381 if (d->perm[i] + 1 != d->perm[i + 1])
38382 {
38383 use_pblendvb:
38384 for (i = 0; i < nelt; ++i)
38385 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
38386
38387 finish_pblendvb:
38388 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
38389 vperm = force_reg (vmode, vperm);
38390
38391 if (GET_MODE_SIZE (vmode) == 16)
38392 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
38393 else
38394 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
38395 return true;
38396 }
38397
38398 for (i = 0; i < 8; ++i)
38399 mask |= (d->perm[i * 2] >= 16) << i;
38400 vmode = V8HImode;
38401 /* FALLTHRU */
38402
38403 do_subreg:
38404 target = gen_lowpart (vmode, target);
38405 op0 = gen_lowpart (vmode, op0);
38406 op1 = gen_lowpart (vmode, op1);
38407 break;
38408
38409 case V32QImode:
38410 /* See if bytes move in pairs. If not, vpblendvb must be used. */
38411 for (i = 0; i < 32; i += 2)
38412 if (d->perm[i] + 1 != d->perm[i + 1])
38413 goto use_pblendvb;
38414 /* See if bytes move in quadruplets. If yes, vpblendd
38415 with immediate can be used. */
38416 for (i = 0; i < 32; i += 4)
38417 if (d->perm[i] + 2 != d->perm[i + 2])
38418 break;
38419 if (i < 32)
38420 {
38421 /* See if bytes move the same in both lanes. If yes,
38422 vpblendw with immediate can be used. */
38423 for (i = 0; i < 16; i += 2)
38424 if (d->perm[i] + 16 != d->perm[i + 16])
38425 goto use_pblendvb;
38426
38427 /* Use vpblendw. */
38428 for (i = 0; i < 16; ++i)
38429 mask |= (d->perm[i * 2] >= 32) << i;
38430 vmode = V16HImode;
38431 goto do_subreg;
38432 }
38433
38434 /* Use vpblendd. */
38435 for (i = 0; i < 8; ++i)
38436 mask |= (d->perm[i * 4] >= 32) << i;
38437 vmode = V8SImode;
38438 goto do_subreg;
38439
38440 case V16HImode:
38441 /* See if words move in pairs. If yes, vpblendd can be used. */
38442 for (i = 0; i < 16; i += 2)
38443 if (d->perm[i] + 1 != d->perm[i + 1])
38444 break;
38445 if (i < 16)
38446 {
38447 /* See if words move the same in both lanes. If not,
38448 vpblendvb must be used. */
38449 for (i = 0; i < 8; i++)
38450 if (d->perm[i] + 8 != d->perm[i + 8])
38451 {
38452 /* Use vpblendvb. */
38453 for (i = 0; i < 32; ++i)
38454 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
38455
38456 vmode = V32QImode;
38457 nelt = 32;
38458 target = gen_lowpart (vmode, target);
38459 op0 = gen_lowpart (vmode, op0);
38460 op1 = gen_lowpart (vmode, op1);
38461 goto finish_pblendvb;
38462 }
38463
38464 /* Use vpblendw. */
38465 for (i = 0; i < 16; ++i)
38466 mask |= (d->perm[i] >= 16) << i;
38467 break;
38468 }
38469
38470 /* Use vpblendd. */
38471 for (i = 0; i < 8; ++i)
38472 mask |= (d->perm[i * 2] >= 16) << i;
38473 vmode = V8SImode;
38474 goto do_subreg;
38475
38476 case V4DImode:
38477 /* Use vpblendd. */
38478 for (i = 0; i < 4; ++i)
38479 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
38480 vmode = V8SImode;
38481 goto do_subreg;
38482
38483 default:
38484 gcc_unreachable ();
38485 }
38486
38487 /* This matches five different patterns with the different modes. */
38488 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
38489 x = gen_rtx_SET (VOIDmode, target, x);
38490 emit_insn (x);
38491
38492 return true;
38493 }
38494
38495 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38496 in terms of the variable form of vpermilps.
38497
38498 Note that we will have already failed the immediate input vpermilps,
38499 which requires that the high and low part shuffle be identical; the
38500 variable form doesn't require that. */
38501
38502 static bool
38503 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
38504 {
38505 rtx rperm[8], vperm;
38506 unsigned i;
38507
38508 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
38509 return false;
38510
38511 /* We can only permute within the 128-bit lane. */
38512 for (i = 0; i < 8; ++i)
38513 {
38514 unsigned e = d->perm[i];
38515 if (i < 4 ? e >= 4 : e < 4)
38516 return false;
38517 }
38518
38519 if (d->testing_p)
38520 return true;
38521
38522 for (i = 0; i < 8; ++i)
38523 {
38524 unsigned e = d->perm[i];
38525
38526 /* Within each 128-bit lane, the elements of op0 are numbered
38527 from 0 and the elements of op1 are numbered from 4. */
38528 if (e >= 8 + 4)
38529 e -= 8;
38530 else if (e >= 4)
38531 e -= 4;
38532
38533 rperm[i] = GEN_INT (e);
38534 }
38535
38536 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
38537 vperm = force_reg (V8SImode, vperm);
38538 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
38539
38540 return true;
38541 }
38542
38543 /* Return true if permutation D can be performed as VMODE permutation
38544 instead. */
38545
38546 static bool
38547 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
38548 {
38549 unsigned int i, j, chunk;
38550
38551 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
38552 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
38553 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
38554 return false;
38555
38556 if (GET_MODE_NUNITS (vmode) >= d->nelt)
38557 return true;
38558
38559 chunk = d->nelt / GET_MODE_NUNITS (vmode);
38560 for (i = 0; i < d->nelt; i += chunk)
38561 if (d->perm[i] & (chunk - 1))
38562 return false;
38563 else
38564 for (j = 1; j < chunk; ++j)
38565 if (d->perm[i] + j != d->perm[i + j])
38566 return false;
38567
38568 return true;
38569 }
38570
38571 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38572 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
38573
38574 static bool
38575 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
38576 {
38577 unsigned i, nelt, eltsz, mask;
38578 unsigned char perm[32];
38579 enum machine_mode vmode = V16QImode;
38580 rtx rperm[32], vperm, target, op0, op1;
38581
38582 nelt = d->nelt;
38583
38584 if (!d->one_operand_p)
38585 {
38586 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
38587 {
38588 if (TARGET_AVX2
38589 && valid_perm_using_mode_p (V2TImode, d))
38590 {
38591 if (d->testing_p)
38592 return true;
38593
38594 /* Use vperm2i128 insn. The pattern uses
38595 V4DImode instead of V2TImode. */
38596 target = gen_lowpart (V4DImode, d->target);
38597 op0 = gen_lowpart (V4DImode, d->op0);
38598 op1 = gen_lowpart (V4DImode, d->op1);
38599 rperm[0]
38600 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
38601 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
38602 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
38603 return true;
38604 }
38605 return false;
38606 }
38607 }
38608 else
38609 {
38610 if (GET_MODE_SIZE (d->vmode) == 16)
38611 {
38612 if (!TARGET_SSSE3)
38613 return false;
38614 }
38615 else if (GET_MODE_SIZE (d->vmode) == 32)
38616 {
38617 if (!TARGET_AVX2)
38618 return false;
38619
38620 /* V4DImode should be already handled through
38621 expand_vselect by vpermq instruction. */
38622 gcc_assert (d->vmode != V4DImode);
38623
38624 vmode = V32QImode;
38625 if (d->vmode == V8SImode
38626 || d->vmode == V16HImode
38627 || d->vmode == V32QImode)
38628 {
38629 /* First see if vpermq can be used for
38630 V8SImode/V16HImode/V32QImode. */
38631 if (valid_perm_using_mode_p (V4DImode, d))
38632 {
38633 for (i = 0; i < 4; i++)
38634 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
38635 if (d->testing_p)
38636 return true;
38637 return expand_vselect (gen_lowpart (V4DImode, d->target),
38638 gen_lowpart (V4DImode, d->op0),
38639 perm, 4, false);
38640 }
38641
38642 /* Next see if vpermd can be used. */
38643 if (valid_perm_using_mode_p (V8SImode, d))
38644 vmode = V8SImode;
38645 }
38646 /* Or if vpermps can be used. */
38647 else if (d->vmode == V8SFmode)
38648 vmode = V8SImode;
38649
38650 if (vmode == V32QImode)
38651 {
38652 /* vpshufb only works intra lanes, it is not
38653 possible to shuffle bytes in between the lanes. */
38654 for (i = 0; i < nelt; ++i)
38655 if ((d->perm[i] ^ i) & (nelt / 2))
38656 return false;
38657 }
38658 }
38659 else
38660 return false;
38661 }
38662
38663 if (d->testing_p)
38664 return true;
38665
38666 if (vmode == V8SImode)
38667 for (i = 0; i < 8; ++i)
38668 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
38669 else
38670 {
38671 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
38672 if (!d->one_operand_p)
38673 mask = 2 * nelt - 1;
38674 else if (vmode == V16QImode)
38675 mask = nelt - 1;
38676 else
38677 mask = nelt / 2 - 1;
38678
38679 for (i = 0; i < nelt; ++i)
38680 {
38681 unsigned j, e = d->perm[i] & mask;
38682 for (j = 0; j < eltsz; ++j)
38683 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
38684 }
38685 }
38686
38687 vperm = gen_rtx_CONST_VECTOR (vmode,
38688 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
38689 vperm = force_reg (vmode, vperm);
38690
38691 target = gen_lowpart (vmode, d->target);
38692 op0 = gen_lowpart (vmode, d->op0);
38693 if (d->one_operand_p)
38694 {
38695 if (vmode == V16QImode)
38696 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
38697 else if (vmode == V32QImode)
38698 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
38699 else if (vmode == V8SFmode)
38700 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
38701 else
38702 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
38703 }
38704 else
38705 {
38706 op1 = gen_lowpart (vmode, d->op1);
38707 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
38708 }
38709
38710 return true;
38711 }
38712
38713 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
38714 in a single instruction. */
38715
38716 static bool
38717 expand_vec_perm_1 (struct expand_vec_perm_d *d)
38718 {
38719 unsigned i, nelt = d->nelt;
38720 unsigned char perm2[MAX_VECT_LEN];
38721
38722 /* Check plain VEC_SELECT first, because AVX has instructions that could
38723 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
38724 input where SEL+CONCAT may not. */
38725 if (d->one_operand_p)
38726 {
38727 int mask = nelt - 1;
38728 bool identity_perm = true;
38729 bool broadcast_perm = true;
38730
38731 for (i = 0; i < nelt; i++)
38732 {
38733 perm2[i] = d->perm[i] & mask;
38734 if (perm2[i] != i)
38735 identity_perm = false;
38736 if (perm2[i])
38737 broadcast_perm = false;
38738 }
38739
38740 if (identity_perm)
38741 {
38742 if (!d->testing_p)
38743 emit_move_insn (d->target, d->op0);
38744 return true;
38745 }
38746 else if (broadcast_perm && TARGET_AVX2)
38747 {
38748 /* Use vpbroadcast{b,w,d}. */
38749 rtx (*gen) (rtx, rtx) = NULL;
38750 switch (d->vmode)
38751 {
38752 case V32QImode:
38753 gen = gen_avx2_pbroadcastv32qi_1;
38754 break;
38755 case V16HImode:
38756 gen = gen_avx2_pbroadcastv16hi_1;
38757 break;
38758 case V8SImode:
38759 gen = gen_avx2_pbroadcastv8si_1;
38760 break;
38761 case V16QImode:
38762 gen = gen_avx2_pbroadcastv16qi;
38763 break;
38764 case V8HImode:
38765 gen = gen_avx2_pbroadcastv8hi;
38766 break;
38767 case V8SFmode:
38768 gen = gen_avx2_vec_dupv8sf_1;
38769 break;
38770 /* For other modes prefer other shuffles this function creates. */
38771 default: break;
38772 }
38773 if (gen != NULL)
38774 {
38775 if (!d->testing_p)
38776 emit_insn (gen (d->target, d->op0));
38777 return true;
38778 }
38779 }
38780
38781 if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
38782 return true;
38783
38784 /* There are plenty of patterns in sse.md that are written for
38785 SEL+CONCAT and are not replicated for a single op. Perhaps
38786 that should be changed, to avoid the nastiness here. */
38787
38788 /* Recognize interleave style patterns, which means incrementing
38789 every other permutation operand. */
38790 for (i = 0; i < nelt; i += 2)
38791 {
38792 perm2[i] = d->perm[i] & mask;
38793 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
38794 }
38795 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
38796 d->testing_p))
38797 return true;
38798
38799 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
38800 if (nelt >= 4)
38801 {
38802 for (i = 0; i < nelt; i += 4)
38803 {
38804 perm2[i + 0] = d->perm[i + 0] & mask;
38805 perm2[i + 1] = d->perm[i + 1] & mask;
38806 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
38807 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
38808 }
38809
38810 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
38811 d->testing_p))
38812 return true;
38813 }
38814 }
38815
38816 /* Finally, try the fully general two operand permute. */
38817 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
38818 d->testing_p))
38819 return true;
38820
38821 /* Recognize interleave style patterns with reversed operands. */
38822 if (!d->one_operand_p)
38823 {
38824 for (i = 0; i < nelt; ++i)
38825 {
38826 unsigned e = d->perm[i];
38827 if (e >= nelt)
38828 e -= nelt;
38829 else
38830 e += nelt;
38831 perm2[i] = e;
38832 }
38833
38834 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
38835 d->testing_p))
38836 return true;
38837 }
38838
38839 /* Try the SSE4.1 blend variable merge instructions. */
38840 if (expand_vec_perm_blend (d))
38841 return true;
38842
38843 /* Try one of the AVX vpermil variable permutations. */
38844 if (expand_vec_perm_vpermil (d))
38845 return true;
38846
38847 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
38848 vpshufb, vpermd, vpermps or vpermq variable permutation. */
38849 if (expand_vec_perm_pshufb (d))
38850 return true;
38851
38852 return false;
38853 }
38854
38855 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38856 in terms of a pair of pshuflw + pshufhw instructions. */
38857
38858 static bool
38859 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
38860 {
38861 unsigned char perm2[MAX_VECT_LEN];
38862 unsigned i;
38863 bool ok;
38864
38865 if (d->vmode != V8HImode || !d->one_operand_p)
38866 return false;
38867
38868 /* The two permutations only operate in 64-bit lanes. */
38869 for (i = 0; i < 4; ++i)
38870 if (d->perm[i] >= 4)
38871 return false;
38872 for (i = 4; i < 8; ++i)
38873 if (d->perm[i] < 4)
38874 return false;
38875
38876 if (d->testing_p)
38877 return true;
38878
38879 /* Emit the pshuflw. */
38880 memcpy (perm2, d->perm, 4);
38881 for (i = 4; i < 8; ++i)
38882 perm2[i] = i;
38883 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
38884 gcc_assert (ok);
38885
38886 /* Emit the pshufhw. */
38887 memcpy (perm2 + 4, d->perm + 4, 4);
38888 for (i = 0; i < 4; ++i)
38889 perm2[i] = i;
38890 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
38891 gcc_assert (ok);
38892
38893 return true;
38894 }
38895
38896 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
38897 the permutation using the SSSE3 palignr instruction. This succeeds
38898 when all of the elements in PERM fit within one vector and we merely
38899 need to shift them down so that a single vector permutation has a
38900 chance to succeed. */
38901
38902 static bool
38903 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
38904 {
38905 unsigned i, nelt = d->nelt;
38906 unsigned min, max;
38907 bool in_order, ok;
38908 rtx shift;
38909
38910 /* Even with AVX, palignr only operates on 128-bit vectors. */
38911 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
38912 return false;
38913
38914 min = nelt, max = 0;
38915 for (i = 0; i < nelt; ++i)
38916 {
38917 unsigned e = d->perm[i];
38918 if (e < min)
38919 min = e;
38920 if (e > max)
38921 max = e;
38922 }
38923 if (min == 0 || max - min >= nelt)
38924 return false;
38925
38926 /* Given that we have SSSE3, we know we'll be able to implement the
38927 single operand permutation after the palignr with pshufb. */
38928 if (d->testing_p)
38929 return true;
38930
38931 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
38932 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
38933 gen_lowpart (TImode, d->op1),
38934 gen_lowpart (TImode, d->op0), shift));
38935
38936 d->op0 = d->op1 = d->target;
38937 d->one_operand_p = true;
38938
38939 in_order = true;
38940 for (i = 0; i < nelt; ++i)
38941 {
38942 unsigned e = d->perm[i] - min;
38943 if (e != i)
38944 in_order = false;
38945 d->perm[i] = e;
38946 }
38947
38948 /* Test for the degenerate case where the alignment by itself
38949 produces the desired permutation. */
38950 if (in_order)
38951 return true;
38952
38953 ok = expand_vec_perm_1 (d);
38954 gcc_assert (ok);
38955
38956 return ok;
38957 }
38958
38959 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
38960
38961 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
38962 a two vector permutation into a single vector permutation by using
38963 an interleave operation to merge the vectors. */
38964
38965 static bool
38966 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
38967 {
38968 struct expand_vec_perm_d dremap, dfinal;
38969 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
38970 unsigned HOST_WIDE_INT contents;
38971 unsigned char remap[2 * MAX_VECT_LEN];
38972 rtx seq;
38973 bool ok, same_halves = false;
38974
38975 if (GET_MODE_SIZE (d->vmode) == 16)
38976 {
38977 if (d->one_operand_p)
38978 return false;
38979 }
38980 else if (GET_MODE_SIZE (d->vmode) == 32)
38981 {
38982 if (!TARGET_AVX)
38983 return false;
38984 /* For 32-byte modes allow even d->one_operand_p.
38985 The lack of cross-lane shuffling in some instructions
38986 might prevent a single insn shuffle. */
38987 dfinal = *d;
38988 dfinal.testing_p = true;
38989 /* If expand_vec_perm_interleave3 can expand this into
38990 a 3 insn sequence, give up and let it be expanded as
38991 3 insn sequence. While that is one insn longer,
38992 it doesn't need a memory operand and in the common
38993 case that both interleave low and high permutations
38994 with the same operands are adjacent needs 4 insns
38995 for both after CSE. */
38996 if (expand_vec_perm_interleave3 (&dfinal))
38997 return false;
38998 }
38999 else
39000 return false;
39001
39002 /* Examine from whence the elements come. */
39003 contents = 0;
39004 for (i = 0; i < nelt; ++i)
39005 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
39006
39007 memset (remap, 0xff, sizeof (remap));
39008 dremap = *d;
39009
39010 if (GET_MODE_SIZE (d->vmode) == 16)
39011 {
39012 unsigned HOST_WIDE_INT h1, h2, h3, h4;
39013
39014 /* Split the two input vectors into 4 halves. */
39015 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
39016 h2 = h1 << nelt2;
39017 h3 = h2 << nelt2;
39018 h4 = h3 << nelt2;
39019
39020 /* If the elements from the low halves use interleave low, and similarly
39021 for interleave high. If the elements are from mis-matched halves, we
39022 can use shufps for V4SF/V4SI or do a DImode shuffle. */
39023 if ((contents & (h1 | h3)) == contents)
39024 {
39025 /* punpckl* */
39026 for (i = 0; i < nelt2; ++i)
39027 {
39028 remap[i] = i * 2;
39029 remap[i + nelt] = i * 2 + 1;
39030 dremap.perm[i * 2] = i;
39031 dremap.perm[i * 2 + 1] = i + nelt;
39032 }
39033 if (!TARGET_SSE2 && d->vmode == V4SImode)
39034 dremap.vmode = V4SFmode;
39035 }
39036 else if ((contents & (h2 | h4)) == contents)
39037 {
39038 /* punpckh* */
39039 for (i = 0; i < nelt2; ++i)
39040 {
39041 remap[i + nelt2] = i * 2;
39042 remap[i + nelt + nelt2] = i * 2 + 1;
39043 dremap.perm[i * 2] = i + nelt2;
39044 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
39045 }
39046 if (!TARGET_SSE2 && d->vmode == V4SImode)
39047 dremap.vmode = V4SFmode;
39048 }
39049 else if ((contents & (h1 | h4)) == contents)
39050 {
39051 /* shufps */
39052 for (i = 0; i < nelt2; ++i)
39053 {
39054 remap[i] = i;
39055 remap[i + nelt + nelt2] = i + nelt2;
39056 dremap.perm[i] = i;
39057 dremap.perm[i + nelt2] = i + nelt + nelt2;
39058 }
39059 if (nelt != 4)
39060 {
39061 /* shufpd */
39062 dremap.vmode = V2DImode;
39063 dremap.nelt = 2;
39064 dremap.perm[0] = 0;
39065 dremap.perm[1] = 3;
39066 }
39067 }
39068 else if ((contents & (h2 | h3)) == contents)
39069 {
39070 /* shufps */
39071 for (i = 0; i < nelt2; ++i)
39072 {
39073 remap[i + nelt2] = i;
39074 remap[i + nelt] = i + nelt2;
39075 dremap.perm[i] = i + nelt2;
39076 dremap.perm[i + nelt2] = i + nelt;
39077 }
39078 if (nelt != 4)
39079 {
39080 /* shufpd */
39081 dremap.vmode = V2DImode;
39082 dremap.nelt = 2;
39083 dremap.perm[0] = 1;
39084 dremap.perm[1] = 2;
39085 }
39086 }
39087 else
39088 return false;
39089 }
39090 else
39091 {
39092 unsigned int nelt4 = nelt / 4, nzcnt = 0;
39093 unsigned HOST_WIDE_INT q[8];
39094 unsigned int nonzero_halves[4];
39095
39096 /* Split the two input vectors into 8 quarters. */
39097 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
39098 for (i = 1; i < 8; ++i)
39099 q[i] = q[0] << (nelt4 * i);
39100 for (i = 0; i < 4; ++i)
39101 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
39102 {
39103 nonzero_halves[nzcnt] = i;
39104 ++nzcnt;
39105 }
39106
39107 if (nzcnt == 1)
39108 {
39109 gcc_assert (d->one_operand_p);
39110 nonzero_halves[1] = nonzero_halves[0];
39111 same_halves = true;
39112 }
39113 else if (d->one_operand_p)
39114 {
39115 gcc_assert (nonzero_halves[0] == 0);
39116 gcc_assert (nonzero_halves[1] == 1);
39117 }
39118
39119 if (nzcnt <= 2)
39120 {
39121 if (d->perm[0] / nelt2 == nonzero_halves[1])
39122 {
39123 /* Attempt to increase the likelihood that dfinal
39124 shuffle will be intra-lane. */
39125 char tmph = nonzero_halves[0];
39126 nonzero_halves[0] = nonzero_halves[1];
39127 nonzero_halves[1] = tmph;
39128 }
39129
39130 /* vperm2f128 or vperm2i128. */
39131 for (i = 0; i < nelt2; ++i)
39132 {
39133 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
39134 remap[i + nonzero_halves[0] * nelt2] = i;
39135 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
39136 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
39137 }
39138
39139 if (d->vmode != V8SFmode
39140 && d->vmode != V4DFmode
39141 && d->vmode != V8SImode)
39142 {
39143 dremap.vmode = V8SImode;
39144 dremap.nelt = 8;
39145 for (i = 0; i < 4; ++i)
39146 {
39147 dremap.perm[i] = i + nonzero_halves[0] * 4;
39148 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
39149 }
39150 }
39151 }
39152 else if (d->one_operand_p)
39153 return false;
39154 else if (TARGET_AVX2
39155 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
39156 {
39157 /* vpunpckl* */
39158 for (i = 0; i < nelt4; ++i)
39159 {
39160 remap[i] = i * 2;
39161 remap[i + nelt] = i * 2 + 1;
39162 remap[i + nelt2] = i * 2 + nelt2;
39163 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
39164 dremap.perm[i * 2] = i;
39165 dremap.perm[i * 2 + 1] = i + nelt;
39166 dremap.perm[i * 2 + nelt2] = i + nelt2;
39167 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
39168 }
39169 }
39170 else if (TARGET_AVX2
39171 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
39172 {
39173 /* vpunpckh* */
39174 for (i = 0; i < nelt4; ++i)
39175 {
39176 remap[i + nelt4] = i * 2;
39177 remap[i + nelt + nelt4] = i * 2 + 1;
39178 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
39179 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
39180 dremap.perm[i * 2] = i + nelt4;
39181 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
39182 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
39183 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
39184 }
39185 }
39186 else
39187 return false;
39188 }
39189
39190 /* Use the remapping array set up above to move the elements from their
39191 swizzled locations into their final destinations. */
39192 dfinal = *d;
39193 for (i = 0; i < nelt; ++i)
39194 {
39195 unsigned e = remap[d->perm[i]];
39196 gcc_assert (e < nelt);
39197 /* If same_halves is true, both halves of the remapped vector are the
39198 same. Avoid cross-lane accesses if possible. */
39199 if (same_halves && i >= nelt2)
39200 {
39201 gcc_assert (e < nelt2);
39202 dfinal.perm[i] = e + nelt2;
39203 }
39204 else
39205 dfinal.perm[i] = e;
39206 }
39207 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
39208 dfinal.op1 = dfinal.op0;
39209 dfinal.one_operand_p = true;
39210 dremap.target = dfinal.op0;
39211
39212 /* Test if the final remap can be done with a single insn. For V4SFmode or
39213 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
39214 start_sequence ();
39215 ok = expand_vec_perm_1 (&dfinal);
39216 seq = get_insns ();
39217 end_sequence ();
39218
39219 if (!ok)
39220 return false;
39221
39222 if (d->testing_p)
39223 return true;
39224
39225 if (dremap.vmode != dfinal.vmode)
39226 {
39227 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
39228 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
39229 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
39230 }
39231
39232 ok = expand_vec_perm_1 (&dremap);
39233 gcc_assert (ok);
39234
39235 emit_insn (seq);
39236 return true;
39237 }
39238
39239 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39240 a single vector cross-lane permutation into vpermq followed
39241 by any of the single insn permutations. */
39242
39243 static bool
39244 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
39245 {
39246 struct expand_vec_perm_d dremap, dfinal;
39247 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
39248 unsigned contents[2];
39249 bool ok;
39250
39251 if (!(TARGET_AVX2
39252 && (d->vmode == V32QImode || d->vmode == V16HImode)
39253 && d->one_operand_p))
39254 return false;
39255
39256 contents[0] = 0;
39257 contents[1] = 0;
39258 for (i = 0; i < nelt2; ++i)
39259 {
39260 contents[0] |= 1u << (d->perm[i] / nelt4);
39261 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
39262 }
39263
39264 for (i = 0; i < 2; ++i)
39265 {
39266 unsigned int cnt = 0;
39267 for (j = 0; j < 4; ++j)
39268 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
39269 return false;
39270 }
39271
39272 if (d->testing_p)
39273 return true;
39274
39275 dremap = *d;
39276 dremap.vmode = V4DImode;
39277 dremap.nelt = 4;
39278 dremap.target = gen_reg_rtx (V4DImode);
39279 dremap.op0 = gen_lowpart (V4DImode, d->op0);
39280 dremap.op1 = dremap.op0;
39281 dremap.one_operand_p = true;
39282 for (i = 0; i < 2; ++i)
39283 {
39284 unsigned int cnt = 0;
39285 for (j = 0; j < 4; ++j)
39286 if ((contents[i] & (1u << j)) != 0)
39287 dremap.perm[2 * i + cnt++] = j;
39288 for (; cnt < 2; ++cnt)
39289 dremap.perm[2 * i + cnt] = 0;
39290 }
39291
39292 dfinal = *d;
39293 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
39294 dfinal.op1 = dfinal.op0;
39295 dfinal.one_operand_p = true;
39296 for (i = 0, j = 0; i < nelt; ++i)
39297 {
39298 if (i == nelt2)
39299 j = 2;
39300 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
39301 if ((d->perm[i] / nelt4) == dremap.perm[j])
39302 ;
39303 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
39304 dfinal.perm[i] |= nelt4;
39305 else
39306 gcc_unreachable ();
39307 }
39308
39309 ok = expand_vec_perm_1 (&dremap);
39310 gcc_assert (ok);
39311
39312 ok = expand_vec_perm_1 (&dfinal);
39313 gcc_assert (ok);
39314
39315 return true;
39316 }
39317
39318 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
39319 a vector permutation using two instructions, vperm2f128 resp.
39320 vperm2i128 followed by any single in-lane permutation. */
39321
39322 static bool
39323 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
39324 {
39325 struct expand_vec_perm_d dfirst, dsecond;
39326 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
39327 bool ok;
39328
39329 if (!TARGET_AVX
39330 || GET_MODE_SIZE (d->vmode) != 32
39331 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
39332 return false;
39333
39334 dsecond = *d;
39335 dsecond.one_operand_p = false;
39336 dsecond.testing_p = true;
39337
39338 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
39339 immediate. For perm < 16 the second permutation uses
39340 d->op0 as first operand, for perm >= 16 it uses d->op1
39341 as first operand. The second operand is the result of
39342 vperm2[fi]128. */
39343 for (perm = 0; perm < 32; perm++)
39344 {
39345 /* Ignore permutations which do not move anything cross-lane. */
39346 if (perm < 16)
39347 {
39348 /* The second shuffle for e.g. V4DFmode has
39349 0123 and ABCD operands.
39350 Ignore AB23, as 23 is already in the second lane
39351 of the first operand. */
39352 if ((perm & 0xc) == (1 << 2)) continue;
39353 /* And 01CD, as 01 is in the first lane of the first
39354 operand. */
39355 if ((perm & 3) == 0) continue;
39356 /* And 4567, as then the vperm2[fi]128 doesn't change
39357 anything on the original 4567 second operand. */
39358 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
39359 }
39360 else
39361 {
39362 /* The second shuffle for e.g. V4DFmode has
39363 4567 and ABCD operands.
39364 Ignore AB67, as 67 is already in the second lane
39365 of the first operand. */
39366 if ((perm & 0xc) == (3 << 2)) continue;
39367 /* And 45CD, as 45 is in the first lane of the first
39368 operand. */
39369 if ((perm & 3) == 2) continue;
39370 /* And 0123, as then the vperm2[fi]128 doesn't change
39371 anything on the original 0123 first operand. */
39372 if ((perm & 0xf) == (1 << 2)) continue;
39373 }
39374
39375 for (i = 0; i < nelt; i++)
39376 {
39377 j = d->perm[i] / nelt2;
39378 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
39379 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
39380 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
39381 dsecond.perm[i] = d->perm[i] & (nelt - 1);
39382 else
39383 break;
39384 }
39385
39386 if (i == nelt)
39387 {
39388 start_sequence ();
39389 ok = expand_vec_perm_1 (&dsecond);
39390 end_sequence ();
39391 }
39392 else
39393 ok = false;
39394
39395 if (ok)
39396 {
39397 if (d->testing_p)
39398 return true;
39399
39400 /* Found a usable second shuffle. dfirst will be
39401 vperm2f128 on d->op0 and d->op1. */
39402 dsecond.testing_p = false;
39403 dfirst = *d;
39404 dfirst.target = gen_reg_rtx (d->vmode);
39405 for (i = 0; i < nelt; i++)
39406 dfirst.perm[i] = (i & (nelt2 - 1))
39407 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
39408
39409 ok = expand_vec_perm_1 (&dfirst);
39410 gcc_assert (ok);
39411
39412 /* And dsecond is some single insn shuffle, taking
39413 d->op0 and result of vperm2f128 (if perm < 16) or
39414 d->op1 and result of vperm2f128 (otherwise). */
39415 dsecond.op1 = dfirst.target;
39416 if (perm >= 16)
39417 dsecond.op0 = dfirst.op1;
39418
39419 ok = expand_vec_perm_1 (&dsecond);
39420 gcc_assert (ok);
39421
39422 return true;
39423 }
39424
39425 /* For one operand, the only useful vperm2f128 permutation is 0x10. */
39426 if (d->one_operand_p)
39427 return false;
39428 }
39429
39430 return false;
39431 }
39432
39433 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39434 a two vector permutation using 2 intra-lane interleave insns
39435 and cross-lane shuffle for 32-byte vectors. */
39436
39437 static bool
39438 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
39439 {
39440 unsigned i, nelt;
39441 rtx (*gen) (rtx, rtx, rtx);
39442
39443 if (d->one_operand_p)
39444 return false;
39445 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
39446 ;
39447 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
39448 ;
39449 else
39450 return false;
39451
39452 nelt = d->nelt;
39453 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
39454 return false;
39455 for (i = 0; i < nelt; i += 2)
39456 if (d->perm[i] != d->perm[0] + i / 2
39457 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
39458 return false;
39459
39460 if (d->testing_p)
39461 return true;
39462
39463 switch (d->vmode)
39464 {
39465 case V32QImode:
39466 if (d->perm[0])
39467 gen = gen_vec_interleave_highv32qi;
39468 else
39469 gen = gen_vec_interleave_lowv32qi;
39470 break;
39471 case V16HImode:
39472 if (d->perm[0])
39473 gen = gen_vec_interleave_highv16hi;
39474 else
39475 gen = gen_vec_interleave_lowv16hi;
39476 break;
39477 case V8SImode:
39478 if (d->perm[0])
39479 gen = gen_vec_interleave_highv8si;
39480 else
39481 gen = gen_vec_interleave_lowv8si;
39482 break;
39483 case V4DImode:
39484 if (d->perm[0])
39485 gen = gen_vec_interleave_highv4di;
39486 else
39487 gen = gen_vec_interleave_lowv4di;
39488 break;
39489 case V8SFmode:
39490 if (d->perm[0])
39491 gen = gen_vec_interleave_highv8sf;
39492 else
39493 gen = gen_vec_interleave_lowv8sf;
39494 break;
39495 case V4DFmode:
39496 if (d->perm[0])
39497 gen = gen_vec_interleave_highv4df;
39498 else
39499 gen = gen_vec_interleave_lowv4df;
39500 break;
39501 default:
39502 gcc_unreachable ();
39503 }
39504
39505 emit_insn (gen (d->target, d->op0, d->op1));
39506 return true;
39507 }
39508
39509 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
39510 a single vector permutation using a single intra-lane vector
39511 permutation, vperm2f128 swapping the lanes and vblend* insn blending
39512 the non-swapped and swapped vectors together. */
39513
39514 static bool
39515 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
39516 {
39517 struct expand_vec_perm_d dfirst, dsecond;
39518 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
39519 rtx seq;
39520 bool ok;
39521 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
39522
39523 if (!TARGET_AVX
39524 || TARGET_AVX2
39525 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
39526 || !d->one_operand_p)
39527 return false;
39528
39529 dfirst = *d;
39530 for (i = 0; i < nelt; i++)
39531 dfirst.perm[i] = 0xff;
39532 for (i = 0, msk = 0; i < nelt; i++)
39533 {
39534 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
39535 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
39536 return false;
39537 dfirst.perm[j] = d->perm[i];
39538 if (j != i)
39539 msk |= (1 << i);
39540 }
39541 for (i = 0; i < nelt; i++)
39542 if (dfirst.perm[i] == 0xff)
39543 dfirst.perm[i] = i;
39544
39545 if (!d->testing_p)
39546 dfirst.target = gen_reg_rtx (dfirst.vmode);
39547
39548 start_sequence ();
39549 ok = expand_vec_perm_1 (&dfirst);
39550 seq = get_insns ();
39551 end_sequence ();
39552
39553 if (!ok)
39554 return false;
39555
39556 if (d->testing_p)
39557 return true;
39558
39559 emit_insn (seq);
39560
39561 dsecond = *d;
39562 dsecond.op0 = dfirst.target;
39563 dsecond.op1 = dfirst.target;
39564 dsecond.one_operand_p = true;
39565 dsecond.target = gen_reg_rtx (dsecond.vmode);
39566 for (i = 0; i < nelt; i++)
39567 dsecond.perm[i] = i ^ nelt2;
39568
39569 ok = expand_vec_perm_1 (&dsecond);
39570 gcc_assert (ok);
39571
39572 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
39573 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
39574 return true;
39575 }
39576
39577 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
39578 permutation using two vperm2f128, followed by a vshufpd insn blending
39579 the two vectors together. */
39580
39581 static bool
39582 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
39583 {
39584 struct expand_vec_perm_d dfirst, dsecond, dthird;
39585 bool ok;
39586
39587 if (!TARGET_AVX || (d->vmode != V4DFmode))
39588 return false;
39589
39590 if (d->testing_p)
39591 return true;
39592
39593 dfirst = *d;
39594 dsecond = *d;
39595 dthird = *d;
39596
39597 dfirst.perm[0] = (d->perm[0] & ~1);
39598 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
39599 dfirst.perm[2] = (d->perm[2] & ~1);
39600 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
39601 dsecond.perm[0] = (d->perm[1] & ~1);
39602 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
39603 dsecond.perm[2] = (d->perm[3] & ~1);
39604 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
39605 dthird.perm[0] = (d->perm[0] % 2);
39606 dthird.perm[1] = (d->perm[1] % 2) + 4;
39607 dthird.perm[2] = (d->perm[2] % 2) + 2;
39608 dthird.perm[3] = (d->perm[3] % 2) + 6;
39609
39610 dfirst.target = gen_reg_rtx (dfirst.vmode);
39611 dsecond.target = gen_reg_rtx (dsecond.vmode);
39612 dthird.op0 = dfirst.target;
39613 dthird.op1 = dsecond.target;
39614 dthird.one_operand_p = false;
39615
39616 canonicalize_perm (&dfirst);
39617 canonicalize_perm (&dsecond);
39618
39619 ok = expand_vec_perm_1 (&dfirst)
39620 && expand_vec_perm_1 (&dsecond)
39621 && expand_vec_perm_1 (&dthird);
39622
39623 gcc_assert (ok);
39624
39625 return true;
39626 }
39627
39628 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
39629 permutation with two pshufb insns and an ior. We should have already
39630 failed all two instruction sequences. */
39631
39632 static bool
39633 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
39634 {
39635 rtx rperm[2][16], vperm, l, h, op, m128;
39636 unsigned int i, nelt, eltsz;
39637
39638 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
39639 return false;
39640 gcc_assert (!d->one_operand_p);
39641
39642 nelt = d->nelt;
39643 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39644
39645 /* Generate two permutation masks. If the required element is within
39646 the given vector it is shuffled into the proper lane. If the required
39647 element is in the other vector, force a zero into the lane by setting
39648 bit 7 in the permutation mask. */
39649 m128 = GEN_INT (-128);
39650 for (i = 0; i < nelt; ++i)
39651 {
39652 unsigned j, e = d->perm[i];
39653 unsigned which = (e >= nelt);
39654 if (e >= nelt)
39655 e -= nelt;
39656
39657 for (j = 0; j < eltsz; ++j)
39658 {
39659 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
39660 rperm[1-which][i*eltsz + j] = m128;
39661 }
39662 }
39663
39664 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
39665 vperm = force_reg (V16QImode, vperm);
39666
39667 l = gen_reg_rtx (V16QImode);
39668 op = gen_lowpart (V16QImode, d->op0);
39669 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
39670
39671 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
39672 vperm = force_reg (V16QImode, vperm);
39673
39674 h = gen_reg_rtx (V16QImode);
39675 op = gen_lowpart (V16QImode, d->op1);
39676 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
39677
39678 op = gen_lowpart (V16QImode, d->target);
39679 emit_insn (gen_iorv16qi3 (op, l, h));
39680
39681 return true;
39682 }
39683
39684 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
39685 with two vpshufb insns, vpermq and vpor. We should have already failed
39686 all two or three instruction sequences. */
39687
39688 static bool
39689 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
39690 {
39691 rtx rperm[2][32], vperm, l, h, hp, op, m128;
39692 unsigned int i, nelt, eltsz;
39693
39694 if (!TARGET_AVX2
39695 || !d->one_operand_p
39696 || (d->vmode != V32QImode && d->vmode != V16HImode))
39697 return false;
39698
39699 if (d->testing_p)
39700 return true;
39701
39702 nelt = d->nelt;
39703 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39704
39705 /* Generate two permutation masks. If the required element is within
39706 the same lane, it is shuffled in. If the required element from the
39707 other lane, force a zero by setting bit 7 in the permutation mask.
39708 In the other mask the mask has non-negative elements if element
39709 is requested from the other lane, but also moved to the other lane,
39710 so that the result of vpshufb can have the two V2TImode halves
39711 swapped. */
39712 m128 = GEN_INT (-128);
39713 for (i = 0; i < nelt; ++i)
39714 {
39715 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
39716 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
39717
39718 for (j = 0; j < eltsz; ++j)
39719 {
39720 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
39721 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
39722 }
39723 }
39724
39725 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
39726 vperm = force_reg (V32QImode, vperm);
39727
39728 h = gen_reg_rtx (V32QImode);
39729 op = gen_lowpart (V32QImode, d->op0);
39730 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
39731
39732 /* Swap the 128-byte lanes of h into hp. */
39733 hp = gen_reg_rtx (V4DImode);
39734 op = gen_lowpart (V4DImode, h);
39735 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
39736 const1_rtx));
39737
39738 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
39739 vperm = force_reg (V32QImode, vperm);
39740
39741 l = gen_reg_rtx (V32QImode);
39742 op = gen_lowpart (V32QImode, d->op0);
39743 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
39744
39745 op = gen_lowpart (V32QImode, d->target);
39746 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
39747
39748 return true;
39749 }
39750
39751 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
39752 and extract-odd permutations of two V32QImode and V16QImode operand
39753 with two vpshufb insns, vpor and vpermq. We should have already
39754 failed all two or three instruction sequences. */
39755
39756 static bool
39757 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
39758 {
39759 rtx rperm[2][32], vperm, l, h, ior, op, m128;
39760 unsigned int i, nelt, eltsz;
39761
39762 if (!TARGET_AVX2
39763 || d->one_operand_p
39764 || (d->vmode != V32QImode && d->vmode != V16HImode))
39765 return false;
39766
39767 for (i = 0; i < d->nelt; ++i)
39768 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
39769 return false;
39770
39771 if (d->testing_p)
39772 return true;
39773
39774 nelt = d->nelt;
39775 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39776
39777 /* Generate two permutation masks. In the first permutation mask
39778 the first quarter will contain indexes for the first half
39779 of the op0, the second quarter will contain bit 7 set, third quarter
39780 will contain indexes for the second half of the op0 and the
39781 last quarter bit 7 set. In the second permutation mask
39782 the first quarter will contain bit 7 set, the second quarter
39783 indexes for the first half of the op1, the third quarter bit 7 set
39784 and last quarter indexes for the second half of the op1.
39785 I.e. the first mask e.g. for V32QImode extract even will be:
39786 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
39787 (all values masked with 0xf except for -128) and second mask
39788 for extract even will be
39789 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
39790 m128 = GEN_INT (-128);
39791 for (i = 0; i < nelt; ++i)
39792 {
39793 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
39794 unsigned which = d->perm[i] >= nelt;
39795 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
39796
39797 for (j = 0; j < eltsz; ++j)
39798 {
39799 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
39800 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
39801 }
39802 }
39803
39804 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
39805 vperm = force_reg (V32QImode, vperm);
39806
39807 l = gen_reg_rtx (V32QImode);
39808 op = gen_lowpart (V32QImode, d->op0);
39809 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
39810
39811 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
39812 vperm = force_reg (V32QImode, vperm);
39813
39814 h = gen_reg_rtx (V32QImode);
39815 op = gen_lowpart (V32QImode, d->op1);
39816 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
39817
39818 ior = gen_reg_rtx (V32QImode);
39819 emit_insn (gen_iorv32qi3 (ior, l, h));
39820
39821 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
39822 op = gen_lowpart (V4DImode, d->target);
39823 ior = gen_lowpart (V4DImode, ior);
39824 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
39825 const1_rtx, GEN_INT (3)));
39826
39827 return true;
39828 }
39829
39830 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
39831 and extract-odd permutations. */
39832
39833 static bool
39834 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
39835 {
39836 rtx t1, t2, t3;
39837
39838 switch (d->vmode)
39839 {
39840 case V4DFmode:
39841 t1 = gen_reg_rtx (V4DFmode);
39842 t2 = gen_reg_rtx (V4DFmode);
39843
39844 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
39845 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
39846 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
39847
39848 /* Now an unpck[lh]pd will produce the result required. */
39849 if (odd)
39850 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
39851 else
39852 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
39853 emit_insn (t3);
39854 break;
39855
39856 case V8SFmode:
39857 {
39858 int mask = odd ? 0xdd : 0x88;
39859
39860 t1 = gen_reg_rtx (V8SFmode);
39861 t2 = gen_reg_rtx (V8SFmode);
39862 t3 = gen_reg_rtx (V8SFmode);
39863
39864 /* Shuffle within the 128-bit lanes to produce:
39865 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
39866 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
39867 GEN_INT (mask)));
39868
39869 /* Shuffle the lanes around to produce:
39870 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
39871 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
39872 GEN_INT (0x3)));
39873
39874 /* Shuffle within the 128-bit lanes to produce:
39875 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
39876 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
39877
39878 /* Shuffle within the 128-bit lanes to produce:
39879 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
39880 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
39881
39882 /* Shuffle the lanes around to produce:
39883 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
39884 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
39885 GEN_INT (0x20)));
39886 }
39887 break;
39888
39889 case V2DFmode:
39890 case V4SFmode:
39891 case V2DImode:
39892 case V4SImode:
39893 /* These are always directly implementable by expand_vec_perm_1. */
39894 gcc_unreachable ();
39895
39896 case V8HImode:
39897 if (TARGET_SSSE3)
39898 return expand_vec_perm_pshufb2 (d);
39899 else
39900 {
39901 /* We need 2*log2(N)-1 operations to achieve odd/even
39902 with interleave. */
39903 t1 = gen_reg_rtx (V8HImode);
39904 t2 = gen_reg_rtx (V8HImode);
39905 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
39906 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
39907 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
39908 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
39909 if (odd)
39910 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
39911 else
39912 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
39913 emit_insn (t3);
39914 }
39915 break;
39916
39917 case V16QImode:
39918 if (TARGET_SSSE3)
39919 return expand_vec_perm_pshufb2 (d);
39920 else
39921 {
39922 t1 = gen_reg_rtx (V16QImode);
39923 t2 = gen_reg_rtx (V16QImode);
39924 t3 = gen_reg_rtx (V16QImode);
39925 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
39926 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
39927 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
39928 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
39929 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
39930 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
39931 if (odd)
39932 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
39933 else
39934 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
39935 emit_insn (t3);
39936 }
39937 break;
39938
39939 case V16HImode:
39940 case V32QImode:
39941 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
39942
39943 case V4DImode:
39944 if (!TARGET_AVX2)
39945 {
39946 struct expand_vec_perm_d d_copy = *d;
39947 d_copy.vmode = V4DFmode;
39948 d_copy.target = gen_lowpart (V4DFmode, d->target);
39949 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
39950 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
39951 return expand_vec_perm_even_odd_1 (&d_copy, odd);
39952 }
39953
39954 t1 = gen_reg_rtx (V4DImode);
39955 t2 = gen_reg_rtx (V4DImode);
39956
39957 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
39958 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
39959 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
39960
39961 /* Now an vpunpck[lh]qdq will produce the result required. */
39962 if (odd)
39963 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
39964 else
39965 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
39966 emit_insn (t3);
39967 break;
39968
39969 case V8SImode:
39970 if (!TARGET_AVX2)
39971 {
39972 struct expand_vec_perm_d d_copy = *d;
39973 d_copy.vmode = V8SFmode;
39974 d_copy.target = gen_lowpart (V8SFmode, d->target);
39975 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
39976 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
39977 return expand_vec_perm_even_odd_1 (&d_copy, odd);
39978 }
39979
39980 t1 = gen_reg_rtx (V8SImode);
39981 t2 = gen_reg_rtx (V8SImode);
39982
39983 /* Shuffle the lanes around into
39984 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
39985 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
39986 gen_lowpart (V4DImode, d->op0),
39987 gen_lowpart (V4DImode, d->op1),
39988 GEN_INT (0x20)));
39989 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
39990 gen_lowpart (V4DImode, d->op0),
39991 gen_lowpart (V4DImode, d->op1),
39992 GEN_INT (0x31)));
39993
39994 /* Swap the 2nd and 3rd position in each lane into
39995 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
39996 emit_insn (gen_avx2_pshufdv3 (t1, t1,
39997 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
39998 emit_insn (gen_avx2_pshufdv3 (t2, t2,
39999 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
40000
40001 /* Now an vpunpck[lh]qdq will produce
40002 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
40003 if (odd)
40004 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
40005 gen_lowpart (V4DImode, t1),
40006 gen_lowpart (V4DImode, t2));
40007 else
40008 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
40009 gen_lowpart (V4DImode, t1),
40010 gen_lowpart (V4DImode, t2));
40011 emit_insn (t3);
40012 break;
40013
40014 default:
40015 gcc_unreachable ();
40016 }
40017
40018 return true;
40019 }
40020
40021 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
40022 extract-even and extract-odd permutations. */
40023
40024 static bool
40025 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
40026 {
40027 unsigned i, odd, nelt = d->nelt;
40028
40029 odd = d->perm[0];
40030 if (odd != 0 && odd != 1)
40031 return false;
40032
40033 for (i = 1; i < nelt; ++i)
40034 if (d->perm[i] != 2 * i + odd)
40035 return false;
40036
40037 return expand_vec_perm_even_odd_1 (d, odd);
40038 }
40039
40040 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
40041 permutations. We assume that expand_vec_perm_1 has already failed. */
40042
40043 static bool
40044 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
40045 {
40046 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
40047 enum machine_mode vmode = d->vmode;
40048 unsigned char perm2[4];
40049 rtx op0 = d->op0;
40050 bool ok;
40051
40052 switch (vmode)
40053 {
40054 case V4DFmode:
40055 case V8SFmode:
40056 /* These are special-cased in sse.md so that we can optionally
40057 use the vbroadcast instruction. They expand to two insns
40058 if the input happens to be in a register. */
40059 gcc_unreachable ();
40060
40061 case V2DFmode:
40062 case V2DImode:
40063 case V4SFmode:
40064 case V4SImode:
40065 /* These are always implementable using standard shuffle patterns. */
40066 gcc_unreachable ();
40067
40068 case V8HImode:
40069 case V16QImode:
40070 /* These can be implemented via interleave. We save one insn by
40071 stopping once we have promoted to V4SImode and then use pshufd. */
40072 do
40073 {
40074 rtx dest;
40075 rtx (*gen) (rtx, rtx, rtx)
40076 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
40077 : gen_vec_interleave_lowv8hi;
40078
40079 if (elt >= nelt2)
40080 {
40081 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
40082 : gen_vec_interleave_highv8hi;
40083 elt -= nelt2;
40084 }
40085 nelt2 /= 2;
40086
40087 dest = gen_reg_rtx (vmode);
40088 emit_insn (gen (dest, op0, op0));
40089 vmode = get_mode_wider_vector (vmode);
40090 op0 = gen_lowpart (vmode, dest);
40091 }
40092 while (vmode != V4SImode);
40093
40094 memset (perm2, elt, 4);
40095 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4,
40096 d->testing_p);
40097 gcc_assert (ok);
40098 return true;
40099
40100 case V32QImode:
40101 case V16HImode:
40102 case V8SImode:
40103 case V4DImode:
40104 /* For AVX2 broadcasts of the first element vpbroadcast* or
40105 vpermq should be used by expand_vec_perm_1. */
40106 gcc_assert (!TARGET_AVX2 || d->perm[0]);
40107 return false;
40108
40109 default:
40110 gcc_unreachable ();
40111 }
40112 }
40113
40114 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
40115 broadcast permutations. */
40116
40117 static bool
40118 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
40119 {
40120 unsigned i, elt, nelt = d->nelt;
40121
40122 if (!d->one_operand_p)
40123 return false;
40124
40125 elt = d->perm[0];
40126 for (i = 1; i < nelt; ++i)
40127 if (d->perm[i] != elt)
40128 return false;
40129
40130 return expand_vec_perm_broadcast_1 (d);
40131 }
40132
40133 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
40134 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
40135 all the shorter instruction sequences. */
40136
40137 static bool
40138 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
40139 {
40140 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
40141 unsigned int i, nelt, eltsz;
40142 bool used[4];
40143
40144 if (!TARGET_AVX2
40145 || d->one_operand_p
40146 || (d->vmode != V32QImode && d->vmode != V16HImode))
40147 return false;
40148
40149 if (d->testing_p)
40150 return true;
40151
40152 nelt = d->nelt;
40153 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40154
40155 /* Generate 4 permutation masks. If the required element is within
40156 the same lane, it is shuffled in. If the required element from the
40157 other lane, force a zero by setting bit 7 in the permutation mask.
40158 In the other mask the mask has non-negative elements if element
40159 is requested from the other lane, but also moved to the other lane,
40160 so that the result of vpshufb can have the two V2TImode halves
40161 swapped. */
40162 m128 = GEN_INT (-128);
40163 for (i = 0; i < 32; ++i)
40164 {
40165 rperm[0][i] = m128;
40166 rperm[1][i] = m128;
40167 rperm[2][i] = m128;
40168 rperm[3][i] = m128;
40169 }
40170 used[0] = false;
40171 used[1] = false;
40172 used[2] = false;
40173 used[3] = false;
40174 for (i = 0; i < nelt; ++i)
40175 {
40176 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
40177 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
40178 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
40179
40180 for (j = 0; j < eltsz; ++j)
40181 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
40182 used[which] = true;
40183 }
40184
40185 for (i = 0; i < 2; ++i)
40186 {
40187 if (!used[2 * i + 1])
40188 {
40189 h[i] = NULL_RTX;
40190 continue;
40191 }
40192 vperm = gen_rtx_CONST_VECTOR (V32QImode,
40193 gen_rtvec_v (32, rperm[2 * i + 1]));
40194 vperm = force_reg (V32QImode, vperm);
40195 h[i] = gen_reg_rtx (V32QImode);
40196 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
40197 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
40198 }
40199
40200 /* Swap the 128-byte lanes of h[X]. */
40201 for (i = 0; i < 2; ++i)
40202 {
40203 if (h[i] == NULL_RTX)
40204 continue;
40205 op = gen_reg_rtx (V4DImode);
40206 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
40207 const2_rtx, GEN_INT (3), const0_rtx,
40208 const1_rtx));
40209 h[i] = gen_lowpart (V32QImode, op);
40210 }
40211
40212 for (i = 0; i < 2; ++i)
40213 {
40214 if (!used[2 * i])
40215 {
40216 l[i] = NULL_RTX;
40217 continue;
40218 }
40219 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
40220 vperm = force_reg (V32QImode, vperm);
40221 l[i] = gen_reg_rtx (V32QImode);
40222 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
40223 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
40224 }
40225
40226 for (i = 0; i < 2; ++i)
40227 {
40228 if (h[i] && l[i])
40229 {
40230 op = gen_reg_rtx (V32QImode);
40231 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
40232 l[i] = op;
40233 }
40234 else if (h[i])
40235 l[i] = h[i];
40236 }
40237
40238 gcc_assert (l[0] && l[1]);
40239 op = gen_lowpart (V32QImode, d->target);
40240 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
40241 return true;
40242 }
40243
40244 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
40245 With all of the interface bits taken care of, perform the expansion
40246 in D and return true on success. */
40247
40248 static bool
40249 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
40250 {
40251 /* Try a single instruction expansion. */
40252 if (expand_vec_perm_1 (d))
40253 return true;
40254
40255 /* Try sequences of two instructions. */
40256
40257 if (expand_vec_perm_pshuflw_pshufhw (d))
40258 return true;
40259
40260 if (expand_vec_perm_palignr (d))
40261 return true;
40262
40263 if (expand_vec_perm_interleave2 (d))
40264 return true;
40265
40266 if (expand_vec_perm_broadcast (d))
40267 return true;
40268
40269 if (expand_vec_perm_vpermq_perm_1 (d))
40270 return true;
40271
40272 if (expand_vec_perm_vperm2f128 (d))
40273 return true;
40274
40275 /* Try sequences of three instructions. */
40276
40277 if (expand_vec_perm_2vperm2f128_vshuf (d))
40278 return true;
40279
40280 if (expand_vec_perm_pshufb2 (d))
40281 return true;
40282
40283 if (expand_vec_perm_interleave3 (d))
40284 return true;
40285
40286 if (expand_vec_perm_vperm2f128_vblend (d))
40287 return true;
40288
40289 /* Try sequences of four instructions. */
40290
40291 if (expand_vec_perm_vpshufb2_vpermq (d))
40292 return true;
40293
40294 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
40295 return true;
40296
40297 /* ??? Look for narrow permutations whose element orderings would
40298 allow the promotion to a wider mode. */
40299
40300 /* ??? Look for sequences of interleave or a wider permute that place
40301 the data into the correct lanes for a half-vector shuffle like
40302 pshuf[lh]w or vpermilps. */
40303
40304 /* ??? Look for sequences of interleave that produce the desired results.
40305 The combinatorics of punpck[lh] get pretty ugly... */
40306
40307 if (expand_vec_perm_even_odd (d))
40308 return true;
40309
40310 /* Even longer sequences. */
40311 if (expand_vec_perm_vpshufb4_vpermq2 (d))
40312 return true;
40313
40314 return false;
40315 }
40316
40317 /* If a permutation only uses one operand, make it clear. Returns true
40318 if the permutation references both operands. */
40319
40320 static bool
40321 canonicalize_perm (struct expand_vec_perm_d *d)
40322 {
40323 int i, which, nelt = d->nelt;
40324
40325 for (i = which = 0; i < nelt; ++i)
40326 which |= (d->perm[i] < nelt ? 1 : 2);
40327
40328 d->one_operand_p = true;
40329 switch (which)
40330 {
40331 default:
40332 gcc_unreachable();
40333
40334 case 3:
40335 if (!rtx_equal_p (d->op0, d->op1))
40336 {
40337 d->one_operand_p = false;
40338 break;
40339 }
40340 /* The elements of PERM do not suggest that only the first operand
40341 is used, but both operands are identical. Allow easier matching
40342 of the permutation by folding the permutation into the single
40343 input vector. */
40344 /* FALLTHRU */
40345
40346 case 2:
40347 for (i = 0; i < nelt; ++i)
40348 d->perm[i] &= nelt - 1;
40349 d->op0 = d->op1;
40350 break;
40351
40352 case 1:
40353 d->op1 = d->op0;
40354 break;
40355 }
40356
40357 return (which == 3);
40358 }
40359
40360 bool
40361 ix86_expand_vec_perm_const (rtx operands[4])
40362 {
40363 struct expand_vec_perm_d d;
40364 unsigned char perm[MAX_VECT_LEN];
40365 int i, nelt;
40366 bool two_args;
40367 rtx sel;
40368
40369 d.target = operands[0];
40370 d.op0 = operands[1];
40371 d.op1 = operands[2];
40372 sel = operands[3];
40373
40374 d.vmode = GET_MODE (d.target);
40375 gcc_assert (VECTOR_MODE_P (d.vmode));
40376 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40377 d.testing_p = false;
40378
40379 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
40380 gcc_assert (XVECLEN (sel, 0) == nelt);
40381 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
40382
40383 for (i = 0; i < nelt; ++i)
40384 {
40385 rtx e = XVECEXP (sel, 0, i);
40386 int ei = INTVAL (e) & (2 * nelt - 1);
40387 d.perm[i] = ei;
40388 perm[i] = ei;
40389 }
40390
40391 two_args = canonicalize_perm (&d);
40392
40393 if (ix86_expand_vec_perm_const_1 (&d))
40394 return true;
40395
40396 /* If the selector says both arguments are needed, but the operands are the
40397 same, the above tried to expand with one_operand_p and flattened selector.
40398 If that didn't work, retry without one_operand_p; we succeeded with that
40399 during testing. */
40400 if (two_args && d.one_operand_p)
40401 {
40402 d.one_operand_p = false;
40403 memcpy (d.perm, perm, sizeof (perm));
40404 return ix86_expand_vec_perm_const_1 (&d);
40405 }
40406
40407 return false;
40408 }
40409
40410 /* Implement targetm.vectorize.vec_perm_const_ok. */
40411
40412 static bool
40413 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
40414 const unsigned char *sel)
40415 {
40416 struct expand_vec_perm_d d;
40417 unsigned int i, nelt, which;
40418 bool ret;
40419
40420 d.vmode = vmode;
40421 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40422 d.testing_p = true;
40423
40424 /* Given sufficient ISA support we can just return true here
40425 for selected vector modes. */
40426 if (GET_MODE_SIZE (d.vmode) == 16)
40427 {
40428 /* All implementable with a single vpperm insn. */
40429 if (TARGET_XOP)
40430 return true;
40431 /* All implementable with 2 pshufb + 1 ior. */
40432 if (TARGET_SSSE3)
40433 return true;
40434 /* All implementable with shufpd or unpck[lh]pd. */
40435 if (d.nelt == 2)
40436 return true;
40437 }
40438
40439 /* Extract the values from the vector CST into the permutation
40440 array in D. */
40441 memcpy (d.perm, sel, nelt);
40442 for (i = which = 0; i < nelt; ++i)
40443 {
40444 unsigned char e = d.perm[i];
40445 gcc_assert (e < 2 * nelt);
40446 which |= (e < nelt ? 1 : 2);
40447 }
40448
40449 /* For all elements from second vector, fold the elements to first. */
40450 if (which == 2)
40451 for (i = 0; i < nelt; ++i)
40452 d.perm[i] -= nelt;
40453
40454 /* Check whether the mask can be applied to the vector type. */
40455 d.one_operand_p = (which != 3);
40456
40457 /* Implementable with shufps or pshufd. */
40458 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
40459 return true;
40460
40461 /* Otherwise we have to go through the motions and see if we can
40462 figure out how to generate the requested permutation. */
40463 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
40464 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
40465 if (!d.one_operand_p)
40466 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
40467
40468 start_sequence ();
40469 ret = ix86_expand_vec_perm_const_1 (&d);
40470 end_sequence ();
40471
40472 return ret;
40473 }
40474
40475 void
40476 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
40477 {
40478 struct expand_vec_perm_d d;
40479 unsigned i, nelt;
40480
40481 d.target = targ;
40482 d.op0 = op0;
40483 d.op1 = op1;
40484 d.vmode = GET_MODE (targ);
40485 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40486 d.one_operand_p = false;
40487 d.testing_p = false;
40488
40489 for (i = 0; i < nelt; ++i)
40490 d.perm[i] = i * 2 + odd;
40491
40492 /* We'll either be able to implement the permutation directly... */
40493 if (expand_vec_perm_1 (&d))
40494 return;
40495
40496 /* ... or we use the special-case patterns. */
40497 expand_vec_perm_even_odd_1 (&d, odd);
40498 }
40499
40500 static void
40501 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
40502 {
40503 struct expand_vec_perm_d d;
40504 unsigned i, nelt, base;
40505 bool ok;
40506
40507 d.target = targ;
40508 d.op0 = op0;
40509 d.op1 = op1;
40510 d.vmode = GET_MODE (targ);
40511 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40512 d.one_operand_p = false;
40513 d.testing_p = false;
40514
40515 base = high_p ? nelt / 2 : 0;
40516 for (i = 0; i < nelt / 2; ++i)
40517 {
40518 d.perm[i * 2] = i + base;
40519 d.perm[i * 2 + 1] = i + base + nelt;
40520 }
40521
40522 /* Note that for AVX this isn't one instruction. */
40523 ok = ix86_expand_vec_perm_const_1 (&d);
40524 gcc_assert (ok);
40525 }
40526
40527
40528 /* Expand a vector operation CODE for a V*QImode in terms of the
40529 same operation on V*HImode. */
40530
40531 void
40532 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
40533 {
40534 enum machine_mode qimode = GET_MODE (dest);
40535 enum machine_mode himode;
40536 rtx (*gen_il) (rtx, rtx, rtx);
40537 rtx (*gen_ih) (rtx, rtx, rtx);
40538 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
40539 struct expand_vec_perm_d d;
40540 bool ok, full_interleave;
40541 bool uns_p = false;
40542 int i;
40543
40544 switch (qimode)
40545 {
40546 case V16QImode:
40547 himode = V8HImode;
40548 gen_il = gen_vec_interleave_lowv16qi;
40549 gen_ih = gen_vec_interleave_highv16qi;
40550 break;
40551 case V32QImode:
40552 himode = V16HImode;
40553 gen_il = gen_avx2_interleave_lowv32qi;
40554 gen_ih = gen_avx2_interleave_highv32qi;
40555 break;
40556 default:
40557 gcc_unreachable ();
40558 }
40559
40560 op2_l = op2_h = op2;
40561 switch (code)
40562 {
40563 case MULT:
40564 /* Unpack data such that we've got a source byte in each low byte of
40565 each word. We don't care what goes into the high byte of each word.
40566 Rather than trying to get zero in there, most convenient is to let
40567 it be a copy of the low byte. */
40568 op2_l = gen_reg_rtx (qimode);
40569 op2_h = gen_reg_rtx (qimode);
40570 emit_insn (gen_il (op2_l, op2, op2));
40571 emit_insn (gen_ih (op2_h, op2, op2));
40572 /* FALLTHRU */
40573
40574 op1_l = gen_reg_rtx (qimode);
40575 op1_h = gen_reg_rtx (qimode);
40576 emit_insn (gen_il (op1_l, op1, op1));
40577 emit_insn (gen_ih (op1_h, op1, op1));
40578 full_interleave = qimode == V16QImode;
40579 break;
40580
40581 case ASHIFT:
40582 case LSHIFTRT:
40583 uns_p = true;
40584 /* FALLTHRU */
40585 case ASHIFTRT:
40586 op1_l = gen_reg_rtx (himode);
40587 op1_h = gen_reg_rtx (himode);
40588 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
40589 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
40590 full_interleave = true;
40591 break;
40592 default:
40593 gcc_unreachable ();
40594 }
40595
40596 /* Perform the operation. */
40597 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
40598 1, OPTAB_DIRECT);
40599 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
40600 1, OPTAB_DIRECT);
40601 gcc_assert (res_l && res_h);
40602
40603 /* Merge the data back into the right place. */
40604 d.target = dest;
40605 d.op0 = gen_lowpart (qimode, res_l);
40606 d.op1 = gen_lowpart (qimode, res_h);
40607 d.vmode = qimode;
40608 d.nelt = GET_MODE_NUNITS (qimode);
40609 d.one_operand_p = false;
40610 d.testing_p = false;
40611
40612 if (full_interleave)
40613 {
40614 /* For SSE2, we used an full interleave, so the desired
40615 results are in the even elements. */
40616 for (i = 0; i < 32; ++i)
40617 d.perm[i] = i * 2;
40618 }
40619 else
40620 {
40621 /* For AVX, the interleave used above was not cross-lane. So the
40622 extraction is evens but with the second and third quarter swapped.
40623 Happily, that is even one insn shorter than even extraction. */
40624 for (i = 0; i < 32; ++i)
40625 d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
40626 }
40627
40628 ok = ix86_expand_vec_perm_const_1 (&d);
40629 gcc_assert (ok);
40630
40631 set_unique_reg_note (get_last_insn (), REG_EQUAL,
40632 gen_rtx_fmt_ee (code, qimode, op1, op2));
40633 }
40634
40635 void
40636 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
40637 bool uns_p, bool odd_p)
40638 {
40639 enum machine_mode mode = GET_MODE (op1);
40640 enum machine_mode wmode = GET_MODE (dest);
40641 rtx x;
40642
40643 /* We only play even/odd games with vectors of SImode. */
40644 gcc_assert (mode == V4SImode || mode == V8SImode);
40645
40646 /* If we're looking for the odd results, shift those members down to
40647 the even slots. For some cpus this is faster than a PSHUFD. */
40648 if (odd_p)
40649 {
40650 if (TARGET_XOP && mode == V4SImode)
40651 {
40652 x = force_reg (wmode, CONST0_RTX (wmode));
40653 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
40654 return;
40655 }
40656
40657 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
40658 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
40659 x, NULL, 1, OPTAB_DIRECT);
40660 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
40661 x, NULL, 1, OPTAB_DIRECT);
40662 op1 = gen_lowpart (mode, op1);
40663 op2 = gen_lowpart (mode, op2);
40664 }
40665
40666 if (mode == V8SImode)
40667 {
40668 if (uns_p)
40669 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
40670 else
40671 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
40672 }
40673 else if (uns_p)
40674 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
40675 else if (TARGET_SSE4_1)
40676 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
40677 else
40678 {
40679 rtx s1, s2, t0, t1, t2;
40680
40681 /* The easiest way to implement this without PMULDQ is to go through
40682 the motions as if we are performing a full 64-bit multiply. With
40683 the exception that we need to do less shuffling of the elements. */
40684
40685 /* Compute the sign-extension, aka highparts, of the two operands. */
40686 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
40687 op1, pc_rtx, pc_rtx);
40688 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
40689 op2, pc_rtx, pc_rtx);
40690
40691 /* Multiply LO(A) * HI(B), and vice-versa. */
40692 t1 = gen_reg_rtx (wmode);
40693 t2 = gen_reg_rtx (wmode);
40694 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
40695 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
40696
40697 /* Multiply LO(A) * LO(B). */
40698 t0 = gen_reg_rtx (wmode);
40699 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
40700
40701 /* Combine and shift the highparts into place. */
40702 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
40703 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
40704 1, OPTAB_DIRECT);
40705
40706 /* Combine high and low parts. */
40707 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
40708 return;
40709 }
40710 emit_insn (x);
40711 }
40712
40713 void
40714 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
40715 bool uns_p, bool high_p)
40716 {
40717 enum machine_mode wmode = GET_MODE (dest);
40718 enum machine_mode mode = GET_MODE (op1);
40719 rtx t1, t2, t3, t4, mask;
40720
40721 switch (mode)
40722 {
40723 case V4SImode:
40724 t1 = gen_reg_rtx (mode);
40725 t2 = gen_reg_rtx (mode);
40726 if (TARGET_XOP && !uns_p)
40727 {
40728 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
40729 shuffle the elements once so that all elements are in the right
40730 place for immediate use: { A C B D }. */
40731 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
40732 const1_rtx, GEN_INT (3)));
40733 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
40734 const1_rtx, GEN_INT (3)));
40735 }
40736 else
40737 {
40738 /* Put the elements into place for the multiply. */
40739 ix86_expand_vec_interleave (t1, op1, op1, high_p);
40740 ix86_expand_vec_interleave (t2, op2, op2, high_p);
40741 high_p = false;
40742 }
40743 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
40744 break;
40745
40746 case V8SImode:
40747 /* Shuffle the elements between the lanes. After this we
40748 have { A B E F | C D G H } for each operand. */
40749 t1 = gen_reg_rtx (V4DImode);
40750 t2 = gen_reg_rtx (V4DImode);
40751 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
40752 const0_rtx, const2_rtx,
40753 const1_rtx, GEN_INT (3)));
40754 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
40755 const0_rtx, const2_rtx,
40756 const1_rtx, GEN_INT (3)));
40757
40758 /* Shuffle the elements within the lanes. After this we
40759 have { A A B B | C C D D } or { E E F F | G G H H }. */
40760 t3 = gen_reg_rtx (V8SImode);
40761 t4 = gen_reg_rtx (V8SImode);
40762 mask = GEN_INT (high_p
40763 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
40764 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
40765 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
40766 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
40767
40768 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
40769 break;
40770
40771 case V8HImode:
40772 case V16HImode:
40773 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
40774 uns_p, OPTAB_DIRECT);
40775 t2 = expand_binop (mode,
40776 uns_p ? umul_highpart_optab : smul_highpart_optab,
40777 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
40778 gcc_assert (t1 && t2);
40779
40780 ix86_expand_vec_interleave (gen_lowpart (mode, dest), t1, t2, high_p);
40781 break;
40782
40783 case V16QImode:
40784 case V32QImode:
40785 t1 = gen_reg_rtx (wmode);
40786 t2 = gen_reg_rtx (wmode);
40787 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
40788 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
40789
40790 emit_insn (gen_rtx_SET (VOIDmode, dest, gen_rtx_MULT (wmode, t1, t2)));
40791 break;
40792
40793 default:
40794 gcc_unreachable ();
40795 }
40796 }
40797
40798 void
40799 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
40800 {
40801 rtx res_1, res_2;
40802
40803 res_1 = gen_reg_rtx (V4SImode);
40804 res_2 = gen_reg_rtx (V4SImode);
40805 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_1),
40806 op1, op2, true, false);
40807 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_2),
40808 op1, op2, true, true);
40809
40810 /* Move the results in element 2 down to element 1; we don't care
40811 what goes in elements 2 and 3. Then we can merge the parts
40812 back together with an interleave.
40813
40814 Note that two other sequences were tried:
40815 (1) Use interleaves at the start instead of psrldq, which allows
40816 us to use a single shufps to merge things back at the end.
40817 (2) Use shufps here to combine the two vectors, then pshufd to
40818 put the elements in the correct order.
40819 In both cases the cost of the reformatting stall was too high
40820 and the overall sequence slower. */
40821
40822 emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx,
40823 const0_rtx, const0_rtx));
40824 emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx,
40825 const0_rtx, const0_rtx));
40826 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
40827
40828 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
40829 }
40830
40831 void
40832 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
40833 {
40834 enum machine_mode mode = GET_MODE (op0);
40835 rtx t1, t2, t3, t4, t5, t6;
40836
40837 if (TARGET_XOP && mode == V2DImode)
40838 {
40839 /* op1: A,B,C,D, op2: E,F,G,H */
40840 op1 = gen_lowpart (V4SImode, op1);
40841 op2 = gen_lowpart (V4SImode, op2);
40842
40843 t1 = gen_reg_rtx (V4SImode);
40844 t2 = gen_reg_rtx (V4SImode);
40845 t3 = gen_reg_rtx (V2DImode);
40846 t4 = gen_reg_rtx (V2DImode);
40847
40848 /* t1: B,A,D,C */
40849 emit_insn (gen_sse2_pshufd_1 (t1, op1,
40850 GEN_INT (1),
40851 GEN_INT (0),
40852 GEN_INT (3),
40853 GEN_INT (2)));
40854
40855 /* t2: (B*E),(A*F),(D*G),(C*H) */
40856 emit_insn (gen_mulv4si3 (t2, t1, op2));
40857
40858 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
40859 emit_insn (gen_xop_phadddq (t3, t2));
40860
40861 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
40862 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
40863
40864 /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
40865 emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4));
40866 }
40867 else
40868 {
40869 enum machine_mode nmode;
40870 rtx (*umul) (rtx, rtx, rtx);
40871
40872 if (mode == V2DImode)
40873 {
40874 umul = gen_vec_widen_umult_even_v4si;
40875 nmode = V4SImode;
40876 }
40877 else if (mode == V4DImode)
40878 {
40879 umul = gen_vec_widen_umult_even_v8si;
40880 nmode = V8SImode;
40881 }
40882 else
40883 gcc_unreachable ();
40884
40885
40886 /* Multiply low parts. */
40887 t1 = gen_reg_rtx (mode);
40888 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
40889
40890 /* Shift input vectors right 32 bits so we can multiply high parts. */
40891 t6 = GEN_INT (32);
40892 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
40893 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
40894
40895 /* Multiply high parts by low parts. */
40896 t4 = gen_reg_rtx (mode);
40897 t5 = gen_reg_rtx (mode);
40898 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
40899 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
40900
40901 /* Combine and shift the highparts back. */
40902 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
40903 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
40904
40905 /* Combine high and low parts. */
40906 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
40907 }
40908
40909 set_unique_reg_note (get_last_insn (), REG_EQUAL,
40910 gen_rtx_MULT (mode, op1, op2));
40911 }
40912
40913 /* Expand an insert into a vector register through pinsr insn.
40914 Return true if successful. */
40915
40916 bool
40917 ix86_expand_pinsr (rtx *operands)
40918 {
40919 rtx dst = operands[0];
40920 rtx src = operands[3];
40921
40922 unsigned int size = INTVAL (operands[1]);
40923 unsigned int pos = INTVAL (operands[2]);
40924
40925 if (GET_CODE (dst) == SUBREG)
40926 {
40927 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
40928 dst = SUBREG_REG (dst);
40929 }
40930
40931 if (GET_CODE (src) == SUBREG)
40932 src = SUBREG_REG (src);
40933
40934 switch (GET_MODE (dst))
40935 {
40936 case V16QImode:
40937 case V8HImode:
40938 case V4SImode:
40939 case V2DImode:
40940 {
40941 enum machine_mode srcmode, dstmode;
40942 rtx (*pinsr)(rtx, rtx, rtx, rtx);
40943
40944 srcmode = mode_for_size (size, MODE_INT, 0);
40945
40946 switch (srcmode)
40947 {
40948 case QImode:
40949 if (!TARGET_SSE4_1)
40950 return false;
40951 dstmode = V16QImode;
40952 pinsr = gen_sse4_1_pinsrb;
40953 break;
40954
40955 case HImode:
40956 if (!TARGET_SSE2)
40957 return false;
40958 dstmode = V8HImode;
40959 pinsr = gen_sse2_pinsrw;
40960 break;
40961
40962 case SImode:
40963 if (!TARGET_SSE4_1)
40964 return false;
40965 dstmode = V4SImode;
40966 pinsr = gen_sse4_1_pinsrd;
40967 break;
40968
40969 case DImode:
40970 gcc_assert (TARGET_64BIT);
40971 if (!TARGET_SSE4_1)
40972 return false;
40973 dstmode = V2DImode;
40974 pinsr = gen_sse4_1_pinsrq;
40975 break;
40976
40977 default:
40978 return false;
40979 }
40980
40981 dst = gen_lowpart (dstmode, dst);
40982 src = gen_lowpart (srcmode, src);
40983
40984 pos /= size;
40985
40986 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
40987 return true;
40988 }
40989
40990 default:
40991 return false;
40992 }
40993 }
40994 \f
40995 /* This function returns the calling abi specific va_list type node.
40996 It returns the FNDECL specific va_list type. */
40997
40998 static tree
40999 ix86_fn_abi_va_list (tree fndecl)
41000 {
41001 if (!TARGET_64BIT)
41002 return va_list_type_node;
41003 gcc_assert (fndecl != NULL_TREE);
41004
41005 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
41006 return ms_va_list_type_node;
41007 else
41008 return sysv_va_list_type_node;
41009 }
41010
41011 /* Returns the canonical va_list type specified by TYPE. If there
41012 is no valid TYPE provided, it return NULL_TREE. */
41013
41014 static tree
41015 ix86_canonical_va_list_type (tree type)
41016 {
41017 tree wtype, htype;
41018
41019 /* Resolve references and pointers to va_list type. */
41020 if (TREE_CODE (type) == MEM_REF)
41021 type = TREE_TYPE (type);
41022 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
41023 type = TREE_TYPE (type);
41024 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
41025 type = TREE_TYPE (type);
41026
41027 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
41028 {
41029 wtype = va_list_type_node;
41030 gcc_assert (wtype != NULL_TREE);
41031 htype = type;
41032 if (TREE_CODE (wtype) == ARRAY_TYPE)
41033 {
41034 /* If va_list is an array type, the argument may have decayed
41035 to a pointer type, e.g. by being passed to another function.
41036 In that case, unwrap both types so that we can compare the
41037 underlying records. */
41038 if (TREE_CODE (htype) == ARRAY_TYPE
41039 || POINTER_TYPE_P (htype))
41040 {
41041 wtype = TREE_TYPE (wtype);
41042 htype = TREE_TYPE (htype);
41043 }
41044 }
41045 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41046 return va_list_type_node;
41047 wtype = sysv_va_list_type_node;
41048 gcc_assert (wtype != NULL_TREE);
41049 htype = type;
41050 if (TREE_CODE (wtype) == ARRAY_TYPE)
41051 {
41052 /* If va_list is an array type, the argument may have decayed
41053 to a pointer type, e.g. by being passed to another function.
41054 In that case, unwrap both types so that we can compare the
41055 underlying records. */
41056 if (TREE_CODE (htype) == ARRAY_TYPE
41057 || POINTER_TYPE_P (htype))
41058 {
41059 wtype = TREE_TYPE (wtype);
41060 htype = TREE_TYPE (htype);
41061 }
41062 }
41063 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41064 return sysv_va_list_type_node;
41065 wtype = ms_va_list_type_node;
41066 gcc_assert (wtype != NULL_TREE);
41067 htype = type;
41068 if (TREE_CODE (wtype) == ARRAY_TYPE)
41069 {
41070 /* If va_list is an array type, the argument may have decayed
41071 to a pointer type, e.g. by being passed to another function.
41072 In that case, unwrap both types so that we can compare the
41073 underlying records. */
41074 if (TREE_CODE (htype) == ARRAY_TYPE
41075 || POINTER_TYPE_P (htype))
41076 {
41077 wtype = TREE_TYPE (wtype);
41078 htype = TREE_TYPE (htype);
41079 }
41080 }
41081 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41082 return ms_va_list_type_node;
41083 return NULL_TREE;
41084 }
41085 return std_canonical_va_list_type (type);
41086 }
41087
41088 /* Iterate through the target-specific builtin types for va_list.
41089 IDX denotes the iterator, *PTREE is set to the result type of
41090 the va_list builtin, and *PNAME to its internal type.
41091 Returns zero if there is no element for this index, otherwise
41092 IDX should be increased upon the next call.
41093 Note, do not iterate a base builtin's name like __builtin_va_list.
41094 Used from c_common_nodes_and_builtins. */
41095
41096 static int
41097 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
41098 {
41099 if (TARGET_64BIT)
41100 {
41101 switch (idx)
41102 {
41103 default:
41104 break;
41105
41106 case 0:
41107 *ptree = ms_va_list_type_node;
41108 *pname = "__builtin_ms_va_list";
41109 return 1;
41110
41111 case 1:
41112 *ptree = sysv_va_list_type_node;
41113 *pname = "__builtin_sysv_va_list";
41114 return 1;
41115 }
41116 }
41117
41118 return 0;
41119 }
41120
41121 #undef TARGET_SCHED_DISPATCH
41122 #define TARGET_SCHED_DISPATCH has_dispatch
41123 #undef TARGET_SCHED_DISPATCH_DO
41124 #define TARGET_SCHED_DISPATCH_DO do_dispatch
41125 #undef TARGET_SCHED_REASSOCIATION_WIDTH
41126 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
41127 #undef TARGET_SCHED_REORDER
41128 #define TARGET_SCHED_REORDER ix86_sched_reorder
41129 #undef TARGET_SCHED_ADJUST_PRIORITY
41130 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
41131 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
41132 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK ix86_dependencies_evaluation_hook
41133
41134 /* The size of the dispatch window is the total number of bytes of
41135 object code allowed in a window. */
41136 #define DISPATCH_WINDOW_SIZE 16
41137
41138 /* Number of dispatch windows considered for scheduling. */
41139 #define MAX_DISPATCH_WINDOWS 3
41140
41141 /* Maximum number of instructions in a window. */
41142 #define MAX_INSN 4
41143
41144 /* Maximum number of immediate operands in a window. */
41145 #define MAX_IMM 4
41146
41147 /* Maximum number of immediate bits allowed in a window. */
41148 #define MAX_IMM_SIZE 128
41149
41150 /* Maximum number of 32 bit immediates allowed in a window. */
41151 #define MAX_IMM_32 4
41152
41153 /* Maximum number of 64 bit immediates allowed in a window. */
41154 #define MAX_IMM_64 2
41155
41156 /* Maximum total of loads or prefetches allowed in a window. */
41157 #define MAX_LOAD 2
41158
41159 /* Maximum total of stores allowed in a window. */
41160 #define MAX_STORE 1
41161
41162 #undef BIG
41163 #define BIG 100
41164
41165
41166 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
41167 enum dispatch_group {
41168 disp_no_group = 0,
41169 disp_load,
41170 disp_store,
41171 disp_load_store,
41172 disp_prefetch,
41173 disp_imm,
41174 disp_imm_32,
41175 disp_imm_64,
41176 disp_branch,
41177 disp_cmp,
41178 disp_jcc,
41179 disp_last
41180 };
41181
41182 /* Number of allowable groups in a dispatch window. It is an array
41183 indexed by dispatch_group enum. 100 is used as a big number,
41184 because the number of these kind of operations does not have any
41185 effect in dispatch window, but we need them for other reasons in
41186 the table. */
41187 static unsigned int num_allowable_groups[disp_last] = {
41188 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
41189 };
41190
41191 char group_name[disp_last + 1][16] = {
41192 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
41193 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
41194 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
41195 };
41196
41197 /* Instruction path. */
41198 enum insn_path {
41199 no_path = 0,
41200 path_single, /* Single micro op. */
41201 path_double, /* Double micro op. */
41202 path_multi, /* Instructions with more than 2 micro op.. */
41203 last_path
41204 };
41205
41206 /* sched_insn_info defines a window to the instructions scheduled in
41207 the basic block. It contains a pointer to the insn_info table and
41208 the instruction scheduled.
41209
41210 Windows are allocated for each basic block and are linked
41211 together. */
41212 typedef struct sched_insn_info_s {
41213 rtx insn;
41214 enum dispatch_group group;
41215 enum insn_path path;
41216 int byte_len;
41217 int imm_bytes;
41218 } sched_insn_info;
41219
41220 /* Linked list of dispatch windows. This is a two way list of
41221 dispatch windows of a basic block. It contains information about
41222 the number of uops in the window and the total number of
41223 instructions and of bytes in the object code for this dispatch
41224 window. */
41225 typedef struct dispatch_windows_s {
41226 int num_insn; /* Number of insn in the window. */
41227 int num_uops; /* Number of uops in the window. */
41228 int window_size; /* Number of bytes in the window. */
41229 int window_num; /* Window number between 0 or 1. */
41230 int num_imm; /* Number of immediates in an insn. */
41231 int num_imm_32; /* Number of 32 bit immediates in an insn. */
41232 int num_imm_64; /* Number of 64 bit immediates in an insn. */
41233 int imm_size; /* Total immediates in the window. */
41234 int num_loads; /* Total memory loads in the window. */
41235 int num_stores; /* Total memory stores in the window. */
41236 int violation; /* Violation exists in window. */
41237 sched_insn_info *window; /* Pointer to the window. */
41238 struct dispatch_windows_s *next;
41239 struct dispatch_windows_s *prev;
41240 } dispatch_windows;
41241
41242 /* Immediate valuse used in an insn. */
41243 typedef struct imm_info_s
41244 {
41245 int imm;
41246 int imm32;
41247 int imm64;
41248 } imm_info;
41249
41250 static dispatch_windows *dispatch_window_list;
41251 static dispatch_windows *dispatch_window_list1;
41252
41253 /* Get dispatch group of insn. */
41254
41255 static enum dispatch_group
41256 get_mem_group (rtx insn)
41257 {
41258 enum attr_memory memory;
41259
41260 if (INSN_CODE (insn) < 0)
41261 return disp_no_group;
41262 memory = get_attr_memory (insn);
41263 if (memory == MEMORY_STORE)
41264 return disp_store;
41265
41266 if (memory == MEMORY_LOAD)
41267 return disp_load;
41268
41269 if (memory == MEMORY_BOTH)
41270 return disp_load_store;
41271
41272 return disp_no_group;
41273 }
41274
41275 /* Return true if insn is a compare instruction. */
41276
41277 static bool
41278 is_cmp (rtx insn)
41279 {
41280 enum attr_type type;
41281
41282 type = get_attr_type (insn);
41283 return (type == TYPE_TEST
41284 || type == TYPE_ICMP
41285 || type == TYPE_FCMP
41286 || GET_CODE (PATTERN (insn)) == COMPARE);
41287 }
41288
41289 /* Return true if a dispatch violation encountered. */
41290
41291 static bool
41292 dispatch_violation (void)
41293 {
41294 if (dispatch_window_list->next)
41295 return dispatch_window_list->next->violation;
41296 return dispatch_window_list->violation;
41297 }
41298
41299 /* Return true if insn is a branch instruction. */
41300
41301 static bool
41302 is_branch (rtx insn)
41303 {
41304 return (CALL_P (insn) || JUMP_P (insn));
41305 }
41306
41307 /* Return true if insn is a prefetch instruction. */
41308
41309 static bool
41310 is_prefetch (rtx insn)
41311 {
41312 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
41313 }
41314
41315 /* This function initializes a dispatch window and the list container holding a
41316 pointer to the window. */
41317
41318 static void
41319 init_window (int window_num)
41320 {
41321 int i;
41322 dispatch_windows *new_list;
41323
41324 if (window_num == 0)
41325 new_list = dispatch_window_list;
41326 else
41327 new_list = dispatch_window_list1;
41328
41329 new_list->num_insn = 0;
41330 new_list->num_uops = 0;
41331 new_list->window_size = 0;
41332 new_list->next = NULL;
41333 new_list->prev = NULL;
41334 new_list->window_num = window_num;
41335 new_list->num_imm = 0;
41336 new_list->num_imm_32 = 0;
41337 new_list->num_imm_64 = 0;
41338 new_list->imm_size = 0;
41339 new_list->num_loads = 0;
41340 new_list->num_stores = 0;
41341 new_list->violation = false;
41342
41343 for (i = 0; i < MAX_INSN; i++)
41344 {
41345 new_list->window[i].insn = NULL;
41346 new_list->window[i].group = disp_no_group;
41347 new_list->window[i].path = no_path;
41348 new_list->window[i].byte_len = 0;
41349 new_list->window[i].imm_bytes = 0;
41350 }
41351 return;
41352 }
41353
41354 /* This function allocates and initializes a dispatch window and the
41355 list container holding a pointer to the window. */
41356
41357 static dispatch_windows *
41358 allocate_window (void)
41359 {
41360 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
41361 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
41362
41363 return new_list;
41364 }
41365
41366 /* This routine initializes the dispatch scheduling information. It
41367 initiates building dispatch scheduler tables and constructs the
41368 first dispatch window. */
41369
41370 static void
41371 init_dispatch_sched (void)
41372 {
41373 /* Allocate a dispatch list and a window. */
41374 dispatch_window_list = allocate_window ();
41375 dispatch_window_list1 = allocate_window ();
41376 init_window (0);
41377 init_window (1);
41378 }
41379
41380 /* This function returns true if a branch is detected. End of a basic block
41381 does not have to be a branch, but here we assume only branches end a
41382 window. */
41383
41384 static bool
41385 is_end_basic_block (enum dispatch_group group)
41386 {
41387 return group == disp_branch;
41388 }
41389
41390 /* This function is called when the end of a window processing is reached. */
41391
41392 static void
41393 process_end_window (void)
41394 {
41395 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
41396 if (dispatch_window_list->next)
41397 {
41398 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
41399 gcc_assert (dispatch_window_list->window_size
41400 + dispatch_window_list1->window_size <= 48);
41401 init_window (1);
41402 }
41403 init_window (0);
41404 }
41405
41406 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
41407 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
41408 for 48 bytes of instructions. Note that these windows are not dispatch
41409 windows that their sizes are DISPATCH_WINDOW_SIZE. */
41410
41411 static dispatch_windows *
41412 allocate_next_window (int window_num)
41413 {
41414 if (window_num == 0)
41415 {
41416 if (dispatch_window_list->next)
41417 init_window (1);
41418 init_window (0);
41419 return dispatch_window_list;
41420 }
41421
41422 dispatch_window_list->next = dispatch_window_list1;
41423 dispatch_window_list1->prev = dispatch_window_list;
41424
41425 return dispatch_window_list1;
41426 }
41427
41428 /* Increment the number of immediate operands of an instruction. */
41429
41430 static int
41431 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
41432 {
41433 if (*in_rtx == 0)
41434 return 0;
41435
41436 switch ( GET_CODE (*in_rtx))
41437 {
41438 case CONST:
41439 case SYMBOL_REF:
41440 case CONST_INT:
41441 (imm_values->imm)++;
41442 if (x86_64_immediate_operand (*in_rtx, SImode))
41443 (imm_values->imm32)++;
41444 else
41445 (imm_values->imm64)++;
41446 break;
41447
41448 case CONST_DOUBLE:
41449 (imm_values->imm)++;
41450 (imm_values->imm64)++;
41451 break;
41452
41453 case CODE_LABEL:
41454 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
41455 {
41456 (imm_values->imm)++;
41457 (imm_values->imm32)++;
41458 }
41459 break;
41460
41461 default:
41462 break;
41463 }
41464
41465 return 0;
41466 }
41467
41468 /* Compute number of immediate operands of an instruction. */
41469
41470 static void
41471 find_constant (rtx in_rtx, imm_info *imm_values)
41472 {
41473 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
41474 (rtx_function) find_constant_1, (void *) imm_values);
41475 }
41476
41477 /* Return total size of immediate operands of an instruction along with number
41478 of corresponding immediate-operands. It initializes its parameters to zero
41479 befor calling FIND_CONSTANT.
41480 INSN is the input instruction. IMM is the total of immediates.
41481 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
41482 bit immediates. */
41483
41484 static int
41485 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
41486 {
41487 imm_info imm_values = {0, 0, 0};
41488
41489 find_constant (insn, &imm_values);
41490 *imm = imm_values.imm;
41491 *imm32 = imm_values.imm32;
41492 *imm64 = imm_values.imm64;
41493 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
41494 }
41495
41496 /* This function indicates if an operand of an instruction is an
41497 immediate. */
41498
41499 static bool
41500 has_immediate (rtx insn)
41501 {
41502 int num_imm_operand;
41503 int num_imm32_operand;
41504 int num_imm64_operand;
41505
41506 if (insn)
41507 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41508 &num_imm64_operand);
41509 return false;
41510 }
41511
41512 /* Return single or double path for instructions. */
41513
41514 static enum insn_path
41515 get_insn_path (rtx insn)
41516 {
41517 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
41518
41519 if ((int)path == 0)
41520 return path_single;
41521
41522 if ((int)path == 1)
41523 return path_double;
41524
41525 return path_multi;
41526 }
41527
41528 /* Return insn dispatch group. */
41529
41530 static enum dispatch_group
41531 get_insn_group (rtx insn)
41532 {
41533 enum dispatch_group group = get_mem_group (insn);
41534 if (group)
41535 return group;
41536
41537 if (is_branch (insn))
41538 return disp_branch;
41539
41540 if (is_cmp (insn))
41541 return disp_cmp;
41542
41543 if (has_immediate (insn))
41544 return disp_imm;
41545
41546 if (is_prefetch (insn))
41547 return disp_prefetch;
41548
41549 return disp_no_group;
41550 }
41551
41552 /* Count number of GROUP restricted instructions in a dispatch
41553 window WINDOW_LIST. */
41554
41555 static int
41556 count_num_restricted (rtx insn, dispatch_windows *window_list)
41557 {
41558 enum dispatch_group group = get_insn_group (insn);
41559 int imm_size;
41560 int num_imm_operand;
41561 int num_imm32_operand;
41562 int num_imm64_operand;
41563
41564 if (group == disp_no_group)
41565 return 0;
41566
41567 if (group == disp_imm)
41568 {
41569 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41570 &num_imm64_operand);
41571 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
41572 || num_imm_operand + window_list->num_imm > MAX_IMM
41573 || (num_imm32_operand > 0
41574 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
41575 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
41576 || (num_imm64_operand > 0
41577 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
41578 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
41579 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
41580 && num_imm64_operand > 0
41581 && ((window_list->num_imm_64 > 0
41582 && window_list->num_insn >= 2)
41583 || window_list->num_insn >= 3)))
41584 return BIG;
41585
41586 return 1;
41587 }
41588
41589 if ((group == disp_load_store
41590 && (window_list->num_loads >= MAX_LOAD
41591 || window_list->num_stores >= MAX_STORE))
41592 || ((group == disp_load
41593 || group == disp_prefetch)
41594 && window_list->num_loads >= MAX_LOAD)
41595 || (group == disp_store
41596 && window_list->num_stores >= MAX_STORE))
41597 return BIG;
41598
41599 return 1;
41600 }
41601
41602 /* This function returns true if insn satisfies dispatch rules on the
41603 last window scheduled. */
41604
41605 static bool
41606 fits_dispatch_window (rtx insn)
41607 {
41608 dispatch_windows *window_list = dispatch_window_list;
41609 dispatch_windows *window_list_next = dispatch_window_list->next;
41610 unsigned int num_restrict;
41611 enum dispatch_group group = get_insn_group (insn);
41612 enum insn_path path = get_insn_path (insn);
41613 int sum;
41614
41615 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
41616 instructions should be given the lowest priority in the
41617 scheduling process in Haifa scheduler to make sure they will be
41618 scheduled in the same dispatch window as the reference to them. */
41619 if (group == disp_jcc || group == disp_cmp)
41620 return false;
41621
41622 /* Check nonrestricted. */
41623 if (group == disp_no_group || group == disp_branch)
41624 return true;
41625
41626 /* Get last dispatch window. */
41627 if (window_list_next)
41628 window_list = window_list_next;
41629
41630 if (window_list->window_num == 1)
41631 {
41632 sum = window_list->prev->window_size + window_list->window_size;
41633
41634 if (sum == 32
41635 || (min_insn_size (insn) + sum) >= 48)
41636 /* Window 1 is full. Go for next window. */
41637 return true;
41638 }
41639
41640 num_restrict = count_num_restricted (insn, window_list);
41641
41642 if (num_restrict > num_allowable_groups[group])
41643 return false;
41644
41645 /* See if it fits in the first window. */
41646 if (window_list->window_num == 0)
41647 {
41648 /* The first widow should have only single and double path
41649 uops. */
41650 if (path == path_double
41651 && (window_list->num_uops + 2) > MAX_INSN)
41652 return false;
41653 else if (path != path_single)
41654 return false;
41655 }
41656 return true;
41657 }
41658
41659 /* Add an instruction INSN with NUM_UOPS micro-operations to the
41660 dispatch window WINDOW_LIST. */
41661
41662 static void
41663 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
41664 {
41665 int byte_len = min_insn_size (insn);
41666 int num_insn = window_list->num_insn;
41667 int imm_size;
41668 sched_insn_info *window = window_list->window;
41669 enum dispatch_group group = get_insn_group (insn);
41670 enum insn_path path = get_insn_path (insn);
41671 int num_imm_operand;
41672 int num_imm32_operand;
41673 int num_imm64_operand;
41674
41675 if (!window_list->violation && group != disp_cmp
41676 && !fits_dispatch_window (insn))
41677 window_list->violation = true;
41678
41679 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41680 &num_imm64_operand);
41681
41682 /* Initialize window with new instruction. */
41683 window[num_insn].insn = insn;
41684 window[num_insn].byte_len = byte_len;
41685 window[num_insn].group = group;
41686 window[num_insn].path = path;
41687 window[num_insn].imm_bytes = imm_size;
41688
41689 window_list->window_size += byte_len;
41690 window_list->num_insn = num_insn + 1;
41691 window_list->num_uops = window_list->num_uops + num_uops;
41692 window_list->imm_size += imm_size;
41693 window_list->num_imm += num_imm_operand;
41694 window_list->num_imm_32 += num_imm32_operand;
41695 window_list->num_imm_64 += num_imm64_operand;
41696
41697 if (group == disp_store)
41698 window_list->num_stores += 1;
41699 else if (group == disp_load
41700 || group == disp_prefetch)
41701 window_list->num_loads += 1;
41702 else if (group == disp_load_store)
41703 {
41704 window_list->num_stores += 1;
41705 window_list->num_loads += 1;
41706 }
41707 }
41708
41709 /* Adds a scheduled instruction, INSN, to the current dispatch window.
41710 If the total bytes of instructions or the number of instructions in
41711 the window exceed allowable, it allocates a new window. */
41712
41713 static void
41714 add_to_dispatch_window (rtx insn)
41715 {
41716 int byte_len;
41717 dispatch_windows *window_list;
41718 dispatch_windows *next_list;
41719 dispatch_windows *window0_list;
41720 enum insn_path path;
41721 enum dispatch_group insn_group;
41722 bool insn_fits;
41723 int num_insn;
41724 int num_uops;
41725 int window_num;
41726 int insn_num_uops;
41727 int sum;
41728
41729 if (INSN_CODE (insn) < 0)
41730 return;
41731
41732 byte_len = min_insn_size (insn);
41733 window_list = dispatch_window_list;
41734 next_list = window_list->next;
41735 path = get_insn_path (insn);
41736 insn_group = get_insn_group (insn);
41737
41738 /* Get the last dispatch window. */
41739 if (next_list)
41740 window_list = dispatch_window_list->next;
41741
41742 if (path == path_single)
41743 insn_num_uops = 1;
41744 else if (path == path_double)
41745 insn_num_uops = 2;
41746 else
41747 insn_num_uops = (int) path;
41748
41749 /* If current window is full, get a new window.
41750 Window number zero is full, if MAX_INSN uops are scheduled in it.
41751 Window number one is full, if window zero's bytes plus window
41752 one's bytes is 32, or if the bytes of the new instruction added
41753 to the total makes it greater than 48, or it has already MAX_INSN
41754 instructions in it. */
41755 num_insn = window_list->num_insn;
41756 num_uops = window_list->num_uops;
41757 window_num = window_list->window_num;
41758 insn_fits = fits_dispatch_window (insn);
41759
41760 if (num_insn >= MAX_INSN
41761 || num_uops + insn_num_uops > MAX_INSN
41762 || !(insn_fits))
41763 {
41764 window_num = ~window_num & 1;
41765 window_list = allocate_next_window (window_num);
41766 }
41767
41768 if (window_num == 0)
41769 {
41770 add_insn_window (insn, window_list, insn_num_uops);
41771 if (window_list->num_insn >= MAX_INSN
41772 && insn_group == disp_branch)
41773 {
41774 process_end_window ();
41775 return;
41776 }
41777 }
41778 else if (window_num == 1)
41779 {
41780 window0_list = window_list->prev;
41781 sum = window0_list->window_size + window_list->window_size;
41782 if (sum == 32
41783 || (byte_len + sum) >= 48)
41784 {
41785 process_end_window ();
41786 window_list = dispatch_window_list;
41787 }
41788
41789 add_insn_window (insn, window_list, insn_num_uops);
41790 }
41791 else
41792 gcc_unreachable ();
41793
41794 if (is_end_basic_block (insn_group))
41795 {
41796 /* End of basic block is reached do end-basic-block process. */
41797 process_end_window ();
41798 return;
41799 }
41800 }
41801
41802 /* Print the dispatch window, WINDOW_NUM, to FILE. */
41803
41804 DEBUG_FUNCTION static void
41805 debug_dispatch_window_file (FILE *file, int window_num)
41806 {
41807 dispatch_windows *list;
41808 int i;
41809
41810 if (window_num == 0)
41811 list = dispatch_window_list;
41812 else
41813 list = dispatch_window_list1;
41814
41815 fprintf (file, "Window #%d:\n", list->window_num);
41816 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
41817 list->num_insn, list->num_uops, list->window_size);
41818 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
41819 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
41820
41821 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
41822 list->num_stores);
41823 fprintf (file, " insn info:\n");
41824
41825 for (i = 0; i < MAX_INSN; i++)
41826 {
41827 if (!list->window[i].insn)
41828 break;
41829 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
41830 i, group_name[list->window[i].group],
41831 i, (void *)list->window[i].insn,
41832 i, list->window[i].path,
41833 i, list->window[i].byte_len,
41834 i, list->window[i].imm_bytes);
41835 }
41836 }
41837
41838 /* Print to stdout a dispatch window. */
41839
41840 DEBUG_FUNCTION void
41841 debug_dispatch_window (int window_num)
41842 {
41843 debug_dispatch_window_file (stdout, window_num);
41844 }
41845
41846 /* Print INSN dispatch information to FILE. */
41847
41848 DEBUG_FUNCTION static void
41849 debug_insn_dispatch_info_file (FILE *file, rtx insn)
41850 {
41851 int byte_len;
41852 enum insn_path path;
41853 enum dispatch_group group;
41854 int imm_size;
41855 int num_imm_operand;
41856 int num_imm32_operand;
41857 int num_imm64_operand;
41858
41859 if (INSN_CODE (insn) < 0)
41860 return;
41861
41862 byte_len = min_insn_size (insn);
41863 path = get_insn_path (insn);
41864 group = get_insn_group (insn);
41865 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41866 &num_imm64_operand);
41867
41868 fprintf (file, " insn info:\n");
41869 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
41870 group_name[group], path, byte_len);
41871 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
41872 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
41873 }
41874
41875 /* Print to STDERR the status of the ready list with respect to
41876 dispatch windows. */
41877
41878 DEBUG_FUNCTION void
41879 debug_ready_dispatch (void)
41880 {
41881 int i;
41882 int no_ready = number_in_ready ();
41883
41884 fprintf (stdout, "Number of ready: %d\n", no_ready);
41885
41886 for (i = 0; i < no_ready; i++)
41887 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
41888 }
41889
41890 /* This routine is the driver of the dispatch scheduler. */
41891
41892 static void
41893 do_dispatch (rtx insn, int mode)
41894 {
41895 if (mode == DISPATCH_INIT)
41896 init_dispatch_sched ();
41897 else if (mode == ADD_TO_DISPATCH_WINDOW)
41898 add_to_dispatch_window (insn);
41899 }
41900
41901 /* Return TRUE if Dispatch Scheduling is supported. */
41902
41903 static bool
41904 has_dispatch (rtx insn, int action)
41905 {
41906 if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3)
41907 && flag_dispatch_scheduler)
41908 switch (action)
41909 {
41910 default:
41911 return false;
41912
41913 case IS_DISPATCH_ON:
41914 return true;
41915 break;
41916
41917 case IS_CMP:
41918 return is_cmp (insn);
41919
41920 case DISPATCH_VIOLATION:
41921 return dispatch_violation ();
41922
41923 case FITS_DISPATCH_WINDOW:
41924 return fits_dispatch_window (insn);
41925 }
41926
41927 return false;
41928 }
41929
41930 /* Implementation of reassociation_width target hook used by
41931 reassoc phase to identify parallelism level in reassociated
41932 tree. Statements tree_code is passed in OPC. Arguments type
41933 is passed in MODE.
41934
41935 Currently parallel reassociation is enabled for Atom
41936 processors only and we set reassociation width to be 2
41937 because Atom may issue up to 2 instructions per cycle.
41938
41939 Return value should be fixed if parallel reassociation is
41940 enabled for other processors. */
41941
41942 static int
41943 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
41944 enum machine_mode mode)
41945 {
41946 int res = 1;
41947
41948 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
41949 res = 2;
41950 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
41951 res = 2;
41952
41953 return res;
41954 }
41955
41956 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
41957 place emms and femms instructions. */
41958
41959 static enum machine_mode
41960 ix86_preferred_simd_mode (enum machine_mode mode)
41961 {
41962 if (!TARGET_SSE)
41963 return word_mode;
41964
41965 switch (mode)
41966 {
41967 case QImode:
41968 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
41969 case HImode:
41970 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
41971 case SImode:
41972 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
41973 case DImode:
41974 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
41975
41976 case SFmode:
41977 if (TARGET_AVX && !TARGET_PREFER_AVX128)
41978 return V8SFmode;
41979 else
41980 return V4SFmode;
41981
41982 case DFmode:
41983 if (!TARGET_VECTORIZE_DOUBLE)
41984 return word_mode;
41985 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
41986 return V4DFmode;
41987 else if (TARGET_SSE2)
41988 return V2DFmode;
41989 /* FALLTHRU */
41990
41991 default:
41992 return word_mode;
41993 }
41994 }
41995
41996 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
41997 vectors. */
41998
41999 static unsigned int
42000 ix86_autovectorize_vector_sizes (void)
42001 {
42002 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
42003 }
42004
42005 \f
42006
42007 /* Return class of registers which could be used for pseudo of MODE
42008 and of class RCLASS for spilling instead of memory. Return NO_REGS
42009 if it is not possible or non-profitable. */
42010 static reg_class_t
42011 ix86_spill_class (reg_class_t rclass, enum machine_mode mode)
42012 {
42013 if (TARGET_SSE && TARGET_GENERAL_REGS_SSE_SPILL && ! TARGET_MMX
42014 && hard_reg_set_subset_p (reg_class_contents[rclass],
42015 reg_class_contents[GENERAL_REGS])
42016 && (mode == SImode || (TARGET_64BIT && mode == DImode)))
42017 return SSE_REGS;
42018 return NO_REGS;
42019 }
42020
42021 /* Implement targetm.vectorize.init_cost. */
42022
42023 static void *
42024 ix86_init_cost (struct loop *loop_info ATTRIBUTE_UNUSED)
42025 {
42026 unsigned *cost = XNEWVEC (unsigned, 3);
42027 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
42028 return cost;
42029 }
42030
42031 /* Implement targetm.vectorize.add_stmt_cost. */
42032
42033 static unsigned
42034 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
42035 struct _stmt_vec_info *stmt_info, int misalign,
42036 enum vect_cost_model_location where)
42037 {
42038 unsigned *cost = (unsigned *) data;
42039 unsigned retval = 0;
42040
42041 if (flag_vect_cost_model)
42042 {
42043 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
42044 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
42045
42046 /* Statements in an inner loop relative to the loop being
42047 vectorized are weighted more heavily. The value here is
42048 arbitrary and could potentially be improved with analysis. */
42049 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
42050 count *= 50; /* FIXME. */
42051
42052 retval = (unsigned) (count * stmt_cost);
42053 cost[where] += retval;
42054 }
42055
42056 return retval;
42057 }
42058
42059 /* Implement targetm.vectorize.finish_cost. */
42060
42061 static void
42062 ix86_finish_cost (void *data, unsigned *prologue_cost,
42063 unsigned *body_cost, unsigned *epilogue_cost)
42064 {
42065 unsigned *cost = (unsigned *) data;
42066 *prologue_cost = cost[vect_prologue];
42067 *body_cost = cost[vect_body];
42068 *epilogue_cost = cost[vect_epilogue];
42069 }
42070
42071 /* Implement targetm.vectorize.destroy_cost_data. */
42072
42073 static void
42074 ix86_destroy_cost_data (void *data)
42075 {
42076 free (data);
42077 }
42078
42079 /* Validate target specific memory model bits in VAL. */
42080
42081 static unsigned HOST_WIDE_INT
42082 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
42083 {
42084 unsigned HOST_WIDE_INT model = val & MEMMODEL_MASK;
42085 bool strong;
42086
42087 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
42088 |MEMMODEL_MASK)
42089 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
42090 {
42091 warning (OPT_Winvalid_memory_model,
42092 "Unknown architecture specific memory model");
42093 return MEMMODEL_SEQ_CST;
42094 }
42095 strong = (model == MEMMODEL_ACQ_REL || model == MEMMODEL_SEQ_CST);
42096 if (val & IX86_HLE_ACQUIRE && !(model == MEMMODEL_ACQUIRE || strong))
42097 {
42098 warning (OPT_Winvalid_memory_model,
42099 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
42100 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
42101 }
42102 if (val & IX86_HLE_RELEASE && !(model == MEMMODEL_RELEASE || strong))
42103 {
42104 warning (OPT_Winvalid_memory_model,
42105 "HLE_RELEASE not used with RELEASE or stronger memory model");
42106 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
42107 }
42108 return val;
42109 }
42110
42111 /* Initialize the GCC target structure. */
42112 #undef TARGET_RETURN_IN_MEMORY
42113 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
42114
42115 #undef TARGET_LEGITIMIZE_ADDRESS
42116 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
42117
42118 #undef TARGET_ATTRIBUTE_TABLE
42119 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
42120 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
42121 # undef TARGET_MERGE_DECL_ATTRIBUTES
42122 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
42123 #endif
42124
42125 #undef TARGET_COMP_TYPE_ATTRIBUTES
42126 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
42127
42128 #undef TARGET_INIT_BUILTINS
42129 #define TARGET_INIT_BUILTINS ix86_init_builtins
42130 #undef TARGET_BUILTIN_DECL
42131 #define TARGET_BUILTIN_DECL ix86_builtin_decl
42132 #undef TARGET_EXPAND_BUILTIN
42133 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
42134
42135 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
42136 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
42137 ix86_builtin_vectorized_function
42138
42139 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
42140 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
42141
42142 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
42143 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
42144
42145 #undef TARGET_VECTORIZE_BUILTIN_GATHER
42146 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
42147
42148 #undef TARGET_BUILTIN_RECIPROCAL
42149 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
42150
42151 #undef TARGET_ASM_FUNCTION_EPILOGUE
42152 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
42153
42154 #undef TARGET_ENCODE_SECTION_INFO
42155 #ifndef SUBTARGET_ENCODE_SECTION_INFO
42156 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
42157 #else
42158 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
42159 #endif
42160
42161 #undef TARGET_ASM_OPEN_PAREN
42162 #define TARGET_ASM_OPEN_PAREN ""
42163 #undef TARGET_ASM_CLOSE_PAREN
42164 #define TARGET_ASM_CLOSE_PAREN ""
42165
42166 #undef TARGET_ASM_BYTE_OP
42167 #define TARGET_ASM_BYTE_OP ASM_BYTE
42168
42169 #undef TARGET_ASM_ALIGNED_HI_OP
42170 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
42171 #undef TARGET_ASM_ALIGNED_SI_OP
42172 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
42173 #ifdef ASM_QUAD
42174 #undef TARGET_ASM_ALIGNED_DI_OP
42175 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
42176 #endif
42177
42178 #undef TARGET_PROFILE_BEFORE_PROLOGUE
42179 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
42180
42181 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
42182 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
42183
42184 #undef TARGET_ASM_UNALIGNED_HI_OP
42185 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
42186 #undef TARGET_ASM_UNALIGNED_SI_OP
42187 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
42188 #undef TARGET_ASM_UNALIGNED_DI_OP
42189 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
42190
42191 #undef TARGET_PRINT_OPERAND
42192 #define TARGET_PRINT_OPERAND ix86_print_operand
42193 #undef TARGET_PRINT_OPERAND_ADDRESS
42194 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
42195 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
42196 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
42197 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
42198 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
42199
42200 #undef TARGET_SCHED_INIT_GLOBAL
42201 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
42202 #undef TARGET_SCHED_ADJUST_COST
42203 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
42204 #undef TARGET_SCHED_ISSUE_RATE
42205 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
42206 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
42207 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
42208 ia32_multipass_dfa_lookahead
42209
42210 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
42211 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
42212
42213 #undef TARGET_MEMMODEL_CHECK
42214 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
42215
42216 #ifdef HAVE_AS_TLS
42217 #undef TARGET_HAVE_TLS
42218 #define TARGET_HAVE_TLS true
42219 #endif
42220 #undef TARGET_CANNOT_FORCE_CONST_MEM
42221 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
42222 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
42223 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
42224
42225 #undef TARGET_DELEGITIMIZE_ADDRESS
42226 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
42227
42228 #undef TARGET_MS_BITFIELD_LAYOUT_P
42229 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
42230
42231 #if TARGET_MACHO
42232 #undef TARGET_BINDS_LOCAL_P
42233 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
42234 #endif
42235 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
42236 #undef TARGET_BINDS_LOCAL_P
42237 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
42238 #endif
42239
42240 #undef TARGET_ASM_OUTPUT_MI_THUNK
42241 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
42242 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
42243 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
42244
42245 #undef TARGET_ASM_FILE_START
42246 #define TARGET_ASM_FILE_START x86_file_start
42247
42248 #undef TARGET_OPTION_OVERRIDE
42249 #define TARGET_OPTION_OVERRIDE ix86_option_override
42250
42251 #undef TARGET_REGISTER_MOVE_COST
42252 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
42253 #undef TARGET_MEMORY_MOVE_COST
42254 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
42255 #undef TARGET_RTX_COSTS
42256 #define TARGET_RTX_COSTS ix86_rtx_costs
42257 #undef TARGET_ADDRESS_COST
42258 #define TARGET_ADDRESS_COST ix86_address_cost
42259
42260 #undef TARGET_FIXED_CONDITION_CODE_REGS
42261 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
42262 #undef TARGET_CC_MODES_COMPATIBLE
42263 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
42264
42265 #undef TARGET_MACHINE_DEPENDENT_REORG
42266 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
42267
42268 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
42269 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
42270
42271 #undef TARGET_BUILD_BUILTIN_VA_LIST
42272 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
42273
42274 #undef TARGET_FOLD_BUILTIN
42275 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
42276
42277 #undef TARGET_COMPARE_VERSION_PRIORITY
42278 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
42279
42280 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
42281 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
42282 ix86_generate_version_dispatcher_body
42283
42284 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
42285 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
42286 ix86_get_function_versions_dispatcher
42287
42288 #undef TARGET_ENUM_VA_LIST_P
42289 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
42290
42291 #undef TARGET_FN_ABI_VA_LIST
42292 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
42293
42294 #undef TARGET_CANONICAL_VA_LIST_TYPE
42295 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
42296
42297 #undef TARGET_EXPAND_BUILTIN_VA_START
42298 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
42299
42300 #undef TARGET_MD_ASM_CLOBBERS
42301 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
42302
42303 #undef TARGET_PROMOTE_PROTOTYPES
42304 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
42305 #undef TARGET_STRUCT_VALUE_RTX
42306 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
42307 #undef TARGET_SETUP_INCOMING_VARARGS
42308 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
42309 #undef TARGET_MUST_PASS_IN_STACK
42310 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
42311 #undef TARGET_FUNCTION_ARG_ADVANCE
42312 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
42313 #undef TARGET_FUNCTION_ARG
42314 #define TARGET_FUNCTION_ARG ix86_function_arg
42315 #undef TARGET_FUNCTION_ARG_BOUNDARY
42316 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
42317 #undef TARGET_PASS_BY_REFERENCE
42318 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
42319 #undef TARGET_INTERNAL_ARG_POINTER
42320 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
42321 #undef TARGET_UPDATE_STACK_BOUNDARY
42322 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
42323 #undef TARGET_GET_DRAP_RTX
42324 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
42325 #undef TARGET_STRICT_ARGUMENT_NAMING
42326 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
42327 #undef TARGET_STATIC_CHAIN
42328 #define TARGET_STATIC_CHAIN ix86_static_chain
42329 #undef TARGET_TRAMPOLINE_INIT
42330 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
42331 #undef TARGET_RETURN_POPS_ARGS
42332 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
42333
42334 #undef TARGET_LEGITIMATE_COMBINED_INSN
42335 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
42336
42337 #undef TARGET_ASAN_SHADOW_OFFSET
42338 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
42339
42340 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
42341 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
42342
42343 #undef TARGET_SCALAR_MODE_SUPPORTED_P
42344 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
42345
42346 #undef TARGET_VECTOR_MODE_SUPPORTED_P
42347 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
42348
42349 #undef TARGET_C_MODE_FOR_SUFFIX
42350 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
42351
42352 #ifdef HAVE_AS_TLS
42353 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
42354 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
42355 #endif
42356
42357 #ifdef SUBTARGET_INSERT_ATTRIBUTES
42358 #undef TARGET_INSERT_ATTRIBUTES
42359 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
42360 #endif
42361
42362 #undef TARGET_MANGLE_TYPE
42363 #define TARGET_MANGLE_TYPE ix86_mangle_type
42364
42365 #if !TARGET_MACHO
42366 #undef TARGET_STACK_PROTECT_FAIL
42367 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
42368 #endif
42369
42370 #undef TARGET_FUNCTION_VALUE
42371 #define TARGET_FUNCTION_VALUE ix86_function_value
42372
42373 #undef TARGET_FUNCTION_VALUE_REGNO_P
42374 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
42375
42376 #undef TARGET_PROMOTE_FUNCTION_MODE
42377 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
42378
42379 #undef TARGET_MEMBER_TYPE_FORCES_BLK
42380 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
42381
42382 #undef TARGET_INSTANTIATE_DECLS
42383 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
42384
42385 #undef TARGET_SECONDARY_RELOAD
42386 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
42387
42388 #undef TARGET_CLASS_MAX_NREGS
42389 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
42390
42391 #undef TARGET_PREFERRED_RELOAD_CLASS
42392 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
42393 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
42394 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
42395 #undef TARGET_CLASS_LIKELY_SPILLED_P
42396 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
42397
42398 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
42399 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
42400 ix86_builtin_vectorization_cost
42401 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
42402 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
42403 ix86_vectorize_vec_perm_const_ok
42404 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
42405 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
42406 ix86_preferred_simd_mode
42407 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
42408 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
42409 ix86_autovectorize_vector_sizes
42410 #undef TARGET_VECTORIZE_INIT_COST
42411 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
42412 #undef TARGET_VECTORIZE_ADD_STMT_COST
42413 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
42414 #undef TARGET_VECTORIZE_FINISH_COST
42415 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
42416 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
42417 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
42418
42419 #undef TARGET_SET_CURRENT_FUNCTION
42420 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
42421
42422 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
42423 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
42424
42425 #undef TARGET_OPTION_SAVE
42426 #define TARGET_OPTION_SAVE ix86_function_specific_save
42427
42428 #undef TARGET_OPTION_RESTORE
42429 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
42430
42431 #undef TARGET_OPTION_PRINT
42432 #define TARGET_OPTION_PRINT ix86_function_specific_print
42433
42434 #undef TARGET_OPTION_FUNCTION_VERSIONS
42435 #define TARGET_OPTION_FUNCTION_VERSIONS ix86_function_versions
42436
42437 #undef TARGET_OPTION_SUPPORTS_FUNCTION_VERSIONS
42438 #define TARGET_OPTION_SUPPORTS_FUNCTION_VERSIONS \
42439 ix86_supports_function_versions
42440
42441 #undef TARGET_CAN_INLINE_P
42442 #define TARGET_CAN_INLINE_P ix86_can_inline_p
42443
42444 #undef TARGET_EXPAND_TO_RTL_HOOK
42445 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
42446
42447 #undef TARGET_LEGITIMATE_ADDRESS_P
42448 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
42449
42450 #undef TARGET_LRA_P
42451 #define TARGET_LRA_P hook_bool_void_true
42452
42453 #undef TARGET_REGISTER_PRIORITY
42454 #define TARGET_REGISTER_PRIORITY ix86_register_priority
42455
42456 #undef TARGET_LEGITIMATE_CONSTANT_P
42457 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
42458
42459 #undef TARGET_FRAME_POINTER_REQUIRED
42460 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
42461
42462 #undef TARGET_CAN_ELIMINATE
42463 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
42464
42465 #undef TARGET_EXTRA_LIVE_ON_ENTRY
42466 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
42467
42468 #undef TARGET_ASM_CODE_END
42469 #define TARGET_ASM_CODE_END ix86_code_end
42470
42471 #undef TARGET_CONDITIONAL_REGISTER_USAGE
42472 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
42473
42474 #if TARGET_MACHO
42475 #undef TARGET_INIT_LIBFUNCS
42476 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
42477 #endif
42478
42479 #undef TARGET_SPILL_CLASS
42480 #define TARGET_SPILL_CLASS ix86_spill_class
42481
42482 struct gcc_target targetm = TARGET_INITIALIZER;
42483 \f
42484 #include "gt-i386.h"
This page took 2.071701 seconds and 5 git commands to generate.