]> gcc.gnu.org Git - gcc.git/blob - gcc/config/i386/i386-expand.c
i386: Fix unsigned int -> double conversion on i386 w/ -mfpmath=sse [PR100119]
[gcc.git] / gcc / config / i386 / i386-expand.c
1 /* Copyright (C) 1988-2021 Free Software Foundation, Inc.
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
18
19 #define IN_TARGET_CODE 1
20
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "memmodel.h"
28 #include "gimple.h"
29 #include "cfghooks.h"
30 #include "cfgloop.h"
31 #include "df.h"
32 #include "tm_p.h"
33 #include "stringpool.h"
34 #include "expmed.h"
35 #include "optabs.h"
36 #include "regs.h"
37 #include "emit-rtl.h"
38 #include "recog.h"
39 #include "cgraph.h"
40 #include "diagnostic.h"
41 #include "cfgbuild.h"
42 #include "alias.h"
43 #include "fold-const.h"
44 #include "attribs.h"
45 #include "calls.h"
46 #include "stor-layout.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "insn-attr.h"
50 #include "flags.h"
51 #include "except.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "cfgrtl.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
57 #include "reload.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "tm-constrs.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "dbgcnt.h"
75 #include "case-cfn-macros.h"
76 #include "dojump.h"
77 #include "fold-const-call.h"
78 #include "tree-vrp.h"
79 #include "tree-ssanames.h"
80 #include "selftest.h"
81 #include "selftest-rtl.h"
82 #include "print-rtl.h"
83 #include "intl.h"
84 #include "ifcvt.h"
85 #include "symbol-summary.h"
86 #include "ipa-prop.h"
87 #include "ipa-fnsummary.h"
88 #include "wide-int-bitmask.h"
89 #include "tree-vector-builder.h"
90 #include "debug.h"
91 #include "dwarf2out.h"
92 #include "i386-options.h"
93 #include "i386-builtins.h"
94 #include "i386-expand.h"
95
96 /* Split one or more double-mode RTL references into pairs of half-mode
97 references. The RTL can be REG, offsettable MEM, integer constant, or
98 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
99 split and "num" is its length. lo_half and hi_half are output arrays
100 that parallel "operands". */
101
102 void
103 split_double_mode (machine_mode mode, rtx operands[],
104 int num, rtx lo_half[], rtx hi_half[])
105 {
106 machine_mode half_mode;
107 unsigned int byte;
108 rtx mem_op = NULL_RTX;
109 int mem_num = 0;
110
111 switch (mode)
112 {
113 case E_TImode:
114 half_mode = DImode;
115 break;
116 case E_DImode:
117 half_mode = SImode;
118 break;
119 case E_P2HImode:
120 half_mode = HImode;
121 break;
122 case E_P2QImode:
123 half_mode = QImode;
124 break;
125 default:
126 gcc_unreachable ();
127 }
128
129 byte = GET_MODE_SIZE (half_mode);
130
131 while (num--)
132 {
133 rtx op = operands[num];
134
135 /* simplify_subreg refuse to split volatile memory addresses,
136 but we still have to handle it. */
137 if (MEM_P (op))
138 {
139 if (mem_op && rtx_equal_p (op, mem_op))
140 {
141 lo_half[num] = lo_half[mem_num];
142 hi_half[num] = hi_half[mem_num];
143 }
144 else
145 {
146 mem_op = op;
147 mem_num = num;
148 lo_half[num] = adjust_address (op, half_mode, 0);
149 hi_half[num] = adjust_address (op, half_mode, byte);
150 }
151 }
152 else
153 {
154 lo_half[num] = simplify_gen_subreg (half_mode, op,
155 GET_MODE (op) == VOIDmode
156 ? mode : GET_MODE (op), 0);
157 hi_half[num] = simplify_gen_subreg (half_mode, op,
158 GET_MODE (op) == VOIDmode
159 ? mode : GET_MODE (op), byte);
160 }
161 }
162 }
163
164 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
165 for the target. */
166
167 void
168 ix86_expand_clear (rtx dest)
169 {
170 rtx tmp;
171
172 /* We play register width games, which are only valid after reload. */
173 gcc_assert (reload_completed);
174
175 /* Avoid HImode and its attendant prefix byte. */
176 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
177 dest = gen_rtx_REG (SImode, REGNO (dest));
178 tmp = gen_rtx_SET (dest, const0_rtx);
179
180 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
181 {
182 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
183 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
184 }
185
186 emit_insn (tmp);
187 }
188
189 void
190 ix86_expand_move (machine_mode mode, rtx operands[])
191 {
192 rtx op0, op1;
193 rtx tmp, addend = NULL_RTX;
194 enum tls_model model;
195
196 op0 = operands[0];
197 op1 = operands[1];
198
199 /* Avoid complex sets of likely spilled hard registers before reload. */
200 if (!ix86_hardreg_mov_ok (op0, op1))
201 {
202 tmp = gen_reg_rtx (mode);
203 operands[0] = tmp;
204 ix86_expand_move (mode, operands);
205 operands[0] = op0;
206 operands[1] = tmp;
207 op1 = tmp;
208 }
209
210 switch (GET_CODE (op1))
211 {
212 case CONST:
213 tmp = XEXP (op1, 0);
214
215 if (GET_CODE (tmp) != PLUS
216 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
217 break;
218
219 op1 = XEXP (tmp, 0);
220 addend = XEXP (tmp, 1);
221 /* FALLTHRU */
222
223 case SYMBOL_REF:
224 model = SYMBOL_REF_TLS_MODEL (op1);
225
226 if (model)
227 op1 = legitimize_tls_address (op1, model, true);
228 else if (ix86_force_load_from_GOT_p (op1))
229 {
230 /* Load the external function address via GOT slot to avoid PLT. */
231 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
232 (TARGET_64BIT
233 ? UNSPEC_GOTPCREL
234 : UNSPEC_GOT));
235 op1 = gen_rtx_CONST (Pmode, op1);
236 op1 = gen_const_mem (Pmode, op1);
237 set_mem_alias_set (op1, ix86_GOT_alias_set ());
238 }
239 else
240 {
241 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
242 if (tmp)
243 {
244 op1 = tmp;
245 if (!addend)
246 break;
247 }
248 else
249 {
250 op1 = operands[1];
251 break;
252 }
253 }
254
255 if (addend)
256 {
257 op1 = force_operand (op1, NULL_RTX);
258 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
259 op0, 1, OPTAB_DIRECT);
260 }
261 else
262 op1 = force_operand (op1, op0);
263
264 if (op1 == op0)
265 return;
266
267 op1 = convert_to_mode (mode, op1, 1);
268
269 default:
270 break;
271 }
272
273 if ((flag_pic || MACHOPIC_INDIRECT)
274 && symbolic_operand (op1, mode))
275 {
276 if (TARGET_MACHO && !TARGET_64BIT)
277 {
278 #if TARGET_MACHO
279 /* dynamic-no-pic */
280 if (MACHOPIC_INDIRECT)
281 {
282 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
283 ? op0 : gen_reg_rtx (Pmode);
284 op1 = machopic_indirect_data_reference (op1, temp);
285 if (MACHOPIC_PURE)
286 op1 = machopic_legitimize_pic_address (op1, mode,
287 temp == op1 ? 0 : temp);
288 }
289 if (op0 != op1 && GET_CODE (op0) != MEM)
290 {
291 rtx insn = gen_rtx_SET (op0, op1);
292 emit_insn (insn);
293 return;
294 }
295 if (GET_CODE (op0) == MEM)
296 op1 = force_reg (Pmode, op1);
297 else
298 {
299 rtx temp = op0;
300 if (GET_CODE (temp) != REG)
301 temp = gen_reg_rtx (Pmode);
302 temp = legitimize_pic_address (op1, temp);
303 if (temp == op0)
304 return;
305 op1 = temp;
306 }
307 /* dynamic-no-pic */
308 #endif
309 }
310 else
311 {
312 if (MEM_P (op0))
313 op1 = force_reg (mode, op1);
314 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
315 {
316 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
317 op1 = legitimize_pic_address (op1, reg);
318 if (op0 == op1)
319 return;
320 op1 = convert_to_mode (mode, op1, 1);
321 }
322 }
323 }
324 else
325 {
326 if (MEM_P (op0)
327 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
328 || !push_operand (op0, mode))
329 && MEM_P (op1))
330 op1 = force_reg (mode, op1);
331
332 if (push_operand (op0, mode)
333 && ! general_no_elim_operand (op1, mode))
334 op1 = copy_to_mode_reg (mode, op1);
335
336 /* Force large constants in 64bit compilation into register
337 to get them CSEed. */
338 if (can_create_pseudo_p ()
339 && (mode == DImode) && TARGET_64BIT
340 && immediate_operand (op1, mode)
341 && !x86_64_zext_immediate_operand (op1, VOIDmode)
342 && !register_operand (op0, mode)
343 && optimize)
344 op1 = copy_to_mode_reg (mode, op1);
345
346 if (can_create_pseudo_p ()
347 && CONST_DOUBLE_P (op1))
348 {
349 /* If we are loading a floating point constant to a register,
350 force the value to memory now, since we'll get better code
351 out the back end. */
352
353 op1 = validize_mem (force_const_mem (mode, op1));
354 if (!register_operand (op0, mode))
355 {
356 rtx temp = gen_reg_rtx (mode);
357 emit_insn (gen_rtx_SET (temp, op1));
358 emit_move_insn (op0, temp);
359 return;
360 }
361 }
362 }
363
364 emit_insn (gen_rtx_SET (op0, op1));
365 }
366
367 void
368 ix86_expand_vector_move (machine_mode mode, rtx operands[])
369 {
370 rtx op0 = operands[0], op1 = operands[1];
371 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
372 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
373 unsigned int align = (TARGET_IAMCU
374 ? GET_MODE_BITSIZE (mode)
375 : GET_MODE_ALIGNMENT (mode));
376
377 if (push_operand (op0, VOIDmode))
378 op0 = emit_move_resolve_push (mode, op0);
379
380 /* Force constants other than zero into memory. We do not know how
381 the instructions used to build constants modify the upper 64 bits
382 of the register, once we have that information we may be able
383 to handle some of them more efficiently. */
384 if (can_create_pseudo_p ()
385 && (CONSTANT_P (op1)
386 || (SUBREG_P (op1)
387 && CONSTANT_P (SUBREG_REG (op1))))
388 && ((register_operand (op0, mode)
389 && !standard_sse_constant_p (op1, mode))
390 /* ix86_expand_vector_move_misalign() does not like constants. */
391 || (SSE_REG_MODE_P (mode)
392 && MEM_P (op0)
393 && MEM_ALIGN (op0) < align)))
394 {
395 if (SUBREG_P (op1))
396 {
397 machine_mode imode = GET_MODE (SUBREG_REG (op1));
398 rtx r = force_const_mem (imode, SUBREG_REG (op1));
399 if (r)
400 r = validize_mem (r);
401 else
402 r = force_reg (imode, SUBREG_REG (op1));
403 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
404 }
405 else
406 op1 = validize_mem (force_const_mem (mode, op1));
407 }
408
409 /* We need to check memory alignment for SSE mode since attribute
410 can make operands unaligned. */
411 if (can_create_pseudo_p ()
412 && SSE_REG_MODE_P (mode)
413 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
414 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
415 {
416 rtx tmp[2];
417
418 /* ix86_expand_vector_move_misalign() does not like both
419 arguments in memory. */
420 if (!register_operand (op0, mode)
421 && !register_operand (op1, mode))
422 op1 = force_reg (mode, op1);
423
424 tmp[0] = op0; tmp[1] = op1;
425 ix86_expand_vector_move_misalign (mode, tmp);
426 return;
427 }
428
429 /* Make operand1 a register if it isn't already. */
430 if (can_create_pseudo_p ()
431 && !register_operand (op0, mode)
432 && !register_operand (op1, mode))
433 {
434 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
435 return;
436 }
437
438 emit_insn (gen_rtx_SET (op0, op1));
439 }
440
441 /* Split 32-byte AVX unaligned load and store if needed. */
442
443 static void
444 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
445 {
446 rtx m;
447 rtx (*extract) (rtx, rtx, rtx);
448 machine_mode mode;
449
450 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
451 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
452 {
453 emit_insn (gen_rtx_SET (op0, op1));
454 return;
455 }
456
457 rtx orig_op0 = NULL_RTX;
458 mode = GET_MODE (op0);
459 switch (GET_MODE_CLASS (mode))
460 {
461 case MODE_VECTOR_INT:
462 case MODE_INT:
463 if (mode != V32QImode)
464 {
465 if (!MEM_P (op0))
466 {
467 orig_op0 = op0;
468 op0 = gen_reg_rtx (V32QImode);
469 }
470 else
471 op0 = gen_lowpart (V32QImode, op0);
472 op1 = gen_lowpart (V32QImode, op1);
473 mode = V32QImode;
474 }
475 break;
476 case MODE_VECTOR_FLOAT:
477 break;
478 default:
479 gcc_unreachable ();
480 }
481
482 switch (mode)
483 {
484 default:
485 gcc_unreachable ();
486 case E_V32QImode:
487 extract = gen_avx_vextractf128v32qi;
488 mode = V16QImode;
489 break;
490 case E_V8SFmode:
491 extract = gen_avx_vextractf128v8sf;
492 mode = V4SFmode;
493 break;
494 case E_V4DFmode:
495 extract = gen_avx_vextractf128v4df;
496 mode = V2DFmode;
497 break;
498 }
499
500 if (MEM_P (op1))
501 {
502 rtx r = gen_reg_rtx (mode);
503 m = adjust_address (op1, mode, 0);
504 emit_move_insn (r, m);
505 m = adjust_address (op1, mode, 16);
506 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
507 emit_move_insn (op0, r);
508 }
509 else if (MEM_P (op0))
510 {
511 m = adjust_address (op0, mode, 0);
512 emit_insn (extract (m, op1, const0_rtx));
513 m = adjust_address (op0, mode, 16);
514 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
515 }
516 else
517 gcc_unreachable ();
518
519 if (orig_op0)
520 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
521 }
522
523 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
524 straight to ix86_expand_vector_move. */
525 /* Code generation for scalar reg-reg moves of single and double precision data:
526 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
527 movaps reg, reg
528 else
529 movss reg, reg
530 if (x86_sse_partial_reg_dependency == true)
531 movapd reg, reg
532 else
533 movsd reg, reg
534
535 Code generation for scalar loads of double precision data:
536 if (x86_sse_split_regs == true)
537 movlpd mem, reg (gas syntax)
538 else
539 movsd mem, reg
540
541 Code generation for unaligned packed loads of single precision data
542 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
543 if (x86_sse_unaligned_move_optimal)
544 movups mem, reg
545
546 if (x86_sse_partial_reg_dependency == true)
547 {
548 xorps reg, reg
549 movlps mem, reg
550 movhps mem+8, reg
551 }
552 else
553 {
554 movlps mem, reg
555 movhps mem+8, reg
556 }
557
558 Code generation for unaligned packed loads of double precision data
559 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
560 if (x86_sse_unaligned_move_optimal)
561 movupd mem, reg
562
563 if (x86_sse_split_regs == true)
564 {
565 movlpd mem, reg
566 movhpd mem+8, reg
567 }
568 else
569 {
570 movsd mem, reg
571 movhpd mem+8, reg
572 }
573 */
574
575 void
576 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
577 {
578 rtx op0, op1, m;
579
580 op0 = operands[0];
581 op1 = operands[1];
582
583 /* Use unaligned load/store for AVX512 or when optimizing for size. */
584 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
585 {
586 emit_insn (gen_rtx_SET (op0, op1));
587 return;
588 }
589
590 if (TARGET_AVX)
591 {
592 if (GET_MODE_SIZE (mode) == 32)
593 ix86_avx256_split_vector_move_misalign (op0, op1);
594 else
595 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
596 emit_insn (gen_rtx_SET (op0, op1));
597 return;
598 }
599
600 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
601 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
602 {
603 emit_insn (gen_rtx_SET (op0, op1));
604 return;
605 }
606
607 /* ??? If we have typed data, then it would appear that using
608 movdqu is the only way to get unaligned data loaded with
609 integer type. */
610 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
611 {
612 emit_insn (gen_rtx_SET (op0, op1));
613 return;
614 }
615
616 if (MEM_P (op1))
617 {
618 if (TARGET_SSE2 && mode == V2DFmode)
619 {
620 rtx zero;
621
622 /* When SSE registers are split into halves, we can avoid
623 writing to the top half twice. */
624 if (TARGET_SSE_SPLIT_REGS)
625 {
626 emit_clobber (op0);
627 zero = op0;
628 }
629 else
630 {
631 /* ??? Not sure about the best option for the Intel chips.
632 The following would seem to satisfy; the register is
633 entirely cleared, breaking the dependency chain. We
634 then store to the upper half, with a dependency depth
635 of one. A rumor has it that Intel recommends two movsd
636 followed by an unpacklpd, but this is unconfirmed. And
637 given that the dependency depth of the unpacklpd would
638 still be one, I'm not sure why this would be better. */
639 zero = CONST0_RTX (V2DFmode);
640 }
641
642 m = adjust_address (op1, DFmode, 0);
643 emit_insn (gen_sse2_loadlpd (op0, zero, m));
644 m = adjust_address (op1, DFmode, 8);
645 emit_insn (gen_sse2_loadhpd (op0, op0, m));
646 }
647 else
648 {
649 rtx t;
650
651 if (mode != V4SFmode)
652 t = gen_reg_rtx (V4SFmode);
653 else
654 t = op0;
655
656 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
657 emit_move_insn (t, CONST0_RTX (V4SFmode));
658 else
659 emit_clobber (t);
660
661 m = adjust_address (op1, V2SFmode, 0);
662 emit_insn (gen_sse_loadlps (t, t, m));
663 m = adjust_address (op1, V2SFmode, 8);
664 emit_insn (gen_sse_loadhps (t, t, m));
665 if (mode != V4SFmode)
666 emit_move_insn (op0, gen_lowpart (mode, t));
667 }
668 }
669 else if (MEM_P (op0))
670 {
671 if (TARGET_SSE2 && mode == V2DFmode)
672 {
673 m = adjust_address (op0, DFmode, 0);
674 emit_insn (gen_sse2_storelpd (m, op1));
675 m = adjust_address (op0, DFmode, 8);
676 emit_insn (gen_sse2_storehpd (m, op1));
677 }
678 else
679 {
680 if (mode != V4SFmode)
681 op1 = gen_lowpart (V4SFmode, op1);
682
683 m = adjust_address (op0, V2SFmode, 0);
684 emit_insn (gen_sse_storelps (m, op1));
685 m = adjust_address (op0, V2SFmode, 8);
686 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
687 }
688 }
689 else
690 gcc_unreachable ();
691 }
692
693 /* Move bits 64:95 to bits 32:63. */
694
695 void
696 ix86_move_vector_high_sse_to_mmx (rtx op)
697 {
698 rtx mask = gen_rtx_PARALLEL (VOIDmode,
699 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
700 GEN_INT (0), GEN_INT (0)));
701 rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
702 op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
703 rtx insn = gen_rtx_SET (dest, op);
704 emit_insn (insn);
705 }
706
707 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
708
709 void
710 ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
711 {
712 rtx op0 = operands[0];
713 rtx op1 = operands[1];
714 rtx op2 = operands[2];
715
716 machine_mode dmode = GET_MODE (op0);
717 machine_mode smode = GET_MODE (op1);
718 machine_mode inner_dmode = GET_MODE_INNER (dmode);
719 machine_mode inner_smode = GET_MODE_INNER (smode);
720
721 /* Get the corresponding SSE mode for destination. */
722 int nunits = 16 / GET_MODE_SIZE (inner_dmode);
723 machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
724 nunits).require ();
725 machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
726 nunits / 2).require ();
727
728 /* Get the corresponding SSE mode for source. */
729 nunits = 16 / GET_MODE_SIZE (inner_smode);
730 machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
731 nunits).require ();
732
733 /* Generate SSE pack with signed/unsigned saturation. */
734 rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0));
735 op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
736 op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
737
738 op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
739 op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
740 rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,
741 op1, op2));
742 emit_insn (insn);
743
744 ix86_move_vector_high_sse_to_mmx (op0);
745 }
746
747 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
748
749 void
750 ix86_split_mmx_punpck (rtx operands[], bool high_p)
751 {
752 rtx op0 = operands[0];
753 rtx op1 = operands[1];
754 rtx op2 = operands[2];
755 machine_mode mode = GET_MODE (op0);
756 rtx mask;
757 /* The corresponding SSE mode. */
758 machine_mode sse_mode, double_sse_mode;
759
760 switch (mode)
761 {
762 case E_V8QImode:
763 sse_mode = V16QImode;
764 double_sse_mode = V32QImode;
765 mask = gen_rtx_PARALLEL (VOIDmode,
766 gen_rtvec (16,
767 GEN_INT (0), GEN_INT (16),
768 GEN_INT (1), GEN_INT (17),
769 GEN_INT (2), GEN_INT (18),
770 GEN_INT (3), GEN_INT (19),
771 GEN_INT (4), GEN_INT (20),
772 GEN_INT (5), GEN_INT (21),
773 GEN_INT (6), GEN_INT (22),
774 GEN_INT (7), GEN_INT (23)));
775 break;
776
777 case E_V4HImode:
778 sse_mode = V8HImode;
779 double_sse_mode = V16HImode;
780 mask = gen_rtx_PARALLEL (VOIDmode,
781 gen_rtvec (8,
782 GEN_INT (0), GEN_INT (8),
783 GEN_INT (1), GEN_INT (9),
784 GEN_INT (2), GEN_INT (10),
785 GEN_INT (3), GEN_INT (11)));
786 break;
787
788 case E_V2SImode:
789 sse_mode = V4SImode;
790 double_sse_mode = V8SImode;
791 mask = gen_rtx_PARALLEL (VOIDmode,
792 gen_rtvec (4,
793 GEN_INT (0), GEN_INT (4),
794 GEN_INT (1), GEN_INT (5)));
795 break;
796
797 default:
798 gcc_unreachable ();
799 }
800
801 /* Generate SSE punpcklXX. */
802 rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0));
803 op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1));
804 op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2));
805
806 op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
807 op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
808 rtx insn = gen_rtx_SET (dest, op2);
809 emit_insn (insn);
810
811 if (high_p)
812 {
813 /* Move bits 64:127 to bits 0:63. */
814 mask = gen_rtx_PARALLEL (VOIDmode,
815 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
816 GEN_INT (0), GEN_INT (0)));
817 dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
818 op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
819 insn = gen_rtx_SET (dest, op1);
820 emit_insn (insn);
821 }
822 }
823
824 /* Helper function of ix86_fixup_binary_operands to canonicalize
825 operand order. Returns true if the operands should be swapped. */
826
827 static bool
828 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
829 rtx operands[])
830 {
831 rtx dst = operands[0];
832 rtx src1 = operands[1];
833 rtx src2 = operands[2];
834
835 /* If the operation is not commutative, we can't do anything. */
836 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
837 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
838 return false;
839
840 /* Highest priority is that src1 should match dst. */
841 if (rtx_equal_p (dst, src1))
842 return false;
843 if (rtx_equal_p (dst, src2))
844 return true;
845
846 /* Next highest priority is that immediate constants come second. */
847 if (immediate_operand (src2, mode))
848 return false;
849 if (immediate_operand (src1, mode))
850 return true;
851
852 /* Lowest priority is that memory references should come second. */
853 if (MEM_P (src2))
854 return false;
855 if (MEM_P (src1))
856 return true;
857
858 return false;
859 }
860
861
862 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
863 destination to use for the operation. If different from the true
864 destination in operands[0], a copy operation will be required. */
865
866 rtx
867 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
868 rtx operands[])
869 {
870 rtx dst = operands[0];
871 rtx src1 = operands[1];
872 rtx src2 = operands[2];
873
874 /* Canonicalize operand order. */
875 if (ix86_swap_binary_operands_p (code, mode, operands))
876 {
877 /* It is invalid to swap operands of different modes. */
878 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
879
880 std::swap (src1, src2);
881 }
882
883 /* Both source operands cannot be in memory. */
884 if (MEM_P (src1) && MEM_P (src2))
885 {
886 /* Optimization: Only read from memory once. */
887 if (rtx_equal_p (src1, src2))
888 {
889 src2 = force_reg (mode, src2);
890 src1 = src2;
891 }
892 else if (rtx_equal_p (dst, src1))
893 src2 = force_reg (mode, src2);
894 else
895 src1 = force_reg (mode, src1);
896 }
897
898 /* If the destination is memory, and we do not have matching source
899 operands, do things in registers. */
900 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
901 dst = gen_reg_rtx (mode);
902
903 /* Source 1 cannot be a constant. */
904 if (CONSTANT_P (src1))
905 src1 = force_reg (mode, src1);
906
907 /* Source 1 cannot be a non-matching memory. */
908 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
909 src1 = force_reg (mode, src1);
910
911 /* Improve address combine. */
912 if (code == PLUS
913 && GET_MODE_CLASS (mode) == MODE_INT
914 && MEM_P (src2))
915 src2 = force_reg (mode, src2);
916
917 operands[1] = src1;
918 operands[2] = src2;
919 return dst;
920 }
921
922 /* Similarly, but assume that the destination has already been
923 set up properly. */
924
925 void
926 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
927 machine_mode mode, rtx operands[])
928 {
929 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
930 gcc_assert (dst == operands[0]);
931 }
932
933 /* Attempt to expand a binary operator. Make the expansion closer to the
934 actual machine, then just general_operand, which will allow 3 separate
935 memory references (one output, two input) in a single insn. */
936
937 void
938 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
939 rtx operands[])
940 {
941 rtx src1, src2, dst, op, clob;
942
943 dst = ix86_fixup_binary_operands (code, mode, operands);
944 src1 = operands[1];
945 src2 = operands[2];
946
947 /* Emit the instruction. */
948
949 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
950
951 if (reload_completed
952 && code == PLUS
953 && !rtx_equal_p (dst, src1))
954 {
955 /* This is going to be an LEA; avoid splitting it later. */
956 emit_insn (op);
957 }
958 else
959 {
960 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
961 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
962 }
963
964 /* Fix up the destination if needed. */
965 if (dst != operands[0])
966 emit_move_insn (operands[0], dst);
967 }
968
969 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
970 the given OPERANDS. */
971
972 void
973 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
974 rtx operands[])
975 {
976 rtx op1 = NULL_RTX, op2 = NULL_RTX;
977 if (SUBREG_P (operands[1]))
978 {
979 op1 = operands[1];
980 op2 = operands[2];
981 }
982 else if (SUBREG_P (operands[2]))
983 {
984 op1 = operands[2];
985 op2 = operands[1];
986 }
987 /* Optimize (__m128i) d | (__m128i) e and similar code
988 when d and e are float vectors into float vector logical
989 insn. In C/C++ without using intrinsics there is no other way
990 to express vector logical operation on float vectors than
991 to cast them temporarily to integer vectors. */
992 if (op1
993 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
994 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
995 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
996 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
997 && SUBREG_BYTE (op1) == 0
998 && (GET_CODE (op2) == CONST_VECTOR
999 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
1000 && SUBREG_BYTE (op2) == 0))
1001 && can_create_pseudo_p ())
1002 {
1003 rtx dst;
1004 switch (GET_MODE (SUBREG_REG (op1)))
1005 {
1006 case E_V4SFmode:
1007 case E_V8SFmode:
1008 case E_V16SFmode:
1009 case E_V2DFmode:
1010 case E_V4DFmode:
1011 case E_V8DFmode:
1012 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
1013 if (GET_CODE (op2) == CONST_VECTOR)
1014 {
1015 op2 = gen_lowpart (GET_MODE (dst), op2);
1016 op2 = force_reg (GET_MODE (dst), op2);
1017 }
1018 else
1019 {
1020 op1 = operands[1];
1021 op2 = SUBREG_REG (operands[2]);
1022 if (!vector_operand (op2, GET_MODE (dst)))
1023 op2 = force_reg (GET_MODE (dst), op2);
1024 }
1025 op1 = SUBREG_REG (op1);
1026 if (!vector_operand (op1, GET_MODE (dst)))
1027 op1 = force_reg (GET_MODE (dst), op1);
1028 emit_insn (gen_rtx_SET (dst,
1029 gen_rtx_fmt_ee (code, GET_MODE (dst),
1030 op1, op2)));
1031 emit_move_insn (operands[0], gen_lowpart (mode, dst));
1032 return;
1033 default:
1034 break;
1035 }
1036 }
1037 if (!vector_operand (operands[1], mode))
1038 operands[1] = force_reg (mode, operands[1]);
1039 if (!vector_operand (operands[2], mode))
1040 operands[2] = force_reg (mode, operands[2]);
1041 ix86_fixup_binary_operands_no_copy (code, mode, operands);
1042 emit_insn (gen_rtx_SET (operands[0],
1043 gen_rtx_fmt_ee (code, mode, operands[1],
1044 operands[2])));
1045 }
1046
1047 /* Return TRUE or FALSE depending on whether the binary operator meets the
1048 appropriate constraints. */
1049
1050 bool
1051 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
1052 rtx operands[3])
1053 {
1054 rtx dst = operands[0];
1055 rtx src1 = operands[1];
1056 rtx src2 = operands[2];
1057
1058 /* Both source operands cannot be in memory. */
1059 if ((MEM_P (src1) || bcst_mem_operand (src1, mode))
1060 && (MEM_P (src2) || bcst_mem_operand (src2, mode)))
1061 return false;
1062
1063 /* Canonicalize operand order for commutative operators. */
1064 if (ix86_swap_binary_operands_p (code, mode, operands))
1065 std::swap (src1, src2);
1066
1067 /* If the destination is memory, we must have a matching source operand. */
1068 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1069 return false;
1070
1071 /* Source 1 cannot be a constant. */
1072 if (CONSTANT_P (src1))
1073 return false;
1074
1075 /* Source 1 cannot be a non-matching memory. */
1076 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1077 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1078 return (code == AND
1079 && (mode == HImode
1080 || mode == SImode
1081 || (TARGET_64BIT && mode == DImode))
1082 && satisfies_constraint_L (src2));
1083
1084 return true;
1085 }
1086
1087 /* Attempt to expand a unary operator. Make the expansion closer to the
1088 actual machine, then just general_operand, which will allow 2 separate
1089 memory references (one output, one input) in a single insn. */
1090
1091 void
1092 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
1093 rtx operands[])
1094 {
1095 bool matching_memory = false;
1096 rtx src, dst, op, clob;
1097
1098 dst = operands[0];
1099 src = operands[1];
1100
1101 /* If the destination is memory, and we do not have matching source
1102 operands, do things in registers. */
1103 if (MEM_P (dst))
1104 {
1105 if (rtx_equal_p (dst, src))
1106 matching_memory = true;
1107 else
1108 dst = gen_reg_rtx (mode);
1109 }
1110
1111 /* When source operand is memory, destination must match. */
1112 if (MEM_P (src) && !matching_memory)
1113 src = force_reg (mode, src);
1114
1115 /* Emit the instruction. */
1116
1117 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
1118
1119 if (code == NOT)
1120 emit_insn (op);
1121 else
1122 {
1123 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1124 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1125 }
1126
1127 /* Fix up the destination if needed. */
1128 if (dst != operands[0])
1129 emit_move_insn (operands[0], dst);
1130 }
1131
1132 /* Predict just emitted jump instruction to be taken with probability PROB. */
1133
1134 static void
1135 predict_jump (int prob)
1136 {
1137 rtx_insn *insn = get_last_insn ();
1138 gcc_assert (JUMP_P (insn));
1139 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
1140 }
1141
1142 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1143 divisor are within the range [0-255]. */
1144
1145 void
1146 ix86_split_idivmod (machine_mode mode, rtx operands[],
1147 bool unsigned_p)
1148 {
1149 rtx_code_label *end_label, *qimode_label;
1150 rtx div, mod;
1151 rtx_insn *insn;
1152 rtx scratch, tmp0, tmp1, tmp2;
1153 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
1154
1155 switch (mode)
1156 {
1157 case E_SImode:
1158 if (GET_MODE (operands[0]) == SImode)
1159 {
1160 if (GET_MODE (operands[1]) == SImode)
1161 gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
1162 else
1163 gen_divmod4_1
1164 = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
1165 }
1166 else
1167 gen_divmod4_1
1168 = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
1169 break;
1170
1171 case E_DImode:
1172 gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
1173 break;
1174
1175 default:
1176 gcc_unreachable ();
1177 }
1178
1179 end_label = gen_label_rtx ();
1180 qimode_label = gen_label_rtx ();
1181
1182 scratch = gen_reg_rtx (mode);
1183
1184 /* Use 8bit unsigned divimod if dividend and divisor are within
1185 the range [0-255]. */
1186 emit_move_insn (scratch, operands[2]);
1187 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
1188 scratch, 1, OPTAB_DIRECT);
1189 emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100)));
1190 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
1191 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
1192 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
1193 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
1194 pc_rtx);
1195 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
1196 predict_jump (REG_BR_PROB_BASE * 50 / 100);
1197 JUMP_LABEL (insn) = qimode_label;
1198
1199 /* Generate original signed/unsigned divimod. */
1200 emit_insn (gen_divmod4_1 (operands[0], operands[1],
1201 operands[2], operands[3]));
1202
1203 /* Branch to the end. */
1204 emit_jump_insn (gen_jump (end_label));
1205 emit_barrier ();
1206
1207 /* Generate 8bit unsigned divide. */
1208 emit_label (qimode_label);
1209 /* Don't use operands[0] for result of 8bit divide since not all
1210 registers support QImode ZERO_EXTRACT. */
1211 tmp0 = lowpart_subreg (HImode, scratch, mode);
1212 tmp1 = lowpart_subreg (HImode, operands[2], mode);
1213 tmp2 = lowpart_subreg (QImode, operands[3], mode);
1214 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
1215
1216 if (unsigned_p)
1217 {
1218 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
1219 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
1220 }
1221 else
1222 {
1223 div = gen_rtx_DIV (mode, operands[2], operands[3]);
1224 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
1225 }
1226 if (mode == SImode)
1227 {
1228 if (GET_MODE (operands[0]) != SImode)
1229 div = gen_rtx_ZERO_EXTEND (DImode, div);
1230 if (GET_MODE (operands[1]) != SImode)
1231 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
1232 }
1233
1234 /* Extract remainder from AH. */
1235 scratch = gen_lowpart (GET_MODE (operands[1]), scratch);
1236 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), scratch,
1237 GEN_INT (8), GEN_INT (8));
1238 insn = emit_move_insn (operands[1], tmp1);
1239 set_unique_reg_note (insn, REG_EQUAL, mod);
1240
1241 /* Zero extend quotient from AL. */
1242 tmp1 = gen_lowpart (QImode, tmp0);
1243 insn = emit_insn (gen_extend_insn
1244 (operands[0], tmp1,
1245 GET_MODE (operands[0]), QImode, 1));
1246 set_unique_reg_note (insn, REG_EQUAL, div);
1247
1248 emit_label (end_label);
1249 }
1250
1251 /* Emit x86 binary operand CODE in mode MODE, where the first operand
1252 matches destination. RTX includes clobber of FLAGS_REG. */
1253
1254 void
1255 ix86_emit_binop (enum rtx_code code, machine_mode mode,
1256 rtx dst, rtx src)
1257 {
1258 rtx op, clob;
1259
1260 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
1261 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1262
1263 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1264 }
1265
1266 /* Return true if regno1 def is nearest to the insn. */
1267
1268 static bool
1269 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
1270 {
1271 rtx_insn *prev = insn;
1272 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
1273
1274 if (insn == start)
1275 return false;
1276 while (prev && prev != start)
1277 {
1278 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
1279 {
1280 prev = PREV_INSN (prev);
1281 continue;
1282 }
1283 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
1284 return true;
1285 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
1286 return false;
1287 prev = PREV_INSN (prev);
1288 }
1289
1290 /* None of the regs is defined in the bb. */
1291 return false;
1292 }
1293
1294 /* Split lea instructions into a sequence of instructions
1295 which are executed on ALU to avoid AGU stalls.
1296 It is assumed that it is allowed to clobber flags register
1297 at lea position. */
1298
1299 void
1300 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
1301 {
1302 unsigned int regno0, regno1, regno2;
1303 struct ix86_address parts;
1304 rtx target, tmp;
1305 int ok, adds;
1306
1307 ok = ix86_decompose_address (operands[1], &parts);
1308 gcc_assert (ok);
1309
1310 target = gen_lowpart (mode, operands[0]);
1311
1312 regno0 = true_regnum (target);
1313 regno1 = INVALID_REGNUM;
1314 regno2 = INVALID_REGNUM;
1315
1316 if (parts.base)
1317 {
1318 parts.base = gen_lowpart (mode, parts.base);
1319 regno1 = true_regnum (parts.base);
1320 }
1321
1322 if (parts.index)
1323 {
1324 parts.index = gen_lowpart (mode, parts.index);
1325 regno2 = true_regnum (parts.index);
1326 }
1327
1328 if (parts.disp)
1329 parts.disp = gen_lowpart (mode, parts.disp);
1330
1331 if (parts.scale > 1)
1332 {
1333 /* Case r1 = r1 + ... */
1334 if (regno1 == regno0)
1335 {
1336 /* If we have a case r1 = r1 + C * r2 then we
1337 should use multiplication which is very
1338 expensive. Assume cost model is wrong if we
1339 have such case here. */
1340 gcc_assert (regno2 != regno0);
1341
1342 for (adds = parts.scale; adds > 0; adds--)
1343 ix86_emit_binop (PLUS, mode, target, parts.index);
1344 }
1345 else
1346 {
1347 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1348 if (regno0 != regno2)
1349 emit_insn (gen_rtx_SET (target, parts.index));
1350
1351 /* Use shift for scaling, but emit it as MULT instead
1352 to avoid it being immediately peephole2 optimized back
1353 into lea. */
1354 ix86_emit_binop (MULT, mode, target, GEN_INT (parts.scale));
1355
1356 if (parts.base)
1357 ix86_emit_binop (PLUS, mode, target, parts.base);
1358
1359 if (parts.disp && parts.disp != const0_rtx)
1360 ix86_emit_binop (PLUS, mode, target, parts.disp);
1361 }
1362 }
1363 else if (!parts.base && !parts.index)
1364 {
1365 gcc_assert(parts.disp);
1366 emit_insn (gen_rtx_SET (target, parts.disp));
1367 }
1368 else
1369 {
1370 if (!parts.base)
1371 {
1372 if (regno0 != regno2)
1373 emit_insn (gen_rtx_SET (target, parts.index));
1374 }
1375 else if (!parts.index)
1376 {
1377 if (regno0 != regno1)
1378 emit_insn (gen_rtx_SET (target, parts.base));
1379 }
1380 else
1381 {
1382 if (regno0 == regno1)
1383 tmp = parts.index;
1384 else if (regno0 == regno2)
1385 tmp = parts.base;
1386 else
1387 {
1388 rtx tmp1;
1389
1390 /* Find better operand for SET instruction, depending
1391 on which definition is farther from the insn. */
1392 if (find_nearest_reg_def (insn, regno1, regno2))
1393 tmp = parts.index, tmp1 = parts.base;
1394 else
1395 tmp = parts.base, tmp1 = parts.index;
1396
1397 emit_insn (gen_rtx_SET (target, tmp));
1398
1399 if (parts.disp && parts.disp != const0_rtx)
1400 ix86_emit_binop (PLUS, mode, target, parts.disp);
1401
1402 ix86_emit_binop (PLUS, mode, target, tmp1);
1403 return;
1404 }
1405
1406 ix86_emit_binop (PLUS, mode, target, tmp);
1407 }
1408
1409 if (parts.disp && parts.disp != const0_rtx)
1410 ix86_emit_binop (PLUS, mode, target, parts.disp);
1411 }
1412 }
1413
1414 /* Post-reload splitter for converting an SF or DFmode value in an
1415 SSE register into an unsigned SImode. */
1416
1417 void
1418 ix86_split_convert_uns_si_sse (rtx operands[])
1419 {
1420 machine_mode vecmode;
1421 rtx value, large, zero_or_two31, input, two31, x;
1422
1423 large = operands[1];
1424 zero_or_two31 = operands[2];
1425 input = operands[3];
1426 two31 = operands[4];
1427 vecmode = GET_MODE (large);
1428 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
1429
1430 /* Load up the value into the low element. We must ensure that the other
1431 elements are valid floats -- zero is the easiest such value. */
1432 if (MEM_P (input))
1433 {
1434 if (vecmode == V4SFmode)
1435 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
1436 else
1437 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
1438 }
1439 else
1440 {
1441 input = gen_rtx_REG (vecmode, REGNO (input));
1442 emit_move_insn (value, CONST0_RTX (vecmode));
1443 if (vecmode == V4SFmode)
1444 emit_insn (gen_sse_movss (value, value, input));
1445 else
1446 emit_insn (gen_sse2_movsd (value, value, input));
1447 }
1448
1449 emit_move_insn (large, two31);
1450 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
1451
1452 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
1453 emit_insn (gen_rtx_SET (large, x));
1454
1455 x = gen_rtx_AND (vecmode, zero_or_two31, large);
1456 emit_insn (gen_rtx_SET (zero_or_two31, x));
1457
1458 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
1459 emit_insn (gen_rtx_SET (value, x));
1460
1461 large = gen_rtx_REG (V4SImode, REGNO (large));
1462 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
1463
1464 x = gen_rtx_REG (V4SImode, REGNO (value));
1465 if (vecmode == V4SFmode)
1466 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
1467 else
1468 emit_insn (gen_sse2_cvttpd2dq (x, value));
1469 value = x;
1470
1471 emit_insn (gen_xorv4si3 (value, value, large));
1472 }
1473
1474 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
1475 machine_mode mode, rtx target,
1476 rtx var, int one_var);
1477
1478 /* Convert an unsigned DImode value into a DFmode, using only SSE.
1479 Expects the 64-bit DImode to be supplied in a pair of integral
1480 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1481 -mfpmath=sse, !optimize_size only. */
1482
1483 void
1484 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
1485 {
1486 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
1487 rtx int_xmm, fp_xmm;
1488 rtx biases, exponents;
1489 rtx x;
1490
1491 int_xmm = gen_reg_rtx (V4SImode);
1492 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
1493 emit_insn (gen_movdi_to_sse (int_xmm, input));
1494 else if (TARGET_SSE_SPLIT_REGS)
1495 {
1496 emit_clobber (int_xmm);
1497 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
1498 }
1499 else
1500 {
1501 x = gen_reg_rtx (V2DImode);
1502 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
1503 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
1504 }
1505
1506 x = gen_rtx_CONST_VECTOR (V4SImode,
1507 gen_rtvec (4, GEN_INT (0x43300000UL),
1508 GEN_INT (0x45300000UL),
1509 const0_rtx, const0_rtx));
1510 exponents = validize_mem (force_const_mem (V4SImode, x));
1511
1512 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1513 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
1514
1515 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1516 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1517 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1518 (0x1.0p84 + double(fp_value_hi_xmm)).
1519 Note these exponents differ by 32. */
1520
1521 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
1522
1523 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1524 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1525 real_ldexp (&bias_lo_rvt, &dconst1, 52);
1526 real_ldexp (&bias_hi_rvt, &dconst1, 84);
1527 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
1528 x = const_double_from_real_value (bias_hi_rvt, DFmode);
1529 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
1530 biases = validize_mem (force_const_mem (V2DFmode, biases));
1531 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
1532
1533 /* Add the upper and lower DFmode values together. */
1534 if (TARGET_SSE3)
1535 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
1536 else
1537 {
1538 x = copy_to_mode_reg (V2DFmode, fp_xmm);
1539 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
1540 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
1541 }
1542
1543 ix86_expand_vector_extract (false, target, fp_xmm, 0);
1544 }
1545
1546 /* Not used, but eases macroization of patterns. */
1547 void
1548 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
1549 {
1550 gcc_unreachable ();
1551 }
1552
1553 static rtx ix86_expand_sse_fabs (rtx op0, rtx *smask);
1554
1555 /* Convert an unsigned SImode value into a DFmode. Only currently used
1556 for SSE, but applicable anywhere. */
1557
1558 void
1559 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
1560 {
1561 REAL_VALUE_TYPE TWO31r;
1562 rtx x, fp;
1563
1564 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
1565 NULL, 1, OPTAB_DIRECT);
1566
1567 fp = gen_reg_rtx (DFmode);
1568 emit_insn (gen_floatsidf2 (fp, x));
1569
1570 real_ldexp (&TWO31r, &dconst1, 31);
1571 x = const_double_from_real_value (TWO31r, DFmode);
1572
1573 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
1574
1575 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
1576 if (HONOR_SIGNED_ZEROS (DFmode) && flag_rounding_math)
1577 x = ix86_expand_sse_fabs (x, NULL);
1578
1579 if (x != target)
1580 emit_move_insn (target, x);
1581 }
1582
1583 /* Convert a signed DImode value into a DFmode. Only used for SSE in
1584 32-bit mode; otherwise we have a direct convert instruction. */
1585
1586 void
1587 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
1588 {
1589 REAL_VALUE_TYPE TWO32r;
1590 rtx fp_lo, fp_hi, x;
1591
1592 fp_lo = gen_reg_rtx (DFmode);
1593 fp_hi = gen_reg_rtx (DFmode);
1594
1595 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
1596
1597 real_ldexp (&TWO32r, &dconst1, 32);
1598 x = const_double_from_real_value (TWO32r, DFmode);
1599 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
1600
1601 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
1602
1603 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
1604 0, OPTAB_DIRECT);
1605 if (x != target)
1606 emit_move_insn (target, x);
1607 }
1608
1609 /* Convert an unsigned SImode value into a SFmode, using only SSE.
1610 For x86_32, -mfpmath=sse, !optimize_size only. */
1611 void
1612 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
1613 {
1614 REAL_VALUE_TYPE ONE16r;
1615 rtx fp_hi, fp_lo, int_hi, int_lo, x;
1616
1617 real_ldexp (&ONE16r, &dconst1, 16);
1618 x = const_double_from_real_value (ONE16r, SFmode);
1619 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
1620 NULL, 0, OPTAB_DIRECT);
1621 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
1622 NULL, 0, OPTAB_DIRECT);
1623 fp_hi = gen_reg_rtx (SFmode);
1624 fp_lo = gen_reg_rtx (SFmode);
1625 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
1626 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
1627 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
1628 0, OPTAB_DIRECT);
1629 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
1630 0, OPTAB_DIRECT);
1631 if (!rtx_equal_p (target, fp_hi))
1632 emit_move_insn (target, fp_hi);
1633 }
1634
1635 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
1636 a vector of unsigned ints VAL to vector of floats TARGET. */
1637
1638 void
1639 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
1640 {
1641 rtx tmp[8];
1642 REAL_VALUE_TYPE TWO16r;
1643 machine_mode intmode = GET_MODE (val);
1644 machine_mode fltmode = GET_MODE (target);
1645 rtx (*cvt) (rtx, rtx);
1646
1647 if (intmode == V4SImode)
1648 cvt = gen_floatv4siv4sf2;
1649 else
1650 cvt = gen_floatv8siv8sf2;
1651 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
1652 tmp[0] = force_reg (intmode, tmp[0]);
1653 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
1654 OPTAB_DIRECT);
1655 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
1656 NULL_RTX, 1, OPTAB_DIRECT);
1657 tmp[3] = gen_reg_rtx (fltmode);
1658 emit_insn (cvt (tmp[3], tmp[1]));
1659 tmp[4] = gen_reg_rtx (fltmode);
1660 emit_insn (cvt (tmp[4], tmp[2]));
1661 real_ldexp (&TWO16r, &dconst1, 16);
1662 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
1663 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
1664 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
1665 OPTAB_DIRECT);
1666 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
1667 OPTAB_DIRECT);
1668 if (tmp[7] != target)
1669 emit_move_insn (target, tmp[7]);
1670 }
1671
1672 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
1673 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
1674 This is done by doing just signed conversion if < 0x1p31, and otherwise by
1675 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
1676
1677 rtx
1678 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
1679 {
1680 REAL_VALUE_TYPE TWO31r;
1681 rtx two31r, tmp[4];
1682 machine_mode mode = GET_MODE (val);
1683 machine_mode scalarmode = GET_MODE_INNER (mode);
1684 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
1685 rtx (*cmp) (rtx, rtx, rtx, rtx);
1686 int i;
1687
1688 for (i = 0; i < 3; i++)
1689 tmp[i] = gen_reg_rtx (mode);
1690 real_ldexp (&TWO31r, &dconst1, 31);
1691 two31r = const_double_from_real_value (TWO31r, scalarmode);
1692 two31r = ix86_build_const_vector (mode, 1, two31r);
1693 two31r = force_reg (mode, two31r);
1694 switch (mode)
1695 {
1696 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
1697 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
1698 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
1699 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
1700 default: gcc_unreachable ();
1701 }
1702 tmp[3] = gen_rtx_LE (mode, two31r, val);
1703 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
1704 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
1705 0, OPTAB_DIRECT);
1706 if (intmode == V4SImode || TARGET_AVX2)
1707 *xorp = expand_simple_binop (intmode, ASHIFT,
1708 gen_lowpart (intmode, tmp[0]),
1709 GEN_INT (31), NULL_RTX, 0,
1710 OPTAB_DIRECT);
1711 else
1712 {
1713 rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode);
1714 two31 = ix86_build_const_vector (intmode, 1, two31);
1715 *xorp = expand_simple_binop (intmode, AND,
1716 gen_lowpart (intmode, tmp[0]),
1717 two31, NULL_RTX, 0,
1718 OPTAB_DIRECT);
1719 }
1720 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
1721 0, OPTAB_DIRECT);
1722 }
1723
1724 /* Generate code for floating point ABS or NEG. */
1725
1726 void
1727 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
1728 rtx operands[])
1729 {
1730 rtx set, dst, src;
1731 bool use_sse = false;
1732 bool vector_mode = VECTOR_MODE_P (mode);
1733 machine_mode vmode = mode;
1734 rtvec par;
1735
1736 if (vector_mode || mode == TFmode)
1737 use_sse = true;
1738 else if (TARGET_SSE_MATH)
1739 {
1740 use_sse = SSE_FLOAT_MODE_P (mode);
1741 if (mode == SFmode)
1742 vmode = V4SFmode;
1743 else if (mode == DFmode)
1744 vmode = V2DFmode;
1745 }
1746
1747 dst = operands[0];
1748 src = operands[1];
1749
1750 set = gen_rtx_fmt_e (code, mode, src);
1751 set = gen_rtx_SET (dst, set);
1752
1753 if (use_sse)
1754 {
1755 rtx mask, use, clob;
1756
1757 /* NEG and ABS performed with SSE use bitwise mask operations.
1758 Create the appropriate mask now. */
1759 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
1760 use = gen_rtx_USE (VOIDmode, mask);
1761 if (vector_mode || mode == TFmode)
1762 par = gen_rtvec (2, set, use);
1763 else
1764 {
1765 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1766 par = gen_rtvec (3, set, use, clob);
1767 }
1768 }
1769 else
1770 {
1771 rtx clob;
1772
1773 /* Changing of sign for FP values is doable using integer unit too. */
1774 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1775 par = gen_rtvec (2, set, clob);
1776 }
1777
1778 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
1779 }
1780
1781 /* Deconstruct a floating point ABS or NEG operation
1782 with integer registers into integer operations. */
1783
1784 void
1785 ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode,
1786 rtx operands[])
1787 {
1788 enum rtx_code absneg_op;
1789 rtx dst, set;
1790
1791 gcc_assert (operands_match_p (operands[0], operands[1]));
1792
1793 switch (mode)
1794 {
1795 case E_SFmode:
1796 dst = gen_lowpart (SImode, operands[0]);
1797
1798 if (code == ABS)
1799 {
1800 set = gen_int_mode (0x7fffffff, SImode);
1801 absneg_op = AND;
1802 }
1803 else
1804 {
1805 set = gen_int_mode (0x80000000, SImode);
1806 absneg_op = XOR;
1807 }
1808 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1809 break;
1810
1811 case E_DFmode:
1812 if (TARGET_64BIT)
1813 {
1814 dst = gen_lowpart (DImode, operands[0]);
1815 dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63));
1816
1817 if (code == ABS)
1818 set = const0_rtx;
1819 else
1820 set = gen_rtx_NOT (DImode, dst);
1821 }
1822 else
1823 {
1824 dst = gen_highpart (SImode, operands[0]);
1825
1826 if (code == ABS)
1827 {
1828 set = gen_int_mode (0x7fffffff, SImode);
1829 absneg_op = AND;
1830 }
1831 else
1832 {
1833 set = gen_int_mode (0x80000000, SImode);
1834 absneg_op = XOR;
1835 }
1836 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1837 }
1838 break;
1839
1840 case E_XFmode:
1841 dst = gen_rtx_REG (SImode,
1842 REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2));
1843 if (code == ABS)
1844 {
1845 set = GEN_INT (0x7fff);
1846 absneg_op = AND;
1847 }
1848 else
1849 {
1850 set = GEN_INT (0x8000);
1851 absneg_op = XOR;
1852 }
1853 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1854 break;
1855
1856 default:
1857 gcc_unreachable ();
1858 }
1859
1860 set = gen_rtx_SET (dst, set);
1861
1862 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1863 rtvec par = gen_rtvec (2, set, clob);
1864
1865 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
1866 }
1867
1868 /* Expand a copysign operation. Special case operand 0 being a constant. */
1869
1870 void
1871 ix86_expand_copysign (rtx operands[])
1872 {
1873 machine_mode mode, vmode;
1874 rtx dest, op0, op1, mask;
1875
1876 dest = operands[0];
1877 op0 = operands[1];
1878 op1 = operands[2];
1879
1880 mode = GET_MODE (dest);
1881
1882 if (mode == SFmode)
1883 vmode = V4SFmode;
1884 else if (mode == DFmode)
1885 vmode = V2DFmode;
1886 else if (mode == TFmode)
1887 vmode = mode;
1888 else
1889 gcc_unreachable ();
1890
1891 mask = ix86_build_signbit_mask (vmode, 0, 0);
1892
1893 if (CONST_DOUBLE_P (op0))
1894 {
1895 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
1896 op0 = simplify_unary_operation (ABS, mode, op0, mode);
1897
1898 if (mode == SFmode || mode == DFmode)
1899 {
1900 if (op0 == CONST0_RTX (mode))
1901 op0 = CONST0_RTX (vmode);
1902 else
1903 {
1904 rtx v = ix86_build_const_vector (vmode, false, op0);
1905
1906 op0 = force_reg (vmode, v);
1907 }
1908 }
1909 else if (op0 != CONST0_RTX (mode))
1910 op0 = force_reg (mode, op0);
1911
1912 emit_insn (gen_copysign3_const (mode, dest, op0, op1, mask));
1913 }
1914 else
1915 {
1916 rtx nmask = ix86_build_signbit_mask (vmode, 0, 1);
1917
1918 emit_insn (gen_copysign3_var
1919 (mode, dest, NULL_RTX, op0, op1, nmask, mask));
1920 }
1921 }
1922
1923 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
1924 be a constant, and so has already been expanded into a vector constant. */
1925
1926 void
1927 ix86_split_copysign_const (rtx operands[])
1928 {
1929 machine_mode mode, vmode;
1930 rtx dest, op0, mask, x;
1931
1932 dest = operands[0];
1933 op0 = operands[1];
1934 mask = operands[3];
1935
1936 mode = GET_MODE (dest);
1937 vmode = GET_MODE (mask);
1938
1939 dest = lowpart_subreg (vmode, dest, mode);
1940 x = gen_rtx_AND (vmode, dest, mask);
1941 emit_insn (gen_rtx_SET (dest, x));
1942
1943 if (op0 != CONST0_RTX (vmode))
1944 {
1945 x = gen_rtx_IOR (vmode, dest, op0);
1946 emit_insn (gen_rtx_SET (dest, x));
1947 }
1948 }
1949
1950 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
1951 so we have to do two masks. */
1952
1953 void
1954 ix86_split_copysign_var (rtx operands[])
1955 {
1956 machine_mode mode, vmode;
1957 rtx dest, scratch, op0, op1, mask, nmask, x;
1958
1959 dest = operands[0];
1960 scratch = operands[1];
1961 op0 = operands[2];
1962 op1 = operands[3];
1963 nmask = operands[4];
1964 mask = operands[5];
1965
1966 mode = GET_MODE (dest);
1967 vmode = GET_MODE (mask);
1968
1969 if (rtx_equal_p (op0, op1))
1970 {
1971 /* Shouldn't happen often (it's useless, obviously), but when it does
1972 we'd generate incorrect code if we continue below. */
1973 emit_move_insn (dest, op0);
1974 return;
1975 }
1976
1977 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
1978 {
1979 gcc_assert (REGNO (op1) == REGNO (scratch));
1980
1981 x = gen_rtx_AND (vmode, scratch, mask);
1982 emit_insn (gen_rtx_SET (scratch, x));
1983
1984 dest = mask;
1985 op0 = lowpart_subreg (vmode, op0, mode);
1986 x = gen_rtx_NOT (vmode, dest);
1987 x = gen_rtx_AND (vmode, x, op0);
1988 emit_insn (gen_rtx_SET (dest, x));
1989 }
1990 else
1991 {
1992 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
1993 {
1994 x = gen_rtx_AND (vmode, scratch, mask);
1995 }
1996 else /* alternative 2,4 */
1997 {
1998 gcc_assert (REGNO (mask) == REGNO (scratch));
1999 op1 = lowpart_subreg (vmode, op1, mode);
2000 x = gen_rtx_AND (vmode, scratch, op1);
2001 }
2002 emit_insn (gen_rtx_SET (scratch, x));
2003
2004 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
2005 {
2006 dest = lowpart_subreg (vmode, op0, mode);
2007 x = gen_rtx_AND (vmode, dest, nmask);
2008 }
2009 else /* alternative 3,4 */
2010 {
2011 gcc_assert (REGNO (nmask) == REGNO (dest));
2012 dest = nmask;
2013 op0 = lowpart_subreg (vmode, op0, mode);
2014 x = gen_rtx_AND (vmode, dest, op0);
2015 }
2016 emit_insn (gen_rtx_SET (dest, x));
2017 }
2018
2019 x = gen_rtx_IOR (vmode, dest, scratch);
2020 emit_insn (gen_rtx_SET (dest, x));
2021 }
2022
2023 /* Expand an xorsign operation. */
2024
2025 void
2026 ix86_expand_xorsign (rtx operands[])
2027 {
2028 machine_mode mode, vmode;
2029 rtx dest, op0, op1, mask;
2030
2031 dest = operands[0];
2032 op0 = operands[1];
2033 op1 = operands[2];
2034
2035 mode = GET_MODE (dest);
2036
2037 if (mode == SFmode)
2038 vmode = V4SFmode;
2039 else if (mode == DFmode)
2040 vmode = V2DFmode;
2041 else
2042 gcc_unreachable ();
2043
2044 mask = ix86_build_signbit_mask (vmode, 0, 0);
2045
2046 emit_insn (gen_xorsign3_1 (mode, dest, op0, op1, mask));
2047 }
2048
2049 /* Deconstruct an xorsign operation into bit masks. */
2050
2051 void
2052 ix86_split_xorsign (rtx operands[])
2053 {
2054 machine_mode mode, vmode;
2055 rtx dest, op0, mask, x;
2056
2057 dest = operands[0];
2058 op0 = operands[1];
2059 mask = operands[3];
2060
2061 mode = GET_MODE (dest);
2062 vmode = GET_MODE (mask);
2063
2064 dest = lowpart_subreg (vmode, dest, mode);
2065 x = gen_rtx_AND (vmode, dest, mask);
2066 emit_insn (gen_rtx_SET (dest, x));
2067
2068 op0 = lowpart_subreg (vmode, op0, mode);
2069 x = gen_rtx_XOR (vmode, dest, op0);
2070 emit_insn (gen_rtx_SET (dest, x));
2071 }
2072
2073 static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
2074
2075 void
2076 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
2077 {
2078 machine_mode mode = GET_MODE (op0);
2079 rtx tmp;
2080
2081 /* Handle special case - vector comparsion with boolean result, transform
2082 it using ptest instruction. */
2083 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
2084 {
2085 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
2086 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
2087
2088 gcc_assert (code == EQ || code == NE);
2089 /* Generate XOR since we can't check that one operand is zero vector. */
2090 tmp = gen_reg_rtx (mode);
2091 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
2092 tmp = gen_lowpart (p_mode, tmp);
2093 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
2094 gen_rtx_UNSPEC (CCmode,
2095 gen_rtvec (2, tmp, tmp),
2096 UNSPEC_PTEST)));
2097 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
2098 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2099 gen_rtx_LABEL_REF (VOIDmode, label),
2100 pc_rtx);
2101 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2102 return;
2103 }
2104
2105 switch (mode)
2106 {
2107 case E_SFmode:
2108 case E_DFmode:
2109 case E_XFmode:
2110 case E_QImode:
2111 case E_HImode:
2112 case E_SImode:
2113 simple:
2114 tmp = ix86_expand_compare (code, op0, op1);
2115 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2116 gen_rtx_LABEL_REF (VOIDmode, label),
2117 pc_rtx);
2118 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2119 return;
2120
2121 case E_DImode:
2122 if (TARGET_64BIT)
2123 goto simple;
2124 /* For 32-bit target DI comparison may be performed on
2125 SSE registers. To allow this we should avoid split
2126 to SI mode which is achieved by doing xor in DI mode
2127 and then comparing with zero (which is recognized by
2128 STV pass). We don't compare using xor when optimizing
2129 for size. */
2130 if (!optimize_insn_for_size_p ()
2131 && TARGET_STV
2132 && (code == EQ || code == NE))
2133 {
2134 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
2135 op1 = const0_rtx;
2136 }
2137 /* FALLTHRU */
2138 case E_TImode:
2139 /* Expand DImode branch into multiple compare+branch. */
2140 {
2141 rtx lo[2], hi[2];
2142 rtx_code_label *label2;
2143 enum rtx_code code1, code2, code3;
2144 machine_mode submode;
2145
2146 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
2147 {
2148 std::swap (op0, op1);
2149 code = swap_condition (code);
2150 }
2151
2152 split_double_mode (mode, &op0, 1, lo+0, hi+0);
2153 split_double_mode (mode, &op1, 1, lo+1, hi+1);
2154
2155 submode = mode == DImode ? SImode : DImode;
2156
2157 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
2158 avoid two branches. This costs one extra insn, so disable when
2159 optimizing for size. */
2160
2161 if ((code == EQ || code == NE)
2162 && (!optimize_insn_for_size_p ()
2163 || hi[1] == const0_rtx || lo[1] == const0_rtx))
2164 {
2165 rtx xor0, xor1;
2166
2167 xor1 = hi[0];
2168 if (hi[1] != const0_rtx)
2169 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
2170 NULL_RTX, 0, OPTAB_WIDEN);
2171
2172 xor0 = lo[0];
2173 if (lo[1] != const0_rtx)
2174 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
2175 NULL_RTX, 0, OPTAB_WIDEN);
2176
2177 tmp = expand_binop (submode, ior_optab, xor1, xor0,
2178 NULL_RTX, 0, OPTAB_WIDEN);
2179
2180 ix86_expand_branch (code, tmp, const0_rtx, label);
2181 return;
2182 }
2183
2184 /* Otherwise, if we are doing less-than or greater-or-equal-than,
2185 op1 is a constant and the low word is zero, then we can just
2186 examine the high word. Similarly for low word -1 and
2187 less-or-equal-than or greater-than. */
2188
2189 if (CONST_INT_P (hi[1]))
2190 switch (code)
2191 {
2192 case LT: case LTU: case GE: case GEU:
2193 if (lo[1] == const0_rtx)
2194 {
2195 ix86_expand_branch (code, hi[0], hi[1], label);
2196 return;
2197 }
2198 break;
2199 case LE: case LEU: case GT: case GTU:
2200 if (lo[1] == constm1_rtx)
2201 {
2202 ix86_expand_branch (code, hi[0], hi[1], label);
2203 return;
2204 }
2205 break;
2206 default:
2207 break;
2208 }
2209
2210 /* Emulate comparisons that do not depend on Zero flag with
2211 double-word subtraction. Note that only Overflow, Sign
2212 and Carry flags are valid, so swap arguments and condition
2213 of comparisons that would otherwise test Zero flag. */
2214
2215 switch (code)
2216 {
2217 case LE: case LEU: case GT: case GTU:
2218 std::swap (lo[0], lo[1]);
2219 std::swap (hi[0], hi[1]);
2220 code = swap_condition (code);
2221 /* FALLTHRU */
2222
2223 case LT: case LTU: case GE: case GEU:
2224 {
2225 bool uns = (code == LTU || code == GEU);
2226 rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx)
2227 = uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz;
2228
2229 if (!nonimmediate_operand (lo[0], submode))
2230 lo[0] = force_reg (submode, lo[0]);
2231 if (!x86_64_general_operand (lo[1], submode))
2232 lo[1] = force_reg (submode, lo[1]);
2233
2234 if (!register_operand (hi[0], submode))
2235 hi[0] = force_reg (submode, hi[0]);
2236 if ((uns && !nonimmediate_operand (hi[1], submode))
2237 || (!uns && !x86_64_general_operand (hi[1], submode)))
2238 hi[1] = force_reg (submode, hi[1]);
2239
2240 emit_insn (gen_cmp_1 (submode, lo[0], lo[1]));
2241
2242 tmp = gen_rtx_SCRATCH (submode);
2243 emit_insn (sbb_insn (submode, tmp, hi[0], hi[1]));
2244
2245 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
2246 ix86_expand_branch (code, tmp, const0_rtx, label);
2247 return;
2248 }
2249
2250 default:
2251 break;
2252 }
2253
2254 /* Otherwise, we need two or three jumps. */
2255
2256 label2 = gen_label_rtx ();
2257
2258 code1 = code;
2259 code2 = swap_condition (code);
2260 code3 = unsigned_condition (code);
2261
2262 switch (code)
2263 {
2264 case LT: case GT: case LTU: case GTU:
2265 break;
2266
2267 case LE: code1 = LT; code2 = GT; break;
2268 case GE: code1 = GT; code2 = LT; break;
2269 case LEU: code1 = LTU; code2 = GTU; break;
2270 case GEU: code1 = GTU; code2 = LTU; break;
2271
2272 case EQ: code1 = UNKNOWN; code2 = NE; break;
2273 case NE: code2 = UNKNOWN; break;
2274
2275 default:
2276 gcc_unreachable ();
2277 }
2278
2279 /*
2280 * a < b =>
2281 * if (hi(a) < hi(b)) goto true;
2282 * if (hi(a) > hi(b)) goto false;
2283 * if (lo(a) < lo(b)) goto true;
2284 * false:
2285 */
2286
2287 if (code1 != UNKNOWN)
2288 ix86_expand_branch (code1, hi[0], hi[1], label);
2289 if (code2 != UNKNOWN)
2290 ix86_expand_branch (code2, hi[0], hi[1], label2);
2291
2292 ix86_expand_branch (code3, lo[0], lo[1], label);
2293
2294 if (code2 != UNKNOWN)
2295 emit_label (label2);
2296 return;
2297 }
2298
2299 default:
2300 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
2301 goto simple;
2302 }
2303 }
2304
2305 /* Figure out whether to use unordered fp comparisons. */
2306
2307 static bool
2308 ix86_unordered_fp_compare (enum rtx_code code)
2309 {
2310 if (!TARGET_IEEE_FP)
2311 return false;
2312
2313 switch (code)
2314 {
2315 case LT:
2316 case LE:
2317 case GT:
2318 case GE:
2319 case LTGT:
2320 return false;
2321
2322 case EQ:
2323 case NE:
2324
2325 case UNORDERED:
2326 case ORDERED:
2327 case UNLT:
2328 case UNLE:
2329 case UNGT:
2330 case UNGE:
2331 case UNEQ:
2332 return true;
2333
2334 default:
2335 gcc_unreachable ();
2336 }
2337 }
2338
2339 /* Return a comparison we can do and that it is equivalent to
2340 swap_condition (code) apart possibly from orderedness.
2341 But, never change orderedness if TARGET_IEEE_FP, returning
2342 UNKNOWN in that case if necessary. */
2343
2344 static enum rtx_code
2345 ix86_fp_swap_condition (enum rtx_code code)
2346 {
2347 switch (code)
2348 {
2349 case GT: /* GTU - CF=0 & ZF=0 */
2350 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
2351 case GE: /* GEU - CF=0 */
2352 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
2353 case UNLT: /* LTU - CF=1 */
2354 return TARGET_IEEE_FP ? UNKNOWN : GT;
2355 case UNLE: /* LEU - CF=1 | ZF=1 */
2356 return TARGET_IEEE_FP ? UNKNOWN : GE;
2357 default:
2358 return swap_condition (code);
2359 }
2360 }
2361
2362 /* Return cost of comparison CODE using the best strategy for performance.
2363 All following functions do use number of instructions as a cost metrics.
2364 In future this should be tweaked to compute bytes for optimize_size and
2365 take into account performance of various instructions on various CPUs. */
2366
2367 static int
2368 ix86_fp_comparison_cost (enum rtx_code code)
2369 {
2370 int arith_cost;
2371
2372 /* The cost of code using bit-twiddling on %ah. */
2373 switch (code)
2374 {
2375 case UNLE:
2376 case UNLT:
2377 case LTGT:
2378 case GT:
2379 case GE:
2380 case UNORDERED:
2381 case ORDERED:
2382 case UNEQ:
2383 arith_cost = 4;
2384 break;
2385 case LT:
2386 case NE:
2387 case EQ:
2388 case UNGE:
2389 arith_cost = TARGET_IEEE_FP ? 5 : 4;
2390 break;
2391 case LE:
2392 case UNGT:
2393 arith_cost = TARGET_IEEE_FP ? 6 : 4;
2394 break;
2395 default:
2396 gcc_unreachable ();
2397 }
2398
2399 switch (ix86_fp_comparison_strategy (code))
2400 {
2401 case IX86_FPCMP_COMI:
2402 return arith_cost > 4 ? 3 : 2;
2403 case IX86_FPCMP_SAHF:
2404 return arith_cost > 4 ? 4 : 3;
2405 default:
2406 return arith_cost;
2407 }
2408 }
2409
2410 /* Swap, force into registers, or otherwise massage the two operands
2411 to a fp comparison. The operands are updated in place; the new
2412 comparison code is returned. */
2413
2414 static enum rtx_code
2415 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
2416 {
2417 bool unordered_compare = ix86_unordered_fp_compare (code);
2418 rtx op0 = *pop0, op1 = *pop1;
2419 machine_mode op_mode = GET_MODE (op0);
2420 bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
2421
2422 /* All of the unordered compare instructions only work on registers.
2423 The same is true of the fcomi compare instructions. The XFmode
2424 compare instructions require registers except when comparing
2425 against zero or when converting operand 1 from fixed point to
2426 floating point. */
2427
2428 if (!is_sse
2429 && (unordered_compare
2430 || (op_mode == XFmode
2431 && ! (standard_80387_constant_p (op0) == 1
2432 || standard_80387_constant_p (op1) == 1)
2433 && GET_CODE (op1) != FLOAT)
2434 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
2435 {
2436 op0 = force_reg (op_mode, op0);
2437 op1 = force_reg (op_mode, op1);
2438 }
2439 else
2440 {
2441 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2442 things around if they appear profitable, otherwise force op0
2443 into a register. */
2444
2445 if (standard_80387_constant_p (op0) == 0
2446 || (MEM_P (op0)
2447 && ! (standard_80387_constant_p (op1) == 0
2448 || MEM_P (op1))))
2449 {
2450 enum rtx_code new_code = ix86_fp_swap_condition (code);
2451 if (new_code != UNKNOWN)
2452 {
2453 std::swap (op0, op1);
2454 code = new_code;
2455 }
2456 }
2457
2458 if (!REG_P (op0))
2459 op0 = force_reg (op_mode, op0);
2460
2461 if (CONSTANT_P (op1))
2462 {
2463 int tmp = standard_80387_constant_p (op1);
2464 if (tmp == 0)
2465 op1 = validize_mem (force_const_mem (op_mode, op1));
2466 else if (tmp == 1)
2467 {
2468 if (TARGET_CMOVE)
2469 op1 = force_reg (op_mode, op1);
2470 }
2471 else
2472 op1 = force_reg (op_mode, op1);
2473 }
2474 }
2475
2476 /* Try to rearrange the comparison to make it cheaper. */
2477 if (ix86_fp_comparison_cost (code)
2478 > ix86_fp_comparison_cost (swap_condition (code))
2479 && (REG_P (op1) || can_create_pseudo_p ()))
2480 {
2481 std::swap (op0, op1);
2482 code = swap_condition (code);
2483 if (!REG_P (op0))
2484 op0 = force_reg (op_mode, op0);
2485 }
2486
2487 *pop0 = op0;
2488 *pop1 = op1;
2489 return code;
2490 }
2491
2492 /* Generate insn patterns to do a floating point compare of OPERANDS. */
2493
2494 static rtx
2495 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
2496 {
2497 bool unordered_compare = ix86_unordered_fp_compare (code);
2498 machine_mode cmp_mode;
2499 rtx tmp, scratch;
2500
2501 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
2502
2503 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
2504 if (unordered_compare)
2505 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
2506
2507 /* Do fcomi/sahf based test when profitable. */
2508 switch (ix86_fp_comparison_strategy (code))
2509 {
2510 case IX86_FPCMP_COMI:
2511 cmp_mode = CCFPmode;
2512 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
2513 break;
2514
2515 case IX86_FPCMP_SAHF:
2516 cmp_mode = CCFPmode;
2517 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2518 scratch = gen_reg_rtx (HImode);
2519 emit_insn (gen_rtx_SET (scratch, tmp));
2520 emit_insn (gen_x86_sahf_1 (scratch));
2521 break;
2522
2523 case IX86_FPCMP_ARITH:
2524 cmp_mode = CCNOmode;
2525 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2526 scratch = gen_reg_rtx (HImode);
2527 emit_insn (gen_rtx_SET (scratch, tmp));
2528
2529 /* In the unordered case, we have to check C2 for NaN's, which
2530 doesn't happen to work out to anything nice combination-wise.
2531 So do some bit twiddling on the value we've got in AH to come
2532 up with an appropriate set of condition codes. */
2533
2534 switch (code)
2535 {
2536 case GT:
2537 case UNGT:
2538 if (code == GT || !TARGET_IEEE_FP)
2539 {
2540 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2541 code = EQ;
2542 }
2543 else
2544 {
2545 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2546 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2547 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
2548 cmp_mode = CCmode;
2549 code = GEU;
2550 }
2551 break;
2552 case LT:
2553 case UNLT:
2554 if (code == LT && TARGET_IEEE_FP)
2555 {
2556 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2557 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
2558 cmp_mode = CCmode;
2559 code = EQ;
2560 }
2561 else
2562 {
2563 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
2564 code = NE;
2565 }
2566 break;
2567 case GE:
2568 case UNGE:
2569 if (code == GE || !TARGET_IEEE_FP)
2570 {
2571 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
2572 code = EQ;
2573 }
2574 else
2575 {
2576 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2577 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
2578 code = NE;
2579 }
2580 break;
2581 case LE:
2582 case UNLE:
2583 if (code == LE && TARGET_IEEE_FP)
2584 {
2585 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2586 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2587 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2588 cmp_mode = CCmode;
2589 code = LTU;
2590 }
2591 else
2592 {
2593 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2594 code = NE;
2595 }
2596 break;
2597 case EQ:
2598 case UNEQ:
2599 if (code == EQ && TARGET_IEEE_FP)
2600 {
2601 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2602 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2603 cmp_mode = CCmode;
2604 code = EQ;
2605 }
2606 else
2607 {
2608 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2609 code = NE;
2610 }
2611 break;
2612 case NE:
2613 case LTGT:
2614 if (code == NE && TARGET_IEEE_FP)
2615 {
2616 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2617 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
2618 GEN_INT (0x40)));
2619 code = NE;
2620 }
2621 else
2622 {
2623 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2624 code = EQ;
2625 }
2626 break;
2627
2628 case UNORDERED:
2629 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2630 code = NE;
2631 break;
2632 case ORDERED:
2633 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2634 code = EQ;
2635 break;
2636
2637 default:
2638 gcc_unreachable ();
2639 }
2640 break;
2641
2642 default:
2643 gcc_unreachable();
2644 }
2645
2646 /* Return the test that should be put into the flags user, i.e.
2647 the bcc, scc, or cmov instruction. */
2648 return gen_rtx_fmt_ee (code, VOIDmode,
2649 gen_rtx_REG (cmp_mode, FLAGS_REG),
2650 const0_rtx);
2651 }
2652
2653 /* Generate insn patterns to do an integer compare of OPERANDS. */
2654
2655 static rtx
2656 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
2657 {
2658 machine_mode cmpmode;
2659 rtx tmp, flags;
2660
2661 cmpmode = SELECT_CC_MODE (code, op0, op1);
2662 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
2663
2664 /* This is very simple, but making the interface the same as in the
2665 FP case makes the rest of the code easier. */
2666 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
2667 emit_insn (gen_rtx_SET (flags, tmp));
2668
2669 /* Return the test that should be put into the flags user, i.e.
2670 the bcc, scc, or cmov instruction. */
2671 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
2672 }
2673
2674 static rtx
2675 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
2676 {
2677 rtx ret;
2678
2679 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
2680 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
2681
2682 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
2683 {
2684 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
2685 ret = ix86_expand_fp_compare (code, op0, op1);
2686 }
2687 else
2688 ret = ix86_expand_int_compare (code, op0, op1);
2689
2690 return ret;
2691 }
2692
2693 void
2694 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
2695 {
2696 rtx ret;
2697
2698 gcc_assert (GET_MODE (dest) == QImode);
2699
2700 ret = ix86_expand_compare (code, op0, op1);
2701 PUT_MODE (ret, QImode);
2702 emit_insn (gen_rtx_SET (dest, ret));
2703 }
2704
2705 /* Expand comparison setting or clearing carry flag. Return true when
2706 successful and set pop for the operation. */
2707 static bool
2708 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
2709 {
2710 machine_mode mode
2711 = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
2712
2713 /* Do not handle double-mode compares that go through special path. */
2714 if (mode == (TARGET_64BIT ? TImode : DImode))
2715 return false;
2716
2717 if (SCALAR_FLOAT_MODE_P (mode))
2718 {
2719 rtx compare_op;
2720 rtx_insn *compare_seq;
2721
2722 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
2723
2724 /* Shortcut: following common codes never translate
2725 into carry flag compares. */
2726 if (code == EQ || code == NE || code == UNEQ || code == LTGT
2727 || code == ORDERED || code == UNORDERED)
2728 return false;
2729
2730 /* These comparisons require zero flag; swap operands so they won't. */
2731 if ((code == GT || code == UNLE || code == LE || code == UNGT)
2732 && !TARGET_IEEE_FP)
2733 {
2734 std::swap (op0, op1);
2735 code = swap_condition (code);
2736 }
2737
2738 /* Try to expand the comparison and verify that we end up with
2739 carry flag based comparison. This fails to be true only when
2740 we decide to expand comparison using arithmetic that is not
2741 too common scenario. */
2742 start_sequence ();
2743 compare_op = ix86_expand_fp_compare (code, op0, op1);
2744 compare_seq = get_insns ();
2745 end_sequence ();
2746
2747 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
2748 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
2749 else
2750 code = GET_CODE (compare_op);
2751
2752 if (code != LTU && code != GEU)
2753 return false;
2754
2755 emit_insn (compare_seq);
2756 *pop = compare_op;
2757 return true;
2758 }
2759
2760 if (!INTEGRAL_MODE_P (mode))
2761 return false;
2762
2763 switch (code)
2764 {
2765 case LTU:
2766 case GEU:
2767 break;
2768
2769 /* Convert a==0 into (unsigned)a<1. */
2770 case EQ:
2771 case NE:
2772 if (op1 != const0_rtx)
2773 return false;
2774 op1 = const1_rtx;
2775 code = (code == EQ ? LTU : GEU);
2776 break;
2777
2778 /* Convert a>b into b<a or a>=b-1. */
2779 case GTU:
2780 case LEU:
2781 if (CONST_INT_P (op1))
2782 {
2783 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
2784 /* Bail out on overflow. We still can swap operands but that
2785 would force loading of the constant into register. */
2786 if (op1 == const0_rtx
2787 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
2788 return false;
2789 code = (code == GTU ? GEU : LTU);
2790 }
2791 else
2792 {
2793 std::swap (op0, op1);
2794 code = (code == GTU ? LTU : GEU);
2795 }
2796 break;
2797
2798 /* Convert a>=0 into (unsigned)a<0x80000000. */
2799 case LT:
2800 case GE:
2801 if (mode == DImode || op1 != const0_rtx)
2802 return false;
2803 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
2804 code = (code == LT ? GEU : LTU);
2805 break;
2806 case LE:
2807 case GT:
2808 if (mode == DImode || op1 != constm1_rtx)
2809 return false;
2810 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
2811 code = (code == LE ? GEU : LTU);
2812 break;
2813
2814 default:
2815 return false;
2816 }
2817 /* Swapping operands may cause constant to appear as first operand. */
2818 if (!nonimmediate_operand (op0, VOIDmode))
2819 {
2820 if (!can_create_pseudo_p ())
2821 return false;
2822 op0 = force_reg (mode, op0);
2823 }
2824 *pop = ix86_expand_compare (code, op0, op1);
2825 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
2826 return true;
2827 }
2828
2829 /* Expand conditional increment or decrement using adb/sbb instructions.
2830 The default case using setcc followed by the conditional move can be
2831 done by generic code. */
2832 bool
2833 ix86_expand_int_addcc (rtx operands[])
2834 {
2835 enum rtx_code code = GET_CODE (operands[1]);
2836 rtx flags;
2837 rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx);
2838 rtx compare_op;
2839 rtx val = const0_rtx;
2840 bool fpcmp = false;
2841 machine_mode mode;
2842 rtx op0 = XEXP (operands[1], 0);
2843 rtx op1 = XEXP (operands[1], 1);
2844
2845 if (operands[3] != const1_rtx
2846 && operands[3] != constm1_rtx)
2847 return false;
2848 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
2849 return false;
2850 code = GET_CODE (compare_op);
2851
2852 flags = XEXP (compare_op, 0);
2853
2854 if (GET_MODE (flags) == CCFPmode)
2855 {
2856 fpcmp = true;
2857 code = ix86_fp_compare_code_to_integer (code);
2858 }
2859
2860 if (code != LTU)
2861 {
2862 val = constm1_rtx;
2863 if (fpcmp)
2864 PUT_CODE (compare_op,
2865 reverse_condition_maybe_unordered
2866 (GET_CODE (compare_op)));
2867 else
2868 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
2869 }
2870
2871 mode = GET_MODE (operands[0]);
2872
2873 /* Construct either adc or sbb insn. */
2874 if ((code == LTU) == (operands[3] == constm1_rtx))
2875 insn = gen_sub3_carry;
2876 else
2877 insn = gen_add3_carry;
2878
2879 emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op));
2880
2881 return true;
2882 }
2883
2884 bool
2885 ix86_expand_int_movcc (rtx operands[])
2886 {
2887 enum rtx_code code = GET_CODE (operands[1]), compare_code;
2888 rtx_insn *compare_seq;
2889 rtx compare_op;
2890 machine_mode mode = GET_MODE (operands[0]);
2891 bool sign_bit_compare_p = false;
2892 rtx op0 = XEXP (operands[1], 0);
2893 rtx op1 = XEXP (operands[1], 1);
2894
2895 if (GET_MODE (op0) == TImode
2896 || (GET_MODE (op0) == DImode
2897 && !TARGET_64BIT))
2898 return false;
2899
2900 start_sequence ();
2901 compare_op = ix86_expand_compare (code, op0, op1);
2902 compare_seq = get_insns ();
2903 end_sequence ();
2904
2905 compare_code = GET_CODE (compare_op);
2906
2907 if ((op1 == const0_rtx && (code == GE || code == LT))
2908 || (op1 == constm1_rtx && (code == GT || code == LE)))
2909 sign_bit_compare_p = true;
2910
2911 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
2912 HImode insns, we'd be swallowed in word prefix ops. */
2913
2914 if ((mode != HImode || TARGET_FAST_PREFIX)
2915 && (mode != (TARGET_64BIT ? TImode : DImode))
2916 && CONST_INT_P (operands[2])
2917 && CONST_INT_P (operands[3]))
2918 {
2919 rtx out = operands[0];
2920 HOST_WIDE_INT ct = INTVAL (operands[2]);
2921 HOST_WIDE_INT cf = INTVAL (operands[3]);
2922 HOST_WIDE_INT diff;
2923
2924 diff = ct - cf;
2925 /* Sign bit compares are better done using shifts than we do by using
2926 sbb. */
2927 if (sign_bit_compare_p
2928 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
2929 {
2930 /* Detect overlap between destination and compare sources. */
2931 rtx tmp = out;
2932
2933 if (!sign_bit_compare_p)
2934 {
2935 rtx flags;
2936 bool fpcmp = false;
2937
2938 compare_code = GET_CODE (compare_op);
2939
2940 flags = XEXP (compare_op, 0);
2941
2942 if (GET_MODE (flags) == CCFPmode)
2943 {
2944 fpcmp = true;
2945 compare_code
2946 = ix86_fp_compare_code_to_integer (compare_code);
2947 }
2948
2949 /* To simplify rest of code, restrict to the GEU case. */
2950 if (compare_code == LTU)
2951 {
2952 std::swap (ct, cf);
2953 compare_code = reverse_condition (compare_code);
2954 code = reverse_condition (code);
2955 }
2956 else
2957 {
2958 if (fpcmp)
2959 PUT_CODE (compare_op,
2960 reverse_condition_maybe_unordered
2961 (GET_CODE (compare_op)));
2962 else
2963 PUT_CODE (compare_op,
2964 reverse_condition (GET_CODE (compare_op)));
2965 }
2966 diff = ct - cf;
2967
2968 if (reg_overlap_mentioned_p (out, op0)
2969 || reg_overlap_mentioned_p (out, op1))
2970 tmp = gen_reg_rtx (mode);
2971
2972 if (mode == DImode)
2973 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
2974 else
2975 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
2976 flags, compare_op));
2977 }
2978 else
2979 {
2980 if (code == GT || code == GE)
2981 code = reverse_condition (code);
2982 else
2983 {
2984 std::swap (ct, cf);
2985 diff = ct - cf;
2986 }
2987 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
2988 }
2989
2990 if (diff == 1)
2991 {
2992 /*
2993 * cmpl op0,op1
2994 * sbbl dest,dest
2995 * [addl dest, ct]
2996 *
2997 * Size 5 - 8.
2998 */
2999 if (ct)
3000 tmp = expand_simple_binop (mode, PLUS,
3001 tmp, GEN_INT (ct),
3002 copy_rtx (tmp), 1, OPTAB_DIRECT);
3003 }
3004 else if (cf == -1)
3005 {
3006 /*
3007 * cmpl op0,op1
3008 * sbbl dest,dest
3009 * orl $ct, dest
3010 *
3011 * Size 8.
3012 */
3013 tmp = expand_simple_binop (mode, IOR,
3014 tmp, GEN_INT (ct),
3015 copy_rtx (tmp), 1, OPTAB_DIRECT);
3016 }
3017 else if (diff == -1 && ct)
3018 {
3019 /*
3020 * cmpl op0,op1
3021 * sbbl dest,dest
3022 * notl dest
3023 * [addl dest, cf]
3024 *
3025 * Size 8 - 11.
3026 */
3027 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3028 if (cf)
3029 tmp = expand_simple_binop (mode, PLUS,
3030 copy_rtx (tmp), GEN_INT (cf),
3031 copy_rtx (tmp), 1, OPTAB_DIRECT);
3032 }
3033 else
3034 {
3035 /*
3036 * cmpl op0,op1
3037 * sbbl dest,dest
3038 * [notl dest]
3039 * andl cf - ct, dest
3040 * [addl dest, ct]
3041 *
3042 * Size 8 - 11.
3043 */
3044
3045 if (cf == 0)
3046 {
3047 cf = ct;
3048 ct = 0;
3049 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3050 }
3051
3052 tmp = expand_simple_binop (mode, AND,
3053 copy_rtx (tmp),
3054 gen_int_mode (cf - ct, mode),
3055 copy_rtx (tmp), 1, OPTAB_DIRECT);
3056 if (ct)
3057 tmp = expand_simple_binop (mode, PLUS,
3058 copy_rtx (tmp), GEN_INT (ct),
3059 copy_rtx (tmp), 1, OPTAB_DIRECT);
3060 }
3061
3062 if (!rtx_equal_p (tmp, out))
3063 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
3064
3065 return true;
3066 }
3067
3068 if (diff < 0)
3069 {
3070 machine_mode cmp_mode = GET_MODE (op0);
3071 enum rtx_code new_code;
3072
3073 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3074 {
3075 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3076
3077 /* We may be reversing a non-trapping
3078 comparison to a trapping comparison. */
3079 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3080 && code != EQ && code != NE
3081 && code != ORDERED && code != UNORDERED)
3082 new_code = UNKNOWN;
3083 else
3084 new_code = reverse_condition_maybe_unordered (code);
3085 }
3086 else
3087 new_code = ix86_reverse_condition (code, cmp_mode);
3088 if (new_code != UNKNOWN)
3089 {
3090 std::swap (ct, cf);
3091 diff = -diff;
3092 code = new_code;
3093 }
3094 }
3095
3096 compare_code = UNKNOWN;
3097 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
3098 && CONST_INT_P (op1))
3099 {
3100 if (op1 == const0_rtx
3101 && (code == LT || code == GE))
3102 compare_code = code;
3103 else if (op1 == constm1_rtx)
3104 {
3105 if (code == LE)
3106 compare_code = LT;
3107 else if (code == GT)
3108 compare_code = GE;
3109 }
3110 }
3111
3112 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3113 if (compare_code != UNKNOWN
3114 && GET_MODE (op0) == GET_MODE (out)
3115 && (cf == -1 || ct == -1))
3116 {
3117 /* If lea code below could be used, only optimize
3118 if it results in a 2 insn sequence. */
3119
3120 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
3121 || diff == 3 || diff == 5 || diff == 9)
3122 || (compare_code == LT && ct == -1)
3123 || (compare_code == GE && cf == -1))
3124 {
3125 /*
3126 * notl op1 (if necessary)
3127 * sarl $31, op1
3128 * orl cf, op1
3129 */
3130 if (ct != -1)
3131 {
3132 cf = ct;
3133 ct = -1;
3134 code = reverse_condition (code);
3135 }
3136
3137 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3138
3139 out = expand_simple_binop (mode, IOR,
3140 out, GEN_INT (cf),
3141 out, 1, OPTAB_DIRECT);
3142 if (out != operands[0])
3143 emit_move_insn (operands[0], out);
3144
3145 return true;
3146 }
3147 }
3148
3149
3150 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
3151 || diff == 3 || diff == 5 || diff == 9)
3152 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
3153 && (mode != DImode
3154 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
3155 {
3156 /*
3157 * xorl dest,dest
3158 * cmpl op1,op2
3159 * setcc dest
3160 * lea cf(dest*(ct-cf)),dest
3161 *
3162 * Size 14.
3163 *
3164 * This also catches the degenerate setcc-only case.
3165 */
3166
3167 rtx tmp;
3168 int nops;
3169
3170 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3171
3172 nops = 0;
3173 /* On x86_64 the lea instruction operates on Pmode, so we need
3174 to get arithmetics done in proper mode to match. */
3175 if (diff == 1)
3176 tmp = copy_rtx (out);
3177 else
3178 {
3179 rtx out1;
3180 out1 = copy_rtx (out);
3181 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
3182 nops++;
3183 if (diff & 1)
3184 {
3185 tmp = gen_rtx_PLUS (mode, tmp, out1);
3186 nops++;
3187 }
3188 }
3189 if (cf != 0)
3190 {
3191 tmp = plus_constant (mode, tmp, cf);
3192 nops++;
3193 }
3194 if (!rtx_equal_p (tmp, out))
3195 {
3196 if (nops == 1)
3197 out = force_operand (tmp, copy_rtx (out));
3198 else
3199 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
3200 }
3201 if (!rtx_equal_p (out, operands[0]))
3202 emit_move_insn (operands[0], copy_rtx (out));
3203
3204 return true;
3205 }
3206
3207 /*
3208 * General case: Jumpful:
3209 * xorl dest,dest cmpl op1, op2
3210 * cmpl op1, op2 movl ct, dest
3211 * setcc dest jcc 1f
3212 * decl dest movl cf, dest
3213 * andl (cf-ct),dest 1:
3214 * addl ct,dest
3215 *
3216 * Size 20. Size 14.
3217 *
3218 * This is reasonably steep, but branch mispredict costs are
3219 * high on modern cpus, so consider failing only if optimizing
3220 * for space.
3221 */
3222
3223 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3224 && BRANCH_COST (optimize_insn_for_speed_p (),
3225 false) >= 2)
3226 {
3227 if (cf == 0)
3228 {
3229 machine_mode cmp_mode = GET_MODE (op0);
3230 enum rtx_code new_code;
3231
3232 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3233 {
3234 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3235
3236 /* We may be reversing a non-trapping
3237 comparison to a trapping comparison. */
3238 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3239 && code != EQ && code != NE
3240 && code != ORDERED && code != UNORDERED)
3241 new_code = UNKNOWN;
3242 else
3243 new_code = reverse_condition_maybe_unordered (code);
3244
3245 }
3246 else
3247 {
3248 new_code = ix86_reverse_condition (code, cmp_mode);
3249 if (compare_code != UNKNOWN && new_code != UNKNOWN)
3250 compare_code = reverse_condition (compare_code);
3251 }
3252
3253 if (new_code != UNKNOWN)
3254 {
3255 cf = ct;
3256 ct = 0;
3257 code = new_code;
3258 }
3259 }
3260
3261 if (compare_code != UNKNOWN)
3262 {
3263 /* notl op1 (if needed)
3264 sarl $31, op1
3265 andl (cf-ct), op1
3266 addl ct, op1
3267
3268 For x < 0 (resp. x <= -1) there will be no notl,
3269 so if possible swap the constants to get rid of the
3270 complement.
3271 True/false will be -1/0 while code below (store flag
3272 followed by decrement) is 0/-1, so the constants need
3273 to be exchanged once more. */
3274
3275 if (compare_code == GE || !cf)
3276 {
3277 code = reverse_condition (code);
3278 compare_code = LT;
3279 }
3280 else
3281 std::swap (ct, cf);
3282
3283 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3284 }
3285 else
3286 {
3287 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3288
3289 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
3290 constm1_rtx,
3291 copy_rtx (out), 1, OPTAB_DIRECT);
3292 }
3293
3294 out = expand_simple_binop (mode, AND, copy_rtx (out),
3295 gen_int_mode (cf - ct, mode),
3296 copy_rtx (out), 1, OPTAB_DIRECT);
3297 if (ct)
3298 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
3299 copy_rtx (out), 1, OPTAB_DIRECT);
3300 if (!rtx_equal_p (out, operands[0]))
3301 emit_move_insn (operands[0], copy_rtx (out));
3302
3303 return true;
3304 }
3305 }
3306
3307 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3308 {
3309 /* Try a few things more with specific constants and a variable. */
3310
3311 optab op;
3312 rtx var, orig_out, out, tmp;
3313
3314 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3315 return false;
3316
3317 /* If one of the two operands is an interesting constant, load a
3318 constant with the above and mask it in with a logical operation. */
3319
3320 if (CONST_INT_P (operands[2]))
3321 {
3322 var = operands[3];
3323 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
3324 operands[3] = constm1_rtx, op = and_optab;
3325 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
3326 operands[3] = const0_rtx, op = ior_optab;
3327 else
3328 return false;
3329 }
3330 else if (CONST_INT_P (operands[3]))
3331 {
3332 var = operands[2];
3333 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
3334 {
3335 /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
3336 "x <= 0 ? x : 0" to enable sign_bit_compare_p. */
3337 if (code == LE && op1 == const0_rtx && rtx_equal_p (op0, var))
3338 operands[1] = simplify_gen_relational (LT, VOIDmode,
3339 GET_MODE (op0),
3340 op0, const0_rtx);
3341
3342 operands[2] = constm1_rtx;
3343 op = and_optab;
3344 }
3345 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
3346 operands[2] = const0_rtx, op = ior_optab;
3347 else
3348 return false;
3349 }
3350 else
3351 return false;
3352
3353 orig_out = operands[0];
3354 tmp = gen_reg_rtx (mode);
3355 operands[0] = tmp;
3356
3357 /* Recurse to get the constant loaded. */
3358 if (!ix86_expand_int_movcc (operands))
3359 return false;
3360
3361 /* Mask in the interesting variable. */
3362 out = expand_binop (mode, op, var, tmp, orig_out, 0,
3363 OPTAB_WIDEN);
3364 if (!rtx_equal_p (out, orig_out))
3365 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
3366
3367 return true;
3368 }
3369
3370 /*
3371 * For comparison with above,
3372 *
3373 * movl cf,dest
3374 * movl ct,tmp
3375 * cmpl op1,op2
3376 * cmovcc tmp,dest
3377 *
3378 * Size 15.
3379 */
3380
3381 if (! nonimmediate_operand (operands[2], mode))
3382 operands[2] = force_reg (mode, operands[2]);
3383 if (! nonimmediate_operand (operands[3], mode))
3384 operands[3] = force_reg (mode, operands[3]);
3385
3386 if (! register_operand (operands[2], VOIDmode)
3387 && (mode == QImode
3388 || ! register_operand (operands[3], VOIDmode)))
3389 operands[2] = force_reg (mode, operands[2]);
3390
3391 if (mode == QImode
3392 && ! register_operand (operands[3], VOIDmode))
3393 operands[3] = force_reg (mode, operands[3]);
3394
3395 emit_insn (compare_seq);
3396 emit_insn (gen_rtx_SET (operands[0],
3397 gen_rtx_IF_THEN_ELSE (mode,
3398 compare_op, operands[2],
3399 operands[3])));
3400 return true;
3401 }
3402
3403 /* Detect conditional moves that exactly match min/max operational
3404 semantics. Note that this is IEEE safe, as long as we don't
3405 interchange the operands.
3406
3407 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3408 and TRUE if the operation is successful and instructions are emitted. */
3409
3410 static bool
3411 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
3412 rtx cmp_op1, rtx if_true, rtx if_false)
3413 {
3414 machine_mode mode;
3415 bool is_min;
3416 rtx tmp;
3417
3418 if (code == LT)
3419 ;
3420 else if (code == UNGE)
3421 std::swap (if_true, if_false);
3422 else
3423 return false;
3424
3425 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
3426 is_min = true;
3427 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
3428 is_min = false;
3429 else
3430 return false;
3431
3432 mode = GET_MODE (dest);
3433
3434 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3435 but MODE may be a vector mode and thus not appropriate. */
3436 if (!flag_finite_math_only || flag_signed_zeros)
3437 {
3438 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
3439 rtvec v;
3440
3441 if_true = force_reg (mode, if_true);
3442 v = gen_rtvec (2, if_true, if_false);
3443 tmp = gen_rtx_UNSPEC (mode, v, u);
3444 }
3445 else
3446 {
3447 code = is_min ? SMIN : SMAX;
3448 if (MEM_P (if_true) && MEM_P (if_false))
3449 if_true = force_reg (mode, if_true);
3450 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
3451 }
3452
3453 emit_insn (gen_rtx_SET (dest, tmp));
3454 return true;
3455 }
3456
3457 /* Return true if MODE is valid for vector compare to mask register,
3458 Same result for conditionl vector move with mask register. */
3459 static bool
3460 ix86_valid_mask_cmp_mode (machine_mode mode)
3461 {
3462 /* XOP has its own vector conditional movement. */
3463 if (TARGET_XOP && !TARGET_AVX512F)
3464 return false;
3465
3466 /* AVX512F is needed for mask operation. */
3467 if (!(TARGET_AVX512F && VECTOR_MODE_P (mode)))
3468 return false;
3469
3470 /* AVX512BW is needed for vector QI/HImode,
3471 AVX512VL is needed for 128/256-bit vector. */
3472 machine_mode inner_mode = GET_MODE_INNER (mode);
3473 int vector_size = GET_MODE_SIZE (mode);
3474 if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW)
3475 return false;
3476
3477 return vector_size == 64 || TARGET_AVX512VL;
3478 }
3479
3480 /* Return true if integer mask comparison should be used. */
3481 static bool
3482 ix86_use_mask_cmp_p (machine_mode mode, machine_mode cmp_mode,
3483 rtx op_true, rtx op_false)
3484 {
3485 if (GET_MODE_SIZE (mode) == 64)
3486 return true;
3487
3488 /* When op_true is NULL, op_false must be NULL, or vice versa. */
3489 gcc_assert (!op_true == !op_false);
3490
3491 /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode,
3492 vector dest is required. */
3493 if (!op_true || !ix86_valid_mask_cmp_mode (cmp_mode))
3494 return false;
3495
3496 /* Exclude those that could be optimized in ix86_expand_sse_movcc. */
3497 if (op_false == CONST0_RTX (mode)
3498 || op_true == CONST0_RTX (mode)
3499 || (INTEGRAL_MODE_P (mode)
3500 && (op_true == CONSTM1_RTX (mode)
3501 || op_false == CONSTM1_RTX (mode))))
3502 return false;
3503
3504 return true;
3505 }
3506
3507 /* Expand an SSE comparison. Return the register with the result. */
3508
3509 static rtx
3510 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
3511 rtx op_true, rtx op_false)
3512 {
3513 machine_mode mode = GET_MODE (dest);
3514 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
3515
3516 /* In general case result of comparison can differ from operands' type. */
3517 machine_mode cmp_mode;
3518
3519 /* In AVX512F the result of comparison is an integer mask. */
3520 bool maskcmp = false;
3521 rtx x;
3522
3523 if (ix86_use_mask_cmp_p (mode, cmp_ops_mode, op_true, op_false))
3524 {
3525 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
3526 maskcmp = true;
3527 cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode;
3528 }
3529 else
3530 cmp_mode = cmp_ops_mode;
3531
3532 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
3533
3534 int (*op1_predicate)(rtx, machine_mode)
3535 = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
3536
3537 if (!op1_predicate (cmp_op1, cmp_ops_mode))
3538 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
3539
3540 if (optimize
3541 || (maskcmp && cmp_mode != mode)
3542 || (op_true && reg_overlap_mentioned_p (dest, op_true))
3543 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
3544 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
3545
3546 if (maskcmp)
3547 {
3548 bool ok = ix86_expand_mask_vec_cmp (dest, code, cmp_op0, cmp_op1);
3549 gcc_assert (ok);
3550 return dest;
3551 }
3552
3553 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
3554
3555 if (cmp_mode != mode)
3556 {
3557 x = force_reg (cmp_ops_mode, x);
3558 convert_move (dest, x, false);
3559 }
3560 else
3561 emit_insn (gen_rtx_SET (dest, x));
3562
3563 return dest;
3564 }
3565
3566 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3567 operations. This is used for both scalar and vector conditional moves. */
3568
3569 void
3570 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
3571 {
3572 machine_mode mode = GET_MODE (dest);
3573 machine_mode cmpmode = GET_MODE (cmp);
3574
3575 /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506. */
3576 if (rtx_equal_p (op_true, op_false))
3577 {
3578 emit_move_insn (dest, op_true);
3579 return;
3580 }
3581
3582 rtx t2, t3, x;
3583
3584 /* If we have an integer mask and FP value then we need
3585 to cast mask to FP mode. */
3586 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
3587 {
3588 cmp = force_reg (cmpmode, cmp);
3589 cmp = gen_rtx_SUBREG (mode, cmp, 0);
3590 }
3591
3592 /* In AVX512F the result of comparison is an integer mask. */
3593 if (mode != cmpmode
3594 && GET_MODE_CLASS (cmpmode) == MODE_INT)
3595 {
3596 gcc_assert (ix86_valid_mask_cmp_mode (mode));
3597 /* Using vector move with mask register. */
3598 cmp = force_reg (cmpmode, cmp);
3599 /* Optimize for mask zero. */
3600 op_true = (op_true != CONST0_RTX (mode)
3601 ? force_reg (mode, op_true) : op_true);
3602 op_false = (op_false != CONST0_RTX (mode)
3603 ? force_reg (mode, op_false) : op_false);
3604 if (op_true == CONST0_RTX (mode))
3605 {
3606 rtx n = gen_reg_rtx (cmpmode);
3607 if (cmpmode == E_DImode && !TARGET_64BIT)
3608 emit_insn (gen_knotdi (n, cmp));
3609 else
3610 emit_insn (gen_rtx_SET (n, gen_rtx_fmt_e (NOT, cmpmode, cmp)));
3611 cmp = n;
3612 /* Reverse op_true op_false. */
3613 std::swap (op_true, op_false);
3614 }
3615
3616 rtx vec_merge = gen_rtx_VEC_MERGE (mode, op_true, op_false, cmp);
3617 emit_insn (gen_rtx_SET (dest, vec_merge));
3618 return;
3619 }
3620 else if (vector_all_ones_operand (op_true, mode)
3621 && op_false == CONST0_RTX (mode))
3622 {
3623 emit_insn (gen_rtx_SET (dest, cmp));
3624 return;
3625 }
3626 else if (op_false == CONST0_RTX (mode))
3627 {
3628 op_true = force_reg (mode, op_true);
3629 x = gen_rtx_AND (mode, cmp, op_true);
3630 emit_insn (gen_rtx_SET (dest, x));
3631 return;
3632 }
3633 else if (op_true == CONST0_RTX (mode))
3634 {
3635 op_false = force_reg (mode, op_false);
3636 x = gen_rtx_NOT (mode, cmp);
3637 x = gen_rtx_AND (mode, x, op_false);
3638 emit_insn (gen_rtx_SET (dest, x));
3639 return;
3640 }
3641 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
3642 {
3643 op_false = force_reg (mode, op_false);
3644 x = gen_rtx_IOR (mode, cmp, op_false);
3645 emit_insn (gen_rtx_SET (dest, x));
3646 return;
3647 }
3648 else if (TARGET_XOP)
3649 {
3650 op_true = force_reg (mode, op_true);
3651
3652 if (!nonimmediate_operand (op_false, mode))
3653 op_false = force_reg (mode, op_false);
3654
3655 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
3656 op_true,
3657 op_false)));
3658 return;
3659 }
3660
3661 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
3662 rtx d = dest;
3663
3664 if (!vector_operand (op_true, mode))
3665 op_true = force_reg (mode, op_true);
3666
3667 op_false = force_reg (mode, op_false);
3668
3669 switch (mode)
3670 {
3671 case E_V4SFmode:
3672 if (TARGET_SSE4_1)
3673 gen = gen_sse4_1_blendvps;
3674 break;
3675 case E_V2DFmode:
3676 if (TARGET_SSE4_1)
3677 gen = gen_sse4_1_blendvpd;
3678 break;
3679 case E_SFmode:
3680 if (TARGET_SSE4_1)
3681 {
3682 gen = gen_sse4_1_blendvss;
3683 op_true = force_reg (mode, op_true);
3684 }
3685 break;
3686 case E_DFmode:
3687 if (TARGET_SSE4_1)
3688 {
3689 gen = gen_sse4_1_blendvsd;
3690 op_true = force_reg (mode, op_true);
3691 }
3692 break;
3693 case E_V16QImode:
3694 case E_V8HImode:
3695 case E_V4SImode:
3696 case E_V2DImode:
3697 if (TARGET_SSE4_1)
3698 {
3699 gen = gen_sse4_1_pblendvb;
3700 if (mode != V16QImode)
3701 d = gen_reg_rtx (V16QImode);
3702 op_false = gen_lowpart (V16QImode, op_false);
3703 op_true = gen_lowpart (V16QImode, op_true);
3704 cmp = gen_lowpart (V16QImode, cmp);
3705 }
3706 break;
3707 case E_V8SFmode:
3708 if (TARGET_AVX)
3709 gen = gen_avx_blendvps256;
3710 break;
3711 case E_V4DFmode:
3712 if (TARGET_AVX)
3713 gen = gen_avx_blendvpd256;
3714 break;
3715 case E_V32QImode:
3716 case E_V16HImode:
3717 case E_V8SImode:
3718 case E_V4DImode:
3719 if (TARGET_AVX2)
3720 {
3721 gen = gen_avx2_pblendvb;
3722 if (mode != V32QImode)
3723 d = gen_reg_rtx (V32QImode);
3724 op_false = gen_lowpart (V32QImode, op_false);
3725 op_true = gen_lowpart (V32QImode, op_true);
3726 cmp = gen_lowpart (V32QImode, cmp);
3727 }
3728 break;
3729
3730 case E_V64QImode:
3731 gen = gen_avx512bw_blendmv64qi;
3732 break;
3733 case E_V32HImode:
3734 gen = gen_avx512bw_blendmv32hi;
3735 break;
3736 case E_V16SImode:
3737 gen = gen_avx512f_blendmv16si;
3738 break;
3739 case E_V8DImode:
3740 gen = gen_avx512f_blendmv8di;
3741 break;
3742 case E_V8DFmode:
3743 gen = gen_avx512f_blendmv8df;
3744 break;
3745 case E_V16SFmode:
3746 gen = gen_avx512f_blendmv16sf;
3747 break;
3748
3749 default:
3750 break;
3751 }
3752
3753 if (gen != NULL)
3754 {
3755 emit_insn (gen (d, op_false, op_true, cmp));
3756 if (d != dest)
3757 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
3758 }
3759 else
3760 {
3761 op_true = force_reg (mode, op_true);
3762
3763 t2 = gen_reg_rtx (mode);
3764 if (optimize)
3765 t3 = gen_reg_rtx (mode);
3766 else
3767 t3 = dest;
3768
3769 x = gen_rtx_AND (mode, op_true, cmp);
3770 emit_insn (gen_rtx_SET (t2, x));
3771
3772 x = gen_rtx_NOT (mode, cmp);
3773 x = gen_rtx_AND (mode, x, op_false);
3774 emit_insn (gen_rtx_SET (t3, x));
3775
3776 x = gen_rtx_IOR (mode, t3, t2);
3777 emit_insn (gen_rtx_SET (dest, x));
3778 }
3779 }
3780
3781 /* Swap, force into registers, or otherwise massage the two operands
3782 to an sse comparison with a mask result. Thus we differ a bit from
3783 ix86_prepare_fp_compare_args which expects to produce a flags result.
3784
3785 The DEST operand exists to help determine whether to commute commutative
3786 operators. The POP0/POP1 operands are updated in place. The new
3787 comparison code is returned, or UNKNOWN if not implementable. */
3788
3789 static enum rtx_code
3790 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
3791 rtx *pop0, rtx *pop1)
3792 {
3793 switch (code)
3794 {
3795 case LTGT:
3796 case UNEQ:
3797 /* AVX supports all the needed comparisons. */
3798 if (TARGET_AVX)
3799 break;
3800 /* We have no LTGT as an operator. We could implement it with
3801 NE & ORDERED, but this requires an extra temporary. It's
3802 not clear that it's worth it. */
3803 return UNKNOWN;
3804
3805 case LT:
3806 case LE:
3807 case UNGT:
3808 case UNGE:
3809 /* These are supported directly. */
3810 break;
3811
3812 case EQ:
3813 case NE:
3814 case UNORDERED:
3815 case ORDERED:
3816 /* AVX has 3 operand comparisons, no need to swap anything. */
3817 if (TARGET_AVX)
3818 break;
3819 /* For commutative operators, try to canonicalize the destination
3820 operand to be first in the comparison - this helps reload to
3821 avoid extra moves. */
3822 if (!dest || !rtx_equal_p (dest, *pop1))
3823 break;
3824 /* FALLTHRU */
3825
3826 case GE:
3827 case GT:
3828 case UNLE:
3829 case UNLT:
3830 /* These are not supported directly before AVX, and furthermore
3831 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
3832 comparison operands to transform into something that is
3833 supported. */
3834 std::swap (*pop0, *pop1);
3835 code = swap_condition (code);
3836 break;
3837
3838 default:
3839 gcc_unreachable ();
3840 }
3841
3842 return code;
3843 }
3844
3845 /* Expand a floating-point conditional move. Return true if successful. */
3846
3847 bool
3848 ix86_expand_fp_movcc (rtx operands[])
3849 {
3850 machine_mode mode = GET_MODE (operands[0]);
3851 enum rtx_code code = GET_CODE (operands[1]);
3852 rtx tmp, compare_op;
3853 rtx op0 = XEXP (operands[1], 0);
3854 rtx op1 = XEXP (operands[1], 1);
3855
3856 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
3857 {
3858 machine_mode cmode;
3859
3860 /* Since we've no cmove for sse registers, don't force bad register
3861 allocation just to gain access to it. Deny movcc when the
3862 comparison mode doesn't match the move mode. */
3863 cmode = GET_MODE (op0);
3864 if (cmode == VOIDmode)
3865 cmode = GET_MODE (op1);
3866 if (cmode != mode)
3867 return false;
3868
3869 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
3870 if (code == UNKNOWN)
3871 return false;
3872
3873 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
3874 operands[2], operands[3]))
3875 return true;
3876
3877 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
3878 operands[2], operands[3]);
3879 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
3880 return true;
3881 }
3882
3883 if (GET_MODE (op0) == TImode
3884 || (GET_MODE (op0) == DImode
3885 && !TARGET_64BIT))
3886 return false;
3887
3888 /* The floating point conditional move instructions don't directly
3889 support conditions resulting from a signed integer comparison. */
3890
3891 compare_op = ix86_expand_compare (code, op0, op1);
3892 if (!fcmov_comparison_operator (compare_op, VOIDmode))
3893 {
3894 tmp = gen_reg_rtx (QImode);
3895 ix86_expand_setcc (tmp, code, op0, op1);
3896
3897 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
3898 }
3899
3900 emit_insn (gen_rtx_SET (operands[0],
3901 gen_rtx_IF_THEN_ELSE (mode, compare_op,
3902 operands[2], operands[3])));
3903
3904 return true;
3905 }
3906
3907 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
3908
3909 static int
3910 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
3911 {
3912 switch (code)
3913 {
3914 case EQ:
3915 return 0;
3916 case LT:
3917 case LTU:
3918 return 1;
3919 case LE:
3920 case LEU:
3921 return 2;
3922 case NE:
3923 return 4;
3924 case GE:
3925 case GEU:
3926 return 5;
3927 case GT:
3928 case GTU:
3929 return 6;
3930 default:
3931 gcc_unreachable ();
3932 }
3933 }
3934
3935 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
3936
3937 static int
3938 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
3939 {
3940 switch (code)
3941 {
3942 case EQ:
3943 return 0x00;
3944 case NE:
3945 return 0x04;
3946 case GT:
3947 return 0x0e;
3948 case LE:
3949 return 0x02;
3950 case GE:
3951 return 0x0d;
3952 case LT:
3953 return 0x01;
3954 case UNLE:
3955 return 0x0a;
3956 case UNLT:
3957 return 0x09;
3958 case UNGE:
3959 return 0x05;
3960 case UNGT:
3961 return 0x06;
3962 case UNEQ:
3963 return 0x18;
3964 case LTGT:
3965 return 0x0c;
3966 case ORDERED:
3967 return 0x07;
3968 case UNORDERED:
3969 return 0x03;
3970 default:
3971 gcc_unreachable ();
3972 }
3973 }
3974
3975 /* Return immediate value to be used in UNSPEC_PCMP
3976 for comparison CODE in MODE. */
3977
3978 static int
3979 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
3980 {
3981 if (FLOAT_MODE_P (mode))
3982 return ix86_fp_cmp_code_to_pcmp_immediate (code);
3983 return ix86_int_cmp_code_to_pcmp_immediate (code);
3984 }
3985
3986 /* Expand AVX-512 vector comparison. */
3987
3988 bool
3989 ix86_expand_mask_vec_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1)
3990 {
3991 machine_mode mask_mode = GET_MODE (dest);
3992 machine_mode cmp_mode = GET_MODE (cmp_op0);
3993 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
3994 int unspec_code;
3995 rtx unspec;
3996
3997 switch (code)
3998 {
3999 case LEU:
4000 case GTU:
4001 case GEU:
4002 case LTU:
4003 unspec_code = UNSPEC_UNSIGNED_PCMP;
4004 break;
4005
4006 default:
4007 unspec_code = UNSPEC_PCMP;
4008 }
4009
4010 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, cmp_op0, cmp_op1, imm),
4011 unspec_code);
4012 emit_insn (gen_rtx_SET (dest, unspec));
4013
4014 return true;
4015 }
4016
4017 /* Expand fp vector comparison. */
4018
4019 bool
4020 ix86_expand_fp_vec_cmp (rtx operands[])
4021 {
4022 enum rtx_code code = GET_CODE (operands[1]);
4023 rtx cmp;
4024
4025 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4026 &operands[2], &operands[3]);
4027 if (code == UNKNOWN)
4028 {
4029 rtx temp;
4030 switch (GET_CODE (operands[1]))
4031 {
4032 case LTGT:
4033 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
4034 operands[3], NULL, NULL);
4035 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
4036 operands[3], NULL, NULL);
4037 code = AND;
4038 break;
4039 case UNEQ:
4040 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
4041 operands[3], NULL, NULL);
4042 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
4043 operands[3], NULL, NULL);
4044 code = IOR;
4045 break;
4046 default:
4047 gcc_unreachable ();
4048 }
4049 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4050 OPTAB_DIRECT);
4051 }
4052 else
4053 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
4054 NULL, NULL);
4055
4056 if (operands[0] != cmp)
4057 emit_move_insn (operands[0], cmp);
4058
4059 return true;
4060 }
4061
4062 static rtx
4063 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
4064 rtx op_true, rtx op_false, bool *negate)
4065 {
4066 machine_mode data_mode = GET_MODE (dest);
4067 machine_mode mode = GET_MODE (cop0);
4068 rtx x;
4069
4070 *negate = false;
4071
4072 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4073 if (TARGET_XOP
4074 && (mode == V16QImode || mode == V8HImode
4075 || mode == V4SImode || mode == V2DImode))
4076 ;
4077 /* AVX512F supports all of the comparsions
4078 on all 128/256/512-bit vector int types. */
4079 else if (ix86_use_mask_cmp_p (data_mode, mode, op_true, op_false))
4080 ;
4081 else
4082 {
4083 /* Canonicalize the comparison to EQ, GT, GTU. */
4084 switch (code)
4085 {
4086 case EQ:
4087 case GT:
4088 case GTU:
4089 break;
4090
4091 case NE:
4092 case LE:
4093 case LEU:
4094 code = reverse_condition (code);
4095 *negate = true;
4096 break;
4097
4098 case GE:
4099 case GEU:
4100 code = reverse_condition (code);
4101 *negate = true;
4102 /* FALLTHRU */
4103
4104 case LT:
4105 case LTU:
4106 std::swap (cop0, cop1);
4107 code = swap_condition (code);
4108 break;
4109
4110 default:
4111 gcc_unreachable ();
4112 }
4113
4114 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4115 if (mode == V2DImode)
4116 {
4117 switch (code)
4118 {
4119 case EQ:
4120 /* SSE4.1 supports EQ. */
4121 if (!TARGET_SSE4_1)
4122 return NULL;
4123 break;
4124
4125 case GT:
4126 case GTU:
4127 /* SSE4.2 supports GT/GTU. */
4128 if (!TARGET_SSE4_2)
4129 return NULL;
4130 break;
4131
4132 default:
4133 gcc_unreachable ();
4134 }
4135 }
4136
4137 rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
4138 rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
4139 if (*negate)
4140 std::swap (optrue, opfalse);
4141
4142 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4143 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4144 min (x, y) == x). While we add one instruction (the minimum),
4145 we remove the need for two instructions in the negation, as the
4146 result is done this way.
4147 When using masks, do it for SI/DImode element types, as it is shorter
4148 than the two subtractions. */
4149 if ((code != EQ
4150 && GET_MODE_SIZE (mode) != 64
4151 && vector_all_ones_operand (opfalse, data_mode)
4152 && optrue == CONST0_RTX (data_mode))
4153 || (code == GTU
4154 && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
4155 /* Don't do it if not using integer masks and we'd end up with
4156 the right values in the registers though. */
4157 && (GET_MODE_SIZE (mode) == 64
4158 || !vector_all_ones_operand (optrue, data_mode)
4159 || opfalse != CONST0_RTX (data_mode))))
4160 {
4161 rtx (*gen) (rtx, rtx, rtx) = NULL;
4162
4163 switch (mode)
4164 {
4165 case E_V16SImode:
4166 gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
4167 break;
4168 case E_V8DImode:
4169 gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
4170 cop0 = force_reg (mode, cop0);
4171 cop1 = force_reg (mode, cop1);
4172 break;
4173 case E_V32QImode:
4174 if (TARGET_AVX2)
4175 gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
4176 break;
4177 case E_V16HImode:
4178 if (TARGET_AVX2)
4179 gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
4180 break;
4181 case E_V8SImode:
4182 if (TARGET_AVX2)
4183 gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
4184 break;
4185 case E_V4DImode:
4186 if (TARGET_AVX512VL)
4187 {
4188 gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
4189 cop0 = force_reg (mode, cop0);
4190 cop1 = force_reg (mode, cop1);
4191 }
4192 break;
4193 case E_V16QImode:
4194 if (code == GTU && TARGET_SSE2)
4195 gen = gen_uminv16qi3;
4196 else if (code == GT && TARGET_SSE4_1)
4197 gen = gen_sminv16qi3;
4198 break;
4199 case E_V8HImode:
4200 if (code == GTU && TARGET_SSE4_1)
4201 gen = gen_uminv8hi3;
4202 else if (code == GT && TARGET_SSE2)
4203 gen = gen_sminv8hi3;
4204 break;
4205 case E_V4SImode:
4206 if (TARGET_SSE4_1)
4207 gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
4208 break;
4209 case E_V2DImode:
4210 if (TARGET_AVX512VL)
4211 {
4212 gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
4213 cop0 = force_reg (mode, cop0);
4214 cop1 = force_reg (mode, cop1);
4215 }
4216 break;
4217 default:
4218 break;
4219 }
4220
4221 if (gen)
4222 {
4223 rtx tem = gen_reg_rtx (mode);
4224 if (!vector_operand (cop0, mode))
4225 cop0 = force_reg (mode, cop0);
4226 if (!vector_operand (cop1, mode))
4227 cop1 = force_reg (mode, cop1);
4228 *negate = !*negate;
4229 emit_insn (gen (tem, cop0, cop1));
4230 cop1 = tem;
4231 code = EQ;
4232 }
4233 }
4234
4235 /* Unsigned parallel compare is not supported by the hardware.
4236 Play some tricks to turn this into a signed comparison
4237 against 0. */
4238 if (code == GTU)
4239 {
4240 cop0 = force_reg (mode, cop0);
4241
4242 switch (mode)
4243 {
4244 case E_V16SImode:
4245 case E_V8DImode:
4246 case E_V8SImode:
4247 case E_V4DImode:
4248 case E_V4SImode:
4249 case E_V2DImode:
4250 {
4251 rtx t1, t2, mask;
4252
4253 /* Subtract (-(INT MAX) - 1) from both operands to make
4254 them signed. */
4255 mask = ix86_build_signbit_mask (mode, true, false);
4256 t1 = gen_reg_rtx (mode);
4257 emit_insn (gen_sub3_insn (t1, cop0, mask));
4258
4259 t2 = gen_reg_rtx (mode);
4260 emit_insn (gen_sub3_insn (t2, cop1, mask));
4261
4262 cop0 = t1;
4263 cop1 = t2;
4264 code = GT;
4265 }
4266 break;
4267
4268 case E_V64QImode:
4269 case E_V32HImode:
4270 case E_V32QImode:
4271 case E_V16HImode:
4272 case E_V16QImode:
4273 case E_V8HImode:
4274 /* Perform a parallel unsigned saturating subtraction. */
4275 x = gen_reg_rtx (mode);
4276 emit_insn (gen_rtx_SET
4277 (x, gen_rtx_US_MINUS (mode, cop0, cop1)));
4278 cop0 = x;
4279 cop1 = CONST0_RTX (mode);
4280 code = EQ;
4281 *negate = !*negate;
4282 break;
4283
4284 default:
4285 gcc_unreachable ();
4286 }
4287 }
4288 }
4289
4290 if (*negate)
4291 std::swap (op_true, op_false);
4292
4293 /* Allow the comparison to be done in one mode, but the movcc to
4294 happen in another mode. */
4295 if (data_mode == mode)
4296 {
4297 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
4298 op_true, op_false);
4299 }
4300 else
4301 {
4302 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
4303 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
4304 op_true, op_false);
4305 if (GET_MODE (x) == mode)
4306 x = gen_lowpart (data_mode, x);
4307 }
4308
4309 return x;
4310 }
4311
4312 /* Expand integer vector comparison. */
4313
4314 bool
4315 ix86_expand_int_vec_cmp (rtx operands[])
4316 {
4317 rtx_code code = GET_CODE (operands[1]);
4318 bool negate = false;
4319 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
4320 operands[3], NULL, NULL, &negate);
4321
4322 if (!cmp)
4323 return false;
4324
4325 if (negate)
4326 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
4327 CONST0_RTX (GET_MODE (cmp)),
4328 NULL, NULL, &negate);
4329
4330 gcc_assert (!negate);
4331
4332 if (operands[0] != cmp)
4333 emit_move_insn (operands[0], cmp);
4334
4335 return true;
4336 }
4337
4338 /* Expand a floating-point vector conditional move; a vcond operation
4339 rather than a movcc operation. */
4340
4341 bool
4342 ix86_expand_fp_vcond (rtx operands[])
4343 {
4344 enum rtx_code code = GET_CODE (operands[3]);
4345 rtx cmp;
4346
4347 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4348 &operands[4], &operands[5]);
4349 if (code == UNKNOWN)
4350 {
4351 rtx temp;
4352 switch (GET_CODE (operands[3]))
4353 {
4354 case LTGT:
4355 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
4356 operands[5], operands[0], operands[0]);
4357 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
4358 operands[5], operands[1], operands[2]);
4359 code = AND;
4360 break;
4361 case UNEQ:
4362 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
4363 operands[5], operands[0], operands[0]);
4364 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
4365 operands[5], operands[1], operands[2]);
4366 code = IOR;
4367 break;
4368 default:
4369 gcc_unreachable ();
4370 }
4371 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4372 OPTAB_DIRECT);
4373 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4374 return true;
4375 }
4376
4377 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
4378 operands[5], operands[1], operands[2]))
4379 return true;
4380
4381 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
4382 operands[1], operands[2]);
4383 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4384 return true;
4385 }
4386
4387 /* Expand a signed/unsigned integral vector conditional move. */
4388
4389 bool
4390 ix86_expand_int_vcond (rtx operands[])
4391 {
4392 machine_mode data_mode = GET_MODE (operands[0]);
4393 machine_mode mode = GET_MODE (operands[4]);
4394 enum rtx_code code = GET_CODE (operands[3]);
4395 bool negate = false;
4396 rtx x, cop0, cop1;
4397
4398 cop0 = operands[4];
4399 cop1 = operands[5];
4400
4401 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4402 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
4403 if ((code == LT || code == GE)
4404 && data_mode == mode
4405 && cop1 == CONST0_RTX (mode)
4406 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
4407 && GET_MODE_UNIT_SIZE (data_mode) > 1
4408 && GET_MODE_UNIT_SIZE (data_mode) <= 8
4409 && (GET_MODE_SIZE (data_mode) == 16
4410 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
4411 {
4412 rtx negop = operands[2 - (code == LT)];
4413 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
4414 if (negop == CONST1_RTX (data_mode))
4415 {
4416 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
4417 operands[0], 1, OPTAB_DIRECT);
4418 if (res != operands[0])
4419 emit_move_insn (operands[0], res);
4420 return true;
4421 }
4422 else if (GET_MODE_INNER (data_mode) != DImode
4423 && vector_all_ones_operand (negop, data_mode))
4424 {
4425 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
4426 operands[0], 0, OPTAB_DIRECT);
4427 if (res != operands[0])
4428 emit_move_insn (operands[0], res);
4429 return true;
4430 }
4431 }
4432
4433 if (!nonimmediate_operand (cop1, mode))
4434 cop1 = force_reg (mode, cop1);
4435 if (!general_operand (operands[1], data_mode))
4436 operands[1] = force_reg (data_mode, operands[1]);
4437 if (!general_operand (operands[2], data_mode))
4438 operands[2] = force_reg (data_mode, operands[2]);
4439
4440 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
4441 operands[1], operands[2], &negate);
4442
4443 if (!x)
4444 return false;
4445
4446 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
4447 operands[2-negate]);
4448 return true;
4449 }
4450
4451 static bool
4452 ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
4453 struct expand_vec_perm_d *d)
4454 {
4455 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4456 expander, so args are either in d, or in op0, op1 etc. */
4457 machine_mode mode = GET_MODE (d ? d->op0 : op0);
4458 machine_mode maskmode = mode;
4459 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
4460
4461 switch (mode)
4462 {
4463 case E_V8HImode:
4464 if (TARGET_AVX512VL && TARGET_AVX512BW)
4465 gen = gen_avx512vl_vpermt2varv8hi3;
4466 break;
4467 case E_V16HImode:
4468 if (TARGET_AVX512VL && TARGET_AVX512BW)
4469 gen = gen_avx512vl_vpermt2varv16hi3;
4470 break;
4471 case E_V64QImode:
4472 if (TARGET_AVX512VBMI)
4473 gen = gen_avx512bw_vpermt2varv64qi3;
4474 break;
4475 case E_V32HImode:
4476 if (TARGET_AVX512BW)
4477 gen = gen_avx512bw_vpermt2varv32hi3;
4478 break;
4479 case E_V4SImode:
4480 if (TARGET_AVX512VL)
4481 gen = gen_avx512vl_vpermt2varv4si3;
4482 break;
4483 case E_V8SImode:
4484 if (TARGET_AVX512VL)
4485 gen = gen_avx512vl_vpermt2varv8si3;
4486 break;
4487 case E_V16SImode:
4488 if (TARGET_AVX512F)
4489 gen = gen_avx512f_vpermt2varv16si3;
4490 break;
4491 case E_V4SFmode:
4492 if (TARGET_AVX512VL)
4493 {
4494 gen = gen_avx512vl_vpermt2varv4sf3;
4495 maskmode = V4SImode;
4496 }
4497 break;
4498 case E_V8SFmode:
4499 if (TARGET_AVX512VL)
4500 {
4501 gen = gen_avx512vl_vpermt2varv8sf3;
4502 maskmode = V8SImode;
4503 }
4504 break;
4505 case E_V16SFmode:
4506 if (TARGET_AVX512F)
4507 {
4508 gen = gen_avx512f_vpermt2varv16sf3;
4509 maskmode = V16SImode;
4510 }
4511 break;
4512 case E_V2DImode:
4513 if (TARGET_AVX512VL)
4514 gen = gen_avx512vl_vpermt2varv2di3;
4515 break;
4516 case E_V4DImode:
4517 if (TARGET_AVX512VL)
4518 gen = gen_avx512vl_vpermt2varv4di3;
4519 break;
4520 case E_V8DImode:
4521 if (TARGET_AVX512F)
4522 gen = gen_avx512f_vpermt2varv8di3;
4523 break;
4524 case E_V2DFmode:
4525 if (TARGET_AVX512VL)
4526 {
4527 gen = gen_avx512vl_vpermt2varv2df3;
4528 maskmode = V2DImode;
4529 }
4530 break;
4531 case E_V4DFmode:
4532 if (TARGET_AVX512VL)
4533 {
4534 gen = gen_avx512vl_vpermt2varv4df3;
4535 maskmode = V4DImode;
4536 }
4537 break;
4538 case E_V8DFmode:
4539 if (TARGET_AVX512F)
4540 {
4541 gen = gen_avx512f_vpermt2varv8df3;
4542 maskmode = V8DImode;
4543 }
4544 break;
4545 default:
4546 break;
4547 }
4548
4549 if (gen == NULL)
4550 return false;
4551
4552 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4553 expander, so args are either in d, or in op0, op1 etc. */
4554 if (d)
4555 {
4556 rtx vec[64];
4557 target = d->target;
4558 op0 = d->op0;
4559 op1 = d->op1;
4560 for (int i = 0; i < d->nelt; ++i)
4561 vec[i] = GEN_INT (d->perm[i]);
4562 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
4563 }
4564
4565 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
4566 return true;
4567 }
4568
4569 /* Expand a variable vector permutation. */
4570
4571 void
4572 ix86_expand_vec_perm (rtx operands[])
4573 {
4574 rtx target = operands[0];
4575 rtx op0 = operands[1];
4576 rtx op1 = operands[2];
4577 rtx mask = operands[3];
4578 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
4579 machine_mode mode = GET_MODE (op0);
4580 machine_mode maskmode = GET_MODE (mask);
4581 int w, e, i;
4582 bool one_operand_shuffle = rtx_equal_p (op0, op1);
4583
4584 /* Number of elements in the vector. */
4585 w = GET_MODE_NUNITS (mode);
4586 e = GET_MODE_UNIT_SIZE (mode);
4587 gcc_assert (w <= 64);
4588
4589 if (TARGET_AVX512F && one_operand_shuffle)
4590 {
4591 rtx (*gen) (rtx, rtx, rtx) = NULL;
4592 switch (mode)
4593 {
4594 case E_V16SImode:
4595 gen =gen_avx512f_permvarv16si;
4596 break;
4597 case E_V16SFmode:
4598 gen = gen_avx512f_permvarv16sf;
4599 break;
4600 case E_V8DImode:
4601 gen = gen_avx512f_permvarv8di;
4602 break;
4603 case E_V8DFmode:
4604 gen = gen_avx512f_permvarv8df;
4605 break;
4606 default:
4607 break;
4608 }
4609 if (gen != NULL)
4610 {
4611 emit_insn (gen (target, op0, mask));
4612 return;
4613 }
4614 }
4615
4616 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
4617 return;
4618
4619 if (TARGET_AVX2)
4620 {
4621 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
4622 {
4623 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
4624 an constant shuffle operand. With a tiny bit of effort we can
4625 use VPERMD instead. A re-interpretation stall for V4DFmode is
4626 unfortunate but there's no avoiding it.
4627 Similarly for V16HImode we don't have instructions for variable
4628 shuffling, while for V32QImode we can use after preparing suitable
4629 masks vpshufb; vpshufb; vpermq; vpor. */
4630
4631 if (mode == V16HImode)
4632 {
4633 maskmode = mode = V32QImode;
4634 w = 32;
4635 e = 1;
4636 }
4637 else
4638 {
4639 maskmode = mode = V8SImode;
4640 w = 8;
4641 e = 4;
4642 }
4643 t1 = gen_reg_rtx (maskmode);
4644
4645 /* Replicate the low bits of the V4DImode mask into V8SImode:
4646 mask = { A B C D }
4647 t1 = { A A B B C C D D }. */
4648 for (i = 0; i < w / 2; ++i)
4649 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
4650 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
4651 vt = force_reg (maskmode, vt);
4652 mask = gen_lowpart (maskmode, mask);
4653 if (maskmode == V8SImode)
4654 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
4655 else
4656 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
4657
4658 /* Multiply the shuffle indicies by two. */
4659 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
4660 OPTAB_DIRECT);
4661
4662 /* Add one to the odd shuffle indicies:
4663 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
4664 for (i = 0; i < w / 2; ++i)
4665 {
4666 vec[i * 2] = const0_rtx;
4667 vec[i * 2 + 1] = const1_rtx;
4668 }
4669 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
4670 vt = validize_mem (force_const_mem (maskmode, vt));
4671 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
4672 OPTAB_DIRECT);
4673
4674 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
4675 operands[3] = mask = t1;
4676 target = gen_reg_rtx (mode);
4677 op0 = gen_lowpart (mode, op0);
4678 op1 = gen_lowpart (mode, op1);
4679 }
4680
4681 switch (mode)
4682 {
4683 case E_V8SImode:
4684 /* The VPERMD and VPERMPS instructions already properly ignore
4685 the high bits of the shuffle elements. No need for us to
4686 perform an AND ourselves. */
4687 if (one_operand_shuffle)
4688 {
4689 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
4690 if (target != operands[0])
4691 emit_move_insn (operands[0],
4692 gen_lowpart (GET_MODE (operands[0]), target));
4693 }
4694 else
4695 {
4696 t1 = gen_reg_rtx (V8SImode);
4697 t2 = gen_reg_rtx (V8SImode);
4698 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
4699 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
4700 goto merge_two;
4701 }
4702 return;
4703
4704 case E_V8SFmode:
4705 mask = gen_lowpart (V8SImode, mask);
4706 if (one_operand_shuffle)
4707 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
4708 else
4709 {
4710 t1 = gen_reg_rtx (V8SFmode);
4711 t2 = gen_reg_rtx (V8SFmode);
4712 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
4713 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
4714 goto merge_two;
4715 }
4716 return;
4717
4718 case E_V4SImode:
4719 /* By combining the two 128-bit input vectors into one 256-bit
4720 input vector, we can use VPERMD and VPERMPS for the full
4721 two-operand shuffle. */
4722 t1 = gen_reg_rtx (V8SImode);
4723 t2 = gen_reg_rtx (V8SImode);
4724 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
4725 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
4726 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
4727 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
4728 return;
4729
4730 case E_V4SFmode:
4731 t1 = gen_reg_rtx (V8SFmode);
4732 t2 = gen_reg_rtx (V8SImode);
4733 mask = gen_lowpart (V4SImode, mask);
4734 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
4735 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
4736 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
4737 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
4738 return;
4739
4740 case E_V32QImode:
4741 t1 = gen_reg_rtx (V32QImode);
4742 t2 = gen_reg_rtx (V32QImode);
4743 t3 = gen_reg_rtx (V32QImode);
4744 vt2 = GEN_INT (-128);
4745 vt = gen_const_vec_duplicate (V32QImode, vt2);
4746 vt = force_reg (V32QImode, vt);
4747 for (i = 0; i < 32; i++)
4748 vec[i] = i < 16 ? vt2 : const0_rtx;
4749 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
4750 vt2 = force_reg (V32QImode, vt2);
4751 /* From mask create two adjusted masks, which contain the same
4752 bits as mask in the low 7 bits of each vector element.
4753 The first mask will have the most significant bit clear
4754 if it requests element from the same 128-bit lane
4755 and MSB set if it requests element from the other 128-bit lane.
4756 The second mask will have the opposite values of the MSB,
4757 and additionally will have its 128-bit lanes swapped.
4758 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
4759 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
4760 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
4761 stands for other 12 bytes. */
4762 /* The bit whether element is from the same lane or the other
4763 lane is bit 4, so shift it up by 3 to the MSB position. */
4764 t5 = gen_reg_rtx (V4DImode);
4765 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
4766 GEN_INT (3)));
4767 /* Clear MSB bits from the mask just in case it had them set. */
4768 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
4769 /* After this t1 will have MSB set for elements from other lane. */
4770 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
4771 /* Clear bits other than MSB. */
4772 emit_insn (gen_andv32qi3 (t1, t1, vt));
4773 /* Or in the lower bits from mask into t3. */
4774 emit_insn (gen_iorv32qi3 (t3, t1, t2));
4775 /* And invert MSB bits in t1, so MSB is set for elements from the same
4776 lane. */
4777 emit_insn (gen_xorv32qi3 (t1, t1, vt));
4778 /* Swap 128-bit lanes in t3. */
4779 t6 = gen_reg_rtx (V4DImode);
4780 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
4781 const2_rtx, GEN_INT (3),
4782 const0_rtx, const1_rtx));
4783 /* And or in the lower bits from mask into t1. */
4784 emit_insn (gen_iorv32qi3 (t1, t1, t2));
4785 if (one_operand_shuffle)
4786 {
4787 /* Each of these shuffles will put 0s in places where
4788 element from the other 128-bit lane is needed, otherwise
4789 will shuffle in the requested value. */
4790 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
4791 gen_lowpart (V32QImode, t6)));
4792 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
4793 /* For t3 the 128-bit lanes are swapped again. */
4794 t7 = gen_reg_rtx (V4DImode);
4795 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
4796 const2_rtx, GEN_INT (3),
4797 const0_rtx, const1_rtx));
4798 /* And oring both together leads to the result. */
4799 emit_insn (gen_iorv32qi3 (target, t1,
4800 gen_lowpart (V32QImode, t7)));
4801 if (target != operands[0])
4802 emit_move_insn (operands[0],
4803 gen_lowpart (GET_MODE (operands[0]), target));
4804 return;
4805 }
4806
4807 t4 = gen_reg_rtx (V32QImode);
4808 /* Similarly to the above one_operand_shuffle code,
4809 just for repeated twice for each operand. merge_two:
4810 code will merge the two results together. */
4811 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
4812 gen_lowpart (V32QImode, t6)));
4813 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
4814 gen_lowpart (V32QImode, t6)));
4815 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
4816 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
4817 t7 = gen_reg_rtx (V4DImode);
4818 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
4819 const2_rtx, GEN_INT (3),
4820 const0_rtx, const1_rtx));
4821 t8 = gen_reg_rtx (V4DImode);
4822 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
4823 const2_rtx, GEN_INT (3),
4824 const0_rtx, const1_rtx));
4825 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
4826 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
4827 t1 = t4;
4828 t2 = t3;
4829 goto merge_two;
4830
4831 default:
4832 gcc_assert (GET_MODE_SIZE (mode) <= 16);
4833 break;
4834 }
4835 }
4836
4837 if (TARGET_XOP)
4838 {
4839 /* The XOP VPPERM insn supports three inputs. By ignoring the
4840 one_operand_shuffle special case, we avoid creating another
4841 set of constant vectors in memory. */
4842 one_operand_shuffle = false;
4843
4844 /* mask = mask & {2*w-1, ...} */
4845 vt = GEN_INT (2*w - 1);
4846 }
4847 else
4848 {
4849 /* mask = mask & {w-1, ...} */
4850 vt = GEN_INT (w - 1);
4851 }
4852
4853 vt = gen_const_vec_duplicate (maskmode, vt);
4854 mask = expand_simple_binop (maskmode, AND, mask, vt,
4855 NULL_RTX, 0, OPTAB_DIRECT);
4856
4857 /* For non-QImode operations, convert the word permutation control
4858 into a byte permutation control. */
4859 if (mode != V16QImode)
4860 {
4861 mask = expand_simple_binop (maskmode, ASHIFT, mask,
4862 GEN_INT (exact_log2 (e)),
4863 NULL_RTX, 0, OPTAB_DIRECT);
4864
4865 /* Convert mask to vector of chars. */
4866 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
4867
4868 /* Replicate each of the input bytes into byte positions:
4869 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
4870 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
4871 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
4872 for (i = 0; i < 16; ++i)
4873 vec[i] = GEN_INT (i/e * e);
4874 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
4875 vt = validize_mem (force_const_mem (V16QImode, vt));
4876 if (TARGET_XOP)
4877 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
4878 else
4879 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
4880
4881 /* Convert it into the byte positions by doing
4882 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
4883 for (i = 0; i < 16; ++i)
4884 vec[i] = GEN_INT (i % e);
4885 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
4886 vt = validize_mem (force_const_mem (V16QImode, vt));
4887 emit_insn (gen_addv16qi3 (mask, mask, vt));
4888 }
4889
4890 /* The actual shuffle operations all operate on V16QImode. */
4891 op0 = gen_lowpart (V16QImode, op0);
4892 op1 = gen_lowpart (V16QImode, op1);
4893
4894 if (TARGET_XOP)
4895 {
4896 if (GET_MODE (target) != V16QImode)
4897 target = gen_reg_rtx (V16QImode);
4898 emit_insn (gen_xop_pperm (target, op0, op1, mask));
4899 if (target != operands[0])
4900 emit_move_insn (operands[0],
4901 gen_lowpart (GET_MODE (operands[0]), target));
4902 }
4903 else if (one_operand_shuffle)
4904 {
4905 if (GET_MODE (target) != V16QImode)
4906 target = gen_reg_rtx (V16QImode);
4907 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
4908 if (target != operands[0])
4909 emit_move_insn (operands[0],
4910 gen_lowpart (GET_MODE (operands[0]), target));
4911 }
4912 else
4913 {
4914 rtx xops[6];
4915 bool ok;
4916
4917 /* Shuffle the two input vectors independently. */
4918 t1 = gen_reg_rtx (V16QImode);
4919 t2 = gen_reg_rtx (V16QImode);
4920 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
4921 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
4922
4923 merge_two:
4924 /* Then merge them together. The key is whether any given control
4925 element contained a bit set that indicates the second word. */
4926 mask = operands[3];
4927 vt = GEN_INT (w);
4928 if (maskmode == V2DImode && !TARGET_SSE4_1)
4929 {
4930 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
4931 more shuffle to convert the V2DI input mask into a V4SI
4932 input mask. At which point the masking that expand_int_vcond
4933 will work as desired. */
4934 rtx t3 = gen_reg_rtx (V4SImode);
4935 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
4936 const0_rtx, const0_rtx,
4937 const2_rtx, const2_rtx));
4938 mask = t3;
4939 maskmode = V4SImode;
4940 e = w = 4;
4941 }
4942
4943 vt = gen_const_vec_duplicate (maskmode, vt);
4944 vt = force_reg (maskmode, vt);
4945 mask = expand_simple_binop (maskmode, AND, mask, vt,
4946 NULL_RTX, 0, OPTAB_DIRECT);
4947
4948 if (GET_MODE (target) != mode)
4949 target = gen_reg_rtx (mode);
4950 xops[0] = target;
4951 xops[1] = gen_lowpart (mode, t2);
4952 xops[2] = gen_lowpart (mode, t1);
4953 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
4954 xops[4] = mask;
4955 xops[5] = vt;
4956 ok = ix86_expand_int_vcond (xops);
4957 gcc_assert (ok);
4958 if (target != operands[0])
4959 emit_move_insn (operands[0],
4960 gen_lowpart (GET_MODE (operands[0]), target));
4961 }
4962 }
4963
4964 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
4965 true if we should do zero extension, else sign extension. HIGH_P is
4966 true if we want the N/2 high elements, else the low elements. */
4967
4968 void
4969 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
4970 {
4971 machine_mode imode = GET_MODE (src);
4972 rtx tmp;
4973
4974 if (TARGET_SSE4_1)
4975 {
4976 rtx (*unpack)(rtx, rtx);
4977 rtx (*extract)(rtx, rtx) = NULL;
4978 machine_mode halfmode = BLKmode;
4979
4980 switch (imode)
4981 {
4982 case E_V64QImode:
4983 if (unsigned_p)
4984 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
4985 else
4986 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
4987 halfmode = V32QImode;
4988 extract
4989 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
4990 break;
4991 case E_V32QImode:
4992 if (unsigned_p)
4993 unpack = gen_avx2_zero_extendv16qiv16hi2;
4994 else
4995 unpack = gen_avx2_sign_extendv16qiv16hi2;
4996 halfmode = V16QImode;
4997 extract
4998 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
4999 break;
5000 case E_V32HImode:
5001 if (unsigned_p)
5002 unpack = gen_avx512f_zero_extendv16hiv16si2;
5003 else
5004 unpack = gen_avx512f_sign_extendv16hiv16si2;
5005 halfmode = V16HImode;
5006 extract
5007 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
5008 break;
5009 case E_V16HImode:
5010 if (unsigned_p)
5011 unpack = gen_avx2_zero_extendv8hiv8si2;
5012 else
5013 unpack = gen_avx2_sign_extendv8hiv8si2;
5014 halfmode = V8HImode;
5015 extract
5016 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
5017 break;
5018 case E_V16SImode:
5019 if (unsigned_p)
5020 unpack = gen_avx512f_zero_extendv8siv8di2;
5021 else
5022 unpack = gen_avx512f_sign_extendv8siv8di2;
5023 halfmode = V8SImode;
5024 extract
5025 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
5026 break;
5027 case E_V8SImode:
5028 if (unsigned_p)
5029 unpack = gen_avx2_zero_extendv4siv4di2;
5030 else
5031 unpack = gen_avx2_sign_extendv4siv4di2;
5032 halfmode = V4SImode;
5033 extract
5034 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
5035 break;
5036 case E_V16QImode:
5037 if (unsigned_p)
5038 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
5039 else
5040 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
5041 break;
5042 case E_V8HImode:
5043 if (unsigned_p)
5044 unpack = gen_sse4_1_zero_extendv4hiv4si2;
5045 else
5046 unpack = gen_sse4_1_sign_extendv4hiv4si2;
5047 break;
5048 case E_V4SImode:
5049 if (unsigned_p)
5050 unpack = gen_sse4_1_zero_extendv2siv2di2;
5051 else
5052 unpack = gen_sse4_1_sign_extendv2siv2di2;
5053 break;
5054 default:
5055 gcc_unreachable ();
5056 }
5057
5058 if (GET_MODE_SIZE (imode) >= 32)
5059 {
5060 tmp = gen_reg_rtx (halfmode);
5061 emit_insn (extract (tmp, src));
5062 }
5063 else if (high_p)
5064 {
5065 /* Shift higher 8 bytes to lower 8 bytes. */
5066 tmp = gen_reg_rtx (V1TImode);
5067 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
5068 GEN_INT (64)));
5069 tmp = gen_lowpart (imode, tmp);
5070 }
5071 else
5072 tmp = src;
5073
5074 emit_insn (unpack (dest, tmp));
5075 }
5076 else
5077 {
5078 rtx (*unpack)(rtx, rtx, rtx);
5079
5080 switch (imode)
5081 {
5082 case E_V16QImode:
5083 if (high_p)
5084 unpack = gen_vec_interleave_highv16qi;
5085 else
5086 unpack = gen_vec_interleave_lowv16qi;
5087 break;
5088 case E_V8HImode:
5089 if (high_p)
5090 unpack = gen_vec_interleave_highv8hi;
5091 else
5092 unpack = gen_vec_interleave_lowv8hi;
5093 break;
5094 case E_V4SImode:
5095 if (high_p)
5096 unpack = gen_vec_interleave_highv4si;
5097 else
5098 unpack = gen_vec_interleave_lowv4si;
5099 break;
5100 default:
5101 gcc_unreachable ();
5102 }
5103
5104 if (unsigned_p)
5105 tmp = force_reg (imode, CONST0_RTX (imode));
5106 else
5107 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
5108 src, pc_rtx, pc_rtx);
5109
5110 rtx tmp2 = gen_reg_rtx (imode);
5111 emit_insn (unpack (tmp2, src, tmp));
5112 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
5113 }
5114 }
5115
5116 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5117 but works for floating pointer parameters and nonoffsetable memories.
5118 For pushes, it returns just stack offsets; the values will be saved
5119 in the right order. Maximally three parts are generated. */
5120
5121 static int
5122 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
5123 {
5124 int size;
5125
5126 if (!TARGET_64BIT)
5127 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
5128 else
5129 size = (GET_MODE_SIZE (mode) + 4) / 8;
5130
5131 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
5132 gcc_assert (size >= 2 && size <= 4);
5133
5134 /* Optimize constant pool reference to immediates. This is used by fp
5135 moves, that force all constants to memory to allow combining. */
5136 if (MEM_P (operand) && MEM_READONLY_P (operand))
5137 operand = avoid_constant_pool_reference (operand);
5138
5139 if (MEM_P (operand) && !offsettable_memref_p (operand))
5140 {
5141 /* The only non-offsetable memories we handle are pushes. */
5142 int ok = push_operand (operand, VOIDmode);
5143
5144 gcc_assert (ok);
5145
5146 operand = copy_rtx (operand);
5147 PUT_MODE (operand, word_mode);
5148 parts[0] = parts[1] = parts[2] = parts[3] = operand;
5149 return size;
5150 }
5151
5152 if (GET_CODE (operand) == CONST_VECTOR)
5153 {
5154 scalar_int_mode imode = int_mode_for_mode (mode).require ();
5155 /* Caution: if we looked through a constant pool memory above,
5156 the operand may actually have a different mode now. That's
5157 ok, since we want to pun this all the way back to an integer. */
5158 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
5159 gcc_assert (operand != NULL);
5160 mode = imode;
5161 }
5162
5163 if (!TARGET_64BIT)
5164 {
5165 if (mode == DImode)
5166 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5167 else
5168 {
5169 int i;
5170
5171 if (REG_P (operand))
5172 {
5173 gcc_assert (reload_completed);
5174 for (i = 0; i < size; i++)
5175 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
5176 }
5177 else if (offsettable_memref_p (operand))
5178 {
5179 operand = adjust_address (operand, SImode, 0);
5180 parts[0] = operand;
5181 for (i = 1; i < size; i++)
5182 parts[i] = adjust_address (operand, SImode, 4 * i);
5183 }
5184 else if (CONST_DOUBLE_P (operand))
5185 {
5186 const REAL_VALUE_TYPE *r;
5187 long l[4];
5188
5189 r = CONST_DOUBLE_REAL_VALUE (operand);
5190 switch (mode)
5191 {
5192 case E_TFmode:
5193 real_to_target (l, r, mode);
5194 parts[3] = gen_int_mode (l[3], SImode);
5195 parts[2] = gen_int_mode (l[2], SImode);
5196 break;
5197 case E_XFmode:
5198 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5199 long double may not be 80-bit. */
5200 real_to_target (l, r, mode);
5201 parts[2] = gen_int_mode (l[2], SImode);
5202 break;
5203 case E_DFmode:
5204 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
5205 break;
5206 default:
5207 gcc_unreachable ();
5208 }
5209 parts[1] = gen_int_mode (l[1], SImode);
5210 parts[0] = gen_int_mode (l[0], SImode);
5211 }
5212 else
5213 gcc_unreachable ();
5214 }
5215 }
5216 else
5217 {
5218 if (mode == TImode)
5219 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5220 if (mode == XFmode || mode == TFmode)
5221 {
5222 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
5223 if (REG_P (operand))
5224 {
5225 gcc_assert (reload_completed);
5226 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
5227 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
5228 }
5229 else if (offsettable_memref_p (operand))
5230 {
5231 operand = adjust_address (operand, DImode, 0);
5232 parts[0] = operand;
5233 parts[1] = adjust_address (operand, upper_mode, 8);
5234 }
5235 else if (CONST_DOUBLE_P (operand))
5236 {
5237 long l[4];
5238
5239 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
5240
5241 /* real_to_target puts 32-bit pieces in each long. */
5242 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
5243 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
5244 << 32), DImode);
5245
5246 if (upper_mode == SImode)
5247 parts[1] = gen_int_mode (l[2], SImode);
5248 else
5249 parts[1]
5250 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
5251 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
5252 << 32), DImode);
5253 }
5254 else
5255 gcc_unreachable ();
5256 }
5257 }
5258
5259 return size;
5260 }
5261
5262 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5263 Return false when normal moves are needed; true when all required
5264 insns have been emitted. Operands 2-4 contain the input values
5265 int the correct order; operands 5-7 contain the output values. */
5266
5267 void
5268 ix86_split_long_move (rtx operands[])
5269 {
5270 rtx part[2][4];
5271 int nparts, i, j;
5272 int push = 0;
5273 int collisions = 0;
5274 machine_mode mode = GET_MODE (operands[0]);
5275 bool collisionparts[4];
5276
5277 /* The DFmode expanders may ask us to move double.
5278 For 64bit target this is single move. By hiding the fact
5279 here we simplify i386.md splitters. */
5280 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
5281 {
5282 /* Optimize constant pool reference to immediates. This is used by
5283 fp moves, that force all constants to memory to allow combining. */
5284
5285 if (MEM_P (operands[1])
5286 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
5287 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
5288 operands[1] = get_pool_constant (XEXP (operands[1], 0));
5289 if (push_operand (operands[0], VOIDmode))
5290 {
5291 operands[0] = copy_rtx (operands[0]);
5292 PUT_MODE (operands[0], word_mode);
5293 }
5294 else
5295 operands[0] = gen_lowpart (DImode, operands[0]);
5296 operands[1] = gen_lowpart (DImode, operands[1]);
5297 emit_move_insn (operands[0], operands[1]);
5298 return;
5299 }
5300
5301 /* The only non-offsettable memory we handle is push. */
5302 if (push_operand (operands[0], VOIDmode))
5303 push = 1;
5304 else
5305 gcc_assert (!MEM_P (operands[0])
5306 || offsettable_memref_p (operands[0]));
5307
5308 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
5309 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
5310
5311 /* When emitting push, take care for source operands on the stack. */
5312 if (push && MEM_P (operands[1])
5313 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
5314 {
5315 rtx src_base = XEXP (part[1][nparts - 1], 0);
5316
5317 /* Compensate for the stack decrement by 4. */
5318 if (!TARGET_64BIT && nparts == 3
5319 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
5320 src_base = plus_constant (Pmode, src_base, 4);
5321
5322 /* src_base refers to the stack pointer and is
5323 automatically decreased by emitted push. */
5324 for (i = 0; i < nparts; i++)
5325 part[1][i] = change_address (part[1][i],
5326 GET_MODE (part[1][i]), src_base);
5327 }
5328
5329 /* We need to do copy in the right order in case an address register
5330 of the source overlaps the destination. */
5331 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
5332 {
5333 rtx tmp;
5334
5335 for (i = 0; i < nparts; i++)
5336 {
5337 collisionparts[i]
5338 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
5339 if (collisionparts[i])
5340 collisions++;
5341 }
5342
5343 /* Collision in the middle part can be handled by reordering. */
5344 if (collisions == 1 && nparts == 3 && collisionparts [1])
5345 {
5346 std::swap (part[0][1], part[0][2]);
5347 std::swap (part[1][1], part[1][2]);
5348 }
5349 else if (collisions == 1
5350 && nparts == 4
5351 && (collisionparts [1] || collisionparts [2]))
5352 {
5353 if (collisionparts [1])
5354 {
5355 std::swap (part[0][1], part[0][2]);
5356 std::swap (part[1][1], part[1][2]);
5357 }
5358 else
5359 {
5360 std::swap (part[0][2], part[0][3]);
5361 std::swap (part[1][2], part[1][3]);
5362 }
5363 }
5364
5365 /* If there are more collisions, we can't handle it by reordering.
5366 Do an lea to the last part and use only one colliding move. */
5367 else if (collisions > 1)
5368 {
5369 rtx base, addr;
5370
5371 collisions = 1;
5372
5373 base = part[0][nparts - 1];
5374
5375 /* Handle the case when the last part isn't valid for lea.
5376 Happens in 64-bit mode storing the 12-byte XFmode. */
5377 if (GET_MODE (base) != Pmode)
5378 base = gen_rtx_REG (Pmode, REGNO (base));
5379
5380 addr = XEXP (part[1][0], 0);
5381 if (TARGET_TLS_DIRECT_SEG_REFS)
5382 {
5383 struct ix86_address parts;
5384 int ok = ix86_decompose_address (addr, &parts);
5385 gcc_assert (ok);
5386 /* It is not valid to use %gs: or %fs: in lea. */
5387 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
5388 }
5389 emit_insn (gen_rtx_SET (base, addr));
5390 part[1][0] = replace_equiv_address (part[1][0], base);
5391 for (i = 1; i < nparts; i++)
5392 {
5393 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
5394 part[1][i] = replace_equiv_address (part[1][i], tmp);
5395 }
5396 }
5397 }
5398
5399 if (push)
5400 {
5401 if (!TARGET_64BIT)
5402 {
5403 if (nparts == 3)
5404 {
5405 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
5406 emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4)));
5407 emit_move_insn (part[0][2], part[1][2]);
5408 }
5409 else if (nparts == 4)
5410 {
5411 emit_move_insn (part[0][3], part[1][3]);
5412 emit_move_insn (part[0][2], part[1][2]);
5413 }
5414 }
5415 else
5416 {
5417 /* In 64bit mode we don't have 32bit push available. In case this is
5418 register, it is OK - we will just use larger counterpart. We also
5419 retype memory - these comes from attempt to avoid REX prefix on
5420 moving of second half of TFmode value. */
5421 if (GET_MODE (part[1][1]) == SImode)
5422 {
5423 switch (GET_CODE (part[1][1]))
5424 {
5425 case MEM:
5426 part[1][1] = adjust_address (part[1][1], DImode, 0);
5427 break;
5428
5429 case REG:
5430 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
5431 break;
5432
5433 default:
5434 gcc_unreachable ();
5435 }
5436
5437 if (GET_MODE (part[1][0]) == SImode)
5438 part[1][0] = part[1][1];
5439 }
5440 }
5441 emit_move_insn (part[0][1], part[1][1]);
5442 emit_move_insn (part[0][0], part[1][0]);
5443 return;
5444 }
5445
5446 /* Choose correct order to not overwrite the source before it is copied. */
5447 if ((REG_P (part[0][0])
5448 && REG_P (part[1][1])
5449 && (REGNO (part[0][0]) == REGNO (part[1][1])
5450 || (nparts == 3
5451 && REGNO (part[0][0]) == REGNO (part[1][2]))
5452 || (nparts == 4
5453 && REGNO (part[0][0]) == REGNO (part[1][3]))))
5454 || (collisions > 0
5455 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
5456 {
5457 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
5458 {
5459 operands[2 + i] = part[0][j];
5460 operands[6 + i] = part[1][j];
5461 }
5462 }
5463 else
5464 {
5465 for (i = 0; i < nparts; i++)
5466 {
5467 operands[2 + i] = part[0][i];
5468 operands[6 + i] = part[1][i];
5469 }
5470 }
5471
5472 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
5473 if (optimize_insn_for_size_p ())
5474 {
5475 for (j = 0; j < nparts - 1; j++)
5476 if (CONST_INT_P (operands[6 + j])
5477 && operands[6 + j] != const0_rtx
5478 && REG_P (operands[2 + j]))
5479 for (i = j; i < nparts - 1; i++)
5480 if (CONST_INT_P (operands[7 + i])
5481 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
5482 operands[7 + i] = operands[2 + j];
5483 }
5484
5485 for (i = 0; i < nparts; i++)
5486 emit_move_insn (operands[2 + i], operands[6 + i]);
5487
5488 return;
5489 }
5490
5491 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
5492 left shift by a constant, either using a single shift or
5493 a sequence of add instructions. */
5494
5495 static void
5496 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
5497 {
5498 if (count == 1
5499 || (count * ix86_cost->add <= ix86_cost->shift_const
5500 && !optimize_insn_for_size_p ()))
5501 {
5502 while (count-- > 0)
5503 emit_insn (gen_add2_insn (operand, operand));
5504 }
5505 else
5506 {
5507 rtx (*insn)(rtx, rtx, rtx);
5508
5509 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
5510 emit_insn (insn (operand, operand, GEN_INT (count)));
5511 }
5512 }
5513
5514 void
5515 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
5516 {
5517 rtx (*gen_ashl3)(rtx, rtx, rtx);
5518 rtx (*gen_shld)(rtx, rtx, rtx);
5519 int half_width = GET_MODE_BITSIZE (mode) >> 1;
5520 machine_mode half_mode;
5521
5522 rtx low[2], high[2];
5523 int count;
5524
5525 if (CONST_INT_P (operands[2]))
5526 {
5527 split_double_mode (mode, operands, 2, low, high);
5528 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5529
5530 if (count >= half_width)
5531 {
5532 emit_move_insn (high[0], low[1]);
5533 emit_move_insn (low[0], const0_rtx);
5534
5535 if (count > half_width)
5536 ix86_expand_ashl_const (high[0], count - half_width, mode);
5537 }
5538 else
5539 {
5540 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
5541
5542 if (!rtx_equal_p (operands[0], operands[1]))
5543 emit_move_insn (operands[0], operands[1]);
5544
5545 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
5546 ix86_expand_ashl_const (low[0], count, mode);
5547 }
5548 return;
5549 }
5550
5551 split_double_mode (mode, operands, 1, low, high);
5552 half_mode = mode == DImode ? SImode : DImode;
5553
5554 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
5555
5556 if (operands[1] == const1_rtx)
5557 {
5558 /* Assuming we've chosen a QImode capable registers, then 1 << N
5559 can be done with two 32/64-bit shifts, no branches, no cmoves. */
5560 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
5561 {
5562 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
5563
5564 ix86_expand_clear (low[0]);
5565 ix86_expand_clear (high[0]);
5566 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
5567
5568 d = gen_lowpart (QImode, low[0]);
5569 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
5570 s = gen_rtx_EQ (QImode, flags, const0_rtx);
5571 emit_insn (gen_rtx_SET (d, s));
5572
5573 d = gen_lowpart (QImode, high[0]);
5574 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
5575 s = gen_rtx_NE (QImode, flags, const0_rtx);
5576 emit_insn (gen_rtx_SET (d, s));
5577 }
5578
5579 /* Otherwise, we can get the same results by manually performing
5580 a bit extract operation on bit 5/6, and then performing the two
5581 shifts. The two methods of getting 0/1 into low/high are exactly
5582 the same size. Avoiding the shift in the bit extract case helps
5583 pentium4 a bit; no one else seems to care much either way. */
5584 else
5585 {
5586 rtx (*gen_lshr3)(rtx, rtx, rtx);
5587 rtx (*gen_and3)(rtx, rtx, rtx);
5588 rtx (*gen_xor3)(rtx, rtx, rtx);
5589 HOST_WIDE_INT bits;
5590 rtx x;
5591
5592 if (mode == DImode)
5593 {
5594 gen_lshr3 = gen_lshrsi3;
5595 gen_and3 = gen_andsi3;
5596 gen_xor3 = gen_xorsi3;
5597 bits = 5;
5598 }
5599 else
5600 {
5601 gen_lshr3 = gen_lshrdi3;
5602 gen_and3 = gen_anddi3;
5603 gen_xor3 = gen_xordi3;
5604 bits = 6;
5605 }
5606
5607 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
5608 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
5609 else
5610 x = gen_lowpart (half_mode, operands[2]);
5611 emit_insn (gen_rtx_SET (high[0], x));
5612
5613 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
5614 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
5615 emit_move_insn (low[0], high[0]);
5616 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
5617 }
5618
5619 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
5620 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
5621 return;
5622 }
5623
5624 if (operands[1] == constm1_rtx)
5625 {
5626 /* For -1 << N, we can avoid the shld instruction, because we
5627 know that we're shifting 0...31/63 ones into a -1. */
5628 emit_move_insn (low[0], constm1_rtx);
5629 if (optimize_insn_for_size_p ())
5630 emit_move_insn (high[0], low[0]);
5631 else
5632 emit_move_insn (high[0], constm1_rtx);
5633 }
5634 else
5635 {
5636 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
5637
5638 if (!rtx_equal_p (operands[0], operands[1]))
5639 emit_move_insn (operands[0], operands[1]);
5640
5641 split_double_mode (mode, operands, 1, low, high);
5642 emit_insn (gen_shld (high[0], low[0], operands[2]));
5643 }
5644
5645 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
5646
5647 if (TARGET_CMOVE && scratch)
5648 {
5649 ix86_expand_clear (scratch);
5650 emit_insn (gen_x86_shift_adj_1
5651 (half_mode, high[0], low[0], operands[2], scratch));
5652 }
5653 else
5654 emit_insn (gen_x86_shift_adj_2 (half_mode, high[0], low[0], operands[2]));
5655 }
5656
5657 void
5658 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
5659 {
5660 rtx (*gen_ashr3)(rtx, rtx, rtx)
5661 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
5662 rtx (*gen_shrd)(rtx, rtx, rtx);
5663 int half_width = GET_MODE_BITSIZE (mode) >> 1;
5664
5665 rtx low[2], high[2];
5666 int count;
5667
5668 if (CONST_INT_P (operands[2]))
5669 {
5670 split_double_mode (mode, operands, 2, low, high);
5671 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5672
5673 if (count == GET_MODE_BITSIZE (mode) - 1)
5674 {
5675 emit_move_insn (high[0], high[1]);
5676 emit_insn (gen_ashr3 (high[0], high[0],
5677 GEN_INT (half_width - 1)));
5678 emit_move_insn (low[0], high[0]);
5679
5680 }
5681 else if (count >= half_width)
5682 {
5683 emit_move_insn (low[0], high[1]);
5684 emit_move_insn (high[0], low[0]);
5685 emit_insn (gen_ashr3 (high[0], high[0],
5686 GEN_INT (half_width - 1)));
5687
5688 if (count > half_width)
5689 emit_insn (gen_ashr3 (low[0], low[0],
5690 GEN_INT (count - half_width)));
5691 }
5692 else
5693 {
5694 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5695
5696 if (!rtx_equal_p (operands[0], operands[1]))
5697 emit_move_insn (operands[0], operands[1]);
5698
5699 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
5700 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
5701 }
5702 }
5703 else
5704 {
5705 machine_mode half_mode;
5706
5707 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5708
5709 if (!rtx_equal_p (operands[0], operands[1]))
5710 emit_move_insn (operands[0], operands[1]);
5711
5712 split_double_mode (mode, operands, 1, low, high);
5713 half_mode = mode == DImode ? SImode : DImode;
5714
5715 emit_insn (gen_shrd (low[0], high[0], operands[2]));
5716 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
5717
5718 if (TARGET_CMOVE && scratch)
5719 {
5720 emit_move_insn (scratch, high[0]);
5721 emit_insn (gen_ashr3 (scratch, scratch,
5722 GEN_INT (half_width - 1)));
5723 emit_insn (gen_x86_shift_adj_1
5724 (half_mode, low[0], high[0], operands[2], scratch));
5725 }
5726 else
5727 emit_insn (gen_x86_shift_adj_3
5728 (half_mode, low[0], high[0], operands[2]));
5729 }
5730 }
5731
5732 void
5733 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
5734 {
5735 rtx (*gen_lshr3)(rtx, rtx, rtx)
5736 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
5737 rtx (*gen_shrd)(rtx, rtx, rtx);
5738 int half_width = GET_MODE_BITSIZE (mode) >> 1;
5739
5740 rtx low[2], high[2];
5741 int count;
5742
5743 if (CONST_INT_P (operands[2]))
5744 {
5745 split_double_mode (mode, operands, 2, low, high);
5746 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5747
5748 if (count >= half_width)
5749 {
5750 emit_move_insn (low[0], high[1]);
5751 ix86_expand_clear (high[0]);
5752
5753 if (count > half_width)
5754 emit_insn (gen_lshr3 (low[0], low[0],
5755 GEN_INT (count - half_width)));
5756 }
5757 else
5758 {
5759 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5760
5761 if (!rtx_equal_p (operands[0], operands[1]))
5762 emit_move_insn (operands[0], operands[1]);
5763
5764 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
5765 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
5766 }
5767 }
5768 else
5769 {
5770 machine_mode half_mode;
5771
5772 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5773
5774 if (!rtx_equal_p (operands[0], operands[1]))
5775 emit_move_insn (operands[0], operands[1]);
5776
5777 split_double_mode (mode, operands, 1, low, high);
5778 half_mode = mode == DImode ? SImode : DImode;
5779
5780 emit_insn (gen_shrd (low[0], high[0], operands[2]));
5781 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
5782
5783 if (TARGET_CMOVE && scratch)
5784 {
5785 ix86_expand_clear (scratch);
5786 emit_insn (gen_x86_shift_adj_1
5787 (half_mode, low[0], high[0], operands[2], scratch));
5788 }
5789 else
5790 emit_insn (gen_x86_shift_adj_2
5791 (half_mode, low[0], high[0], operands[2]));
5792 }
5793 }
5794
5795 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
5796 DImode for constant loop counts. */
5797
5798 static machine_mode
5799 counter_mode (rtx count_exp)
5800 {
5801 if (GET_MODE (count_exp) != VOIDmode)
5802 return GET_MODE (count_exp);
5803 if (!CONST_INT_P (count_exp))
5804 return Pmode;
5805 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
5806 return DImode;
5807 return SImode;
5808 }
5809
5810 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
5811 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
5812 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
5813 memory by VALUE (supposed to be in MODE).
5814
5815 The size is rounded down to whole number of chunk size moved at once.
5816 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
5817
5818
5819 static void
5820 expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
5821 rtx destptr, rtx srcptr, rtx value,
5822 rtx count, machine_mode mode, int unroll,
5823 int expected_size, bool issetmem)
5824 {
5825 rtx_code_label *out_label, *top_label;
5826 rtx iter, tmp;
5827 machine_mode iter_mode = counter_mode (count);
5828 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
5829 rtx piece_size = GEN_INT (piece_size_n);
5830 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
5831 rtx size;
5832 int i;
5833
5834 top_label = gen_label_rtx ();
5835 out_label = gen_label_rtx ();
5836 iter = gen_reg_rtx (iter_mode);
5837
5838 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
5839 NULL, 1, OPTAB_DIRECT);
5840 /* Those two should combine. */
5841 if (piece_size == const1_rtx)
5842 {
5843 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
5844 true, out_label);
5845 predict_jump (REG_BR_PROB_BASE * 10 / 100);
5846 }
5847 emit_move_insn (iter, const0_rtx);
5848
5849 emit_label (top_label);
5850
5851 tmp = convert_modes (Pmode, iter_mode, iter, true);
5852
5853 /* This assert could be relaxed - in this case we'll need to compute
5854 smallest power of two, containing in PIECE_SIZE_N and pass it to
5855 offset_address. */
5856 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
5857 destmem = offset_address (destmem, tmp, piece_size_n);
5858 destmem = adjust_address (destmem, mode, 0);
5859
5860 if (!issetmem)
5861 {
5862 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
5863 srcmem = adjust_address (srcmem, mode, 0);
5864
5865 /* When unrolling for chips that reorder memory reads and writes,
5866 we can save registers by using single temporary.
5867 Also using 4 temporaries is overkill in 32bit mode. */
5868 if (!TARGET_64BIT && 0)
5869 {
5870 for (i = 0; i < unroll; i++)
5871 {
5872 if (i)
5873 {
5874 destmem = adjust_address (copy_rtx (destmem), mode,
5875 GET_MODE_SIZE (mode));
5876 srcmem = adjust_address (copy_rtx (srcmem), mode,
5877 GET_MODE_SIZE (mode));
5878 }
5879 emit_move_insn (destmem, srcmem);
5880 }
5881 }
5882 else
5883 {
5884 rtx tmpreg[4];
5885 gcc_assert (unroll <= 4);
5886 for (i = 0; i < unroll; i++)
5887 {
5888 tmpreg[i] = gen_reg_rtx (mode);
5889 if (i)
5890 srcmem = adjust_address (copy_rtx (srcmem), mode,
5891 GET_MODE_SIZE (mode));
5892 emit_move_insn (tmpreg[i], srcmem);
5893 }
5894 for (i = 0; i < unroll; i++)
5895 {
5896 if (i)
5897 destmem = adjust_address (copy_rtx (destmem), mode,
5898 GET_MODE_SIZE (mode));
5899 emit_move_insn (destmem, tmpreg[i]);
5900 }
5901 }
5902 }
5903 else
5904 for (i = 0; i < unroll; i++)
5905 {
5906 if (i)
5907 destmem = adjust_address (copy_rtx (destmem), mode,
5908 GET_MODE_SIZE (mode));
5909 emit_move_insn (destmem, value);
5910 }
5911
5912 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
5913 true, OPTAB_LIB_WIDEN);
5914 if (tmp != iter)
5915 emit_move_insn (iter, tmp);
5916
5917 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
5918 true, top_label);
5919 if (expected_size != -1)
5920 {
5921 expected_size /= GET_MODE_SIZE (mode) * unroll;
5922 if (expected_size == 0)
5923 predict_jump (0);
5924 else if (expected_size > REG_BR_PROB_BASE)
5925 predict_jump (REG_BR_PROB_BASE - 1);
5926 else
5927 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
5928 / expected_size);
5929 }
5930 else
5931 predict_jump (REG_BR_PROB_BASE * 80 / 100);
5932 iter = ix86_zero_extend_to_Pmode (iter);
5933 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
5934 true, OPTAB_LIB_WIDEN);
5935 if (tmp != destptr)
5936 emit_move_insn (destptr, tmp);
5937 if (!issetmem)
5938 {
5939 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
5940 true, OPTAB_LIB_WIDEN);
5941 if (tmp != srcptr)
5942 emit_move_insn (srcptr, tmp);
5943 }
5944 emit_label (out_label);
5945 }
5946
5947 /* Divide COUNTREG by SCALE. */
5948 static rtx
5949 scale_counter (rtx countreg, int scale)
5950 {
5951 rtx sc;
5952
5953 if (scale == 1)
5954 return countreg;
5955 if (CONST_INT_P (countreg))
5956 return GEN_INT (INTVAL (countreg) / scale);
5957 gcc_assert (REG_P (countreg));
5958
5959 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
5960 GEN_INT (exact_log2 (scale)),
5961 NULL, 1, OPTAB_DIRECT);
5962 return sc;
5963 }
5964
5965 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
5966 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
5967 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
5968 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
5969 ORIG_VALUE is the original value passed to memset to fill the memory with.
5970 Other arguments have same meaning as for previous function. */
5971
5972 static void
5973 expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
5974 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
5975 rtx count,
5976 machine_mode mode, bool issetmem)
5977 {
5978 rtx destexp;
5979 rtx srcexp;
5980 rtx countreg;
5981 HOST_WIDE_INT rounded_count;
5982
5983 /* If possible, it is shorter to use rep movs.
5984 TODO: Maybe it is better to move this logic to decide_alg. */
5985 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
5986 && !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
5987 && (!issetmem || orig_value == const0_rtx))
5988 mode = SImode;
5989
5990 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
5991 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
5992
5993 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
5994 GET_MODE_SIZE (mode)));
5995 if (mode != QImode)
5996 {
5997 destexp = gen_rtx_ASHIFT (Pmode, countreg,
5998 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
5999 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
6000 }
6001 else
6002 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
6003 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
6004 {
6005 rounded_count
6006 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
6007 destmem = shallow_copy_rtx (destmem);
6008 set_mem_size (destmem, rounded_count);
6009 }
6010 else if (MEM_SIZE_KNOWN_P (destmem))
6011 clear_mem_size (destmem);
6012
6013 if (issetmem)
6014 {
6015 value = force_reg (mode, gen_lowpart (mode, value));
6016 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
6017 }
6018 else
6019 {
6020 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
6021 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
6022 if (mode != QImode)
6023 {
6024 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
6025 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
6026 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
6027 }
6028 else
6029 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
6030 if (CONST_INT_P (count))
6031 {
6032 rounded_count
6033 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
6034 srcmem = shallow_copy_rtx (srcmem);
6035 set_mem_size (srcmem, rounded_count);
6036 }
6037 else
6038 {
6039 if (MEM_SIZE_KNOWN_P (srcmem))
6040 clear_mem_size (srcmem);
6041 }
6042 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
6043 destexp, srcexp));
6044 }
6045 }
6046
6047 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
6048 DESTMEM.
6049 SRC is passed by pointer to be updated on return.
6050 Return value is updated DST. */
6051 static rtx
6052 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
6053 HOST_WIDE_INT size_to_move)
6054 {
6055 rtx dst = destmem, src = *srcmem, tempreg;
6056 enum insn_code code;
6057 machine_mode move_mode;
6058 int piece_size, i;
6059
6060 /* Find the widest mode in which we could perform moves.
6061 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6062 it until move of such size is supported. */
6063 piece_size = 1 << floor_log2 (size_to_move);
6064 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
6065 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
6066 {
6067 gcc_assert (piece_size > 1);
6068 piece_size >>= 1;
6069 }
6070
6071 /* Find the corresponding vector mode with the same size as MOVE_MODE.
6072 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
6073 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
6074 {
6075 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
6076 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
6077 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
6078 {
6079 move_mode = word_mode;
6080 piece_size = GET_MODE_SIZE (move_mode);
6081 code = optab_handler (mov_optab, move_mode);
6082 }
6083 }
6084 gcc_assert (code != CODE_FOR_nothing);
6085
6086 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
6087 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
6088
6089 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6090 gcc_assert (size_to_move % piece_size == 0);
6091
6092 for (i = 0; i < size_to_move; i += piece_size)
6093 {
6094 /* We move from memory to memory, so we'll need to do it via
6095 a temporary register. */
6096 tempreg = gen_reg_rtx (move_mode);
6097 emit_insn (GEN_FCN (code) (tempreg, src));
6098 emit_insn (GEN_FCN (code) (dst, tempreg));
6099
6100 emit_move_insn (destptr,
6101 plus_constant (Pmode, copy_rtx (destptr), piece_size));
6102 emit_move_insn (srcptr,
6103 plus_constant (Pmode, copy_rtx (srcptr), piece_size));
6104
6105 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6106 piece_size);
6107 src = adjust_automodify_address_nv (src, move_mode, srcptr,
6108 piece_size);
6109 }
6110
6111 /* Update DST and SRC rtx. */
6112 *srcmem = src;
6113 return dst;
6114 }
6115
6116 /* Helper function for the string operations below. Dest VARIABLE whether
6117 it is aligned to VALUE bytes. If true, jump to the label. */
6118
6119 static rtx_code_label *
6120 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
6121 {
6122 rtx_code_label *label = gen_label_rtx ();
6123 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
6124 if (GET_MODE (variable) == DImode)
6125 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
6126 else
6127 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
6128 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
6129 1, label);
6130 if (epilogue)
6131 predict_jump (REG_BR_PROB_BASE * 50 / 100);
6132 else
6133 predict_jump (REG_BR_PROB_BASE * 90 / 100);
6134 return label;
6135 }
6136
6137
6138 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
6139
6140 static void
6141 expand_cpymem_epilogue (rtx destmem, rtx srcmem,
6142 rtx destptr, rtx srcptr, rtx count, int max_size)
6143 {
6144 rtx src, dest;
6145 if (CONST_INT_P (count))
6146 {
6147 HOST_WIDE_INT countval = INTVAL (count);
6148 HOST_WIDE_INT epilogue_size = countval % max_size;
6149 int i;
6150
6151 /* For now MAX_SIZE should be a power of 2. This assert could be
6152 relaxed, but it'll require a bit more complicated epilogue
6153 expanding. */
6154 gcc_assert ((max_size & (max_size - 1)) == 0);
6155 for (i = max_size; i >= 1; i >>= 1)
6156 {
6157 if (epilogue_size & i)
6158 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
6159 }
6160 return;
6161 }
6162 if (max_size > 8)
6163 {
6164 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
6165 count, 1, OPTAB_DIRECT);
6166 expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
6167 count, QImode, 1, 4, false);
6168 return;
6169 }
6170
6171 /* When there are stringops, we can cheaply increase dest and src pointers.
6172 Otherwise we save code size by maintaining offset (zero is readily
6173 available from preceding rep operation) and using x86 addressing modes.
6174 */
6175 if (TARGET_SINGLE_STRINGOP)
6176 {
6177 if (max_size > 4)
6178 {
6179 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6180 src = change_address (srcmem, SImode, srcptr);
6181 dest = change_address (destmem, SImode, destptr);
6182 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6183 emit_label (label);
6184 LABEL_NUSES (label) = 1;
6185 }
6186 if (max_size > 2)
6187 {
6188 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6189 src = change_address (srcmem, HImode, srcptr);
6190 dest = change_address (destmem, HImode, destptr);
6191 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6192 emit_label (label);
6193 LABEL_NUSES (label) = 1;
6194 }
6195 if (max_size > 1)
6196 {
6197 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6198 src = change_address (srcmem, QImode, srcptr);
6199 dest = change_address (destmem, QImode, destptr);
6200 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6201 emit_label (label);
6202 LABEL_NUSES (label) = 1;
6203 }
6204 }
6205 else
6206 {
6207 rtx offset = force_reg (Pmode, const0_rtx);
6208 rtx tmp;
6209
6210 if (max_size > 4)
6211 {
6212 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6213 src = change_address (srcmem, SImode, srcptr);
6214 dest = change_address (destmem, SImode, destptr);
6215 emit_move_insn (dest, src);
6216 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
6217 true, OPTAB_LIB_WIDEN);
6218 if (tmp != offset)
6219 emit_move_insn (offset, tmp);
6220 emit_label (label);
6221 LABEL_NUSES (label) = 1;
6222 }
6223 if (max_size > 2)
6224 {
6225 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6226 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
6227 src = change_address (srcmem, HImode, tmp);
6228 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
6229 dest = change_address (destmem, HImode, tmp);
6230 emit_move_insn (dest, src);
6231 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
6232 true, OPTAB_LIB_WIDEN);
6233 if (tmp != offset)
6234 emit_move_insn (offset, tmp);
6235 emit_label (label);
6236 LABEL_NUSES (label) = 1;
6237 }
6238 if (max_size > 1)
6239 {
6240 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6241 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
6242 src = change_address (srcmem, QImode, tmp);
6243 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
6244 dest = change_address (destmem, QImode, tmp);
6245 emit_move_insn (dest, src);
6246 emit_label (label);
6247 LABEL_NUSES (label) = 1;
6248 }
6249 }
6250 }
6251
6252 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
6253 with value PROMOTED_VAL.
6254 SRC is passed by pointer to be updated on return.
6255 Return value is updated DST. */
6256 static rtx
6257 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
6258 HOST_WIDE_INT size_to_move)
6259 {
6260 rtx dst = destmem;
6261 enum insn_code code;
6262 machine_mode move_mode;
6263 int piece_size, i;
6264
6265 /* Find the widest mode in which we could perform moves.
6266 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6267 it until move of such size is supported. */
6268 move_mode = GET_MODE (promoted_val);
6269 if (move_mode == VOIDmode)
6270 move_mode = QImode;
6271 if (size_to_move < GET_MODE_SIZE (move_mode))
6272 {
6273 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
6274 move_mode = int_mode_for_size (move_bits, 0).require ();
6275 promoted_val = gen_lowpart (move_mode, promoted_val);
6276 }
6277 piece_size = GET_MODE_SIZE (move_mode);
6278 code = optab_handler (mov_optab, move_mode);
6279 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
6280
6281 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
6282
6283 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6284 gcc_assert (size_to_move % piece_size == 0);
6285
6286 for (i = 0; i < size_to_move; i += piece_size)
6287 {
6288 if (piece_size <= GET_MODE_SIZE (word_mode))
6289 {
6290 emit_insn (gen_strset (destptr, dst, promoted_val));
6291 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6292 piece_size);
6293 continue;
6294 }
6295
6296 emit_insn (GEN_FCN (code) (dst, promoted_val));
6297
6298 emit_move_insn (destptr,
6299 plus_constant (Pmode, copy_rtx (destptr), piece_size));
6300
6301 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6302 piece_size);
6303 }
6304
6305 /* Update DST rtx. */
6306 return dst;
6307 }
6308 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6309 static void
6310 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
6311 rtx count, int max_size)
6312 {
6313 count = expand_simple_binop (counter_mode (count), AND, count,
6314 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
6315 expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
6316 gen_lowpart (QImode, value), count, QImode,
6317 1, max_size / 2, true);
6318 }
6319
6320 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6321 static void
6322 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
6323 rtx count, int max_size)
6324 {
6325 rtx dest;
6326
6327 if (CONST_INT_P (count))
6328 {
6329 HOST_WIDE_INT countval = INTVAL (count);
6330 HOST_WIDE_INT epilogue_size = countval % max_size;
6331 int i;
6332
6333 /* For now MAX_SIZE should be a power of 2. This assert could be
6334 relaxed, but it'll require a bit more complicated epilogue
6335 expanding. */
6336 gcc_assert ((max_size & (max_size - 1)) == 0);
6337 for (i = max_size; i >= 1; i >>= 1)
6338 {
6339 if (epilogue_size & i)
6340 {
6341 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
6342 destmem = emit_memset (destmem, destptr, vec_value, i);
6343 else
6344 destmem = emit_memset (destmem, destptr, value, i);
6345 }
6346 }
6347 return;
6348 }
6349 if (max_size > 32)
6350 {
6351 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
6352 return;
6353 }
6354 if (max_size > 16)
6355 {
6356 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
6357 if (TARGET_64BIT)
6358 {
6359 dest = change_address (destmem, DImode, destptr);
6360 emit_insn (gen_strset (destptr, dest, value));
6361 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
6362 emit_insn (gen_strset (destptr, dest, value));
6363 }
6364 else
6365 {
6366 dest = change_address (destmem, SImode, destptr);
6367 emit_insn (gen_strset (destptr, dest, value));
6368 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
6369 emit_insn (gen_strset (destptr, dest, value));
6370 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
6371 emit_insn (gen_strset (destptr, dest, value));
6372 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
6373 emit_insn (gen_strset (destptr, dest, value));
6374 }
6375 emit_label (label);
6376 LABEL_NUSES (label) = 1;
6377 }
6378 if (max_size > 8)
6379 {
6380 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
6381 if (TARGET_64BIT)
6382 {
6383 dest = change_address (destmem, DImode, destptr);
6384 emit_insn (gen_strset (destptr, dest, value));
6385 }
6386 else
6387 {
6388 dest = change_address (destmem, SImode, destptr);
6389 emit_insn (gen_strset (destptr, dest, value));
6390 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
6391 emit_insn (gen_strset (destptr, dest, value));
6392 }
6393 emit_label (label);
6394 LABEL_NUSES (label) = 1;
6395 }
6396 if (max_size > 4)
6397 {
6398 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6399 dest = change_address (destmem, SImode, destptr);
6400 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
6401 emit_label (label);
6402 LABEL_NUSES (label) = 1;
6403 }
6404 if (max_size > 2)
6405 {
6406 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6407 dest = change_address (destmem, HImode, destptr);
6408 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
6409 emit_label (label);
6410 LABEL_NUSES (label) = 1;
6411 }
6412 if (max_size > 1)
6413 {
6414 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6415 dest = change_address (destmem, QImode, destptr);
6416 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
6417 emit_label (label);
6418 LABEL_NUSES (label) = 1;
6419 }
6420 }
6421
6422 /* Adjust COUNTER by the VALUE. */
6423 static void
6424 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
6425 {
6426 emit_insn (gen_add2_insn (countreg, GEN_INT (-value)));
6427 }
6428
6429 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
6430 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
6431 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
6432 ignored.
6433 Return value is updated DESTMEM. */
6434
6435 static rtx
6436 expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
6437 rtx destptr, rtx srcptr, rtx value,
6438 rtx vec_value, rtx count, int align,
6439 int desired_alignment, bool issetmem)
6440 {
6441 int i;
6442 for (i = 1; i < desired_alignment; i <<= 1)
6443 {
6444 if (align <= i)
6445 {
6446 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
6447 if (issetmem)
6448 {
6449 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
6450 destmem = emit_memset (destmem, destptr, vec_value, i);
6451 else
6452 destmem = emit_memset (destmem, destptr, value, i);
6453 }
6454 else
6455 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
6456 ix86_adjust_counter (count, i);
6457 emit_label (label);
6458 LABEL_NUSES (label) = 1;
6459 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
6460 }
6461 }
6462 return destmem;
6463 }
6464
6465 /* Test if COUNT&SIZE is nonzero and if so, expand movme
6466 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
6467 and jump to DONE_LABEL. */
6468 static void
6469 expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
6470 rtx destptr, rtx srcptr,
6471 rtx value, rtx vec_value,
6472 rtx count, int size,
6473 rtx done_label, bool issetmem)
6474 {
6475 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
6476 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
6477 rtx modesize;
6478 int n;
6479
6480 /* If we do not have vector value to copy, we must reduce size. */
6481 if (issetmem)
6482 {
6483 if (!vec_value)
6484 {
6485 if (GET_MODE (value) == VOIDmode && size > 8)
6486 mode = Pmode;
6487 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
6488 mode = GET_MODE (value);
6489 }
6490 else
6491 mode = GET_MODE (vec_value), value = vec_value;
6492 }
6493 else
6494 {
6495 /* Choose appropriate vector mode. */
6496 if (size >= 32)
6497 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
6498 else if (size >= 16)
6499 mode = TARGET_SSE ? V16QImode : DImode;
6500 srcmem = change_address (srcmem, mode, srcptr);
6501 }
6502 destmem = change_address (destmem, mode, destptr);
6503 modesize = GEN_INT (GET_MODE_SIZE (mode));
6504 gcc_assert (GET_MODE_SIZE (mode) <= size);
6505 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
6506 {
6507 if (issetmem)
6508 emit_move_insn (destmem, gen_lowpart (mode, value));
6509 else
6510 {
6511 emit_move_insn (destmem, srcmem);
6512 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6513 }
6514 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6515 }
6516
6517 destmem = offset_address (destmem, count, 1);
6518 destmem = offset_address (destmem, GEN_INT (-2 * size),
6519 GET_MODE_SIZE (mode));
6520 if (!issetmem)
6521 {
6522 srcmem = offset_address (srcmem, count, 1);
6523 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
6524 GET_MODE_SIZE (mode));
6525 }
6526 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
6527 {
6528 if (issetmem)
6529 emit_move_insn (destmem, gen_lowpart (mode, value));
6530 else
6531 {
6532 emit_move_insn (destmem, srcmem);
6533 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6534 }
6535 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6536 }
6537 emit_jump_insn (gen_jump (done_label));
6538 emit_barrier ();
6539
6540 emit_label (label);
6541 LABEL_NUSES (label) = 1;
6542 }
6543
6544 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
6545 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
6546 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
6547 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
6548 DONE_LABEL is a label after the whole copying sequence. The label is created
6549 on demand if *DONE_LABEL is NULL.
6550 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
6551 bounds after the initial copies.
6552
6553 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
6554 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
6555 we will dispatch to a library call for large blocks.
6556
6557 In pseudocode we do:
6558
6559 if (COUNT < SIZE)
6560 {
6561 Assume that SIZE is 4. Bigger sizes are handled analogously
6562 if (COUNT & 4)
6563 {
6564 copy 4 bytes from SRCPTR to DESTPTR
6565 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
6566 goto done_label
6567 }
6568 if (!COUNT)
6569 goto done_label;
6570 copy 1 byte from SRCPTR to DESTPTR
6571 if (COUNT & 2)
6572 {
6573 copy 2 bytes from SRCPTR to DESTPTR
6574 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
6575 }
6576 }
6577 else
6578 {
6579 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
6580 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
6581
6582 OLD_DESPTR = DESTPTR;
6583 Align DESTPTR up to DESIRED_ALIGN
6584 SRCPTR += DESTPTR - OLD_DESTPTR
6585 COUNT -= DEST_PTR - OLD_DESTPTR
6586 if (DYNAMIC_CHECK)
6587 Round COUNT down to multiple of SIZE
6588 << optional caller supplied zero size guard is here >>
6589 << optional caller supplied dynamic check is here >>
6590 << caller supplied main copy loop is here >>
6591 }
6592 done_label:
6593 */
6594 static void
6595 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
6596 rtx *destptr, rtx *srcptr,
6597 machine_mode mode,
6598 rtx value, rtx vec_value,
6599 rtx *count,
6600 rtx_code_label **done_label,
6601 int size,
6602 int desired_align,
6603 int align,
6604 unsigned HOST_WIDE_INT *min_size,
6605 bool dynamic_check,
6606 bool issetmem)
6607 {
6608 rtx_code_label *loop_label = NULL, *label;
6609 int n;
6610 rtx modesize;
6611 int prolog_size = 0;
6612 rtx mode_value;
6613
6614 /* Chose proper value to copy. */
6615 if (issetmem && VECTOR_MODE_P (mode))
6616 mode_value = vec_value;
6617 else
6618 mode_value = value;
6619 gcc_assert (GET_MODE_SIZE (mode) <= size);
6620
6621 /* See if block is big or small, handle small blocks. */
6622 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
6623 {
6624 int size2 = size;
6625 loop_label = gen_label_rtx ();
6626
6627 if (!*done_label)
6628 *done_label = gen_label_rtx ();
6629
6630 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
6631 1, loop_label);
6632 size2 >>= 1;
6633
6634 /* Handle sizes > 3. */
6635 for (;size2 > 2; size2 >>= 1)
6636 expand_small_cpymem_or_setmem (destmem, srcmem,
6637 *destptr, *srcptr,
6638 value, vec_value,
6639 *count,
6640 size2, *done_label, issetmem);
6641 /* Nothing to copy? Jump to DONE_LABEL if so */
6642 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
6643 1, *done_label);
6644
6645 /* Do a byte copy. */
6646 destmem = change_address (destmem, QImode, *destptr);
6647 if (issetmem)
6648 emit_move_insn (destmem, gen_lowpart (QImode, value));
6649 else
6650 {
6651 srcmem = change_address (srcmem, QImode, *srcptr);
6652 emit_move_insn (destmem, srcmem);
6653 }
6654
6655 /* Handle sizes 2 and 3. */
6656 label = ix86_expand_aligntest (*count, 2, false);
6657 destmem = change_address (destmem, HImode, *destptr);
6658 destmem = offset_address (destmem, *count, 1);
6659 destmem = offset_address (destmem, GEN_INT (-2), 2);
6660 if (issetmem)
6661 emit_move_insn (destmem, gen_lowpart (HImode, value));
6662 else
6663 {
6664 srcmem = change_address (srcmem, HImode, *srcptr);
6665 srcmem = offset_address (srcmem, *count, 1);
6666 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
6667 emit_move_insn (destmem, srcmem);
6668 }
6669
6670 emit_label (label);
6671 LABEL_NUSES (label) = 1;
6672 emit_jump_insn (gen_jump (*done_label));
6673 emit_barrier ();
6674 }
6675 else
6676 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
6677 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
6678
6679 /* Start memcpy for COUNT >= SIZE. */
6680 if (loop_label)
6681 {
6682 emit_label (loop_label);
6683 LABEL_NUSES (loop_label) = 1;
6684 }
6685
6686 /* Copy first desired_align bytes. */
6687 if (!issetmem)
6688 srcmem = change_address (srcmem, mode, *srcptr);
6689 destmem = change_address (destmem, mode, *destptr);
6690 modesize = GEN_INT (GET_MODE_SIZE (mode));
6691 for (n = 0; prolog_size < desired_align - align; n++)
6692 {
6693 if (issetmem)
6694 emit_move_insn (destmem, mode_value);
6695 else
6696 {
6697 emit_move_insn (destmem, srcmem);
6698 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6699 }
6700 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6701 prolog_size += GET_MODE_SIZE (mode);
6702 }
6703
6704
6705 /* Copy last SIZE bytes. */
6706 destmem = offset_address (destmem, *count, 1);
6707 destmem = offset_address (destmem,
6708 GEN_INT (-size - prolog_size),
6709 1);
6710 if (issetmem)
6711 emit_move_insn (destmem, mode_value);
6712 else
6713 {
6714 srcmem = offset_address (srcmem, *count, 1);
6715 srcmem = offset_address (srcmem,
6716 GEN_INT (-size - prolog_size),
6717 1);
6718 emit_move_insn (destmem, srcmem);
6719 }
6720 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
6721 {
6722 destmem = offset_address (destmem, modesize, 1);
6723 if (issetmem)
6724 emit_move_insn (destmem, mode_value);
6725 else
6726 {
6727 srcmem = offset_address (srcmem, modesize, 1);
6728 emit_move_insn (destmem, srcmem);
6729 }
6730 }
6731
6732 /* Align destination. */
6733 if (desired_align > 1 && desired_align > align)
6734 {
6735 rtx saveddest = *destptr;
6736
6737 gcc_assert (desired_align <= size);
6738 /* Align destptr up, place it to new register. */
6739 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
6740 GEN_INT (prolog_size),
6741 NULL_RTX, 1, OPTAB_DIRECT);
6742 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
6743 REG_POINTER (*destptr) = 1;
6744 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
6745 GEN_INT (-desired_align),
6746 *destptr, 1, OPTAB_DIRECT);
6747 /* See how many bytes we skipped. */
6748 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
6749 *destptr,
6750 saveddest, 1, OPTAB_DIRECT);
6751 /* Adjust srcptr and count. */
6752 if (!issetmem)
6753 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
6754 saveddest, *srcptr, 1, OPTAB_DIRECT);
6755 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
6756 saveddest, *count, 1, OPTAB_DIRECT);
6757 /* We copied at most size + prolog_size. */
6758 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
6759 *min_size
6760 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
6761 else
6762 *min_size = 0;
6763
6764 /* Our loops always round down the block size, but for dispatch to
6765 library we need precise value. */
6766 if (dynamic_check)
6767 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
6768 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
6769 }
6770 else
6771 {
6772 gcc_assert (prolog_size == 0);
6773 /* Decrease count, so we won't end up copying last word twice. */
6774 if (!CONST_INT_P (*count))
6775 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
6776 constm1_rtx, *count, 1, OPTAB_DIRECT);
6777 else
6778 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
6779 (unsigned HOST_WIDE_INT)size));
6780 if (*min_size)
6781 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
6782 }
6783 }
6784
6785
6786 /* This function is like the previous one, except here we know how many bytes
6787 need to be copied. That allows us to update alignment not only of DST, which
6788 is returned, but also of SRC, which is passed as a pointer for that
6789 reason. */
6790 static rtx
6791 expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
6792 rtx srcreg, rtx value, rtx vec_value,
6793 int desired_align, int align_bytes,
6794 bool issetmem)
6795 {
6796 rtx src = NULL;
6797 rtx orig_dst = dst;
6798 rtx orig_src = NULL;
6799 int piece_size = 1;
6800 int copied_bytes = 0;
6801
6802 if (!issetmem)
6803 {
6804 gcc_assert (srcp != NULL);
6805 src = *srcp;
6806 orig_src = src;
6807 }
6808
6809 for (piece_size = 1;
6810 piece_size <= desired_align && copied_bytes < align_bytes;
6811 piece_size <<= 1)
6812 {
6813 if (align_bytes & piece_size)
6814 {
6815 if (issetmem)
6816 {
6817 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
6818 dst = emit_memset (dst, destreg, vec_value, piece_size);
6819 else
6820 dst = emit_memset (dst, destreg, value, piece_size);
6821 }
6822 else
6823 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
6824 copied_bytes += piece_size;
6825 }
6826 }
6827 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
6828 set_mem_align (dst, desired_align * BITS_PER_UNIT);
6829 if (MEM_SIZE_KNOWN_P (orig_dst))
6830 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
6831
6832 if (!issetmem)
6833 {
6834 int src_align_bytes = get_mem_align_offset (src, desired_align
6835 * BITS_PER_UNIT);
6836 if (src_align_bytes >= 0)
6837 src_align_bytes = desired_align - src_align_bytes;
6838 if (src_align_bytes >= 0)
6839 {
6840 unsigned int src_align;
6841 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
6842 {
6843 if ((src_align_bytes & (src_align - 1))
6844 == (align_bytes & (src_align - 1)))
6845 break;
6846 }
6847 if (src_align > (unsigned int) desired_align)
6848 src_align = desired_align;
6849 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
6850 set_mem_align (src, src_align * BITS_PER_UNIT);
6851 }
6852 if (MEM_SIZE_KNOWN_P (orig_src))
6853 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
6854 *srcp = src;
6855 }
6856
6857 return dst;
6858 }
6859
6860 /* Return true if ALG can be used in current context.
6861 Assume we expand memset if MEMSET is true. */
6862 static bool
6863 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
6864 {
6865 if (alg == no_stringop)
6866 return false;
6867 if (alg == vector_loop)
6868 return TARGET_SSE || TARGET_AVX;
6869 /* Algorithms using the rep prefix want at least edi and ecx;
6870 additionally, memset wants eax and memcpy wants esi. Don't
6871 consider such algorithms if the user has appropriated those
6872 registers for their own purposes, or if we have a non-default
6873 address space, since some string insns cannot override the segment. */
6874 if (alg == rep_prefix_1_byte
6875 || alg == rep_prefix_4_byte
6876 || alg == rep_prefix_8_byte)
6877 {
6878 if (have_as)
6879 return false;
6880 if (fixed_regs[CX_REG]
6881 || fixed_regs[DI_REG]
6882 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
6883 return false;
6884 }
6885 return true;
6886 }
6887
6888 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
6889 static enum stringop_alg
6890 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
6891 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
6892 bool memset, bool zero_memset, bool have_as,
6893 int *dynamic_check, bool *noalign, bool recur)
6894 {
6895 const struct stringop_algs *algs;
6896 bool optimize_for_speed;
6897 int max = 0;
6898 const struct processor_costs *cost;
6899 int i;
6900 bool any_alg_usable_p = false;
6901
6902 *noalign = false;
6903 *dynamic_check = -1;
6904
6905 /* Even if the string operation call is cold, we still might spend a lot
6906 of time processing large blocks. */
6907 if (optimize_function_for_size_p (cfun)
6908 || (optimize_insn_for_size_p ()
6909 && (max_size < 256
6910 || (expected_size != -1 && expected_size < 256))))
6911 optimize_for_speed = false;
6912 else
6913 optimize_for_speed = true;
6914
6915 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
6916 if (memset)
6917 algs = &cost->memset[TARGET_64BIT != 0];
6918 else
6919 algs = &cost->memcpy[TARGET_64BIT != 0];
6920
6921 /* See maximal size for user defined algorithm. */
6922 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
6923 {
6924 enum stringop_alg candidate = algs->size[i].alg;
6925 bool usable = alg_usable_p (candidate, memset, have_as);
6926 any_alg_usable_p |= usable;
6927
6928 if (candidate != libcall && candidate && usable)
6929 max = algs->size[i].max;
6930 }
6931
6932 /* If expected size is not known but max size is small enough
6933 so inline version is a win, set expected size into
6934 the range. */
6935 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
6936 && expected_size == -1)
6937 expected_size = min_size / 2 + max_size / 2;
6938
6939 /* If user specified the algorithm, honor it if possible. */
6940 if (ix86_stringop_alg != no_stringop
6941 && alg_usable_p (ix86_stringop_alg, memset, have_as))
6942 return ix86_stringop_alg;
6943 /* rep; movq or rep; movl is the smallest variant. */
6944 else if (!optimize_for_speed)
6945 {
6946 *noalign = true;
6947 if (!count || (count & 3) || (memset && !zero_memset))
6948 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
6949 ? rep_prefix_1_byte : loop_1_byte;
6950 else
6951 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
6952 ? rep_prefix_4_byte : loop;
6953 }
6954 /* Very tiny blocks are best handled via the loop, REP is expensive to
6955 setup. */
6956 else if (expected_size != -1 && expected_size < 4)
6957 return loop_1_byte;
6958 else if (expected_size != -1)
6959 {
6960 enum stringop_alg alg = libcall;
6961 bool alg_noalign = false;
6962 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
6963 {
6964 /* We get here if the algorithms that were not libcall-based
6965 were rep-prefix based and we are unable to use rep prefixes
6966 based on global register usage. Break out of the loop and
6967 use the heuristic below. */
6968 if (algs->size[i].max == 0)
6969 break;
6970 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
6971 {
6972 enum stringop_alg candidate = algs->size[i].alg;
6973
6974 if (candidate != libcall
6975 && alg_usable_p (candidate, memset, have_as))
6976 {
6977 alg = candidate;
6978 alg_noalign = algs->size[i].noalign;
6979 }
6980 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
6981 last non-libcall inline algorithm. */
6982 if (TARGET_INLINE_ALL_STRINGOPS)
6983 {
6984 /* When the current size is best to be copied by a libcall,
6985 but we are still forced to inline, run the heuristic below
6986 that will pick code for medium sized blocks. */
6987 if (alg != libcall)
6988 {
6989 *noalign = alg_noalign;
6990 return alg;
6991 }
6992 else if (!any_alg_usable_p)
6993 break;
6994 }
6995 else if (alg_usable_p (candidate, memset, have_as)
6996 && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
6997 && candidate == rep_prefix_1_byte
6998 /* NB: If min_size != max_size, size is
6999 unknown. */
7000 && min_size != max_size))
7001 {
7002 *noalign = algs->size[i].noalign;
7003 return candidate;
7004 }
7005 }
7006 }
7007 }
7008 /* When asked to inline the call anyway, try to pick meaningful choice.
7009 We look for maximal size of block that is faster to copy by hand and
7010 take blocks of at most of that size guessing that average size will
7011 be roughly half of the block.
7012
7013 If this turns out to be bad, we might simply specify the preferred
7014 choice in ix86_costs. */
7015 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
7016 && (algs->unknown_size == libcall
7017 || !alg_usable_p (algs->unknown_size, memset, have_as)))
7018 {
7019 enum stringop_alg alg;
7020 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
7021
7022 /* If there aren't any usable algorithms or if recursing already,
7023 then recursing on smaller sizes or same size isn't going to
7024 find anything. Just return the simple byte-at-a-time copy loop. */
7025 if (!any_alg_usable_p || recur)
7026 {
7027 /* Pick something reasonable. */
7028 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
7029 *dynamic_check = 128;
7030 return loop_1_byte;
7031 }
7032 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
7033 zero_memset, have_as, dynamic_check, noalign, true);
7034 gcc_assert (*dynamic_check == -1);
7035 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
7036 *dynamic_check = max;
7037 else
7038 gcc_assert (alg != libcall);
7039 return alg;
7040 }
7041 return (alg_usable_p (algs->unknown_size, memset, have_as)
7042 ? algs->unknown_size : libcall);
7043 }
7044
7045 /* Decide on alignment. We know that the operand is already aligned to ALIGN
7046 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
7047 static int
7048 decide_alignment (int align,
7049 enum stringop_alg alg,
7050 int expected_size,
7051 machine_mode move_mode)
7052 {
7053 int desired_align = 0;
7054
7055 gcc_assert (alg != no_stringop);
7056
7057 if (alg == libcall)
7058 return 0;
7059 if (move_mode == VOIDmode)
7060 return 0;
7061
7062 desired_align = GET_MODE_SIZE (move_mode);
7063 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
7064 copying whole cacheline at once. */
7065 if (TARGET_CPU_P (PENTIUMPRO)
7066 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
7067 desired_align = 8;
7068
7069 if (optimize_size)
7070 desired_align = 1;
7071 if (desired_align < align)
7072 desired_align = align;
7073 if (expected_size != -1 && expected_size < 4)
7074 desired_align = align;
7075
7076 return desired_align;
7077 }
7078
7079
7080 /* Helper function for memcpy. For QImode value 0xXY produce
7081 0xXYXYXYXY of wide specified by MODE. This is essentially
7082 a * 0x10101010, but we can do slightly better than
7083 synth_mult by unwinding the sequence by hand on CPUs with
7084 slow multiply. */
7085 static rtx
7086 promote_duplicated_reg (machine_mode mode, rtx val)
7087 {
7088 machine_mode valmode = GET_MODE (val);
7089 rtx tmp;
7090 int nops = mode == DImode ? 3 : 2;
7091
7092 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
7093 if (val == const0_rtx)
7094 return copy_to_mode_reg (mode, CONST0_RTX (mode));
7095 if (CONST_INT_P (val))
7096 {
7097 HOST_WIDE_INT v = INTVAL (val) & 255;
7098
7099 v |= v << 8;
7100 v |= v << 16;
7101 if (mode == DImode)
7102 v |= (v << 16) << 16;
7103 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
7104 }
7105
7106 if (valmode == VOIDmode)
7107 valmode = QImode;
7108 if (valmode != QImode)
7109 val = gen_lowpart (QImode, val);
7110 if (mode == QImode)
7111 return val;
7112 if (!TARGET_PARTIAL_REG_STALL)
7113 nops--;
7114 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
7115 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
7116 <= (ix86_cost->shift_const + ix86_cost->add) * nops
7117 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
7118 {
7119 rtx reg = convert_modes (mode, QImode, val, true);
7120 tmp = promote_duplicated_reg (mode, const1_rtx);
7121 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
7122 OPTAB_DIRECT);
7123 }
7124 else
7125 {
7126 rtx reg = convert_modes (mode, QImode, val, true);
7127
7128 if (!TARGET_PARTIAL_REG_STALL)
7129 emit_insn (gen_insv_1 (mode, reg, reg));
7130 else
7131 {
7132 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
7133 NULL, 1, OPTAB_DIRECT);
7134 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
7135 OPTAB_DIRECT);
7136 }
7137 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
7138 NULL, 1, OPTAB_DIRECT);
7139 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
7140 if (mode == SImode)
7141 return reg;
7142 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
7143 NULL, 1, OPTAB_DIRECT);
7144 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
7145 return reg;
7146 }
7147 }
7148
7149 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
7150 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
7151 alignment from ALIGN to DESIRED_ALIGN. */
7152 static rtx
7153 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
7154 int align)
7155 {
7156 rtx promoted_val;
7157
7158 if (TARGET_64BIT
7159 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
7160 promoted_val = promote_duplicated_reg (DImode, val);
7161 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
7162 promoted_val = promote_duplicated_reg (SImode, val);
7163 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
7164 promoted_val = promote_duplicated_reg (HImode, val);
7165 else
7166 promoted_val = val;
7167
7168 return promoted_val;
7169 }
7170
7171 /* Copy the address to a Pmode register. This is used for x32 to
7172 truncate DImode TLS address to a SImode register. */
7173
7174 static rtx
7175 ix86_copy_addr_to_reg (rtx addr)
7176 {
7177 rtx reg;
7178 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
7179 {
7180 reg = copy_addr_to_reg (addr);
7181 REG_POINTER (reg) = 1;
7182 return reg;
7183 }
7184 else
7185 {
7186 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
7187 reg = copy_to_mode_reg (DImode, addr);
7188 REG_POINTER (reg) = 1;
7189 return gen_rtx_SUBREG (SImode, reg, 0);
7190 }
7191 }
7192
7193 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
7194 operations when profitable. The code depends upon architecture, block size
7195 and alignment, but always has one of the following overall structures:
7196
7197 Aligned move sequence:
7198
7199 1) Prologue guard: Conditional that jumps up to epilogues for small
7200 blocks that can be handled by epilogue alone. This is faster
7201 but also needed for correctness, since prologue assume the block
7202 is larger than the desired alignment.
7203
7204 Optional dynamic check for size and libcall for large
7205 blocks is emitted here too, with -minline-stringops-dynamically.
7206
7207 2) Prologue: copy first few bytes in order to get destination
7208 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
7209 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
7210 copied. We emit either a jump tree on power of two sized
7211 blocks, or a byte loop.
7212
7213 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7214 with specified algorithm.
7215
7216 4) Epilogue: code copying tail of the block that is too small to be
7217 handled by main body (or up to size guarded by prologue guard).
7218
7219 Misaligned move sequence
7220
7221 1) missaligned move prologue/epilogue containing:
7222 a) Prologue handling small memory blocks and jumping to done_label
7223 (skipped if blocks are known to be large enough)
7224 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
7225 needed by single possibly misaligned move
7226 (skipped if alignment is not needed)
7227 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
7228
7229 2) Zero size guard dispatching to done_label, if needed
7230
7231 3) dispatch to library call, if needed,
7232
7233 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7234 with specified algorithm. */
7235 bool
7236 ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
7237 rtx align_exp, rtx expected_align_exp,
7238 rtx expected_size_exp, rtx min_size_exp,
7239 rtx max_size_exp, rtx probable_max_size_exp,
7240 bool issetmem)
7241 {
7242 rtx destreg;
7243 rtx srcreg = NULL;
7244 rtx_code_label *label = NULL;
7245 rtx tmp;
7246 rtx_code_label *jump_around_label = NULL;
7247 HOST_WIDE_INT align = 1;
7248 unsigned HOST_WIDE_INT count = 0;
7249 HOST_WIDE_INT expected_size = -1;
7250 int size_needed = 0, epilogue_size_needed;
7251 int desired_align = 0, align_bytes = 0;
7252 enum stringop_alg alg;
7253 rtx promoted_val = NULL;
7254 rtx vec_promoted_val = NULL;
7255 bool force_loopy_epilogue = false;
7256 int dynamic_check;
7257 bool need_zero_guard = false;
7258 bool noalign;
7259 machine_mode move_mode = VOIDmode;
7260 machine_mode wider_mode;
7261 int unroll_factor = 1;
7262 /* TODO: Once value ranges are available, fill in proper data. */
7263 unsigned HOST_WIDE_INT min_size = 0;
7264 unsigned HOST_WIDE_INT max_size = -1;
7265 unsigned HOST_WIDE_INT probable_max_size = -1;
7266 bool misaligned_prologue_used = false;
7267 bool have_as;
7268
7269 if (CONST_INT_P (align_exp))
7270 align = INTVAL (align_exp);
7271 /* i386 can do misaligned access on reasonably increased cost. */
7272 if (CONST_INT_P (expected_align_exp)
7273 && INTVAL (expected_align_exp) > align)
7274 align = INTVAL (expected_align_exp);
7275 /* ALIGN is the minimum of destination and source alignment, but we care here
7276 just about destination alignment. */
7277 else if (!issetmem
7278 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
7279 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
7280
7281 if (CONST_INT_P (count_exp))
7282 {
7283 min_size = max_size = probable_max_size = count = expected_size
7284 = INTVAL (count_exp);
7285 /* When COUNT is 0, there is nothing to do. */
7286 if (!count)
7287 return true;
7288 }
7289 else
7290 {
7291 if (min_size_exp)
7292 min_size = INTVAL (min_size_exp);
7293 if (max_size_exp)
7294 max_size = INTVAL (max_size_exp);
7295 if (probable_max_size_exp)
7296 probable_max_size = INTVAL (probable_max_size_exp);
7297 if (CONST_INT_P (expected_size_exp))
7298 expected_size = INTVAL (expected_size_exp);
7299 }
7300
7301 /* Make sure we don't need to care about overflow later on. */
7302 if (count > (HOST_WIDE_INT_1U << 30))
7303 return false;
7304
7305 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
7306 if (!issetmem)
7307 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
7308
7309 /* Step 0: Decide on preferred algorithm, desired alignment and
7310 size of chunks to be copied by main loop. */
7311 alg = decide_alg (count, expected_size, min_size, probable_max_size,
7312 issetmem,
7313 issetmem && val_exp == const0_rtx, have_as,
7314 &dynamic_check, &noalign, false);
7315
7316 if (dump_file)
7317 fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
7318 stringop_alg_names[alg]);
7319
7320 if (alg == libcall)
7321 return false;
7322 gcc_assert (alg != no_stringop);
7323
7324 /* For now vector-version of memset is generated only for memory zeroing, as
7325 creating of promoted vector value is very cheap in this case. */
7326 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
7327 alg = unrolled_loop;
7328
7329 if (!count)
7330 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
7331 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
7332 if (!issetmem)
7333 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
7334
7335 unroll_factor = 1;
7336 move_mode = word_mode;
7337 switch (alg)
7338 {
7339 case libcall:
7340 case no_stringop:
7341 case last_alg:
7342 gcc_unreachable ();
7343 case loop_1_byte:
7344 need_zero_guard = true;
7345 move_mode = QImode;
7346 break;
7347 case loop:
7348 need_zero_guard = true;
7349 break;
7350 case unrolled_loop:
7351 need_zero_guard = true;
7352 unroll_factor = (TARGET_64BIT ? 4 : 2);
7353 break;
7354 case vector_loop:
7355 need_zero_guard = true;
7356 unroll_factor = 4;
7357 /* Find the widest supported mode. */
7358 move_mode = word_mode;
7359 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
7360 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
7361 move_mode = wider_mode;
7362
7363 if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 128)
7364 move_mode = TImode;
7365
7366 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7367 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7368 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
7369 {
7370 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
7371 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
7372 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
7373 move_mode = word_mode;
7374 }
7375 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
7376 break;
7377 case rep_prefix_8_byte:
7378 move_mode = DImode;
7379 break;
7380 case rep_prefix_4_byte:
7381 move_mode = SImode;
7382 break;
7383 case rep_prefix_1_byte:
7384 move_mode = QImode;
7385 break;
7386 }
7387 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
7388 epilogue_size_needed = size_needed;
7389
7390 /* If we are going to call any library calls conditionally, make sure any
7391 pending stack adjustment happen before the first conditional branch,
7392 otherwise they will be emitted before the library call only and won't
7393 happen from the other branches. */
7394 if (dynamic_check != -1)
7395 do_pending_stack_adjust ();
7396
7397 desired_align = decide_alignment (align, alg, expected_size, move_mode);
7398 if (!TARGET_ALIGN_STRINGOPS || noalign)
7399 align = desired_align;
7400
7401 /* Step 1: Prologue guard. */
7402
7403 /* Alignment code needs count to be in register. */
7404 if (CONST_INT_P (count_exp) && desired_align > align)
7405 {
7406 if (INTVAL (count_exp) > desired_align
7407 && INTVAL (count_exp) > size_needed)
7408 {
7409 align_bytes
7410 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
7411 if (align_bytes <= 0)
7412 align_bytes = 0;
7413 else
7414 align_bytes = desired_align - align_bytes;
7415 }
7416 if (align_bytes == 0)
7417 count_exp = force_reg (counter_mode (count_exp), count_exp);
7418 }
7419 gcc_assert (desired_align >= 1 && align >= 1);
7420
7421 /* Misaligned move sequences handle both prologue and epilogue at once.
7422 Default code generation results in a smaller code for large alignments
7423 and also avoids redundant job when sizes are known precisely. */
7424 misaligned_prologue_used
7425 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
7426 && MAX (desired_align, epilogue_size_needed) <= 32
7427 && desired_align <= epilogue_size_needed
7428 && ((desired_align > align && !align_bytes)
7429 || (!count && epilogue_size_needed > 1)));
7430
7431 /* Do the cheap promotion to allow better CSE across the
7432 main loop and epilogue (ie one load of the big constant in the
7433 front of all code.
7434 For now the misaligned move sequences do not have fast path
7435 without broadcasting. */
7436 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
7437 {
7438 if (alg == vector_loop)
7439 {
7440 gcc_assert (val_exp == const0_rtx);
7441 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
7442 promoted_val = promote_duplicated_reg_to_size (val_exp,
7443 GET_MODE_SIZE (word_mode),
7444 desired_align, align);
7445 }
7446 else
7447 {
7448 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
7449 desired_align, align);
7450 }
7451 }
7452 /* Misaligned move sequences handles both prologues and epilogues at once.
7453 Default code generation results in smaller code for large alignments and
7454 also avoids redundant job when sizes are known precisely. */
7455 if (misaligned_prologue_used)
7456 {
7457 /* Misaligned move prologue handled small blocks by itself. */
7458 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
7459 (dst, src, &destreg, &srcreg,
7460 move_mode, promoted_val, vec_promoted_val,
7461 &count_exp,
7462 &jump_around_label,
7463 desired_align < align
7464 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
7465 desired_align, align, &min_size, dynamic_check, issetmem);
7466 if (!issetmem)
7467 src = change_address (src, BLKmode, srcreg);
7468 dst = change_address (dst, BLKmode, destreg);
7469 set_mem_align (dst, desired_align * BITS_PER_UNIT);
7470 epilogue_size_needed = 0;
7471 if (need_zero_guard
7472 && min_size < (unsigned HOST_WIDE_INT) size_needed)
7473 {
7474 /* It is possible that we copied enough so the main loop will not
7475 execute. */
7476 gcc_assert (size_needed > 1);
7477 if (jump_around_label == NULL_RTX)
7478 jump_around_label = gen_label_rtx ();
7479 emit_cmp_and_jump_insns (count_exp,
7480 GEN_INT (size_needed),
7481 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
7482 if (expected_size == -1
7483 || expected_size < (desired_align - align) / 2 + size_needed)
7484 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7485 else
7486 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7487 }
7488 }
7489 /* Ensure that alignment prologue won't copy past end of block. */
7490 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
7491 {
7492 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
7493 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
7494 Make sure it is power of 2. */
7495 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
7496
7497 /* To improve performance of small blocks, we jump around the VAL
7498 promoting mode. This mean that if the promoted VAL is not constant,
7499 we might not use it in the epilogue and have to use byte
7500 loop variant. */
7501 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
7502 force_loopy_epilogue = true;
7503 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7504 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7505 {
7506 /* If main algorithm works on QImode, no epilogue is needed.
7507 For small sizes just don't align anything. */
7508 if (size_needed == 1)
7509 desired_align = align;
7510 else
7511 goto epilogue;
7512 }
7513 else if (!count
7514 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7515 {
7516 label = gen_label_rtx ();
7517 emit_cmp_and_jump_insns (count_exp,
7518 GEN_INT (epilogue_size_needed),
7519 LTU, 0, counter_mode (count_exp), 1, label);
7520 if (expected_size == -1 || expected_size < epilogue_size_needed)
7521 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7522 else
7523 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7524 }
7525 }
7526
7527 /* Emit code to decide on runtime whether library call or inline should be
7528 used. */
7529 if (dynamic_check != -1)
7530 {
7531 if (!issetmem && CONST_INT_P (count_exp))
7532 {
7533 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
7534 {
7535 emit_block_copy_via_libcall (dst, src, count_exp);
7536 count_exp = const0_rtx;
7537 goto epilogue;
7538 }
7539 }
7540 else
7541 {
7542 rtx_code_label *hot_label = gen_label_rtx ();
7543 if (jump_around_label == NULL_RTX)
7544 jump_around_label = gen_label_rtx ();
7545 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
7546 LEU, 0, counter_mode (count_exp),
7547 1, hot_label);
7548 predict_jump (REG_BR_PROB_BASE * 90 / 100);
7549 if (issetmem)
7550 set_storage_via_libcall (dst, count_exp, val_exp);
7551 else
7552 emit_block_copy_via_libcall (dst, src, count_exp);
7553 emit_jump (jump_around_label);
7554 emit_label (hot_label);
7555 }
7556 }
7557
7558 /* Step 2: Alignment prologue. */
7559 /* Do the expensive promotion once we branched off the small blocks. */
7560 if (issetmem && !promoted_val)
7561 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
7562 desired_align, align);
7563
7564 if (desired_align > align && !misaligned_prologue_used)
7565 {
7566 if (align_bytes == 0)
7567 {
7568 /* Except for the first move in prologue, we no longer know
7569 constant offset in aliasing info. It don't seems to worth
7570 the pain to maintain it for the first move, so throw away
7571 the info early. */
7572 dst = change_address (dst, BLKmode, destreg);
7573 if (!issetmem)
7574 src = change_address (src, BLKmode, srcreg);
7575 dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
7576 promoted_val, vec_promoted_val,
7577 count_exp, align, desired_align,
7578 issetmem);
7579 /* At most desired_align - align bytes are copied. */
7580 if (min_size < (unsigned)(desired_align - align))
7581 min_size = 0;
7582 else
7583 min_size -= desired_align - align;
7584 }
7585 else
7586 {
7587 /* If we know how many bytes need to be stored before dst is
7588 sufficiently aligned, maintain aliasing info accurately. */
7589 dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
7590 srcreg,
7591 promoted_val,
7592 vec_promoted_val,
7593 desired_align,
7594 align_bytes,
7595 issetmem);
7596
7597 count_exp = plus_constant (counter_mode (count_exp),
7598 count_exp, -align_bytes);
7599 count -= align_bytes;
7600 min_size -= align_bytes;
7601 max_size -= align_bytes;
7602 }
7603 if (need_zero_guard
7604 && min_size < (unsigned HOST_WIDE_INT) size_needed
7605 && (count < (unsigned HOST_WIDE_INT) size_needed
7606 || (align_bytes == 0
7607 && count < ((unsigned HOST_WIDE_INT) size_needed
7608 + desired_align - align))))
7609 {
7610 /* It is possible that we copied enough so the main loop will not
7611 execute. */
7612 gcc_assert (size_needed > 1);
7613 if (label == NULL_RTX)
7614 label = gen_label_rtx ();
7615 emit_cmp_and_jump_insns (count_exp,
7616 GEN_INT (size_needed),
7617 LTU, 0, counter_mode (count_exp), 1, label);
7618 if (expected_size == -1
7619 || expected_size < (desired_align - align) / 2 + size_needed)
7620 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7621 else
7622 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7623 }
7624 }
7625 if (label && size_needed == 1)
7626 {
7627 emit_label (label);
7628 LABEL_NUSES (label) = 1;
7629 label = NULL;
7630 epilogue_size_needed = 1;
7631 if (issetmem)
7632 promoted_val = val_exp;
7633 }
7634 else if (label == NULL_RTX && !misaligned_prologue_used)
7635 epilogue_size_needed = size_needed;
7636
7637 /* Step 3: Main loop. */
7638
7639 switch (alg)
7640 {
7641 case libcall:
7642 case no_stringop:
7643 case last_alg:
7644 gcc_unreachable ();
7645 case loop_1_byte:
7646 case loop:
7647 case unrolled_loop:
7648 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
7649 count_exp, move_mode, unroll_factor,
7650 expected_size, issetmem);
7651 break;
7652 case vector_loop:
7653 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
7654 vec_promoted_val, count_exp, move_mode,
7655 unroll_factor, expected_size, issetmem);
7656 break;
7657 case rep_prefix_8_byte:
7658 case rep_prefix_4_byte:
7659 case rep_prefix_1_byte:
7660 expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
7661 val_exp, count_exp, move_mode, issetmem);
7662 break;
7663 }
7664 /* Adjust properly the offset of src and dest memory for aliasing. */
7665 if (CONST_INT_P (count_exp))
7666 {
7667 if (!issetmem)
7668 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
7669 (count / size_needed) * size_needed);
7670 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
7671 (count / size_needed) * size_needed);
7672 }
7673 else
7674 {
7675 if (!issetmem)
7676 src = change_address (src, BLKmode, srcreg);
7677 dst = change_address (dst, BLKmode, destreg);
7678 }
7679
7680 /* Step 4: Epilogue to copy the remaining bytes. */
7681 epilogue:
7682 if (label)
7683 {
7684 /* When the main loop is done, COUNT_EXP might hold original count,
7685 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
7686 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
7687 bytes. Compensate if needed. */
7688
7689 if (size_needed < epilogue_size_needed)
7690 {
7691 tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
7692 GEN_INT (size_needed - 1), count_exp, 1,
7693 OPTAB_DIRECT);
7694 if (tmp != count_exp)
7695 emit_move_insn (count_exp, tmp);
7696 }
7697 emit_label (label);
7698 LABEL_NUSES (label) = 1;
7699 }
7700
7701 if (count_exp != const0_rtx && epilogue_size_needed > 1)
7702 {
7703 if (force_loopy_epilogue)
7704 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
7705 epilogue_size_needed);
7706 else
7707 {
7708 if (issetmem)
7709 expand_setmem_epilogue (dst, destreg, promoted_val,
7710 vec_promoted_val, count_exp,
7711 epilogue_size_needed);
7712 else
7713 expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
7714 epilogue_size_needed);
7715 }
7716 }
7717 if (jump_around_label)
7718 emit_label (jump_around_label);
7719 return true;
7720 }
7721
7722 /* Expand cmpstrn or memcmp. */
7723
7724 bool
7725 ix86_expand_cmpstrn_or_cmpmem (rtx result, rtx src1, rtx src2,
7726 rtx length, rtx align, bool is_cmpstrn)
7727 {
7728 /* Expand strncmp and memcmp only with -minline-all-stringops since
7729 "repz cmpsb" can be much slower than strncmp and memcmp functions
7730 implemented with vector instructions, see
7731
7732 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
7733 */
7734 if (!TARGET_INLINE_ALL_STRINGOPS)
7735 return false;
7736
7737 /* Can't use this if the user has appropriated ecx, esi or edi. */
7738 if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
7739 return false;
7740
7741 if (is_cmpstrn)
7742 {
7743 /* For strncmp, length is the maximum length, which can be larger
7744 than actual string lengths. We can expand the cmpstrn pattern
7745 to "repz cmpsb" only if one of the strings is a constant so
7746 that expand_builtin_strncmp() can write the length argument to
7747 be the minimum of the const string length and the actual length
7748 argument. Otherwise, "repz cmpsb" may pass the 0 byte. */
7749 tree t1 = MEM_EXPR (src1);
7750 tree t2 = MEM_EXPR (src2);
7751 if (!((t1 && TREE_CODE (t1) == MEM_REF
7752 && TREE_CODE (TREE_OPERAND (t1, 0)) == ADDR_EXPR
7753 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1, 0), 0))
7754 == STRING_CST))
7755 || (t2 && TREE_CODE (t2) == MEM_REF
7756 && TREE_CODE (TREE_OPERAND (t2, 0)) == ADDR_EXPR
7757 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2, 0), 0))
7758 == STRING_CST))))
7759 return false;
7760 }
7761
7762 rtx addr1 = copy_addr_to_reg (XEXP (src1, 0));
7763 rtx addr2 = copy_addr_to_reg (XEXP (src2, 0));
7764 if (addr1 != XEXP (src1, 0))
7765 src1 = replace_equiv_address_nv (src1, addr1);
7766 if (addr2 != XEXP (src2, 0))
7767 src2 = replace_equiv_address_nv (src2, addr2);
7768
7769 /* NB: Make a copy of the data length to avoid changing the original
7770 data length by cmpstrnqi patterns. */
7771 length = ix86_zero_extend_to_Pmode (length);
7772 rtx lengthreg = gen_reg_rtx (Pmode);
7773 emit_move_insn (lengthreg, length);
7774
7775 /* If we are testing strict equality, we can use known alignment to
7776 good advantage. This may be possible with combine, particularly
7777 once cc0 is dead. */
7778 if (CONST_INT_P (length))
7779 {
7780 if (length == const0_rtx)
7781 {
7782 emit_move_insn (result, const0_rtx);
7783 return true;
7784 }
7785 emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, lengthreg, align,
7786 src1, src2));
7787 }
7788 else
7789 {
7790 emit_insn (gen_cmp_1 (Pmode, lengthreg, lengthreg));
7791 emit_insn (gen_cmpstrnqi_1 (addr1, addr2, lengthreg, align,
7792 src1, src2));
7793 }
7794
7795 rtx out = gen_lowpart (QImode, result);
7796 emit_insn (gen_cmpintqi (out));
7797 emit_move_insn (result, gen_rtx_SIGN_EXTEND (SImode, out));
7798
7799 return true;
7800 }
7801
7802 /* Expand the appropriate insns for doing strlen if not just doing
7803 repnz; scasb
7804
7805 out = result, initialized with the start address
7806 align_rtx = alignment of the address.
7807 scratch = scratch register, initialized with the startaddress when
7808 not aligned, otherwise undefined
7809
7810 This is just the body. It needs the initializations mentioned above and
7811 some address computing at the end. These things are done in i386.md. */
7812
7813 static void
7814 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
7815 {
7816 int align;
7817 rtx tmp;
7818 rtx_code_label *align_2_label = NULL;
7819 rtx_code_label *align_3_label = NULL;
7820 rtx_code_label *align_4_label = gen_label_rtx ();
7821 rtx_code_label *end_0_label = gen_label_rtx ();
7822 rtx mem;
7823 rtx tmpreg = gen_reg_rtx (SImode);
7824 rtx scratch = gen_reg_rtx (SImode);
7825 rtx cmp;
7826
7827 align = 0;
7828 if (CONST_INT_P (align_rtx))
7829 align = INTVAL (align_rtx);
7830
7831 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
7832
7833 /* Is there a known alignment and is it less than 4? */
7834 if (align < 4)
7835 {
7836 rtx scratch1 = gen_reg_rtx (Pmode);
7837 emit_move_insn (scratch1, out);
7838 /* Is there a known alignment and is it not 2? */
7839 if (align != 2)
7840 {
7841 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
7842 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
7843
7844 /* Leave just the 3 lower bits. */
7845 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
7846 NULL_RTX, 0, OPTAB_WIDEN);
7847
7848 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
7849 Pmode, 1, align_4_label);
7850 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
7851 Pmode, 1, align_2_label);
7852 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
7853 Pmode, 1, align_3_label);
7854 }
7855 else
7856 {
7857 /* Since the alignment is 2, we have to check 2 or 0 bytes;
7858 check if is aligned to 4 - byte. */
7859
7860 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
7861 NULL_RTX, 0, OPTAB_WIDEN);
7862
7863 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
7864 Pmode, 1, align_4_label);
7865 }
7866
7867 mem = change_address (src, QImode, out);
7868
7869 /* Now compare the bytes. */
7870
7871 /* Compare the first n unaligned byte on a byte per byte basis. */
7872 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
7873 QImode, 1, end_0_label);
7874
7875 /* Increment the address. */
7876 emit_insn (gen_add2_insn (out, const1_rtx));
7877
7878 /* Not needed with an alignment of 2 */
7879 if (align != 2)
7880 {
7881 emit_label (align_2_label);
7882
7883 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
7884 end_0_label);
7885
7886 emit_insn (gen_add2_insn (out, const1_rtx));
7887
7888 emit_label (align_3_label);
7889 }
7890
7891 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
7892 end_0_label);
7893
7894 emit_insn (gen_add2_insn (out, const1_rtx));
7895 }
7896
7897 /* Generate loop to check 4 bytes at a time. It is not a good idea to
7898 align this loop. It gives only huge programs, but does not help to
7899 speed up. */
7900 emit_label (align_4_label);
7901
7902 mem = change_address (src, SImode, out);
7903 emit_move_insn (scratch, mem);
7904 emit_insn (gen_add2_insn (out, GEN_INT (4)));
7905
7906 /* This formula yields a nonzero result iff one of the bytes is zero.
7907 This saves three branches inside loop and many cycles. */
7908
7909 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
7910 emit_insn (gen_one_cmplsi2 (scratch, scratch));
7911 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
7912 emit_insn (gen_andsi3 (tmpreg, tmpreg,
7913 gen_int_mode (0x80808080, SImode)));
7914 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
7915 align_4_label);
7916
7917 if (TARGET_CMOVE)
7918 {
7919 rtx reg = gen_reg_rtx (SImode);
7920 rtx reg2 = gen_reg_rtx (Pmode);
7921 emit_move_insn (reg, tmpreg);
7922 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
7923
7924 /* If zero is not in the first two bytes, move two bytes forward. */
7925 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
7926 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7927 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
7928 emit_insn (gen_rtx_SET (tmpreg,
7929 gen_rtx_IF_THEN_ELSE (SImode, tmp,
7930 reg,
7931 tmpreg)));
7932 /* Emit lea manually to avoid clobbering of flags. */
7933 emit_insn (gen_rtx_SET (reg2, plus_constant (Pmode, out, 2)));
7934
7935 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7936 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
7937 emit_insn (gen_rtx_SET (out,
7938 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
7939 reg2,
7940 out)));
7941 }
7942 else
7943 {
7944 rtx_code_label *end_2_label = gen_label_rtx ();
7945 /* Is zero in the first two bytes? */
7946
7947 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
7948 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7949 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
7950 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
7951 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
7952 pc_rtx);
7953 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
7954 JUMP_LABEL (tmp) = end_2_label;
7955
7956 /* Not in the first two. Move two bytes forward. */
7957 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
7958 emit_insn (gen_add2_insn (out, const2_rtx));
7959
7960 emit_label (end_2_label);
7961
7962 }
7963
7964 /* Avoid branch in fixing the byte. */
7965 tmpreg = gen_lowpart (QImode, tmpreg);
7966 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
7967 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
7968 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
7969 emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp));
7970
7971 emit_label (end_0_label);
7972 }
7973
7974 /* Expand strlen. */
7975
7976 bool
7977 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
7978 {
7979 if (TARGET_UNROLL_STRLEN
7980 && TARGET_INLINE_ALL_STRINGOPS
7981 && eoschar == const0_rtx
7982 && optimize > 1)
7983 {
7984 /* The generic case of strlen expander is long. Avoid it's
7985 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
7986 rtx addr = force_reg (Pmode, XEXP (src, 0));
7987 /* Well it seems that some optimizer does not combine a call like
7988 foo(strlen(bar), strlen(bar));
7989 when the move and the subtraction is done here. It does calculate
7990 the length just once when these instructions are done inside of
7991 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
7992 often used and I use one fewer register for the lifetime of
7993 output_strlen_unroll() this is better. */
7994
7995 emit_move_insn (out, addr);
7996
7997 ix86_expand_strlensi_unroll_1 (out, src, align);
7998
7999 /* strlensi_unroll_1 returns the address of the zero at the end of
8000 the string, like memchr(), so compute the length by subtracting
8001 the start address. */
8002 emit_insn (gen_sub2_insn (out, addr));
8003 return true;
8004 }
8005 else
8006 return false;
8007 }
8008
8009 /* For given symbol (function) construct code to compute address of it's PLT
8010 entry in large x86-64 PIC model. */
8011
8012 static rtx
8013 construct_plt_address (rtx symbol)
8014 {
8015 rtx tmp, unspec;
8016
8017 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
8018 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
8019 gcc_assert (Pmode == DImode);
8020
8021 tmp = gen_reg_rtx (Pmode);
8022 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
8023
8024 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
8025 emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx));
8026 return tmp;
8027 }
8028
8029 /* Additional registers that are clobbered by SYSV calls. */
8030
8031 static int const x86_64_ms_sysv_extra_clobbered_registers
8032 [NUM_X86_64_MS_CLOBBERED_REGS] =
8033 {
8034 SI_REG, DI_REG,
8035 XMM6_REG, XMM7_REG,
8036 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
8037 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
8038 };
8039
8040 rtx_insn *
8041 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
8042 rtx callarg2,
8043 rtx pop, bool sibcall)
8044 {
8045 rtx vec[3];
8046 rtx use = NULL, call;
8047 unsigned int vec_len = 0;
8048 tree fndecl;
8049
8050 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
8051 {
8052 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
8053 if (fndecl
8054 && (lookup_attribute ("interrupt",
8055 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
8056 error ("interrupt service routine cannot be called directly");
8057 }
8058 else
8059 fndecl = NULL_TREE;
8060
8061 if (pop == const0_rtx)
8062 pop = NULL;
8063 gcc_assert (!TARGET_64BIT || !pop);
8064
8065 if (TARGET_MACHO && !TARGET_64BIT)
8066 {
8067 #if TARGET_MACHO
8068 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
8069 fnaddr = machopic_indirect_call_target (fnaddr);
8070 #endif
8071 }
8072 else
8073 {
8074 /* Static functions and indirect calls don't need the pic register. Also,
8075 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
8076 it an indirect call. */
8077 rtx addr = XEXP (fnaddr, 0);
8078 if (flag_pic
8079 && GET_CODE (addr) == SYMBOL_REF
8080 && !SYMBOL_REF_LOCAL_P (addr))
8081 {
8082 if (flag_plt
8083 && (SYMBOL_REF_DECL (addr) == NULL_TREE
8084 || !lookup_attribute ("noplt",
8085 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
8086 {
8087 if (!TARGET_64BIT
8088 || (ix86_cmodel == CM_LARGE_PIC
8089 && DEFAULT_ABI != MS_ABI))
8090 {
8091 use_reg (&use, gen_rtx_REG (Pmode,
8092 REAL_PIC_OFFSET_TABLE_REGNUM));
8093 if (ix86_use_pseudo_pic_reg ())
8094 emit_move_insn (gen_rtx_REG (Pmode,
8095 REAL_PIC_OFFSET_TABLE_REGNUM),
8096 pic_offset_table_rtx);
8097 }
8098 }
8099 else if (!TARGET_PECOFF && !TARGET_MACHO)
8100 {
8101 if (TARGET_64BIT
8102 && ix86_cmodel == CM_LARGE_PIC
8103 && DEFAULT_ABI != MS_ABI)
8104 {
8105 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
8106 UNSPEC_GOT);
8107 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
8108 fnaddr = force_reg (Pmode, fnaddr);
8109 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, fnaddr);
8110 }
8111 else if (TARGET_64BIT)
8112 {
8113 fnaddr = gen_rtx_UNSPEC (Pmode,
8114 gen_rtvec (1, addr),
8115 UNSPEC_GOTPCREL);
8116 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
8117 }
8118 else
8119 {
8120 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
8121 UNSPEC_GOT);
8122 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
8123 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
8124 fnaddr);
8125 }
8126 fnaddr = gen_const_mem (Pmode, fnaddr);
8127 /* Pmode may not be the same as word_mode for x32, which
8128 doesn't support indirect branch via 32-bit memory slot.
8129 Since x32 GOT slot is 64 bit with zero upper 32 bits,
8130 indirect branch via x32 GOT slot is OK. */
8131 if (GET_MODE (fnaddr) != word_mode)
8132 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
8133 fnaddr = gen_rtx_MEM (QImode, fnaddr);
8134 }
8135 }
8136 }
8137
8138 /* Skip setting up RAX register for -mskip-rax-setup when there are no
8139 parameters passed in vector registers. */
8140 if (TARGET_64BIT
8141 && (INTVAL (callarg2) > 0
8142 || (INTVAL (callarg2) == 0
8143 && (TARGET_SSE || !flag_skip_rax_setup))))
8144 {
8145 rtx al = gen_rtx_REG (QImode, AX_REG);
8146 emit_move_insn (al, callarg2);
8147 use_reg (&use, al);
8148 }
8149
8150 if (ix86_cmodel == CM_LARGE_PIC
8151 && !TARGET_PECOFF
8152 && MEM_P (fnaddr)
8153 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
8154 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
8155 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
8156 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
8157 branch via x32 GOT slot is OK. */
8158 else if (!(TARGET_X32
8159 && MEM_P (fnaddr)
8160 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
8161 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
8162 && (sibcall
8163 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
8164 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
8165 {
8166 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
8167 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
8168 }
8169
8170 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
8171
8172 if (retval)
8173 call = gen_rtx_SET (retval, call);
8174 vec[vec_len++] = call;
8175
8176 if (pop)
8177 {
8178 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
8179 pop = gen_rtx_SET (stack_pointer_rtx, pop);
8180 vec[vec_len++] = pop;
8181 }
8182
8183 if (cfun->machine->no_caller_saved_registers
8184 && (!fndecl
8185 || (!TREE_THIS_VOLATILE (fndecl)
8186 && !lookup_attribute ("no_caller_saved_registers",
8187 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
8188 {
8189 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
8190 bool is_64bit_ms_abi = (TARGET_64BIT
8191 && ix86_function_abi (fndecl) == MS_ABI);
8192 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
8193
8194 /* If there are no caller-saved registers, add all registers
8195 that are clobbered by the call which returns. */
8196 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
8197 if (!fixed_regs[i]
8198 && (ix86_call_used_regs[i] == 1
8199 || (ix86_call_used_regs[i] & c_mask))
8200 && !STACK_REGNO_P (i)
8201 && !MMX_REGNO_P (i))
8202 clobber_reg (&use,
8203 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
8204 }
8205 else if (TARGET_64BIT_MS_ABI
8206 && (!callarg2 || INTVAL (callarg2) != -2))
8207 {
8208 unsigned i;
8209
8210 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
8211 {
8212 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
8213 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
8214
8215 clobber_reg (&use, gen_rtx_REG (mode, regno));
8216 }
8217
8218 /* Set here, but it may get cleared later. */
8219 if (TARGET_CALL_MS2SYSV_XLOGUES)
8220 {
8221 if (!TARGET_SSE)
8222 ;
8223
8224 /* Don't break hot-patched functions. */
8225 else if (ix86_function_ms_hook_prologue (current_function_decl))
8226 ;
8227
8228 /* TODO: Cases not yet examined. */
8229 else if (flag_split_stack)
8230 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
8231
8232 else
8233 {
8234 gcc_assert (!reload_completed);
8235 cfun->machine->call_ms2sysv = true;
8236 }
8237 }
8238 }
8239
8240 if (vec_len > 1)
8241 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
8242 rtx_insn *call_insn = emit_call_insn (call);
8243 if (use)
8244 CALL_INSN_FUNCTION_USAGE (call_insn) = use;
8245
8246 return call_insn;
8247 }
8248
8249 /* Split simple return with popping POPC bytes from stack to indirect
8250 branch with stack adjustment . */
8251
8252 void
8253 ix86_split_simple_return_pop_internal (rtx popc)
8254 {
8255 struct machine_function *m = cfun->machine;
8256 rtx ecx = gen_rtx_REG (SImode, CX_REG);
8257 rtx_insn *insn;
8258
8259 /* There is no "pascal" calling convention in any 64bit ABI. */
8260 gcc_assert (!TARGET_64BIT);
8261
8262 insn = emit_insn (gen_pop (ecx));
8263 m->fs.cfa_offset -= UNITS_PER_WORD;
8264 m->fs.sp_offset -= UNITS_PER_WORD;
8265
8266 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
8267 x = gen_rtx_SET (stack_pointer_rtx, x);
8268 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
8269 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
8270 RTX_FRAME_RELATED_P (insn) = 1;
8271
8272 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
8273 x = gen_rtx_SET (stack_pointer_rtx, x);
8274 insn = emit_insn (x);
8275 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
8276 RTX_FRAME_RELATED_P (insn) = 1;
8277
8278 /* Now return address is in ECX. */
8279 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
8280 }
8281
8282 /* Errors in the source file can cause expand_expr to return const0_rtx
8283 where we expect a vector. To avoid crashing, use one of the vector
8284 clear instructions. */
8285
8286 static rtx
8287 safe_vector_operand (rtx x, machine_mode mode)
8288 {
8289 if (x == const0_rtx)
8290 x = CONST0_RTX (mode);
8291 return x;
8292 }
8293
8294 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
8295
8296 static rtx
8297 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
8298 {
8299 rtx pat;
8300 tree arg0 = CALL_EXPR_ARG (exp, 0);
8301 tree arg1 = CALL_EXPR_ARG (exp, 1);
8302 rtx op0 = expand_normal (arg0);
8303 rtx op1 = expand_normal (arg1);
8304 machine_mode tmode = insn_data[icode].operand[0].mode;
8305 machine_mode mode0 = insn_data[icode].operand[1].mode;
8306 machine_mode mode1 = insn_data[icode].operand[2].mode;
8307
8308 if (VECTOR_MODE_P (mode0))
8309 op0 = safe_vector_operand (op0, mode0);
8310 if (VECTOR_MODE_P (mode1))
8311 op1 = safe_vector_operand (op1, mode1);
8312
8313 if (optimize || !target
8314 || GET_MODE (target) != tmode
8315 || !insn_data[icode].operand[0].predicate (target, tmode))
8316 target = gen_reg_rtx (tmode);
8317
8318 if (GET_MODE (op1) == SImode && mode1 == TImode)
8319 {
8320 rtx x = gen_reg_rtx (V4SImode);
8321 emit_insn (gen_sse2_loadd (x, op1));
8322 op1 = gen_lowpart (TImode, x);
8323 }
8324
8325 if (!insn_data[icode].operand[1].predicate (op0, mode0))
8326 op0 = copy_to_mode_reg (mode0, op0);
8327 if (!insn_data[icode].operand[2].predicate (op1, mode1))
8328 op1 = copy_to_mode_reg (mode1, op1);
8329
8330 pat = GEN_FCN (icode) (target, op0, op1);
8331 if (! pat)
8332 return 0;
8333
8334 emit_insn (pat);
8335
8336 return target;
8337 }
8338
8339 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
8340
8341 static rtx
8342 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
8343 enum ix86_builtin_func_type m_type,
8344 enum rtx_code sub_code)
8345 {
8346 rtx pat;
8347 unsigned int i, nargs;
8348 bool comparison_p = false;
8349 bool tf_p = false;
8350 bool last_arg_constant = false;
8351 int num_memory = 0;
8352 rtx xops[4];
8353
8354 machine_mode tmode = insn_data[icode].operand[0].mode;
8355
8356 switch (m_type)
8357 {
8358 case MULTI_ARG_4_DF2_DI_I:
8359 case MULTI_ARG_4_DF2_DI_I1:
8360 case MULTI_ARG_4_SF2_SI_I:
8361 case MULTI_ARG_4_SF2_SI_I1:
8362 nargs = 4;
8363 last_arg_constant = true;
8364 break;
8365
8366 case MULTI_ARG_3_SF:
8367 case MULTI_ARG_3_DF:
8368 case MULTI_ARG_3_SF2:
8369 case MULTI_ARG_3_DF2:
8370 case MULTI_ARG_3_DI:
8371 case MULTI_ARG_3_SI:
8372 case MULTI_ARG_3_SI_DI:
8373 case MULTI_ARG_3_HI:
8374 case MULTI_ARG_3_HI_SI:
8375 case MULTI_ARG_3_QI:
8376 case MULTI_ARG_3_DI2:
8377 case MULTI_ARG_3_SI2:
8378 case MULTI_ARG_3_HI2:
8379 case MULTI_ARG_3_QI2:
8380 nargs = 3;
8381 break;
8382
8383 case MULTI_ARG_2_SF:
8384 case MULTI_ARG_2_DF:
8385 case MULTI_ARG_2_DI:
8386 case MULTI_ARG_2_SI:
8387 case MULTI_ARG_2_HI:
8388 case MULTI_ARG_2_QI:
8389 nargs = 2;
8390 break;
8391
8392 case MULTI_ARG_2_DI_IMM:
8393 case MULTI_ARG_2_SI_IMM:
8394 case MULTI_ARG_2_HI_IMM:
8395 case MULTI_ARG_2_QI_IMM:
8396 nargs = 2;
8397 last_arg_constant = true;
8398 break;
8399
8400 case MULTI_ARG_1_SF:
8401 case MULTI_ARG_1_DF:
8402 case MULTI_ARG_1_SF2:
8403 case MULTI_ARG_1_DF2:
8404 case MULTI_ARG_1_DI:
8405 case MULTI_ARG_1_SI:
8406 case MULTI_ARG_1_HI:
8407 case MULTI_ARG_1_QI:
8408 case MULTI_ARG_1_SI_DI:
8409 case MULTI_ARG_1_HI_DI:
8410 case MULTI_ARG_1_HI_SI:
8411 case MULTI_ARG_1_QI_DI:
8412 case MULTI_ARG_1_QI_SI:
8413 case MULTI_ARG_1_QI_HI:
8414 nargs = 1;
8415 break;
8416
8417 case MULTI_ARG_2_DI_CMP:
8418 case MULTI_ARG_2_SI_CMP:
8419 case MULTI_ARG_2_HI_CMP:
8420 case MULTI_ARG_2_QI_CMP:
8421 nargs = 2;
8422 comparison_p = true;
8423 break;
8424
8425 case MULTI_ARG_2_SF_TF:
8426 case MULTI_ARG_2_DF_TF:
8427 case MULTI_ARG_2_DI_TF:
8428 case MULTI_ARG_2_SI_TF:
8429 case MULTI_ARG_2_HI_TF:
8430 case MULTI_ARG_2_QI_TF:
8431 nargs = 2;
8432 tf_p = true;
8433 break;
8434
8435 default:
8436 gcc_unreachable ();
8437 }
8438
8439 if (optimize || !target
8440 || GET_MODE (target) != tmode
8441 || !insn_data[icode].operand[0].predicate (target, tmode))
8442 target = gen_reg_rtx (tmode);
8443 else if (memory_operand (target, tmode))
8444 num_memory++;
8445
8446 gcc_assert (nargs <= ARRAY_SIZE (xops));
8447
8448 for (i = 0; i < nargs; i++)
8449 {
8450 tree arg = CALL_EXPR_ARG (exp, i);
8451 rtx op = expand_normal (arg);
8452 int adjust = (comparison_p) ? 1 : 0;
8453 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
8454
8455 if (last_arg_constant && i == nargs - 1)
8456 {
8457 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
8458 {
8459 enum insn_code new_icode = icode;
8460 switch (icode)
8461 {
8462 case CODE_FOR_xop_vpermil2v2df3:
8463 case CODE_FOR_xop_vpermil2v4sf3:
8464 case CODE_FOR_xop_vpermil2v4df3:
8465 case CODE_FOR_xop_vpermil2v8sf3:
8466 error ("the last argument must be a 2-bit immediate");
8467 return gen_reg_rtx (tmode);
8468 case CODE_FOR_xop_rotlv2di3:
8469 new_icode = CODE_FOR_rotlv2di3;
8470 goto xop_rotl;
8471 case CODE_FOR_xop_rotlv4si3:
8472 new_icode = CODE_FOR_rotlv4si3;
8473 goto xop_rotl;
8474 case CODE_FOR_xop_rotlv8hi3:
8475 new_icode = CODE_FOR_rotlv8hi3;
8476 goto xop_rotl;
8477 case CODE_FOR_xop_rotlv16qi3:
8478 new_icode = CODE_FOR_rotlv16qi3;
8479 xop_rotl:
8480 if (CONST_INT_P (op))
8481 {
8482 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
8483 op = GEN_INT (INTVAL (op) & mask);
8484 gcc_checking_assert
8485 (insn_data[icode].operand[i + 1].predicate (op, mode));
8486 }
8487 else
8488 {
8489 gcc_checking_assert
8490 (nargs == 2
8491 && insn_data[new_icode].operand[0].mode == tmode
8492 && insn_data[new_icode].operand[1].mode == tmode
8493 && insn_data[new_icode].operand[2].mode == mode
8494 && insn_data[new_icode].operand[0].predicate
8495 == insn_data[icode].operand[0].predicate
8496 && insn_data[new_icode].operand[1].predicate
8497 == insn_data[icode].operand[1].predicate);
8498 icode = new_icode;
8499 goto non_constant;
8500 }
8501 break;
8502 default:
8503 gcc_unreachable ();
8504 }
8505 }
8506 }
8507 else
8508 {
8509 non_constant:
8510 if (VECTOR_MODE_P (mode))
8511 op = safe_vector_operand (op, mode);
8512
8513 /* If we aren't optimizing, only allow one memory operand to be
8514 generated. */
8515 if (memory_operand (op, mode))
8516 num_memory++;
8517
8518 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
8519
8520 if (optimize
8521 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
8522 || num_memory > 1)
8523 op = force_reg (mode, op);
8524 }
8525
8526 xops[i] = op;
8527 }
8528
8529 switch (nargs)
8530 {
8531 case 1:
8532 pat = GEN_FCN (icode) (target, xops[0]);
8533 break;
8534
8535 case 2:
8536 if (tf_p)
8537 pat = GEN_FCN (icode) (target, xops[0], xops[1],
8538 GEN_INT ((int)sub_code));
8539 else if (! comparison_p)
8540 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
8541 else
8542 {
8543 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
8544 xops[0], xops[1]);
8545
8546 pat = GEN_FCN (icode) (target, cmp_op, xops[0], xops[1]);
8547 }
8548 break;
8549
8550 case 3:
8551 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
8552 break;
8553
8554 case 4:
8555 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
8556 break;
8557
8558 default:
8559 gcc_unreachable ();
8560 }
8561
8562 if (! pat)
8563 return 0;
8564
8565 emit_insn (pat);
8566 return target;
8567 }
8568
8569 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
8570 insns with vec_merge. */
8571
8572 static rtx
8573 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
8574 rtx target)
8575 {
8576 rtx pat;
8577 tree arg0 = CALL_EXPR_ARG (exp, 0);
8578 rtx op1, op0 = expand_normal (arg0);
8579 machine_mode tmode = insn_data[icode].operand[0].mode;
8580 machine_mode mode0 = insn_data[icode].operand[1].mode;
8581
8582 if (optimize || !target
8583 || GET_MODE (target) != tmode
8584 || !insn_data[icode].operand[0].predicate (target, tmode))
8585 target = gen_reg_rtx (tmode);
8586
8587 if (VECTOR_MODE_P (mode0))
8588 op0 = safe_vector_operand (op0, mode0);
8589
8590 if ((optimize && !register_operand (op0, mode0))
8591 || !insn_data[icode].operand[1].predicate (op0, mode0))
8592 op0 = copy_to_mode_reg (mode0, op0);
8593
8594 op1 = op0;
8595 if (!insn_data[icode].operand[2].predicate (op1, mode0))
8596 op1 = copy_to_mode_reg (mode0, op1);
8597
8598 pat = GEN_FCN (icode) (target, op0, op1);
8599 if (! pat)
8600 return 0;
8601 emit_insn (pat);
8602 return target;
8603 }
8604
8605 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
8606
8607 static rtx
8608 ix86_expand_sse_compare (const struct builtin_description *d,
8609 tree exp, rtx target, bool swap)
8610 {
8611 rtx pat;
8612 tree arg0 = CALL_EXPR_ARG (exp, 0);
8613 tree arg1 = CALL_EXPR_ARG (exp, 1);
8614 rtx op0 = expand_normal (arg0);
8615 rtx op1 = expand_normal (arg1);
8616 rtx op2;
8617 machine_mode tmode = insn_data[d->icode].operand[0].mode;
8618 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8619 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
8620 enum rtx_code comparison = d->comparison;
8621
8622 if (VECTOR_MODE_P (mode0))
8623 op0 = safe_vector_operand (op0, mode0);
8624 if (VECTOR_MODE_P (mode1))
8625 op1 = safe_vector_operand (op1, mode1);
8626
8627 /* Swap operands if we have a comparison that isn't available in
8628 hardware. */
8629 if (swap)
8630 std::swap (op0, op1);
8631
8632 if (optimize || !target
8633 || GET_MODE (target) != tmode
8634 || !insn_data[d->icode].operand[0].predicate (target, tmode))
8635 target = gen_reg_rtx (tmode);
8636
8637 if ((optimize && !register_operand (op0, mode0))
8638 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
8639 op0 = copy_to_mode_reg (mode0, op0);
8640 if ((optimize && !register_operand (op1, mode1))
8641 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
8642 op1 = copy_to_mode_reg (mode1, op1);
8643
8644 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
8645 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
8646 if (! pat)
8647 return 0;
8648 emit_insn (pat);
8649 return target;
8650 }
8651
8652 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
8653
8654 static rtx
8655 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
8656 rtx target)
8657 {
8658 rtx pat;
8659 tree arg0 = CALL_EXPR_ARG (exp, 0);
8660 tree arg1 = CALL_EXPR_ARG (exp, 1);
8661 rtx op0 = expand_normal (arg0);
8662 rtx op1 = expand_normal (arg1);
8663 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
8664 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
8665 enum rtx_code comparison = d->comparison;
8666
8667 if (VECTOR_MODE_P (mode0))
8668 op0 = safe_vector_operand (op0, mode0);
8669 if (VECTOR_MODE_P (mode1))
8670 op1 = safe_vector_operand (op1, mode1);
8671
8672 target = gen_reg_rtx (SImode);
8673 emit_move_insn (target, const0_rtx);
8674 target = gen_rtx_SUBREG (QImode, target, 0);
8675
8676 if ((optimize && !register_operand (op0, mode0))
8677 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8678 op0 = copy_to_mode_reg (mode0, op0);
8679 if ((optimize && !register_operand (op1, mode1))
8680 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8681 op1 = copy_to_mode_reg (mode1, op1);
8682
8683 pat = GEN_FCN (d->icode) (op0, op1);
8684 if (! pat)
8685 return 0;
8686 emit_insn (pat);
8687 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8688 gen_rtx_fmt_ee (comparison, QImode,
8689 SET_DEST (pat),
8690 const0_rtx)));
8691
8692 return SUBREG_REG (target);
8693 }
8694
8695 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
8696
8697 static rtx
8698 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
8699 rtx target)
8700 {
8701 rtx pat;
8702 tree arg0 = CALL_EXPR_ARG (exp, 0);
8703 rtx op1, op0 = expand_normal (arg0);
8704 machine_mode tmode = insn_data[d->icode].operand[0].mode;
8705 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8706
8707 if (optimize || target == 0
8708 || GET_MODE (target) != tmode
8709 || !insn_data[d->icode].operand[0].predicate (target, tmode))
8710 target = gen_reg_rtx (tmode);
8711
8712 if (VECTOR_MODE_P (mode0))
8713 op0 = safe_vector_operand (op0, mode0);
8714
8715 if ((optimize && !register_operand (op0, mode0))
8716 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8717 op0 = copy_to_mode_reg (mode0, op0);
8718
8719 op1 = GEN_INT (d->comparison);
8720
8721 pat = GEN_FCN (d->icode) (target, op0, op1);
8722 if (! pat)
8723 return 0;
8724 emit_insn (pat);
8725 return target;
8726 }
8727
8728 static rtx
8729 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
8730 tree exp, rtx target)
8731 {
8732 rtx pat;
8733 tree arg0 = CALL_EXPR_ARG (exp, 0);
8734 tree arg1 = CALL_EXPR_ARG (exp, 1);
8735 rtx op0 = expand_normal (arg0);
8736 rtx op1 = expand_normal (arg1);
8737 rtx op2;
8738 machine_mode tmode = insn_data[d->icode].operand[0].mode;
8739 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8740 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
8741
8742 if (optimize || target == 0
8743 || GET_MODE (target) != tmode
8744 || !insn_data[d->icode].operand[0].predicate (target, tmode))
8745 target = gen_reg_rtx (tmode);
8746
8747 op0 = safe_vector_operand (op0, mode0);
8748 op1 = safe_vector_operand (op1, mode1);
8749
8750 if ((optimize && !register_operand (op0, mode0))
8751 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8752 op0 = copy_to_mode_reg (mode0, op0);
8753 if ((optimize && !register_operand (op1, mode1))
8754 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8755 op1 = copy_to_mode_reg (mode1, op1);
8756
8757 op2 = GEN_INT (d->comparison);
8758
8759 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
8760 if (! pat)
8761 return 0;
8762 emit_insn (pat);
8763 return target;
8764 }
8765
8766 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
8767
8768 static rtx
8769 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
8770 rtx target)
8771 {
8772 rtx pat;
8773 tree arg0 = CALL_EXPR_ARG (exp, 0);
8774 tree arg1 = CALL_EXPR_ARG (exp, 1);
8775 rtx op0 = expand_normal (arg0);
8776 rtx op1 = expand_normal (arg1);
8777 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
8778 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
8779 enum rtx_code comparison = d->comparison;
8780
8781 if (VECTOR_MODE_P (mode0))
8782 op0 = safe_vector_operand (op0, mode0);
8783 if (VECTOR_MODE_P (mode1))
8784 op1 = safe_vector_operand (op1, mode1);
8785
8786 target = gen_reg_rtx (SImode);
8787 emit_move_insn (target, const0_rtx);
8788 target = gen_rtx_SUBREG (QImode, target, 0);
8789
8790 if ((optimize && !register_operand (op0, mode0))
8791 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8792 op0 = copy_to_mode_reg (mode0, op0);
8793 if ((optimize && !register_operand (op1, mode1))
8794 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8795 op1 = copy_to_mode_reg (mode1, op1);
8796
8797 pat = GEN_FCN (d->icode) (op0, op1);
8798 if (! pat)
8799 return 0;
8800 emit_insn (pat);
8801 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8802 gen_rtx_fmt_ee (comparison, QImode,
8803 SET_DEST (pat),
8804 const0_rtx)));
8805
8806 return SUBREG_REG (target);
8807 }
8808
8809 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
8810
8811 static rtx
8812 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
8813 tree exp, rtx target)
8814 {
8815 rtx pat;
8816 tree arg0 = CALL_EXPR_ARG (exp, 0);
8817 tree arg1 = CALL_EXPR_ARG (exp, 1);
8818 tree arg2 = CALL_EXPR_ARG (exp, 2);
8819 tree arg3 = CALL_EXPR_ARG (exp, 3);
8820 tree arg4 = CALL_EXPR_ARG (exp, 4);
8821 rtx scratch0, scratch1;
8822 rtx op0 = expand_normal (arg0);
8823 rtx op1 = expand_normal (arg1);
8824 rtx op2 = expand_normal (arg2);
8825 rtx op3 = expand_normal (arg3);
8826 rtx op4 = expand_normal (arg4);
8827 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
8828
8829 tmode0 = insn_data[d->icode].operand[0].mode;
8830 tmode1 = insn_data[d->icode].operand[1].mode;
8831 modev2 = insn_data[d->icode].operand[2].mode;
8832 modei3 = insn_data[d->icode].operand[3].mode;
8833 modev4 = insn_data[d->icode].operand[4].mode;
8834 modei5 = insn_data[d->icode].operand[5].mode;
8835 modeimm = insn_data[d->icode].operand[6].mode;
8836
8837 if (VECTOR_MODE_P (modev2))
8838 op0 = safe_vector_operand (op0, modev2);
8839 if (VECTOR_MODE_P (modev4))
8840 op2 = safe_vector_operand (op2, modev4);
8841
8842 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
8843 op0 = copy_to_mode_reg (modev2, op0);
8844 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
8845 op1 = copy_to_mode_reg (modei3, op1);
8846 if ((optimize && !register_operand (op2, modev4))
8847 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
8848 op2 = copy_to_mode_reg (modev4, op2);
8849 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
8850 op3 = copy_to_mode_reg (modei5, op3);
8851
8852 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
8853 {
8854 error ("the fifth argument must be an 8-bit immediate");
8855 return const0_rtx;
8856 }
8857
8858 if (d->code == IX86_BUILTIN_PCMPESTRI128)
8859 {
8860 if (optimize || !target
8861 || GET_MODE (target) != tmode0
8862 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
8863 target = gen_reg_rtx (tmode0);
8864
8865 scratch1 = gen_reg_rtx (tmode1);
8866
8867 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
8868 }
8869 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
8870 {
8871 if (optimize || !target
8872 || GET_MODE (target) != tmode1
8873 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
8874 target = gen_reg_rtx (tmode1);
8875
8876 scratch0 = gen_reg_rtx (tmode0);
8877
8878 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
8879 }
8880 else
8881 {
8882 gcc_assert (d->flag);
8883
8884 scratch0 = gen_reg_rtx (tmode0);
8885 scratch1 = gen_reg_rtx (tmode1);
8886
8887 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
8888 }
8889
8890 if (! pat)
8891 return 0;
8892
8893 emit_insn (pat);
8894
8895 if (d->flag)
8896 {
8897 target = gen_reg_rtx (SImode);
8898 emit_move_insn (target, const0_rtx);
8899 target = gen_rtx_SUBREG (QImode, target, 0);
8900
8901 emit_insn
8902 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8903 gen_rtx_fmt_ee (EQ, QImode,
8904 gen_rtx_REG ((machine_mode) d->flag,
8905 FLAGS_REG),
8906 const0_rtx)));
8907 return SUBREG_REG (target);
8908 }
8909 else
8910 return target;
8911 }
8912
8913
8914 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
8915
8916 static rtx
8917 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
8918 tree exp, rtx target)
8919 {
8920 rtx pat;
8921 tree arg0 = CALL_EXPR_ARG (exp, 0);
8922 tree arg1 = CALL_EXPR_ARG (exp, 1);
8923 tree arg2 = CALL_EXPR_ARG (exp, 2);
8924 rtx scratch0, scratch1;
8925 rtx op0 = expand_normal (arg0);
8926 rtx op1 = expand_normal (arg1);
8927 rtx op2 = expand_normal (arg2);
8928 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
8929
8930 tmode0 = insn_data[d->icode].operand[0].mode;
8931 tmode1 = insn_data[d->icode].operand[1].mode;
8932 modev2 = insn_data[d->icode].operand[2].mode;
8933 modev3 = insn_data[d->icode].operand[3].mode;
8934 modeimm = insn_data[d->icode].operand[4].mode;
8935
8936 if (VECTOR_MODE_P (modev2))
8937 op0 = safe_vector_operand (op0, modev2);
8938 if (VECTOR_MODE_P (modev3))
8939 op1 = safe_vector_operand (op1, modev3);
8940
8941 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
8942 op0 = copy_to_mode_reg (modev2, op0);
8943 if ((optimize && !register_operand (op1, modev3))
8944 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
8945 op1 = copy_to_mode_reg (modev3, op1);
8946
8947 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
8948 {
8949 error ("the third argument must be an 8-bit immediate");
8950 return const0_rtx;
8951 }
8952
8953 if (d->code == IX86_BUILTIN_PCMPISTRI128)
8954 {
8955 if (optimize || !target
8956 || GET_MODE (target) != tmode0
8957 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
8958 target = gen_reg_rtx (tmode0);
8959
8960 scratch1 = gen_reg_rtx (tmode1);
8961
8962 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
8963 }
8964 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
8965 {
8966 if (optimize || !target
8967 || GET_MODE (target) != tmode1
8968 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
8969 target = gen_reg_rtx (tmode1);
8970
8971 scratch0 = gen_reg_rtx (tmode0);
8972
8973 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
8974 }
8975 else
8976 {
8977 gcc_assert (d->flag);
8978
8979 scratch0 = gen_reg_rtx (tmode0);
8980 scratch1 = gen_reg_rtx (tmode1);
8981
8982 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
8983 }
8984
8985 if (! pat)
8986 return 0;
8987
8988 emit_insn (pat);
8989
8990 if (d->flag)
8991 {
8992 target = gen_reg_rtx (SImode);
8993 emit_move_insn (target, const0_rtx);
8994 target = gen_rtx_SUBREG (QImode, target, 0);
8995
8996 emit_insn
8997 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8998 gen_rtx_fmt_ee (EQ, QImode,
8999 gen_rtx_REG ((machine_mode) d->flag,
9000 FLAGS_REG),
9001 const0_rtx)));
9002 return SUBREG_REG (target);
9003 }
9004 else
9005 return target;
9006 }
9007
9008 /* Fixup modeless constants to fit required mode. */
9009
9010 static rtx
9011 fixup_modeless_constant (rtx x, machine_mode mode)
9012 {
9013 if (GET_MODE (x) == VOIDmode)
9014 x = convert_to_mode (mode, x, 1);
9015 return x;
9016 }
9017
9018 /* Subroutine of ix86_expand_builtin to take care of insns with
9019 variable number of operands. */
9020
9021 static rtx
9022 ix86_expand_args_builtin (const struct builtin_description *d,
9023 tree exp, rtx target)
9024 {
9025 rtx pat, real_target;
9026 unsigned int i, nargs;
9027 unsigned int nargs_constant = 0;
9028 unsigned int mask_pos = 0;
9029 int num_memory = 0;
9030 rtx xops[6];
9031 bool second_arg_count = false;
9032 enum insn_code icode = d->icode;
9033 const struct insn_data_d *insn_p = &insn_data[icode];
9034 machine_mode tmode = insn_p->operand[0].mode;
9035 machine_mode rmode = VOIDmode;
9036 bool swap = false;
9037 enum rtx_code comparison = d->comparison;
9038
9039 switch ((enum ix86_builtin_func_type) d->flag)
9040 {
9041 case V2DF_FTYPE_V2DF_ROUND:
9042 case V4DF_FTYPE_V4DF_ROUND:
9043 case V8DF_FTYPE_V8DF_ROUND:
9044 case V4SF_FTYPE_V4SF_ROUND:
9045 case V8SF_FTYPE_V8SF_ROUND:
9046 case V16SF_FTYPE_V16SF_ROUND:
9047 case V4SI_FTYPE_V4SF_ROUND:
9048 case V8SI_FTYPE_V8SF_ROUND:
9049 case V16SI_FTYPE_V16SF_ROUND:
9050 return ix86_expand_sse_round (d, exp, target);
9051 case V4SI_FTYPE_V2DF_V2DF_ROUND:
9052 case V8SI_FTYPE_V4DF_V4DF_ROUND:
9053 case V16SI_FTYPE_V8DF_V8DF_ROUND:
9054 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
9055 case INT_FTYPE_V8SF_V8SF_PTEST:
9056 case INT_FTYPE_V4DI_V4DI_PTEST:
9057 case INT_FTYPE_V4DF_V4DF_PTEST:
9058 case INT_FTYPE_V4SF_V4SF_PTEST:
9059 case INT_FTYPE_V2DI_V2DI_PTEST:
9060 case INT_FTYPE_V2DF_V2DF_PTEST:
9061 return ix86_expand_sse_ptest (d, exp, target);
9062 case FLOAT128_FTYPE_FLOAT128:
9063 case FLOAT_FTYPE_FLOAT:
9064 case INT_FTYPE_INT:
9065 case UINT_FTYPE_UINT:
9066 case UINT16_FTYPE_UINT16:
9067 case UINT64_FTYPE_INT:
9068 case UINT64_FTYPE_UINT64:
9069 case INT64_FTYPE_INT64:
9070 case INT64_FTYPE_V4SF:
9071 case INT64_FTYPE_V2DF:
9072 case INT_FTYPE_V16QI:
9073 case INT_FTYPE_V8QI:
9074 case INT_FTYPE_V8SF:
9075 case INT_FTYPE_V4DF:
9076 case INT_FTYPE_V4SF:
9077 case INT_FTYPE_V2DF:
9078 case INT_FTYPE_V32QI:
9079 case V16QI_FTYPE_V16QI:
9080 case V8SI_FTYPE_V8SF:
9081 case V8SI_FTYPE_V4SI:
9082 case V8HI_FTYPE_V8HI:
9083 case V8HI_FTYPE_V16QI:
9084 case V8QI_FTYPE_V8QI:
9085 case V8SF_FTYPE_V8SF:
9086 case V8SF_FTYPE_V8SI:
9087 case V8SF_FTYPE_V4SF:
9088 case V8SF_FTYPE_V8HI:
9089 case V4SI_FTYPE_V4SI:
9090 case V4SI_FTYPE_V16QI:
9091 case V4SI_FTYPE_V4SF:
9092 case V4SI_FTYPE_V8SI:
9093 case V4SI_FTYPE_V8HI:
9094 case V4SI_FTYPE_V4DF:
9095 case V4SI_FTYPE_V2DF:
9096 case V4HI_FTYPE_V4HI:
9097 case V4DF_FTYPE_V4DF:
9098 case V4DF_FTYPE_V4SI:
9099 case V4DF_FTYPE_V4SF:
9100 case V4DF_FTYPE_V2DF:
9101 case V4SF_FTYPE_V4SF:
9102 case V4SF_FTYPE_V4SI:
9103 case V4SF_FTYPE_V8SF:
9104 case V4SF_FTYPE_V4DF:
9105 case V4SF_FTYPE_V8HI:
9106 case V4SF_FTYPE_V2DF:
9107 case V2DI_FTYPE_V2DI:
9108 case V2DI_FTYPE_V16QI:
9109 case V2DI_FTYPE_V8HI:
9110 case V2DI_FTYPE_V4SI:
9111 case V2DF_FTYPE_V2DF:
9112 case V2DF_FTYPE_V4SI:
9113 case V2DF_FTYPE_V4DF:
9114 case V2DF_FTYPE_V4SF:
9115 case V2DF_FTYPE_V2SI:
9116 case V2SI_FTYPE_V2SI:
9117 case V2SI_FTYPE_V4SF:
9118 case V2SI_FTYPE_V2SF:
9119 case V2SI_FTYPE_V2DF:
9120 case V2SF_FTYPE_V2SF:
9121 case V2SF_FTYPE_V2SI:
9122 case V32QI_FTYPE_V32QI:
9123 case V32QI_FTYPE_V16QI:
9124 case V16HI_FTYPE_V16HI:
9125 case V16HI_FTYPE_V8HI:
9126 case V8SI_FTYPE_V8SI:
9127 case V16HI_FTYPE_V16QI:
9128 case V8SI_FTYPE_V16QI:
9129 case V4DI_FTYPE_V16QI:
9130 case V8SI_FTYPE_V8HI:
9131 case V4DI_FTYPE_V8HI:
9132 case V4DI_FTYPE_V4SI:
9133 case V4DI_FTYPE_V2DI:
9134 case UQI_FTYPE_UQI:
9135 case UHI_FTYPE_UHI:
9136 case USI_FTYPE_USI:
9137 case USI_FTYPE_UQI:
9138 case USI_FTYPE_UHI:
9139 case UDI_FTYPE_UDI:
9140 case UHI_FTYPE_V16QI:
9141 case USI_FTYPE_V32QI:
9142 case UDI_FTYPE_V64QI:
9143 case V16QI_FTYPE_UHI:
9144 case V32QI_FTYPE_USI:
9145 case V64QI_FTYPE_UDI:
9146 case V8HI_FTYPE_UQI:
9147 case V16HI_FTYPE_UHI:
9148 case V32HI_FTYPE_USI:
9149 case V4SI_FTYPE_UQI:
9150 case V8SI_FTYPE_UQI:
9151 case V4SI_FTYPE_UHI:
9152 case V8SI_FTYPE_UHI:
9153 case UQI_FTYPE_V8HI:
9154 case UHI_FTYPE_V16HI:
9155 case USI_FTYPE_V32HI:
9156 case UQI_FTYPE_V4SI:
9157 case UQI_FTYPE_V8SI:
9158 case UHI_FTYPE_V16SI:
9159 case UQI_FTYPE_V2DI:
9160 case UQI_FTYPE_V4DI:
9161 case UQI_FTYPE_V8DI:
9162 case V16SI_FTYPE_UHI:
9163 case V2DI_FTYPE_UQI:
9164 case V4DI_FTYPE_UQI:
9165 case V16SI_FTYPE_INT:
9166 case V16SF_FTYPE_V8SF:
9167 case V16SI_FTYPE_V8SI:
9168 case V16SF_FTYPE_V4SF:
9169 case V16SI_FTYPE_V4SI:
9170 case V16SI_FTYPE_V16SF:
9171 case V16SI_FTYPE_V16SI:
9172 case V64QI_FTYPE_V64QI:
9173 case V32HI_FTYPE_V32HI:
9174 case V16SF_FTYPE_V16SF:
9175 case V8DI_FTYPE_UQI:
9176 case V8DI_FTYPE_V8DI:
9177 case V8DF_FTYPE_V4DF:
9178 case V8DF_FTYPE_V2DF:
9179 case V8DF_FTYPE_V8DF:
9180 case V4DI_FTYPE_V4DI:
9181 case V16HI_FTYPE_V16SF:
9182 case V8HI_FTYPE_V8SF:
9183 case V8HI_FTYPE_V4SF:
9184 nargs = 1;
9185 break;
9186 case V4SF_FTYPE_V4SF_VEC_MERGE:
9187 case V2DF_FTYPE_V2DF_VEC_MERGE:
9188 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
9189 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
9190 case V16QI_FTYPE_V16QI_V16QI:
9191 case V16QI_FTYPE_V8HI_V8HI:
9192 case V16SF_FTYPE_V16SF_V16SF:
9193 case V8QI_FTYPE_V8QI_V8QI:
9194 case V8QI_FTYPE_V4HI_V4HI:
9195 case V8HI_FTYPE_V8HI_V8HI:
9196 case V8HI_FTYPE_V16QI_V16QI:
9197 case V8HI_FTYPE_V4SI_V4SI:
9198 case V8SF_FTYPE_V8SF_V8SF:
9199 case V8SF_FTYPE_V8SF_V8SI:
9200 case V8DF_FTYPE_V8DF_V8DF:
9201 case V4SI_FTYPE_V4SI_V4SI:
9202 case V4SI_FTYPE_V8HI_V8HI:
9203 case V4SI_FTYPE_V2DF_V2DF:
9204 case V4HI_FTYPE_V4HI_V4HI:
9205 case V4HI_FTYPE_V8QI_V8QI:
9206 case V4HI_FTYPE_V2SI_V2SI:
9207 case V4DF_FTYPE_V4DF_V4DF:
9208 case V4DF_FTYPE_V4DF_V4DI:
9209 case V4SF_FTYPE_V4SF_V4SF:
9210 case V4SF_FTYPE_V4SF_V4SI:
9211 case V4SF_FTYPE_V4SF_V2SI:
9212 case V4SF_FTYPE_V4SF_V2DF:
9213 case V4SF_FTYPE_V4SF_UINT:
9214 case V4SF_FTYPE_V4SF_DI:
9215 case V4SF_FTYPE_V4SF_SI:
9216 case V2DI_FTYPE_V2DI_V2DI:
9217 case V2DI_FTYPE_V16QI_V16QI:
9218 case V2DI_FTYPE_V4SI_V4SI:
9219 case V2DI_FTYPE_V2DI_V16QI:
9220 case V2SI_FTYPE_V2SI_V2SI:
9221 case V2SI_FTYPE_V4HI_V4HI:
9222 case V2SI_FTYPE_V2SF_V2SF:
9223 case V2DF_FTYPE_V2DF_V2DF:
9224 case V2DF_FTYPE_V2DF_V4SF:
9225 case V2DF_FTYPE_V2DF_V2DI:
9226 case V2DF_FTYPE_V2DF_DI:
9227 case V2DF_FTYPE_V2DF_SI:
9228 case V2DF_FTYPE_V2DF_UINT:
9229 case V2SF_FTYPE_V2SF_V2SF:
9230 case V1DI_FTYPE_V1DI_V1DI:
9231 case V1DI_FTYPE_V8QI_V8QI:
9232 case V1DI_FTYPE_V2SI_V2SI:
9233 case V32QI_FTYPE_V16HI_V16HI:
9234 case V16HI_FTYPE_V8SI_V8SI:
9235 case V64QI_FTYPE_V64QI_V64QI:
9236 case V32QI_FTYPE_V32QI_V32QI:
9237 case V16HI_FTYPE_V32QI_V32QI:
9238 case V16HI_FTYPE_V16HI_V16HI:
9239 case V8SI_FTYPE_V4DF_V4DF:
9240 case V8SI_FTYPE_V8SI_V8SI:
9241 case V8SI_FTYPE_V16HI_V16HI:
9242 case V4DI_FTYPE_V4DI_V4DI:
9243 case V4DI_FTYPE_V8SI_V8SI:
9244 case V8DI_FTYPE_V64QI_V64QI:
9245 if (comparison == UNKNOWN)
9246 return ix86_expand_binop_builtin (icode, exp, target);
9247 nargs = 2;
9248 break;
9249 case V4SF_FTYPE_V4SF_V4SF_SWAP:
9250 case V2DF_FTYPE_V2DF_V2DF_SWAP:
9251 gcc_assert (comparison != UNKNOWN);
9252 nargs = 2;
9253 swap = true;
9254 break;
9255 case V16HI_FTYPE_V16HI_V8HI_COUNT:
9256 case V16HI_FTYPE_V16HI_SI_COUNT:
9257 case V8SI_FTYPE_V8SI_V4SI_COUNT:
9258 case V8SI_FTYPE_V8SI_SI_COUNT:
9259 case V4DI_FTYPE_V4DI_V2DI_COUNT:
9260 case V4DI_FTYPE_V4DI_INT_COUNT:
9261 case V8HI_FTYPE_V8HI_V8HI_COUNT:
9262 case V8HI_FTYPE_V8HI_SI_COUNT:
9263 case V4SI_FTYPE_V4SI_V4SI_COUNT:
9264 case V4SI_FTYPE_V4SI_SI_COUNT:
9265 case V4HI_FTYPE_V4HI_V4HI_COUNT:
9266 case V4HI_FTYPE_V4HI_SI_COUNT:
9267 case V2DI_FTYPE_V2DI_V2DI_COUNT:
9268 case V2DI_FTYPE_V2DI_SI_COUNT:
9269 case V2SI_FTYPE_V2SI_V2SI_COUNT:
9270 case V2SI_FTYPE_V2SI_SI_COUNT:
9271 case V1DI_FTYPE_V1DI_V1DI_COUNT:
9272 case V1DI_FTYPE_V1DI_SI_COUNT:
9273 nargs = 2;
9274 second_arg_count = true;
9275 break;
9276 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
9277 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
9278 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
9279 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
9280 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
9281 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
9282 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
9283 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
9284 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
9285 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
9286 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
9287 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
9288 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
9289 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
9290 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
9291 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
9292 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
9293 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
9294 nargs = 4;
9295 second_arg_count = true;
9296 break;
9297 case UINT64_FTYPE_UINT64_UINT64:
9298 case UINT_FTYPE_UINT_UINT:
9299 case UINT_FTYPE_UINT_USHORT:
9300 case UINT_FTYPE_UINT_UCHAR:
9301 case UINT16_FTYPE_UINT16_INT:
9302 case UINT8_FTYPE_UINT8_INT:
9303 case UQI_FTYPE_UQI_UQI:
9304 case UHI_FTYPE_UHI_UHI:
9305 case USI_FTYPE_USI_USI:
9306 case UDI_FTYPE_UDI_UDI:
9307 case V16SI_FTYPE_V8DF_V8DF:
9308 case V32HI_FTYPE_V16SF_V16SF:
9309 case V16HI_FTYPE_V8SF_V8SF:
9310 case V8HI_FTYPE_V4SF_V4SF:
9311 case V16HI_FTYPE_V16SF_UHI:
9312 case V8HI_FTYPE_V8SF_UQI:
9313 case V8HI_FTYPE_V4SF_UQI:
9314 nargs = 2;
9315 break;
9316 case V2DI_FTYPE_V2DI_INT_CONVERT:
9317 nargs = 2;
9318 rmode = V1TImode;
9319 nargs_constant = 1;
9320 break;
9321 case V4DI_FTYPE_V4DI_INT_CONVERT:
9322 nargs = 2;
9323 rmode = V2TImode;
9324 nargs_constant = 1;
9325 break;
9326 case V8DI_FTYPE_V8DI_INT_CONVERT:
9327 nargs = 2;
9328 rmode = V4TImode;
9329 nargs_constant = 1;
9330 break;
9331 case V8HI_FTYPE_V8HI_INT:
9332 case V8HI_FTYPE_V8SF_INT:
9333 case V16HI_FTYPE_V16SF_INT:
9334 case V8HI_FTYPE_V4SF_INT:
9335 case V8SF_FTYPE_V8SF_INT:
9336 case V4SF_FTYPE_V16SF_INT:
9337 case V16SF_FTYPE_V16SF_INT:
9338 case V4SI_FTYPE_V4SI_INT:
9339 case V4SI_FTYPE_V8SI_INT:
9340 case V4HI_FTYPE_V4HI_INT:
9341 case V4DF_FTYPE_V4DF_INT:
9342 case V4DF_FTYPE_V8DF_INT:
9343 case V4SF_FTYPE_V4SF_INT:
9344 case V4SF_FTYPE_V8SF_INT:
9345 case V2DI_FTYPE_V2DI_INT:
9346 case V2DF_FTYPE_V2DF_INT:
9347 case V2DF_FTYPE_V4DF_INT:
9348 case V16HI_FTYPE_V16HI_INT:
9349 case V8SI_FTYPE_V8SI_INT:
9350 case V16SI_FTYPE_V16SI_INT:
9351 case V4SI_FTYPE_V16SI_INT:
9352 case V4DI_FTYPE_V4DI_INT:
9353 case V2DI_FTYPE_V4DI_INT:
9354 case V4DI_FTYPE_V8DI_INT:
9355 case UQI_FTYPE_UQI_UQI_CONST:
9356 case UHI_FTYPE_UHI_UQI:
9357 case USI_FTYPE_USI_UQI:
9358 case UDI_FTYPE_UDI_UQI:
9359 nargs = 2;
9360 nargs_constant = 1;
9361 break;
9362 case V16QI_FTYPE_V16QI_V16QI_V16QI:
9363 case V8SF_FTYPE_V8SF_V8SF_V8SF:
9364 case V4DF_FTYPE_V4DF_V4DF_V4DF:
9365 case V4SF_FTYPE_V4SF_V4SF_V4SF:
9366 case V2DF_FTYPE_V2DF_V2DF_V2DF:
9367 case V32QI_FTYPE_V32QI_V32QI_V32QI:
9368 case UHI_FTYPE_V16SI_V16SI_UHI:
9369 case UQI_FTYPE_V8DI_V8DI_UQI:
9370 case V16HI_FTYPE_V16SI_V16HI_UHI:
9371 case V16QI_FTYPE_V16SI_V16QI_UHI:
9372 case V16QI_FTYPE_V8DI_V16QI_UQI:
9373 case V16SF_FTYPE_V16SF_V16SF_UHI:
9374 case V16SF_FTYPE_V4SF_V16SF_UHI:
9375 case V16SI_FTYPE_SI_V16SI_UHI:
9376 case V16SI_FTYPE_V16HI_V16SI_UHI:
9377 case V16SI_FTYPE_V16QI_V16SI_UHI:
9378 case V8SF_FTYPE_V4SF_V8SF_UQI:
9379 case V4DF_FTYPE_V2DF_V4DF_UQI:
9380 case V8SI_FTYPE_V4SI_V8SI_UQI:
9381 case V8SI_FTYPE_SI_V8SI_UQI:
9382 case V4SI_FTYPE_V4SI_V4SI_UQI:
9383 case V4SI_FTYPE_SI_V4SI_UQI:
9384 case V4DI_FTYPE_V2DI_V4DI_UQI:
9385 case V4DI_FTYPE_DI_V4DI_UQI:
9386 case V2DI_FTYPE_V2DI_V2DI_UQI:
9387 case V2DI_FTYPE_DI_V2DI_UQI:
9388 case V64QI_FTYPE_V64QI_V64QI_UDI:
9389 case V64QI_FTYPE_V16QI_V64QI_UDI:
9390 case V64QI_FTYPE_QI_V64QI_UDI:
9391 case V32QI_FTYPE_V32QI_V32QI_USI:
9392 case V32QI_FTYPE_V16QI_V32QI_USI:
9393 case V32QI_FTYPE_QI_V32QI_USI:
9394 case V16QI_FTYPE_V16QI_V16QI_UHI:
9395 case V16QI_FTYPE_QI_V16QI_UHI:
9396 case V32HI_FTYPE_V8HI_V32HI_USI:
9397 case V32HI_FTYPE_HI_V32HI_USI:
9398 case V16HI_FTYPE_V8HI_V16HI_UHI:
9399 case V16HI_FTYPE_HI_V16HI_UHI:
9400 case V8HI_FTYPE_V8HI_V8HI_UQI:
9401 case V8HI_FTYPE_HI_V8HI_UQI:
9402 case V8SF_FTYPE_V8HI_V8SF_UQI:
9403 case V4SF_FTYPE_V8HI_V4SF_UQI:
9404 case V8SI_FTYPE_V8SF_V8SI_UQI:
9405 case V4SI_FTYPE_V4SF_V4SI_UQI:
9406 case V4DI_FTYPE_V4SF_V4DI_UQI:
9407 case V2DI_FTYPE_V4SF_V2DI_UQI:
9408 case V4SF_FTYPE_V4DI_V4SF_UQI:
9409 case V4SF_FTYPE_V2DI_V4SF_UQI:
9410 case V4DF_FTYPE_V4DI_V4DF_UQI:
9411 case V2DF_FTYPE_V2DI_V2DF_UQI:
9412 case V16QI_FTYPE_V8HI_V16QI_UQI:
9413 case V16QI_FTYPE_V16HI_V16QI_UHI:
9414 case V16QI_FTYPE_V4SI_V16QI_UQI:
9415 case V16QI_FTYPE_V8SI_V16QI_UQI:
9416 case V8HI_FTYPE_V4SI_V8HI_UQI:
9417 case V8HI_FTYPE_V8SI_V8HI_UQI:
9418 case V16QI_FTYPE_V2DI_V16QI_UQI:
9419 case V16QI_FTYPE_V4DI_V16QI_UQI:
9420 case V8HI_FTYPE_V2DI_V8HI_UQI:
9421 case V8HI_FTYPE_V4DI_V8HI_UQI:
9422 case V4SI_FTYPE_V2DI_V4SI_UQI:
9423 case V4SI_FTYPE_V4DI_V4SI_UQI:
9424 case V32QI_FTYPE_V32HI_V32QI_USI:
9425 case UHI_FTYPE_V16QI_V16QI_UHI:
9426 case USI_FTYPE_V32QI_V32QI_USI:
9427 case UDI_FTYPE_V64QI_V64QI_UDI:
9428 case UQI_FTYPE_V8HI_V8HI_UQI:
9429 case UHI_FTYPE_V16HI_V16HI_UHI:
9430 case USI_FTYPE_V32HI_V32HI_USI:
9431 case UQI_FTYPE_V4SI_V4SI_UQI:
9432 case UQI_FTYPE_V8SI_V8SI_UQI:
9433 case UQI_FTYPE_V2DI_V2DI_UQI:
9434 case UQI_FTYPE_V4DI_V4DI_UQI:
9435 case V4SF_FTYPE_V2DF_V4SF_UQI:
9436 case V4SF_FTYPE_V4DF_V4SF_UQI:
9437 case V16SI_FTYPE_V16SI_V16SI_UHI:
9438 case V16SI_FTYPE_V4SI_V16SI_UHI:
9439 case V2DI_FTYPE_V4SI_V2DI_UQI:
9440 case V2DI_FTYPE_V8HI_V2DI_UQI:
9441 case V2DI_FTYPE_V16QI_V2DI_UQI:
9442 case V4DI_FTYPE_V4DI_V4DI_UQI:
9443 case V4DI_FTYPE_V4SI_V4DI_UQI:
9444 case V4DI_FTYPE_V8HI_V4DI_UQI:
9445 case V4DI_FTYPE_V16QI_V4DI_UQI:
9446 case V4DI_FTYPE_V4DF_V4DI_UQI:
9447 case V2DI_FTYPE_V2DF_V2DI_UQI:
9448 case V4SI_FTYPE_V4DF_V4SI_UQI:
9449 case V4SI_FTYPE_V2DF_V4SI_UQI:
9450 case V4SI_FTYPE_V8HI_V4SI_UQI:
9451 case V4SI_FTYPE_V16QI_V4SI_UQI:
9452 case V4DI_FTYPE_V4DI_V4DI_V4DI:
9453 case V8DF_FTYPE_V2DF_V8DF_UQI:
9454 case V8DF_FTYPE_V4DF_V8DF_UQI:
9455 case V8DF_FTYPE_V8DF_V8DF_UQI:
9456 case V8SF_FTYPE_V8SF_V8SF_UQI:
9457 case V8SF_FTYPE_V8SI_V8SF_UQI:
9458 case V4DF_FTYPE_V4DF_V4DF_UQI:
9459 case V4SF_FTYPE_V4SF_V4SF_UQI:
9460 case V2DF_FTYPE_V2DF_V2DF_UQI:
9461 case V2DF_FTYPE_V4SF_V2DF_UQI:
9462 case V2DF_FTYPE_V4SI_V2DF_UQI:
9463 case V4SF_FTYPE_V4SI_V4SF_UQI:
9464 case V4DF_FTYPE_V4SF_V4DF_UQI:
9465 case V4DF_FTYPE_V4SI_V4DF_UQI:
9466 case V8SI_FTYPE_V8SI_V8SI_UQI:
9467 case V8SI_FTYPE_V8HI_V8SI_UQI:
9468 case V8SI_FTYPE_V16QI_V8SI_UQI:
9469 case V8DF_FTYPE_V8SI_V8DF_UQI:
9470 case V8DI_FTYPE_DI_V8DI_UQI:
9471 case V16SF_FTYPE_V8SF_V16SF_UHI:
9472 case V16SI_FTYPE_V8SI_V16SI_UHI:
9473 case V16HI_FTYPE_V16HI_V16HI_UHI:
9474 case V8HI_FTYPE_V16QI_V8HI_UQI:
9475 case V16HI_FTYPE_V16QI_V16HI_UHI:
9476 case V32HI_FTYPE_V32HI_V32HI_USI:
9477 case V32HI_FTYPE_V32QI_V32HI_USI:
9478 case V8DI_FTYPE_V16QI_V8DI_UQI:
9479 case V8DI_FTYPE_V2DI_V8DI_UQI:
9480 case V8DI_FTYPE_V4DI_V8DI_UQI:
9481 case V8DI_FTYPE_V8DI_V8DI_UQI:
9482 case V8DI_FTYPE_V8HI_V8DI_UQI:
9483 case V8DI_FTYPE_V8SI_V8DI_UQI:
9484 case V8HI_FTYPE_V8DI_V8HI_UQI:
9485 case V8SI_FTYPE_V8DI_V8SI_UQI:
9486 case V4SI_FTYPE_V4SI_V4SI_V4SI:
9487 case V16SI_FTYPE_V16SI_V16SI_V16SI:
9488 case V8DI_FTYPE_V8DI_V8DI_V8DI:
9489 case V32HI_FTYPE_V32HI_V32HI_V32HI:
9490 case V2DI_FTYPE_V2DI_V2DI_V2DI:
9491 case V16HI_FTYPE_V16HI_V16HI_V16HI:
9492 case V8SI_FTYPE_V8SI_V8SI_V8SI:
9493 case V8HI_FTYPE_V8HI_V8HI_V8HI:
9494 case V32HI_FTYPE_V16SF_V16SF_USI:
9495 case V16HI_FTYPE_V8SF_V8SF_UHI:
9496 case V8HI_FTYPE_V4SF_V4SF_UQI:
9497 case V16HI_FTYPE_V16SF_V16HI_UHI:
9498 case V8HI_FTYPE_V8SF_V8HI_UQI:
9499 case V8HI_FTYPE_V4SF_V8HI_UQI:
9500 case V16SF_FTYPE_V16SF_V32HI_V32HI:
9501 case V8SF_FTYPE_V8SF_V16HI_V16HI:
9502 case V4SF_FTYPE_V4SF_V8HI_V8HI:
9503 nargs = 3;
9504 break;
9505 case V32QI_FTYPE_V32QI_V32QI_INT:
9506 case V16HI_FTYPE_V16HI_V16HI_INT:
9507 case V16QI_FTYPE_V16QI_V16QI_INT:
9508 case V4DI_FTYPE_V4DI_V4DI_INT:
9509 case V8HI_FTYPE_V8HI_V8HI_INT:
9510 case V8SI_FTYPE_V8SI_V8SI_INT:
9511 case V8SI_FTYPE_V8SI_V4SI_INT:
9512 case V8SF_FTYPE_V8SF_V8SF_INT:
9513 case V8SF_FTYPE_V8SF_V4SF_INT:
9514 case V4SI_FTYPE_V4SI_V4SI_INT:
9515 case V4DF_FTYPE_V4DF_V4DF_INT:
9516 case V16SF_FTYPE_V16SF_V16SF_INT:
9517 case V16SF_FTYPE_V16SF_V4SF_INT:
9518 case V16SI_FTYPE_V16SI_V4SI_INT:
9519 case V4DF_FTYPE_V4DF_V2DF_INT:
9520 case V4SF_FTYPE_V4SF_V4SF_INT:
9521 case V2DI_FTYPE_V2DI_V2DI_INT:
9522 case V4DI_FTYPE_V4DI_V2DI_INT:
9523 case V2DF_FTYPE_V2DF_V2DF_INT:
9524 case UQI_FTYPE_V8DI_V8UDI_INT:
9525 case UQI_FTYPE_V8DF_V8DF_INT:
9526 case UQI_FTYPE_V2DF_V2DF_INT:
9527 case UQI_FTYPE_V4SF_V4SF_INT:
9528 case UHI_FTYPE_V16SI_V16SI_INT:
9529 case UHI_FTYPE_V16SF_V16SF_INT:
9530 case V64QI_FTYPE_V64QI_V64QI_INT:
9531 case V32HI_FTYPE_V32HI_V32HI_INT:
9532 case V16SI_FTYPE_V16SI_V16SI_INT:
9533 case V8DI_FTYPE_V8DI_V8DI_INT:
9534 nargs = 3;
9535 nargs_constant = 1;
9536 break;
9537 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
9538 nargs = 3;
9539 rmode = V4DImode;
9540 nargs_constant = 1;
9541 break;
9542 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
9543 nargs = 3;
9544 rmode = V2DImode;
9545 nargs_constant = 1;
9546 break;
9547 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
9548 nargs = 3;
9549 rmode = DImode;
9550 nargs_constant = 1;
9551 break;
9552 case V2DI_FTYPE_V2DI_UINT_UINT:
9553 nargs = 3;
9554 nargs_constant = 2;
9555 break;
9556 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
9557 nargs = 3;
9558 rmode = V8DImode;
9559 nargs_constant = 1;
9560 break;
9561 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
9562 nargs = 5;
9563 rmode = V8DImode;
9564 mask_pos = 2;
9565 nargs_constant = 1;
9566 break;
9567 case QI_FTYPE_V8DF_INT_UQI:
9568 case QI_FTYPE_V4DF_INT_UQI:
9569 case QI_FTYPE_V2DF_INT_UQI:
9570 case HI_FTYPE_V16SF_INT_UHI:
9571 case QI_FTYPE_V8SF_INT_UQI:
9572 case QI_FTYPE_V4SF_INT_UQI:
9573 case V4SI_FTYPE_V4SI_V4SI_UHI:
9574 case V8SI_FTYPE_V8SI_V8SI_UHI:
9575 nargs = 3;
9576 mask_pos = 1;
9577 nargs_constant = 1;
9578 break;
9579 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
9580 nargs = 5;
9581 rmode = V4DImode;
9582 mask_pos = 2;
9583 nargs_constant = 1;
9584 break;
9585 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
9586 nargs = 5;
9587 rmode = V2DImode;
9588 mask_pos = 2;
9589 nargs_constant = 1;
9590 break;
9591 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
9592 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
9593 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
9594 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
9595 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
9596 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
9597 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
9598 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
9599 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
9600 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
9601 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
9602 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
9603 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
9604 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
9605 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
9606 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
9607 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
9608 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
9609 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
9610 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
9611 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
9612 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
9613 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
9614 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
9615 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
9616 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
9617 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
9618 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
9619 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
9620 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
9621 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
9622 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
9623 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
9624 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
9625 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
9626 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
9627 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
9628 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
9629 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
9630 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
9631 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
9632 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
9633 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
9634 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
9635 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
9636 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
9637 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
9638 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
9639 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
9640 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
9641 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
9642 case V32HI_FTYPE_V16SF_V16SF_V32HI_USI:
9643 case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI:
9644 case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI:
9645 nargs = 4;
9646 break;
9647 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
9648 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
9649 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
9650 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
9651 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
9652 nargs = 4;
9653 nargs_constant = 1;
9654 break;
9655 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
9656 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
9657 case QI_FTYPE_V4DF_V4DF_INT_UQI:
9658 case QI_FTYPE_V8SF_V8SF_INT_UQI:
9659 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
9660 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
9661 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
9662 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
9663 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
9664 case USI_FTYPE_V32QI_V32QI_INT_USI:
9665 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
9666 case USI_FTYPE_V32HI_V32HI_INT_USI:
9667 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
9668 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
9669 nargs = 4;
9670 mask_pos = 1;
9671 nargs_constant = 1;
9672 break;
9673 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
9674 nargs = 4;
9675 nargs_constant = 2;
9676 break;
9677 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
9678 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
9679 case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI:
9680 case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI:
9681 case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI:
9682 nargs = 4;
9683 break;
9684 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
9685 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
9686 mask_pos = 1;
9687 nargs = 4;
9688 nargs_constant = 1;
9689 break;
9690 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
9691 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
9692 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
9693 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
9694 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
9695 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
9696 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
9697 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
9698 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
9699 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
9700 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
9701 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
9702 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
9703 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
9704 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
9705 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
9706 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
9707 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
9708 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
9709 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
9710 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
9711 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
9712 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
9713 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
9714 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
9715 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
9716 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
9717 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
9718 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
9719 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
9720 nargs = 4;
9721 mask_pos = 2;
9722 nargs_constant = 1;
9723 break;
9724 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
9725 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
9726 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
9727 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
9728 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
9729 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
9730 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
9731 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
9732 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
9733 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
9734 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
9735 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
9736 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
9737 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
9738 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
9739 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
9740 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
9741 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
9742 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
9743 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
9744 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
9745 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
9746 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
9747 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
9748 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
9749 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
9750 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
9751 nargs = 5;
9752 mask_pos = 2;
9753 nargs_constant = 1;
9754 break;
9755 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
9756 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
9757 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
9758 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
9759 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
9760 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
9761 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
9762 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
9763 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
9764 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
9765 nargs = 5;
9766 mask_pos = 1;
9767 nargs_constant = 1;
9768 break;
9769 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
9770 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
9771 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
9772 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
9773 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
9774 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
9775 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
9776 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
9777 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
9778 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
9779 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
9780 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
9781 nargs = 5;
9782 mask_pos = 1;
9783 nargs_constant = 2;
9784 break;
9785
9786 default:
9787 gcc_unreachable ();
9788 }
9789
9790 gcc_assert (nargs <= ARRAY_SIZE (xops));
9791
9792 if (comparison != UNKNOWN)
9793 {
9794 gcc_assert (nargs == 2);
9795 return ix86_expand_sse_compare (d, exp, target, swap);
9796 }
9797
9798 if (rmode == VOIDmode || rmode == tmode)
9799 {
9800 if (optimize
9801 || target == 0
9802 || GET_MODE (target) != tmode
9803 || !insn_p->operand[0].predicate (target, tmode))
9804 target = gen_reg_rtx (tmode);
9805 else if (memory_operand (target, tmode))
9806 num_memory++;
9807 real_target = target;
9808 }
9809 else
9810 {
9811 real_target = gen_reg_rtx (tmode);
9812 target = lowpart_subreg (rmode, real_target, tmode);
9813 }
9814
9815 for (i = 0; i < nargs; i++)
9816 {
9817 tree arg = CALL_EXPR_ARG (exp, i);
9818 rtx op = expand_normal (arg);
9819 machine_mode mode = insn_p->operand[i + 1].mode;
9820 bool match = insn_p->operand[i + 1].predicate (op, mode);
9821
9822 if (second_arg_count && i == 1)
9823 {
9824 /* SIMD shift insns take either an 8-bit immediate or
9825 register as count. But builtin functions take int as
9826 count. If count doesn't match, we put it in register.
9827 The instructions are using 64-bit count, if op is just
9828 32-bit, zero-extend it, as negative shift counts
9829 are undefined behavior and zero-extension is more
9830 efficient. */
9831 if (!match)
9832 {
9833 if (SCALAR_INT_MODE_P (GET_MODE (op)))
9834 op = convert_modes (mode, GET_MODE (op), op, 1);
9835 else
9836 op = lowpart_subreg (mode, op, GET_MODE (op));
9837 if (!insn_p->operand[i + 1].predicate (op, mode))
9838 op = copy_to_reg (op);
9839 }
9840 }
9841 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
9842 (!mask_pos && (nargs - i) <= nargs_constant))
9843 {
9844 if (!match)
9845 switch (icode)
9846 {
9847 case CODE_FOR_avx_vinsertf128v4di:
9848 case CODE_FOR_avx_vextractf128v4di:
9849 error ("the last argument must be an 1-bit immediate");
9850 return const0_rtx;
9851
9852 case CODE_FOR_avx512f_cmpv8di3_mask:
9853 case CODE_FOR_avx512f_cmpv16si3_mask:
9854 case CODE_FOR_avx512f_ucmpv8di3_mask:
9855 case CODE_FOR_avx512f_ucmpv16si3_mask:
9856 case CODE_FOR_avx512vl_cmpv4di3_mask:
9857 case CODE_FOR_avx512vl_cmpv8si3_mask:
9858 case CODE_FOR_avx512vl_ucmpv4di3_mask:
9859 case CODE_FOR_avx512vl_ucmpv8si3_mask:
9860 case CODE_FOR_avx512vl_cmpv2di3_mask:
9861 case CODE_FOR_avx512vl_cmpv4si3_mask:
9862 case CODE_FOR_avx512vl_ucmpv2di3_mask:
9863 case CODE_FOR_avx512vl_ucmpv4si3_mask:
9864 error ("the last argument must be a 3-bit immediate");
9865 return const0_rtx;
9866
9867 case CODE_FOR_sse4_1_roundsd:
9868 case CODE_FOR_sse4_1_roundss:
9869
9870 case CODE_FOR_sse4_1_roundpd:
9871 case CODE_FOR_sse4_1_roundps:
9872 case CODE_FOR_avx_roundpd256:
9873 case CODE_FOR_avx_roundps256:
9874
9875 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
9876 case CODE_FOR_sse4_1_roundps_sfix:
9877 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
9878 case CODE_FOR_avx_roundps_sfix256:
9879
9880 case CODE_FOR_sse4_1_blendps:
9881 case CODE_FOR_avx_blendpd256:
9882 case CODE_FOR_avx_vpermilv4df:
9883 case CODE_FOR_avx_vpermilv4df_mask:
9884 case CODE_FOR_avx512f_getmantv8df_mask:
9885 case CODE_FOR_avx512f_getmantv16sf_mask:
9886 case CODE_FOR_avx512vl_getmantv8sf_mask:
9887 case CODE_FOR_avx512vl_getmantv4df_mask:
9888 case CODE_FOR_avx512vl_getmantv4sf_mask:
9889 case CODE_FOR_avx512vl_getmantv2df_mask:
9890 case CODE_FOR_avx512dq_rangepv8df_mask_round:
9891 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
9892 case CODE_FOR_avx512dq_rangepv4df_mask:
9893 case CODE_FOR_avx512dq_rangepv8sf_mask:
9894 case CODE_FOR_avx512dq_rangepv2df_mask:
9895 case CODE_FOR_avx512dq_rangepv4sf_mask:
9896 case CODE_FOR_avx_shufpd256_mask:
9897 error ("the last argument must be a 4-bit immediate");
9898 return const0_rtx;
9899
9900 case CODE_FOR_sha1rnds4:
9901 case CODE_FOR_sse4_1_blendpd:
9902 case CODE_FOR_avx_vpermilv2df:
9903 case CODE_FOR_avx_vpermilv2df_mask:
9904 case CODE_FOR_xop_vpermil2v2df3:
9905 case CODE_FOR_xop_vpermil2v4sf3:
9906 case CODE_FOR_xop_vpermil2v4df3:
9907 case CODE_FOR_xop_vpermil2v8sf3:
9908 case CODE_FOR_avx512f_vinsertf32x4_mask:
9909 case CODE_FOR_avx512f_vinserti32x4_mask:
9910 case CODE_FOR_avx512f_vextractf32x4_mask:
9911 case CODE_FOR_avx512f_vextracti32x4_mask:
9912 case CODE_FOR_sse2_shufpd:
9913 case CODE_FOR_sse2_shufpd_mask:
9914 case CODE_FOR_avx512dq_shuf_f64x2_mask:
9915 case CODE_FOR_avx512dq_shuf_i64x2_mask:
9916 case CODE_FOR_avx512vl_shuf_i32x4_mask:
9917 case CODE_FOR_avx512vl_shuf_f32x4_mask:
9918 error ("the last argument must be a 2-bit immediate");
9919 return const0_rtx;
9920
9921 case CODE_FOR_avx_vextractf128v4df:
9922 case CODE_FOR_avx_vextractf128v8sf:
9923 case CODE_FOR_avx_vextractf128v8si:
9924 case CODE_FOR_avx_vinsertf128v4df:
9925 case CODE_FOR_avx_vinsertf128v8sf:
9926 case CODE_FOR_avx_vinsertf128v8si:
9927 case CODE_FOR_avx512f_vinsertf64x4_mask:
9928 case CODE_FOR_avx512f_vinserti64x4_mask:
9929 case CODE_FOR_avx512f_vextractf64x4_mask:
9930 case CODE_FOR_avx512f_vextracti64x4_mask:
9931 case CODE_FOR_avx512dq_vinsertf32x8_mask:
9932 case CODE_FOR_avx512dq_vinserti32x8_mask:
9933 case CODE_FOR_avx512vl_vinsertv4df:
9934 case CODE_FOR_avx512vl_vinsertv4di:
9935 case CODE_FOR_avx512vl_vinsertv8sf:
9936 case CODE_FOR_avx512vl_vinsertv8si:
9937 error ("the last argument must be a 1-bit immediate");
9938 return const0_rtx;
9939
9940 case CODE_FOR_avx_vmcmpv2df3:
9941 case CODE_FOR_avx_vmcmpv4sf3:
9942 case CODE_FOR_avx_cmpv2df3:
9943 case CODE_FOR_avx_cmpv4sf3:
9944 case CODE_FOR_avx_cmpv4df3:
9945 case CODE_FOR_avx_cmpv8sf3:
9946 case CODE_FOR_avx512f_cmpv8df3_mask:
9947 case CODE_FOR_avx512f_cmpv16sf3_mask:
9948 case CODE_FOR_avx512f_vmcmpv2df3_mask:
9949 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
9950 error ("the last argument must be a 5-bit immediate");
9951 return const0_rtx;
9952
9953 default:
9954 switch (nargs_constant)
9955 {
9956 case 2:
9957 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
9958 (!mask_pos && (nargs - i) == nargs_constant))
9959 {
9960 error ("the next to last argument must be an 8-bit immediate");
9961 break;
9962 }
9963 /* FALLTHRU */
9964 case 1:
9965 error ("the last argument must be an 8-bit immediate");
9966 break;
9967 default:
9968 gcc_unreachable ();
9969 }
9970 return const0_rtx;
9971 }
9972 }
9973 else
9974 {
9975 if (VECTOR_MODE_P (mode))
9976 op = safe_vector_operand (op, mode);
9977
9978 /* If we aren't optimizing, only allow one memory operand to
9979 be generated. */
9980 if (memory_operand (op, mode))
9981 num_memory++;
9982
9983 op = fixup_modeless_constant (op, mode);
9984
9985 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
9986 {
9987 if (optimize || !match || num_memory > 1)
9988 op = copy_to_mode_reg (mode, op);
9989 }
9990 else
9991 {
9992 op = copy_to_reg (op);
9993 op = lowpart_subreg (mode, op, GET_MODE (op));
9994 }
9995 }
9996
9997 xops[i] = op;
9998 }
9999
10000 switch (nargs)
10001 {
10002 case 1:
10003 pat = GEN_FCN (icode) (real_target, xops[0]);
10004 break;
10005 case 2:
10006 pat = GEN_FCN (icode) (real_target, xops[0], xops[1]);
10007 break;
10008 case 3:
10009 pat = GEN_FCN (icode) (real_target, xops[0], xops[1], xops[2]);
10010 break;
10011 case 4:
10012 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
10013 xops[2], xops[3]);
10014 break;
10015 case 5:
10016 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
10017 xops[2], xops[3], xops[4]);
10018 break;
10019 case 6:
10020 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
10021 xops[2], xops[3], xops[4], xops[5]);
10022 break;
10023 default:
10024 gcc_unreachable ();
10025 }
10026
10027 if (! pat)
10028 return 0;
10029
10030 emit_insn (pat);
10031 return target;
10032 }
10033
10034 /* Transform pattern of following layout:
10035 (set A
10036 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
10037 )
10038 into:
10039 (set (A B)) */
10040
10041 static rtx
10042 ix86_erase_embedded_rounding (rtx pat)
10043 {
10044 if (GET_CODE (pat) == INSN)
10045 pat = PATTERN (pat);
10046
10047 gcc_assert (GET_CODE (pat) == SET);
10048 rtx src = SET_SRC (pat);
10049 gcc_assert (XVECLEN (src, 0) == 2);
10050 rtx p0 = XVECEXP (src, 0, 0);
10051 gcc_assert (GET_CODE (src) == UNSPEC
10052 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
10053 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
10054 return res;
10055 }
10056
10057 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
10058 with rounding. */
10059 static rtx
10060 ix86_expand_sse_comi_round (const struct builtin_description *d,
10061 tree exp, rtx target)
10062 {
10063 rtx pat, set_dst;
10064 tree arg0 = CALL_EXPR_ARG (exp, 0);
10065 tree arg1 = CALL_EXPR_ARG (exp, 1);
10066 tree arg2 = CALL_EXPR_ARG (exp, 2);
10067 tree arg3 = CALL_EXPR_ARG (exp, 3);
10068 rtx op0 = expand_normal (arg0);
10069 rtx op1 = expand_normal (arg1);
10070 rtx op2 = expand_normal (arg2);
10071 rtx op3 = expand_normal (arg3);
10072 enum insn_code icode = d->icode;
10073 const struct insn_data_d *insn_p = &insn_data[icode];
10074 machine_mode mode0 = insn_p->operand[0].mode;
10075 machine_mode mode1 = insn_p->operand[1].mode;
10076
10077 /* See avxintrin.h for values. */
10078 static const enum rtx_code comparisons[32] =
10079 {
10080 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
10081 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED,
10082 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
10083 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED
10084 };
10085 static const bool ordereds[32] =
10086 {
10087 true, true, true, false, false, false, false, true,
10088 false, false, false, true, true, true, true, false,
10089 true, true, true, false, false, false, false, true,
10090 false, false, false, true, true, true, true, false
10091 };
10092 static const bool non_signalings[32] =
10093 {
10094 true, false, false, true, true, false, false, true,
10095 true, false, false, true, true, false, false, true,
10096 false, true, true, false, false, true, true, false,
10097 false, true, true, false, false, true, true, false
10098 };
10099
10100 if (!CONST_INT_P (op2))
10101 {
10102 error ("the third argument must be comparison constant");
10103 return const0_rtx;
10104 }
10105 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
10106 {
10107 error ("incorrect comparison mode");
10108 return const0_rtx;
10109 }
10110
10111 if (!insn_p->operand[2].predicate (op3, SImode))
10112 {
10113 error ("incorrect rounding operand");
10114 return const0_rtx;
10115 }
10116
10117 if (VECTOR_MODE_P (mode0))
10118 op0 = safe_vector_operand (op0, mode0);
10119 if (VECTOR_MODE_P (mode1))
10120 op1 = safe_vector_operand (op1, mode1);
10121
10122 enum rtx_code comparison = comparisons[INTVAL (op2)];
10123 bool ordered = ordereds[INTVAL (op2)];
10124 bool non_signaling = non_signalings[INTVAL (op2)];
10125 rtx const_val = const0_rtx;
10126
10127 bool check_unordered = false;
10128 machine_mode mode = CCFPmode;
10129 switch (comparison)
10130 {
10131 case ORDERED:
10132 if (!ordered)
10133 {
10134 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
10135 if (!non_signaling)
10136 ordered = true;
10137 mode = CCSmode;
10138 }
10139 else
10140 {
10141 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
10142 if (non_signaling)
10143 ordered = false;
10144 mode = CCPmode;
10145 }
10146 comparison = NE;
10147 break;
10148 case UNORDERED:
10149 if (ordered)
10150 {
10151 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
10152 if (non_signaling)
10153 ordered = false;
10154 mode = CCSmode;
10155 }
10156 else
10157 {
10158 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
10159 if (!non_signaling)
10160 ordered = true;
10161 mode = CCPmode;
10162 }
10163 comparison = EQ;
10164 break;
10165
10166 case LE: /* -> GE */
10167 case LT: /* -> GT */
10168 case UNGE: /* -> UNLE */
10169 case UNGT: /* -> UNLT */
10170 std::swap (op0, op1);
10171 comparison = swap_condition (comparison);
10172 /* FALLTHRU */
10173 case GT:
10174 case GE:
10175 case UNEQ:
10176 case UNLT:
10177 case UNLE:
10178 case LTGT:
10179 /* These are supported by CCFPmode. NB: Use ordered/signaling
10180 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
10181 with NAN operands. */
10182 if (ordered == non_signaling)
10183 ordered = !ordered;
10184 break;
10185 case EQ:
10186 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10187 _CMP_EQ_OQ/_CMP_EQ_OS. */
10188 check_unordered = true;
10189 mode = CCZmode;
10190 break;
10191 case NE:
10192 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10193 _CMP_NEQ_UQ/_CMP_NEQ_US. */
10194 gcc_assert (!ordered);
10195 check_unordered = true;
10196 mode = CCZmode;
10197 const_val = const1_rtx;
10198 break;
10199 default:
10200 gcc_unreachable ();
10201 }
10202
10203 target = gen_reg_rtx (SImode);
10204 emit_move_insn (target, const_val);
10205 target = gen_rtx_SUBREG (QImode, target, 0);
10206
10207 if ((optimize && !register_operand (op0, mode0))
10208 || !insn_p->operand[0].predicate (op0, mode0))
10209 op0 = copy_to_mode_reg (mode0, op0);
10210 if ((optimize && !register_operand (op1, mode1))
10211 || !insn_p->operand[1].predicate (op1, mode1))
10212 op1 = copy_to_mode_reg (mode1, op1);
10213
10214 /*
10215 1. COMI: ordered and signaling.
10216 2. UCOMI: unordered and non-signaling.
10217 */
10218 if (non_signaling)
10219 icode = (icode == CODE_FOR_sse_comi_round
10220 ? CODE_FOR_sse_ucomi_round
10221 : CODE_FOR_sse2_ucomi_round);
10222
10223 pat = GEN_FCN (icode) (op0, op1, op3);
10224 if (! pat)
10225 return 0;
10226
10227 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
10228 if (INTVAL (op3) == NO_ROUND)
10229 {
10230 pat = ix86_erase_embedded_rounding (pat);
10231 if (! pat)
10232 return 0;
10233
10234 set_dst = SET_DEST (pat);
10235 }
10236 else
10237 {
10238 gcc_assert (GET_CODE (pat) == SET);
10239 set_dst = SET_DEST (pat);
10240 }
10241
10242 emit_insn (pat);
10243
10244 rtx_code_label *label = NULL;
10245
10246 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
10247 with NAN operands. */
10248 if (check_unordered)
10249 {
10250 gcc_assert (comparison == EQ || comparison == NE);
10251
10252 rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG);
10253 label = gen_label_rtx ();
10254 rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx);
10255 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
10256 gen_rtx_LABEL_REF (VOIDmode, label),
10257 pc_rtx);
10258 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
10259 }
10260
10261 /* NB: Set CCFPmode and check a different CCmode which is in subset
10262 of CCFPmode. */
10263 if (GET_MODE (set_dst) != mode)
10264 {
10265 gcc_assert (mode == CCAmode || mode == CCCmode
10266 || mode == CCOmode || mode == CCPmode
10267 || mode == CCSmode || mode == CCZmode);
10268 set_dst = gen_rtx_REG (mode, FLAGS_REG);
10269 }
10270
10271 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10272 gen_rtx_fmt_ee (comparison, QImode,
10273 set_dst,
10274 const0_rtx)));
10275
10276 if (label)
10277 emit_label (label);
10278
10279 return SUBREG_REG (target);
10280 }
10281
10282 static rtx
10283 ix86_expand_round_builtin (const struct builtin_description *d,
10284 tree exp, rtx target)
10285 {
10286 rtx pat;
10287 unsigned int i, nargs;
10288 rtx xops[6];
10289 enum insn_code icode = d->icode;
10290 const struct insn_data_d *insn_p = &insn_data[icode];
10291 machine_mode tmode = insn_p->operand[0].mode;
10292 unsigned int nargs_constant = 0;
10293 unsigned int redundant_embed_rnd = 0;
10294
10295 switch ((enum ix86_builtin_func_type) d->flag)
10296 {
10297 case UINT64_FTYPE_V2DF_INT:
10298 case UINT64_FTYPE_V4SF_INT:
10299 case UINT_FTYPE_V2DF_INT:
10300 case UINT_FTYPE_V4SF_INT:
10301 case INT64_FTYPE_V2DF_INT:
10302 case INT64_FTYPE_V4SF_INT:
10303 case INT_FTYPE_V2DF_INT:
10304 case INT_FTYPE_V4SF_INT:
10305 nargs = 2;
10306 break;
10307 case V4SF_FTYPE_V4SF_UINT_INT:
10308 case V4SF_FTYPE_V4SF_UINT64_INT:
10309 case V2DF_FTYPE_V2DF_UINT64_INT:
10310 case V4SF_FTYPE_V4SF_INT_INT:
10311 case V4SF_FTYPE_V4SF_INT64_INT:
10312 case V2DF_FTYPE_V2DF_INT64_INT:
10313 case V4SF_FTYPE_V4SF_V4SF_INT:
10314 case V2DF_FTYPE_V2DF_V2DF_INT:
10315 case V4SF_FTYPE_V4SF_V2DF_INT:
10316 case V2DF_FTYPE_V2DF_V4SF_INT:
10317 nargs = 3;
10318 break;
10319 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
10320 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
10321 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
10322 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
10323 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
10324 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
10325 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
10326 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
10327 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
10328 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
10329 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
10330 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
10331 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
10332 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
10333 nargs = 4;
10334 break;
10335 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
10336 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
10337 nargs_constant = 2;
10338 nargs = 4;
10339 break;
10340 case INT_FTYPE_V4SF_V4SF_INT_INT:
10341 case INT_FTYPE_V2DF_V2DF_INT_INT:
10342 return ix86_expand_sse_comi_round (d, exp, target);
10343 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
10344 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
10345 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
10346 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
10347 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
10348 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
10349 case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT:
10350 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
10351 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
10352 case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT:
10353 nargs = 5;
10354 break;
10355 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
10356 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
10357 case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT:
10358 case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT:
10359 nargs_constant = 4;
10360 nargs = 5;
10361 break;
10362 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
10363 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
10364 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
10365 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
10366 nargs_constant = 3;
10367 nargs = 5;
10368 break;
10369 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
10370 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
10371 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
10372 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
10373 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
10374 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
10375 nargs = 6;
10376 nargs_constant = 4;
10377 break;
10378 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
10379 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
10380 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
10381 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
10382 nargs = 6;
10383 nargs_constant = 3;
10384 break;
10385 default:
10386 gcc_unreachable ();
10387 }
10388 gcc_assert (nargs <= ARRAY_SIZE (xops));
10389
10390 if (optimize
10391 || target == 0
10392 || GET_MODE (target) != tmode
10393 || !insn_p->operand[0].predicate (target, tmode))
10394 target = gen_reg_rtx (tmode);
10395
10396 for (i = 0; i < nargs; i++)
10397 {
10398 tree arg = CALL_EXPR_ARG (exp, i);
10399 rtx op = expand_normal (arg);
10400 machine_mode mode = insn_p->operand[i + 1].mode;
10401 bool match = insn_p->operand[i + 1].predicate (op, mode);
10402
10403 if (i == nargs - nargs_constant)
10404 {
10405 if (!match)
10406 {
10407 switch (icode)
10408 {
10409 case CODE_FOR_avx512f_getmantv8df_mask_round:
10410 case CODE_FOR_avx512f_getmantv16sf_mask_round:
10411 case CODE_FOR_avx512f_vgetmantv2df_round:
10412 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
10413 case CODE_FOR_avx512f_vgetmantv4sf_round:
10414 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
10415 error ("the immediate argument must be a 4-bit immediate");
10416 return const0_rtx;
10417 case CODE_FOR_avx512f_cmpv8df3_mask_round:
10418 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
10419 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
10420 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
10421 error ("the immediate argument must be a 5-bit immediate");
10422 return const0_rtx;
10423 default:
10424 error ("the immediate argument must be an 8-bit immediate");
10425 return const0_rtx;
10426 }
10427 }
10428 }
10429 else if (i == nargs-1)
10430 {
10431 if (!insn_p->operand[nargs].predicate (op, SImode))
10432 {
10433 error ("incorrect rounding operand");
10434 return const0_rtx;
10435 }
10436
10437 /* If there is no rounding use normal version of the pattern. */
10438 if (INTVAL (op) == NO_ROUND)
10439 redundant_embed_rnd = 1;
10440 }
10441 else
10442 {
10443 if (VECTOR_MODE_P (mode))
10444 op = safe_vector_operand (op, mode);
10445
10446 op = fixup_modeless_constant (op, mode);
10447
10448 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
10449 {
10450 if (optimize || !match)
10451 op = copy_to_mode_reg (mode, op);
10452 }
10453 else
10454 {
10455 op = copy_to_reg (op);
10456 op = lowpart_subreg (mode, op, GET_MODE (op));
10457 }
10458 }
10459
10460 xops[i] = op;
10461 }
10462
10463 switch (nargs)
10464 {
10465 case 1:
10466 pat = GEN_FCN (icode) (target, xops[0]);
10467 break;
10468 case 2:
10469 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
10470 break;
10471 case 3:
10472 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
10473 break;
10474 case 4:
10475 pat = GEN_FCN (icode) (target, xops[0], xops[1],
10476 xops[2], xops[3]);
10477 break;
10478 case 5:
10479 pat = GEN_FCN (icode) (target, xops[0], xops[1],
10480 xops[2], xops[3], xops[4]);
10481 break;
10482 case 6:
10483 pat = GEN_FCN (icode) (target, xops[0], xops[1],
10484 xops[2], xops[3], xops[4], xops[5]);
10485 break;
10486 default:
10487 gcc_unreachable ();
10488 }
10489
10490 if (!pat)
10491 return 0;
10492
10493 if (redundant_embed_rnd)
10494 pat = ix86_erase_embedded_rounding (pat);
10495
10496 emit_insn (pat);
10497 return target;
10498 }
10499
10500 /* Subroutine of ix86_expand_builtin to take care of special insns
10501 with variable number of operands. */
10502
10503 static rtx
10504 ix86_expand_special_args_builtin (const struct builtin_description *d,
10505 tree exp, rtx target)
10506 {
10507 tree arg;
10508 rtx pat, op;
10509 unsigned int i, nargs, arg_adjust, memory;
10510 bool aligned_mem = false;
10511 rtx xops[3];
10512 enum insn_code icode = d->icode;
10513 const struct insn_data_d *insn_p = &insn_data[icode];
10514 machine_mode tmode = insn_p->operand[0].mode;
10515 enum { load, store } klass;
10516
10517 switch ((enum ix86_builtin_func_type) d->flag)
10518 {
10519 case VOID_FTYPE_VOID:
10520 emit_insn (GEN_FCN (icode) (target));
10521 return 0;
10522 case VOID_FTYPE_UINT64:
10523 case VOID_FTYPE_UNSIGNED:
10524 nargs = 0;
10525 klass = store;
10526 memory = 0;
10527 break;
10528
10529 case INT_FTYPE_VOID:
10530 case USHORT_FTYPE_VOID:
10531 case UINT64_FTYPE_VOID:
10532 case UINT_FTYPE_VOID:
10533 case UINT8_FTYPE_VOID:
10534 case UNSIGNED_FTYPE_VOID:
10535 nargs = 0;
10536 klass = load;
10537 memory = 0;
10538 break;
10539 case UINT64_FTYPE_PUNSIGNED:
10540 case V2DI_FTYPE_PV2DI:
10541 case V4DI_FTYPE_PV4DI:
10542 case V32QI_FTYPE_PCCHAR:
10543 case V16QI_FTYPE_PCCHAR:
10544 case V8SF_FTYPE_PCV4SF:
10545 case V8SF_FTYPE_PCFLOAT:
10546 case V4SF_FTYPE_PCFLOAT:
10547 case V4DF_FTYPE_PCV2DF:
10548 case V4DF_FTYPE_PCDOUBLE:
10549 case V2DF_FTYPE_PCDOUBLE:
10550 case VOID_FTYPE_PVOID:
10551 case V8DI_FTYPE_PV8DI:
10552 nargs = 1;
10553 klass = load;
10554 memory = 0;
10555 switch (icode)
10556 {
10557 case CODE_FOR_sse4_1_movntdqa:
10558 case CODE_FOR_avx2_movntdqa:
10559 case CODE_FOR_avx512f_movntdqa:
10560 aligned_mem = true;
10561 break;
10562 default:
10563 break;
10564 }
10565 break;
10566 case VOID_FTYPE_PV2SF_V4SF:
10567 case VOID_FTYPE_PV8DI_V8DI:
10568 case VOID_FTYPE_PV4DI_V4DI:
10569 case VOID_FTYPE_PV2DI_V2DI:
10570 case VOID_FTYPE_PCHAR_V32QI:
10571 case VOID_FTYPE_PCHAR_V16QI:
10572 case VOID_FTYPE_PFLOAT_V16SF:
10573 case VOID_FTYPE_PFLOAT_V8SF:
10574 case VOID_FTYPE_PFLOAT_V4SF:
10575 case VOID_FTYPE_PDOUBLE_V8DF:
10576 case VOID_FTYPE_PDOUBLE_V4DF:
10577 case VOID_FTYPE_PDOUBLE_V2DF:
10578 case VOID_FTYPE_PLONGLONG_LONGLONG:
10579 case VOID_FTYPE_PULONGLONG_ULONGLONG:
10580 case VOID_FTYPE_PUNSIGNED_UNSIGNED:
10581 case VOID_FTYPE_PINT_INT:
10582 nargs = 1;
10583 klass = store;
10584 /* Reserve memory operand for target. */
10585 memory = ARRAY_SIZE (xops);
10586 switch (icode)
10587 {
10588 /* These builtins and instructions require the memory
10589 to be properly aligned. */
10590 case CODE_FOR_avx_movntv4di:
10591 case CODE_FOR_sse2_movntv2di:
10592 case CODE_FOR_avx_movntv8sf:
10593 case CODE_FOR_sse_movntv4sf:
10594 case CODE_FOR_sse4a_vmmovntv4sf:
10595 case CODE_FOR_avx_movntv4df:
10596 case CODE_FOR_sse2_movntv2df:
10597 case CODE_FOR_sse4a_vmmovntv2df:
10598 case CODE_FOR_sse2_movntidi:
10599 case CODE_FOR_sse_movntq:
10600 case CODE_FOR_sse2_movntisi:
10601 case CODE_FOR_avx512f_movntv16sf:
10602 case CODE_FOR_avx512f_movntv8df:
10603 case CODE_FOR_avx512f_movntv8di:
10604 aligned_mem = true;
10605 break;
10606 default:
10607 break;
10608 }
10609 break;
10610 case VOID_FTYPE_PVOID_PCVOID:
10611 nargs = 1;
10612 klass = store;
10613 memory = 0;
10614
10615 break;
10616 case V4SF_FTYPE_V4SF_PCV2SF:
10617 case V2DF_FTYPE_V2DF_PCDOUBLE:
10618 nargs = 2;
10619 klass = load;
10620 memory = 1;
10621 break;
10622 case V8SF_FTYPE_PCV8SF_V8SI:
10623 case V4DF_FTYPE_PCV4DF_V4DI:
10624 case V4SF_FTYPE_PCV4SF_V4SI:
10625 case V2DF_FTYPE_PCV2DF_V2DI:
10626 case V8SI_FTYPE_PCV8SI_V8SI:
10627 case V4DI_FTYPE_PCV4DI_V4DI:
10628 case V4SI_FTYPE_PCV4SI_V4SI:
10629 case V2DI_FTYPE_PCV2DI_V2DI:
10630 case VOID_FTYPE_INT_INT64:
10631 nargs = 2;
10632 klass = load;
10633 memory = 0;
10634 break;
10635 case VOID_FTYPE_PV8DF_V8DF_UQI:
10636 case VOID_FTYPE_PV4DF_V4DF_UQI:
10637 case VOID_FTYPE_PV2DF_V2DF_UQI:
10638 case VOID_FTYPE_PV16SF_V16SF_UHI:
10639 case VOID_FTYPE_PV8SF_V8SF_UQI:
10640 case VOID_FTYPE_PV4SF_V4SF_UQI:
10641 case VOID_FTYPE_PV8DI_V8DI_UQI:
10642 case VOID_FTYPE_PV4DI_V4DI_UQI:
10643 case VOID_FTYPE_PV2DI_V2DI_UQI:
10644 case VOID_FTYPE_PV16SI_V16SI_UHI:
10645 case VOID_FTYPE_PV8SI_V8SI_UQI:
10646 case VOID_FTYPE_PV4SI_V4SI_UQI:
10647 case VOID_FTYPE_PV64QI_V64QI_UDI:
10648 case VOID_FTYPE_PV32HI_V32HI_USI:
10649 case VOID_FTYPE_PV32QI_V32QI_USI:
10650 case VOID_FTYPE_PV16QI_V16QI_UHI:
10651 case VOID_FTYPE_PV16HI_V16HI_UHI:
10652 case VOID_FTYPE_PV8HI_V8HI_UQI:
10653 switch (icode)
10654 {
10655 /* These builtins and instructions require the memory
10656 to be properly aligned. */
10657 case CODE_FOR_avx512f_storev16sf_mask:
10658 case CODE_FOR_avx512f_storev16si_mask:
10659 case CODE_FOR_avx512f_storev8df_mask:
10660 case CODE_FOR_avx512f_storev8di_mask:
10661 case CODE_FOR_avx512vl_storev8sf_mask:
10662 case CODE_FOR_avx512vl_storev8si_mask:
10663 case CODE_FOR_avx512vl_storev4df_mask:
10664 case CODE_FOR_avx512vl_storev4di_mask:
10665 case CODE_FOR_avx512vl_storev4sf_mask:
10666 case CODE_FOR_avx512vl_storev4si_mask:
10667 case CODE_FOR_avx512vl_storev2df_mask:
10668 case CODE_FOR_avx512vl_storev2di_mask:
10669 aligned_mem = true;
10670 break;
10671 default:
10672 break;
10673 }
10674 /* FALLTHRU */
10675 case VOID_FTYPE_PV8SF_V8SI_V8SF:
10676 case VOID_FTYPE_PV4DF_V4DI_V4DF:
10677 case VOID_FTYPE_PV4SF_V4SI_V4SF:
10678 case VOID_FTYPE_PV2DF_V2DI_V2DF:
10679 case VOID_FTYPE_PV8SI_V8SI_V8SI:
10680 case VOID_FTYPE_PV4DI_V4DI_V4DI:
10681 case VOID_FTYPE_PV4SI_V4SI_V4SI:
10682 case VOID_FTYPE_PV2DI_V2DI_V2DI:
10683 case VOID_FTYPE_PV8SI_V8DI_UQI:
10684 case VOID_FTYPE_PV8HI_V8DI_UQI:
10685 case VOID_FTYPE_PV16HI_V16SI_UHI:
10686 case VOID_FTYPE_PUDI_V8DI_UQI:
10687 case VOID_FTYPE_PV16QI_V16SI_UHI:
10688 case VOID_FTYPE_PV4SI_V4DI_UQI:
10689 case VOID_FTYPE_PUDI_V2DI_UQI:
10690 case VOID_FTYPE_PUDI_V4DI_UQI:
10691 case VOID_FTYPE_PUSI_V2DI_UQI:
10692 case VOID_FTYPE_PV8HI_V8SI_UQI:
10693 case VOID_FTYPE_PUDI_V4SI_UQI:
10694 case VOID_FTYPE_PUSI_V4DI_UQI:
10695 case VOID_FTYPE_PUHI_V2DI_UQI:
10696 case VOID_FTYPE_PUDI_V8SI_UQI:
10697 case VOID_FTYPE_PUSI_V4SI_UQI:
10698 case VOID_FTYPE_PCHAR_V64QI_UDI:
10699 case VOID_FTYPE_PCHAR_V32QI_USI:
10700 case VOID_FTYPE_PCHAR_V16QI_UHI:
10701 case VOID_FTYPE_PSHORT_V32HI_USI:
10702 case VOID_FTYPE_PSHORT_V16HI_UHI:
10703 case VOID_FTYPE_PSHORT_V8HI_UQI:
10704 case VOID_FTYPE_PINT_V16SI_UHI:
10705 case VOID_FTYPE_PINT_V8SI_UQI:
10706 case VOID_FTYPE_PINT_V4SI_UQI:
10707 case VOID_FTYPE_PINT64_V8DI_UQI:
10708 case VOID_FTYPE_PINT64_V4DI_UQI:
10709 case VOID_FTYPE_PINT64_V2DI_UQI:
10710 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
10711 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
10712 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
10713 case VOID_FTYPE_PFLOAT_V16SF_UHI:
10714 case VOID_FTYPE_PFLOAT_V8SF_UQI:
10715 case VOID_FTYPE_PFLOAT_V4SF_UQI:
10716 case VOID_FTYPE_PV32QI_V32HI_USI:
10717 case VOID_FTYPE_PV16QI_V16HI_UHI:
10718 case VOID_FTYPE_PUDI_V8HI_UQI:
10719 nargs = 2;
10720 klass = store;
10721 /* Reserve memory operand for target. */
10722 memory = ARRAY_SIZE (xops);
10723 break;
10724 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
10725 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
10726 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
10727 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
10728 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
10729 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
10730 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
10731 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
10732 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
10733 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
10734 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
10735 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
10736 case V64QI_FTYPE_PCV64QI_V64QI_UDI:
10737 case V32HI_FTYPE_PCV32HI_V32HI_USI:
10738 case V32QI_FTYPE_PCV32QI_V32QI_USI:
10739 case V16QI_FTYPE_PCV16QI_V16QI_UHI:
10740 case V16HI_FTYPE_PCV16HI_V16HI_UHI:
10741 case V8HI_FTYPE_PCV8HI_V8HI_UQI:
10742 switch (icode)
10743 {
10744 /* These builtins and instructions require the memory
10745 to be properly aligned. */
10746 case CODE_FOR_avx512f_loadv16sf_mask:
10747 case CODE_FOR_avx512f_loadv16si_mask:
10748 case CODE_FOR_avx512f_loadv8df_mask:
10749 case CODE_FOR_avx512f_loadv8di_mask:
10750 case CODE_FOR_avx512vl_loadv8sf_mask:
10751 case CODE_FOR_avx512vl_loadv8si_mask:
10752 case CODE_FOR_avx512vl_loadv4df_mask:
10753 case CODE_FOR_avx512vl_loadv4di_mask:
10754 case CODE_FOR_avx512vl_loadv4sf_mask:
10755 case CODE_FOR_avx512vl_loadv4si_mask:
10756 case CODE_FOR_avx512vl_loadv2df_mask:
10757 case CODE_FOR_avx512vl_loadv2di_mask:
10758 case CODE_FOR_avx512bw_loadv64qi_mask:
10759 case CODE_FOR_avx512vl_loadv32qi_mask:
10760 case CODE_FOR_avx512vl_loadv16qi_mask:
10761 case CODE_FOR_avx512bw_loadv32hi_mask:
10762 case CODE_FOR_avx512vl_loadv16hi_mask:
10763 case CODE_FOR_avx512vl_loadv8hi_mask:
10764 aligned_mem = true;
10765 break;
10766 default:
10767 break;
10768 }
10769 /* FALLTHRU */
10770 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
10771 case V32QI_FTYPE_PCCHAR_V32QI_USI:
10772 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
10773 case V32HI_FTYPE_PCSHORT_V32HI_USI:
10774 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
10775 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
10776 case V16SI_FTYPE_PCINT_V16SI_UHI:
10777 case V8SI_FTYPE_PCINT_V8SI_UQI:
10778 case V4SI_FTYPE_PCINT_V4SI_UQI:
10779 case V8DI_FTYPE_PCINT64_V8DI_UQI:
10780 case V4DI_FTYPE_PCINT64_V4DI_UQI:
10781 case V2DI_FTYPE_PCINT64_V2DI_UQI:
10782 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
10783 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
10784 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
10785 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
10786 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
10787 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
10788 nargs = 3;
10789 klass = load;
10790 memory = 0;
10791 break;
10792 default:
10793 gcc_unreachable ();
10794 }
10795
10796 gcc_assert (nargs <= ARRAY_SIZE (xops));
10797
10798 if (klass == store)
10799 {
10800 arg = CALL_EXPR_ARG (exp, 0);
10801 op = expand_normal (arg);
10802 gcc_assert (target == 0);
10803 if (memory)
10804 {
10805 op = ix86_zero_extend_to_Pmode (op);
10806 target = gen_rtx_MEM (tmode, op);
10807 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
10808 on it. Try to improve it using get_pointer_alignment,
10809 and if the special builtin is one that requires strict
10810 mode alignment, also from it's GET_MODE_ALIGNMENT.
10811 Failure to do so could lead to ix86_legitimate_combined_insn
10812 rejecting all changes to such insns. */
10813 unsigned int align = get_pointer_alignment (arg);
10814 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
10815 align = GET_MODE_ALIGNMENT (tmode);
10816 if (MEM_ALIGN (target) < align)
10817 set_mem_align (target, align);
10818 }
10819 else
10820 target = force_reg (tmode, op);
10821 arg_adjust = 1;
10822 }
10823 else
10824 {
10825 arg_adjust = 0;
10826 if (optimize
10827 || target == 0
10828 || !register_operand (target, tmode)
10829 || GET_MODE (target) != tmode)
10830 target = gen_reg_rtx (tmode);
10831 }
10832
10833 for (i = 0; i < nargs; i++)
10834 {
10835 machine_mode mode = insn_p->operand[i + 1].mode;
10836
10837 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
10838 op = expand_normal (arg);
10839
10840 if (i == memory)
10841 {
10842 /* This must be the memory operand. */
10843 op = ix86_zero_extend_to_Pmode (op);
10844 op = gen_rtx_MEM (mode, op);
10845 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
10846 on it. Try to improve it using get_pointer_alignment,
10847 and if the special builtin is one that requires strict
10848 mode alignment, also from it's GET_MODE_ALIGNMENT.
10849 Failure to do so could lead to ix86_legitimate_combined_insn
10850 rejecting all changes to such insns. */
10851 unsigned int align = get_pointer_alignment (arg);
10852 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
10853 align = GET_MODE_ALIGNMENT (mode);
10854 if (MEM_ALIGN (op) < align)
10855 set_mem_align (op, align);
10856 }
10857 else
10858 {
10859 /* This must be register. */
10860 if (VECTOR_MODE_P (mode))
10861 op = safe_vector_operand (op, mode);
10862
10863 op = fixup_modeless_constant (op, mode);
10864
10865 /* NB: 3-operands load implied it's a mask load,
10866 and that mask operand shoud be at the end.
10867 Keep all-ones mask which would be simplified by the expander. */
10868 if (nargs == 3 && i == 2 && klass == load
10869 && constm1_operand (op, mode))
10870 ;
10871 else if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
10872 op = copy_to_mode_reg (mode, op);
10873 else
10874 {
10875 op = copy_to_reg (op);
10876 op = lowpart_subreg (mode, op, GET_MODE (op));
10877 }
10878 }
10879
10880 xops[i]= op;
10881 }
10882
10883 switch (nargs)
10884 {
10885 case 0:
10886 pat = GEN_FCN (icode) (target);
10887 break;
10888 case 1:
10889 pat = GEN_FCN (icode) (target, xops[0]);
10890 break;
10891 case 2:
10892 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
10893 break;
10894 case 3:
10895 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
10896 break;
10897 default:
10898 gcc_unreachable ();
10899 }
10900
10901 if (! pat)
10902 return 0;
10903
10904 emit_insn (pat);
10905 return klass == store ? 0 : target;
10906 }
10907
10908 /* Return the integer constant in ARG. Constrain it to be in the range
10909 of the subparts of VEC_TYPE; issue an error if not. */
10910
10911 static int
10912 get_element_number (tree vec_type, tree arg)
10913 {
10914 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
10915
10916 if (!tree_fits_uhwi_p (arg)
10917 || (elt = tree_to_uhwi (arg), elt > max))
10918 {
10919 error ("selector must be an integer constant in the range "
10920 "[0, %wi]", max);
10921 return 0;
10922 }
10923
10924 return elt;
10925 }
10926
10927 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10928 ix86_expand_vector_init. We DO have language-level syntax for this, in
10929 the form of (type){ init-list }. Except that since we can't place emms
10930 instructions from inside the compiler, we can't allow the use of MMX
10931 registers unless the user explicitly asks for it. So we do *not* define
10932 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
10933 we have builtins invoked by mmintrin.h that gives us license to emit
10934 these sorts of instructions. */
10935
10936 static rtx
10937 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
10938 {
10939 machine_mode tmode = TYPE_MODE (type);
10940 machine_mode inner_mode = GET_MODE_INNER (tmode);
10941 int i, n_elt = GET_MODE_NUNITS (tmode);
10942 rtvec v = rtvec_alloc (n_elt);
10943
10944 gcc_assert (VECTOR_MODE_P (tmode));
10945 gcc_assert (call_expr_nargs (exp) == n_elt);
10946
10947 for (i = 0; i < n_elt; ++i)
10948 {
10949 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
10950 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
10951 }
10952
10953 if (!target || !register_operand (target, tmode))
10954 target = gen_reg_rtx (tmode);
10955
10956 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
10957 return target;
10958 }
10959
10960 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10961 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
10962 had a language-level syntax for referencing vector elements. */
10963
10964 static rtx
10965 ix86_expand_vec_ext_builtin (tree exp, rtx target)
10966 {
10967 machine_mode tmode, mode0;
10968 tree arg0, arg1;
10969 int elt;
10970 rtx op0;
10971
10972 arg0 = CALL_EXPR_ARG (exp, 0);
10973 arg1 = CALL_EXPR_ARG (exp, 1);
10974
10975 op0 = expand_normal (arg0);
10976 elt = get_element_number (TREE_TYPE (arg0), arg1);
10977
10978 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
10979 mode0 = TYPE_MODE (TREE_TYPE (arg0));
10980 gcc_assert (VECTOR_MODE_P (mode0));
10981
10982 op0 = force_reg (mode0, op0);
10983
10984 if (optimize || !target || !register_operand (target, tmode))
10985 target = gen_reg_rtx (tmode);
10986
10987 ix86_expand_vector_extract (true, target, op0, elt);
10988
10989 return target;
10990 }
10991
10992 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10993 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
10994 a language-level syntax for referencing vector elements. */
10995
10996 static rtx
10997 ix86_expand_vec_set_builtin (tree exp)
10998 {
10999 machine_mode tmode, mode1;
11000 tree arg0, arg1, arg2;
11001 int elt;
11002 rtx op0, op1, target;
11003
11004 arg0 = CALL_EXPR_ARG (exp, 0);
11005 arg1 = CALL_EXPR_ARG (exp, 1);
11006 arg2 = CALL_EXPR_ARG (exp, 2);
11007
11008 tmode = TYPE_MODE (TREE_TYPE (arg0));
11009 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
11010 gcc_assert (VECTOR_MODE_P (tmode));
11011
11012 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
11013 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
11014 elt = get_element_number (TREE_TYPE (arg0), arg2);
11015
11016 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
11017 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
11018
11019 op0 = force_reg (tmode, op0);
11020 op1 = force_reg (mode1, op1);
11021
11022 /* OP0 is the source of these builtin functions and shouldn't be
11023 modified. Create a copy, use it and return it as target. */
11024 target = gen_reg_rtx (tmode);
11025 emit_move_insn (target, op0);
11026 ix86_expand_vector_set (true, target, op1, elt);
11027
11028 return target;
11029 }
11030
11031 /* Expand an expression EXP that calls a built-in function,
11032 with result going to TARGET if that's convenient
11033 (and in mode MODE if that's convenient).
11034 SUBTARGET may be used as the target for computing one of EXP's operands.
11035 IGNORE is nonzero if the value is to be ignored. */
11036
11037 rtx
11038 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
11039 machine_mode mode, int ignore)
11040 {
11041 size_t i;
11042 enum insn_code icode, icode2;
11043 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
11044 tree arg0, arg1, arg2, arg3, arg4;
11045 rtx op0, op1, op2, op3, op4, pat, pat2, insn;
11046 machine_mode mode0, mode1, mode2, mode3, mode4;
11047 unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
11048
11049 /* For CPU builtins that can be folded, fold first and expand the fold. */
11050 switch (fcode)
11051 {
11052 case IX86_BUILTIN_CPU_INIT:
11053 {
11054 /* Make it call __cpu_indicator_init in libgcc. */
11055 tree call_expr, fndecl, type;
11056 type = build_function_type_list (integer_type_node, NULL_TREE);
11057 fndecl = build_fn_decl ("__cpu_indicator_init", type);
11058 call_expr = build_call_expr (fndecl, 0);
11059 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
11060 }
11061 case IX86_BUILTIN_CPU_IS:
11062 case IX86_BUILTIN_CPU_SUPPORTS:
11063 {
11064 tree arg0 = CALL_EXPR_ARG (exp, 0);
11065 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
11066 gcc_assert (fold_expr != NULL_TREE);
11067 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
11068 }
11069 }
11070
11071 HOST_WIDE_INT isa = ix86_isa_flags;
11072 HOST_WIDE_INT isa2 = ix86_isa_flags2;
11073 HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
11074 HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
11075 /* The general case is we require all the ISAs specified in bisa{,2}
11076 to be enabled.
11077 The exceptions are:
11078 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
11079 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
11080 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
11081 (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
11082 OPTION_MASK_ISA2_AVXVNNI
11083 where for each such pair it is sufficient if either of the ISAs is
11084 enabled, plus if it is ored with other options also those others.
11085 OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
11086 if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
11087 == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
11088 && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
11089 isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
11090
11091 if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
11092 == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
11093 && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
11094 isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
11095
11096 if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
11097 == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
11098 && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
11099 isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
11100
11101 if ((((bisa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
11102 == (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
11103 || (bisa2 & OPTION_MASK_ISA2_AVXVNNI) != 0)
11104 && (((isa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
11105 == (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
11106 || (isa2 & OPTION_MASK_ISA2_AVXVNNI) != 0))
11107 {
11108 isa |= OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL;
11109 isa2 |= OPTION_MASK_ISA2_AVXVNNI;
11110 }
11111
11112 if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE
11113 /* __builtin_ia32_maskmovq requires MMX registers. */
11114 && fcode != IX86_BUILTIN_MASKMOVQ)
11115 {
11116 bisa &= ~OPTION_MASK_ISA_MMX;
11117 bisa |= OPTION_MASK_ISA_SSE2;
11118 }
11119
11120 if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2)
11121 {
11122 bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
11123 if (TARGET_ABI_X32)
11124 bisa |= OPTION_MASK_ABI_X32;
11125 else
11126 bisa |= OPTION_MASK_ABI_64;
11127 char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
11128 (enum fpmath_unit) 0,
11129 (enum prefer_vector_width) 0,
11130 false, add_abi_p);
11131 if (!opts)
11132 error ("%qE needs unknown isa option", fndecl);
11133 else
11134 {
11135 gcc_assert (opts != NULL);
11136 error ("%qE needs isa option %s", fndecl, opts);
11137 free (opts);
11138 }
11139 return expand_call (exp, target, ignore);
11140 }
11141
11142 switch (fcode)
11143 {
11144 case IX86_BUILTIN_MASKMOVQ:
11145 case IX86_BUILTIN_MASKMOVDQU:
11146 icode = (fcode == IX86_BUILTIN_MASKMOVQ
11147 ? CODE_FOR_mmx_maskmovq
11148 : CODE_FOR_sse2_maskmovdqu);
11149 /* Note the arg order is different from the operand order. */
11150 arg1 = CALL_EXPR_ARG (exp, 0);
11151 arg2 = CALL_EXPR_ARG (exp, 1);
11152 arg0 = CALL_EXPR_ARG (exp, 2);
11153 op0 = expand_normal (arg0);
11154 op1 = expand_normal (arg1);
11155 op2 = expand_normal (arg2);
11156 mode0 = insn_data[icode].operand[0].mode;
11157 mode1 = insn_data[icode].operand[1].mode;
11158 mode2 = insn_data[icode].operand[2].mode;
11159
11160 op0 = ix86_zero_extend_to_Pmode (op0);
11161 op0 = gen_rtx_MEM (mode1, op0);
11162
11163 if (!insn_data[icode].operand[0].predicate (op0, mode0))
11164 op0 = copy_to_mode_reg (mode0, op0);
11165 if (!insn_data[icode].operand[1].predicate (op1, mode1))
11166 op1 = copy_to_mode_reg (mode1, op1);
11167 if (!insn_data[icode].operand[2].predicate (op2, mode2))
11168 op2 = copy_to_mode_reg (mode2, op2);
11169 pat = GEN_FCN (icode) (op0, op1, op2);
11170 if (! pat)
11171 return 0;
11172 emit_insn (pat);
11173 return 0;
11174
11175 case IX86_BUILTIN_LDMXCSR:
11176 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
11177 target = assign_386_stack_local (SImode, SLOT_TEMP);
11178 emit_move_insn (target, op0);
11179 emit_insn (gen_sse_ldmxcsr (target));
11180 return 0;
11181
11182 case IX86_BUILTIN_STMXCSR:
11183 target = assign_386_stack_local (SImode, SLOT_TEMP);
11184 emit_insn (gen_sse_stmxcsr (target));
11185 return copy_to_mode_reg (SImode, target);
11186
11187 case IX86_BUILTIN_CLFLUSH:
11188 arg0 = CALL_EXPR_ARG (exp, 0);
11189 op0 = expand_normal (arg0);
11190 icode = CODE_FOR_sse2_clflush;
11191 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11192 op0 = ix86_zero_extend_to_Pmode (op0);
11193
11194 emit_insn (gen_sse2_clflush (op0));
11195 return 0;
11196
11197 case IX86_BUILTIN_CLWB:
11198 arg0 = CALL_EXPR_ARG (exp, 0);
11199 op0 = expand_normal (arg0);
11200 icode = CODE_FOR_clwb;
11201 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11202 op0 = ix86_zero_extend_to_Pmode (op0);
11203
11204 emit_insn (gen_clwb (op0));
11205 return 0;
11206
11207 case IX86_BUILTIN_CLFLUSHOPT:
11208 arg0 = CALL_EXPR_ARG (exp, 0);
11209 op0 = expand_normal (arg0);
11210 icode = CODE_FOR_clflushopt;
11211 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11212 op0 = ix86_zero_extend_to_Pmode (op0);
11213
11214 emit_insn (gen_clflushopt (op0));
11215 return 0;
11216
11217 case IX86_BUILTIN_MONITOR:
11218 case IX86_BUILTIN_MONITORX:
11219 arg0 = CALL_EXPR_ARG (exp, 0);
11220 arg1 = CALL_EXPR_ARG (exp, 1);
11221 arg2 = CALL_EXPR_ARG (exp, 2);
11222 op0 = expand_normal (arg0);
11223 op1 = expand_normal (arg1);
11224 op2 = expand_normal (arg2);
11225 if (!REG_P (op0))
11226 op0 = ix86_zero_extend_to_Pmode (op0);
11227 if (!REG_P (op1))
11228 op1 = copy_to_mode_reg (SImode, op1);
11229 if (!REG_P (op2))
11230 op2 = copy_to_mode_reg (SImode, op2);
11231
11232 emit_insn (fcode == IX86_BUILTIN_MONITOR
11233 ? gen_sse3_monitor (Pmode, op0, op1, op2)
11234 : gen_monitorx (Pmode, op0, op1, op2));
11235 return 0;
11236
11237 case IX86_BUILTIN_MWAIT:
11238 arg0 = CALL_EXPR_ARG (exp, 0);
11239 arg1 = CALL_EXPR_ARG (exp, 1);
11240 op0 = expand_normal (arg0);
11241 op1 = expand_normal (arg1);
11242 if (!REG_P (op0))
11243 op0 = copy_to_mode_reg (SImode, op0);
11244 if (!REG_P (op1))
11245 op1 = copy_to_mode_reg (SImode, op1);
11246 emit_insn (gen_sse3_mwait (op0, op1));
11247 return 0;
11248
11249 case IX86_BUILTIN_MWAITX:
11250 arg0 = CALL_EXPR_ARG (exp, 0);
11251 arg1 = CALL_EXPR_ARG (exp, 1);
11252 arg2 = CALL_EXPR_ARG (exp, 2);
11253 op0 = expand_normal (arg0);
11254 op1 = expand_normal (arg1);
11255 op2 = expand_normal (arg2);
11256 if (!REG_P (op0))
11257 op0 = copy_to_mode_reg (SImode, op0);
11258 if (!REG_P (op1))
11259 op1 = copy_to_mode_reg (SImode, op1);
11260 if (!REG_P (op2))
11261 op2 = copy_to_mode_reg (SImode, op2);
11262 emit_insn (gen_mwaitx (op0, op1, op2));
11263 return 0;
11264
11265 case IX86_BUILTIN_UMONITOR:
11266 arg0 = CALL_EXPR_ARG (exp, 0);
11267 op0 = expand_normal (arg0);
11268
11269 op0 = ix86_zero_extend_to_Pmode (op0);
11270 emit_insn (gen_umonitor (Pmode, op0));
11271 return 0;
11272
11273 case IX86_BUILTIN_UMWAIT:
11274 case IX86_BUILTIN_TPAUSE:
11275 arg0 = CALL_EXPR_ARG (exp, 0);
11276 arg1 = CALL_EXPR_ARG (exp, 1);
11277 op0 = expand_normal (arg0);
11278 op1 = expand_normal (arg1);
11279
11280 if (!REG_P (op0))
11281 op0 = copy_to_mode_reg (SImode, op0);
11282
11283 op1 = force_reg (DImode, op1);
11284
11285 if (TARGET_64BIT)
11286 {
11287 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11288 NULL, 1, OPTAB_DIRECT);
11289 switch (fcode)
11290 {
11291 case IX86_BUILTIN_UMWAIT:
11292 icode = CODE_FOR_umwait_rex64;
11293 break;
11294 case IX86_BUILTIN_TPAUSE:
11295 icode = CODE_FOR_tpause_rex64;
11296 break;
11297 default:
11298 gcc_unreachable ();
11299 }
11300
11301 op2 = gen_lowpart (SImode, op2);
11302 op1 = gen_lowpart (SImode, op1);
11303 pat = GEN_FCN (icode) (op0, op1, op2);
11304 }
11305 else
11306 {
11307 switch (fcode)
11308 {
11309 case IX86_BUILTIN_UMWAIT:
11310 icode = CODE_FOR_umwait;
11311 break;
11312 case IX86_BUILTIN_TPAUSE:
11313 icode = CODE_FOR_tpause;
11314 break;
11315 default:
11316 gcc_unreachable ();
11317 }
11318 pat = GEN_FCN (icode) (op0, op1);
11319 }
11320
11321 if (!pat)
11322 return 0;
11323
11324 emit_insn (pat);
11325
11326 if (target == 0
11327 || !register_operand (target, QImode))
11328 target = gen_reg_rtx (QImode);
11329
11330 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
11331 const0_rtx);
11332 emit_insn (gen_rtx_SET (target, pat));
11333
11334 return target;
11335
11336 case IX86_BUILTIN_TESTUI:
11337 emit_insn (gen_testui ());
11338
11339 if (target == 0
11340 || !register_operand (target, QImode))
11341 target = gen_reg_rtx (QImode);
11342
11343 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
11344 const0_rtx);
11345 emit_insn (gen_rtx_SET (target, pat));
11346
11347 return target;
11348
11349 case IX86_BUILTIN_CLZERO:
11350 arg0 = CALL_EXPR_ARG (exp, 0);
11351 op0 = expand_normal (arg0);
11352 if (!REG_P (op0))
11353 op0 = ix86_zero_extend_to_Pmode (op0);
11354 emit_insn (gen_clzero (Pmode, op0));
11355 return 0;
11356
11357 case IX86_BUILTIN_CLDEMOTE:
11358 arg0 = CALL_EXPR_ARG (exp, 0);
11359 op0 = expand_normal (arg0);
11360 icode = CODE_FOR_cldemote;
11361 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11362 op0 = ix86_zero_extend_to_Pmode (op0);
11363
11364 emit_insn (gen_cldemote (op0));
11365 return 0;
11366
11367 case IX86_BUILTIN_LOADIWKEY:
11368 {
11369 arg0 = CALL_EXPR_ARG (exp, 0);
11370 arg1 = CALL_EXPR_ARG (exp, 1);
11371 arg2 = CALL_EXPR_ARG (exp, 2);
11372 arg3 = CALL_EXPR_ARG (exp, 3);
11373
11374 op0 = expand_normal (arg0);
11375 op1 = expand_normal (arg1);
11376 op2 = expand_normal (arg2);
11377 op3 = expand_normal (arg3);
11378
11379 if (!REG_P (op0))
11380 op0 = copy_to_mode_reg (V2DImode, op0);
11381 if (!REG_P (op1))
11382 op1 = copy_to_mode_reg (V2DImode, op1);
11383 if (!REG_P (op2))
11384 op2 = copy_to_mode_reg (V2DImode, op2);
11385 if (!REG_P (op3))
11386 op3 = copy_to_mode_reg (SImode, op3);
11387
11388 emit_insn (gen_loadiwkey (op0, op1, op2, op3));
11389
11390 return 0;
11391 }
11392
11393 case IX86_BUILTIN_AESDEC128KLU8:
11394 icode = CODE_FOR_aesdec128klu8;
11395 goto aesdecenc_expand;
11396
11397 case IX86_BUILTIN_AESDEC256KLU8:
11398 icode = CODE_FOR_aesdec256klu8;
11399 goto aesdecenc_expand;
11400
11401 case IX86_BUILTIN_AESENC128KLU8:
11402 icode = CODE_FOR_aesenc128klu8;
11403 goto aesdecenc_expand;
11404
11405 case IX86_BUILTIN_AESENC256KLU8:
11406 icode = CODE_FOR_aesenc256klu8;
11407
11408 aesdecenc_expand:
11409
11410 arg0 = CALL_EXPR_ARG (exp, 0); // __m128i *odata
11411 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i idata
11412 arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
11413
11414 op0 = expand_normal (arg0);
11415 op1 = expand_normal (arg1);
11416 op2 = expand_normal (arg2);
11417
11418 if (!address_operand (op0, V2DImode))
11419 {
11420 op0 = convert_memory_address (Pmode, op0);
11421 op0 = copy_addr_to_reg (op0);
11422 }
11423 op0 = gen_rtx_MEM (V2DImode, op0);
11424
11425 if (!REG_P (op1))
11426 op1 = copy_to_mode_reg (V2DImode, op1);
11427
11428 if (!address_operand (op2, VOIDmode))
11429 {
11430 op2 = convert_memory_address (Pmode, op2);
11431 op2 = copy_addr_to_reg (op2);
11432 }
11433 op2 = gen_rtx_MEM (BLKmode, op2);
11434
11435 emit_insn (GEN_FCN (icode) (op1, op1, op2));
11436
11437 if (target == 0)
11438 target = gen_reg_rtx (QImode);
11439
11440 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCZmode, FLAGS_REG),
11441 const0_rtx);
11442 emit_insn (gen_rtx_SET (target, pat));
11443
11444 emit_insn (gen_rtx_SET (op0, op1));
11445
11446 return target;
11447
11448 case IX86_BUILTIN_AESDECWIDE128KLU8:
11449 icode = CODE_FOR_aesdecwide128klu8;
11450 goto wideaesdecenc_expand;
11451
11452 case IX86_BUILTIN_AESDECWIDE256KLU8:
11453 icode = CODE_FOR_aesdecwide256klu8;
11454 goto wideaesdecenc_expand;
11455
11456 case IX86_BUILTIN_AESENCWIDE128KLU8:
11457 icode = CODE_FOR_aesencwide128klu8;
11458 goto wideaesdecenc_expand;
11459
11460 case IX86_BUILTIN_AESENCWIDE256KLU8:
11461 icode = CODE_FOR_aesencwide256klu8;
11462
11463 wideaesdecenc_expand:
11464
11465 rtx xmm_regs[8];
11466 rtx op;
11467
11468 arg0 = CALL_EXPR_ARG (exp, 0); // __m128i * odata
11469 arg1 = CALL_EXPR_ARG (exp, 1); // const __m128i * idata
11470 arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
11471
11472 op0 = expand_normal (arg0);
11473 op1 = expand_normal (arg1);
11474 op2 = expand_normal (arg2);
11475
11476 if (!address_operand (op2, VOIDmode))
11477 {
11478 op2 = convert_memory_address (Pmode, op2);
11479 op2 = copy_addr_to_reg (op2);
11480 }
11481 op2 = gen_rtx_MEM (BLKmode, op2);
11482
11483 for (i = 0; i < 8; i++)
11484 {
11485 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
11486
11487 op = gen_rtx_MEM (V2DImode,
11488 plus_constant (Pmode, op1, (i * 16)));
11489
11490 emit_move_insn (xmm_regs[i], op);
11491 }
11492
11493 emit_insn (GEN_FCN (icode) (op2));
11494
11495 if (target == 0)
11496 target = gen_reg_rtx (QImode);
11497
11498 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCZmode, FLAGS_REG),
11499 const0_rtx);
11500 emit_insn (gen_rtx_SET (target, pat));
11501
11502 for (i = 0; i < 8; i++)
11503 {
11504 op = gen_rtx_MEM (V2DImode,
11505 plus_constant (Pmode, op0, (i * 16)));
11506 emit_move_insn (op, xmm_regs[i]);
11507 }
11508
11509 return target;
11510
11511 case IX86_BUILTIN_ENCODEKEY128U32:
11512 {
11513 rtx op, xmm_regs[7];
11514
11515 arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
11516 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i key
11517 arg2 = CALL_EXPR_ARG (exp, 2); // void *h
11518
11519 op0 = expand_normal (arg0);
11520 op1 = expand_normal (arg1);
11521 op2 = expand_normal (arg2);
11522
11523 if (!REG_P (op0))
11524 op0 = copy_to_mode_reg (SImode, op0);
11525
11526 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
11527 emit_move_insn (op, op1);
11528
11529 for (i = 0; i < 3; i++)
11530 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
11531
11532 if (target == 0)
11533 target = gen_reg_rtx (SImode);
11534
11535 emit_insn (gen_encodekey128u32 (target, op0));
11536
11537 for (i = 0; i < 3; i++)
11538 {
11539 op = gen_rtx_MEM (V2DImode,
11540 plus_constant (Pmode, op2, (i * 16)));
11541 emit_move_insn (op, xmm_regs[i]);
11542 }
11543
11544 return target;
11545 }
11546 case IX86_BUILTIN_ENCODEKEY256U32:
11547 {
11548 rtx op, xmm_regs[7];
11549
11550 arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
11551 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i keylow
11552 arg2 = CALL_EXPR_ARG (exp, 2); // __m128i keyhi
11553 arg3 = CALL_EXPR_ARG (exp, 3); // void *h
11554
11555 op0 = expand_normal (arg0);
11556 op1 = expand_normal (arg1);
11557 op2 = expand_normal (arg2);
11558 op3 = expand_normal (arg3);
11559
11560 if (!REG_P (op0))
11561 op0 = copy_to_mode_reg (SImode, op0);
11562
11563 /* Force to use xmm0, xmm1 for keylow, keyhi*/
11564 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
11565 emit_move_insn (op, op1);
11566 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (1));
11567 emit_move_insn (op, op2);
11568
11569 for (i = 0; i < 4; i++)
11570 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
11571
11572 if (target == 0)
11573 target = gen_reg_rtx (SImode);
11574
11575 emit_insn (gen_encodekey256u32 (target, op0));
11576
11577 for (i = 0; i < 4; i++)
11578 {
11579 op = gen_rtx_MEM (V2DImode,
11580 plus_constant (Pmode, op3, (i * 16)));
11581 emit_move_insn (op, xmm_regs[i]);
11582 }
11583
11584 return target;
11585 }
11586
11587 case IX86_BUILTIN_VEC_INIT_V2SI:
11588 case IX86_BUILTIN_VEC_INIT_V4HI:
11589 case IX86_BUILTIN_VEC_INIT_V8QI:
11590 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
11591
11592 case IX86_BUILTIN_VEC_EXT_V2DF:
11593 case IX86_BUILTIN_VEC_EXT_V2DI:
11594 case IX86_BUILTIN_VEC_EXT_V4SF:
11595 case IX86_BUILTIN_VEC_EXT_V4SI:
11596 case IX86_BUILTIN_VEC_EXT_V8HI:
11597 case IX86_BUILTIN_VEC_EXT_V2SI:
11598 case IX86_BUILTIN_VEC_EXT_V4HI:
11599 case IX86_BUILTIN_VEC_EXT_V16QI:
11600 return ix86_expand_vec_ext_builtin (exp, target);
11601
11602 case IX86_BUILTIN_VEC_SET_V2DI:
11603 case IX86_BUILTIN_VEC_SET_V4SF:
11604 case IX86_BUILTIN_VEC_SET_V4SI:
11605 case IX86_BUILTIN_VEC_SET_V8HI:
11606 case IX86_BUILTIN_VEC_SET_V4HI:
11607 case IX86_BUILTIN_VEC_SET_V16QI:
11608 return ix86_expand_vec_set_builtin (exp);
11609
11610 case IX86_BUILTIN_NANQ:
11611 case IX86_BUILTIN_NANSQ:
11612 return expand_call (exp, target, ignore);
11613
11614 case IX86_BUILTIN_RDPID:
11615
11616 op0 = gen_reg_rtx (word_mode);
11617
11618 if (TARGET_64BIT)
11619 {
11620 insn = gen_rdpid_rex64 (op0);
11621 op0 = convert_to_mode (SImode, op0, 1);
11622 }
11623 else
11624 insn = gen_rdpid (op0);
11625
11626 emit_insn (insn);
11627
11628 if (target == 0
11629 || !register_operand (target, SImode))
11630 target = gen_reg_rtx (SImode);
11631
11632 emit_move_insn (target, op0);
11633 return target;
11634
11635 case IX86_BUILTIN_2INTERSECTD512:
11636 case IX86_BUILTIN_2INTERSECTQ512:
11637 case IX86_BUILTIN_2INTERSECTD256:
11638 case IX86_BUILTIN_2INTERSECTQ256:
11639 case IX86_BUILTIN_2INTERSECTD128:
11640 case IX86_BUILTIN_2INTERSECTQ128:
11641 arg0 = CALL_EXPR_ARG (exp, 0);
11642 arg1 = CALL_EXPR_ARG (exp, 1);
11643 arg2 = CALL_EXPR_ARG (exp, 2);
11644 arg3 = CALL_EXPR_ARG (exp, 3);
11645 op0 = expand_normal (arg0);
11646 op1 = expand_normal (arg1);
11647 op2 = expand_normal (arg2);
11648 op3 = expand_normal (arg3);
11649
11650 if (!address_operand (op0, VOIDmode))
11651 {
11652 op0 = convert_memory_address (Pmode, op0);
11653 op0 = copy_addr_to_reg (op0);
11654 }
11655 if (!address_operand (op1, VOIDmode))
11656 {
11657 op1 = convert_memory_address (Pmode, op1);
11658 op1 = copy_addr_to_reg (op1);
11659 }
11660
11661 switch (fcode)
11662 {
11663 case IX86_BUILTIN_2INTERSECTD512:
11664 mode4 = P2HImode;
11665 icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
11666 break;
11667 case IX86_BUILTIN_2INTERSECTQ512:
11668 mode4 = P2QImode;
11669 icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
11670 break;
11671 case IX86_BUILTIN_2INTERSECTD256:
11672 mode4 = P2QImode;
11673 icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
11674 break;
11675 case IX86_BUILTIN_2INTERSECTQ256:
11676 mode4 = P2QImode;
11677 icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
11678 break;
11679 case IX86_BUILTIN_2INTERSECTD128:
11680 mode4 = P2QImode;
11681 icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
11682 break;
11683 case IX86_BUILTIN_2INTERSECTQ128:
11684 mode4 = P2QImode;
11685 icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
11686 break;
11687 default:
11688 gcc_unreachable ();
11689 }
11690
11691 mode2 = insn_data[icode].operand[1].mode;
11692 mode3 = insn_data[icode].operand[2].mode;
11693 if (!insn_data[icode].operand[1].predicate (op2, mode2))
11694 op2 = copy_to_mode_reg (mode2, op2);
11695 if (!insn_data[icode].operand[2].predicate (op3, mode3))
11696 op3 = copy_to_mode_reg (mode3, op3);
11697
11698 op4 = gen_reg_rtx (mode4);
11699 emit_insn (GEN_FCN (icode) (op4, op2, op3));
11700 mode0 = mode4 == P2HImode ? HImode : QImode;
11701 emit_move_insn (gen_rtx_MEM (mode0, op0),
11702 gen_lowpart (mode0, op4));
11703 emit_move_insn (gen_rtx_MEM (mode0, op1),
11704 gen_highpart (mode0, op4));
11705
11706 return 0;
11707
11708 case IX86_BUILTIN_RDPMC:
11709 case IX86_BUILTIN_RDTSC:
11710 case IX86_BUILTIN_RDTSCP:
11711 case IX86_BUILTIN_XGETBV:
11712
11713 op0 = gen_reg_rtx (DImode);
11714 op1 = gen_reg_rtx (DImode);
11715
11716 if (fcode == IX86_BUILTIN_RDPMC)
11717 {
11718 arg0 = CALL_EXPR_ARG (exp, 0);
11719 op2 = expand_normal (arg0);
11720 if (!register_operand (op2, SImode))
11721 op2 = copy_to_mode_reg (SImode, op2);
11722
11723 insn = (TARGET_64BIT
11724 ? gen_rdpmc_rex64 (op0, op1, op2)
11725 : gen_rdpmc (op0, op2));
11726 emit_insn (insn);
11727 }
11728 else if (fcode == IX86_BUILTIN_XGETBV)
11729 {
11730 arg0 = CALL_EXPR_ARG (exp, 0);
11731 op2 = expand_normal (arg0);
11732 if (!register_operand (op2, SImode))
11733 op2 = copy_to_mode_reg (SImode, op2);
11734
11735 insn = (TARGET_64BIT
11736 ? gen_xgetbv_rex64 (op0, op1, op2)
11737 : gen_xgetbv (op0, op2));
11738 emit_insn (insn);
11739 }
11740 else if (fcode == IX86_BUILTIN_RDTSC)
11741 {
11742 insn = (TARGET_64BIT
11743 ? gen_rdtsc_rex64 (op0, op1)
11744 : gen_rdtsc (op0));
11745 emit_insn (insn);
11746 }
11747 else
11748 {
11749 op2 = gen_reg_rtx (SImode);
11750
11751 insn = (TARGET_64BIT
11752 ? gen_rdtscp_rex64 (op0, op1, op2)
11753 : gen_rdtscp (op0, op2));
11754 emit_insn (insn);
11755
11756 arg0 = CALL_EXPR_ARG (exp, 0);
11757 op4 = expand_normal (arg0);
11758 if (!address_operand (op4, VOIDmode))
11759 {
11760 op4 = convert_memory_address (Pmode, op4);
11761 op4 = copy_addr_to_reg (op4);
11762 }
11763 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
11764 }
11765
11766 if (target == 0
11767 || !register_operand (target, DImode))
11768 target = gen_reg_rtx (DImode);
11769
11770 if (TARGET_64BIT)
11771 {
11772 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
11773 op1, 1, OPTAB_DIRECT);
11774 op0 = expand_simple_binop (DImode, IOR, op0, op1,
11775 op0, 1, OPTAB_DIRECT);
11776 }
11777
11778 emit_move_insn (target, op0);
11779 return target;
11780
11781 case IX86_BUILTIN_ENQCMD:
11782 case IX86_BUILTIN_ENQCMDS:
11783 case IX86_BUILTIN_MOVDIR64B:
11784
11785 arg0 = CALL_EXPR_ARG (exp, 0);
11786 arg1 = CALL_EXPR_ARG (exp, 1);
11787 op0 = expand_normal (arg0);
11788 op1 = expand_normal (arg1);
11789
11790 op0 = ix86_zero_extend_to_Pmode (op0);
11791 if (!address_operand (op1, VOIDmode))
11792 {
11793 op1 = convert_memory_address (Pmode, op1);
11794 op1 = copy_addr_to_reg (op1);
11795 }
11796 op1 = gen_rtx_MEM (XImode, op1);
11797
11798 if (fcode == IX86_BUILTIN_MOVDIR64B)
11799 {
11800 emit_insn (gen_movdir64b (Pmode, op0, op1));
11801 return 0;
11802 }
11803 else
11804 {
11805 if (target == 0
11806 || !register_operand (target, SImode))
11807 target = gen_reg_rtx (SImode);
11808
11809 emit_move_insn (target, const0_rtx);
11810 target = gen_rtx_SUBREG (QImode, target, 0);
11811
11812 int unspecv = (fcode == IX86_BUILTIN_ENQCMD
11813 ? UNSPECV_ENQCMD
11814 : UNSPECV_ENQCMDS);
11815 icode = code_for_enqcmd (unspecv, Pmode);
11816 emit_insn (GEN_FCN (icode) (op0, op1));
11817
11818 emit_insn
11819 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
11820 gen_rtx_fmt_ee (EQ, QImode,
11821 gen_rtx_REG (CCZmode, FLAGS_REG),
11822 const0_rtx)));
11823 return SUBREG_REG (target);
11824 }
11825
11826 case IX86_BUILTIN_FXSAVE:
11827 case IX86_BUILTIN_FXRSTOR:
11828 case IX86_BUILTIN_FXSAVE64:
11829 case IX86_BUILTIN_FXRSTOR64:
11830 case IX86_BUILTIN_FNSTENV:
11831 case IX86_BUILTIN_FLDENV:
11832 mode0 = BLKmode;
11833 switch (fcode)
11834 {
11835 case IX86_BUILTIN_FXSAVE:
11836 icode = CODE_FOR_fxsave;
11837 break;
11838 case IX86_BUILTIN_FXRSTOR:
11839 icode = CODE_FOR_fxrstor;
11840 break;
11841 case IX86_BUILTIN_FXSAVE64:
11842 icode = CODE_FOR_fxsave64;
11843 break;
11844 case IX86_BUILTIN_FXRSTOR64:
11845 icode = CODE_FOR_fxrstor64;
11846 break;
11847 case IX86_BUILTIN_FNSTENV:
11848 icode = CODE_FOR_fnstenv;
11849 break;
11850 case IX86_BUILTIN_FLDENV:
11851 icode = CODE_FOR_fldenv;
11852 break;
11853 default:
11854 gcc_unreachable ();
11855 }
11856
11857 arg0 = CALL_EXPR_ARG (exp, 0);
11858 op0 = expand_normal (arg0);
11859
11860 if (!address_operand (op0, VOIDmode))
11861 {
11862 op0 = convert_memory_address (Pmode, op0);
11863 op0 = copy_addr_to_reg (op0);
11864 }
11865 op0 = gen_rtx_MEM (mode0, op0);
11866
11867 pat = GEN_FCN (icode) (op0);
11868 if (pat)
11869 emit_insn (pat);
11870 return 0;
11871
11872 case IX86_BUILTIN_XSETBV:
11873 arg0 = CALL_EXPR_ARG (exp, 0);
11874 arg1 = CALL_EXPR_ARG (exp, 1);
11875 op0 = expand_normal (arg0);
11876 op1 = expand_normal (arg1);
11877
11878 if (!REG_P (op0))
11879 op0 = copy_to_mode_reg (SImode, op0);
11880
11881 op1 = force_reg (DImode, op1);
11882
11883 if (TARGET_64BIT)
11884 {
11885 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11886 NULL, 1, OPTAB_DIRECT);
11887
11888 icode = CODE_FOR_xsetbv_rex64;
11889
11890 op2 = gen_lowpart (SImode, op2);
11891 op1 = gen_lowpart (SImode, op1);
11892 pat = GEN_FCN (icode) (op0, op1, op2);
11893 }
11894 else
11895 {
11896 icode = CODE_FOR_xsetbv;
11897
11898 pat = GEN_FCN (icode) (op0, op1);
11899 }
11900 if (pat)
11901 emit_insn (pat);
11902 return 0;
11903
11904 case IX86_BUILTIN_XSAVE:
11905 case IX86_BUILTIN_XRSTOR:
11906 case IX86_BUILTIN_XSAVE64:
11907 case IX86_BUILTIN_XRSTOR64:
11908 case IX86_BUILTIN_XSAVEOPT:
11909 case IX86_BUILTIN_XSAVEOPT64:
11910 case IX86_BUILTIN_XSAVES:
11911 case IX86_BUILTIN_XRSTORS:
11912 case IX86_BUILTIN_XSAVES64:
11913 case IX86_BUILTIN_XRSTORS64:
11914 case IX86_BUILTIN_XSAVEC:
11915 case IX86_BUILTIN_XSAVEC64:
11916 arg0 = CALL_EXPR_ARG (exp, 0);
11917 arg1 = CALL_EXPR_ARG (exp, 1);
11918 op0 = expand_normal (arg0);
11919 op1 = expand_normal (arg1);
11920
11921 if (!address_operand (op0, VOIDmode))
11922 {
11923 op0 = convert_memory_address (Pmode, op0);
11924 op0 = copy_addr_to_reg (op0);
11925 }
11926 op0 = gen_rtx_MEM (BLKmode, op0);
11927
11928 op1 = force_reg (DImode, op1);
11929
11930 if (TARGET_64BIT)
11931 {
11932 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11933 NULL, 1, OPTAB_DIRECT);
11934 switch (fcode)
11935 {
11936 case IX86_BUILTIN_XSAVE:
11937 icode = CODE_FOR_xsave_rex64;
11938 break;
11939 case IX86_BUILTIN_XRSTOR:
11940 icode = CODE_FOR_xrstor_rex64;
11941 break;
11942 case IX86_BUILTIN_XSAVE64:
11943 icode = CODE_FOR_xsave64;
11944 break;
11945 case IX86_BUILTIN_XRSTOR64:
11946 icode = CODE_FOR_xrstor64;
11947 break;
11948 case IX86_BUILTIN_XSAVEOPT:
11949 icode = CODE_FOR_xsaveopt_rex64;
11950 break;
11951 case IX86_BUILTIN_XSAVEOPT64:
11952 icode = CODE_FOR_xsaveopt64;
11953 break;
11954 case IX86_BUILTIN_XSAVES:
11955 icode = CODE_FOR_xsaves_rex64;
11956 break;
11957 case IX86_BUILTIN_XRSTORS:
11958 icode = CODE_FOR_xrstors_rex64;
11959 break;
11960 case IX86_BUILTIN_XSAVES64:
11961 icode = CODE_FOR_xsaves64;
11962 break;
11963 case IX86_BUILTIN_XRSTORS64:
11964 icode = CODE_FOR_xrstors64;
11965 break;
11966 case IX86_BUILTIN_XSAVEC:
11967 icode = CODE_FOR_xsavec_rex64;
11968 break;
11969 case IX86_BUILTIN_XSAVEC64:
11970 icode = CODE_FOR_xsavec64;
11971 break;
11972 default:
11973 gcc_unreachable ();
11974 }
11975
11976 op2 = gen_lowpart (SImode, op2);
11977 op1 = gen_lowpart (SImode, op1);
11978 pat = GEN_FCN (icode) (op0, op1, op2);
11979 }
11980 else
11981 {
11982 switch (fcode)
11983 {
11984 case IX86_BUILTIN_XSAVE:
11985 icode = CODE_FOR_xsave;
11986 break;
11987 case IX86_BUILTIN_XRSTOR:
11988 icode = CODE_FOR_xrstor;
11989 break;
11990 case IX86_BUILTIN_XSAVEOPT:
11991 icode = CODE_FOR_xsaveopt;
11992 break;
11993 case IX86_BUILTIN_XSAVES:
11994 icode = CODE_FOR_xsaves;
11995 break;
11996 case IX86_BUILTIN_XRSTORS:
11997 icode = CODE_FOR_xrstors;
11998 break;
11999 case IX86_BUILTIN_XSAVEC:
12000 icode = CODE_FOR_xsavec;
12001 break;
12002 default:
12003 gcc_unreachable ();
12004 }
12005 pat = GEN_FCN (icode) (op0, op1);
12006 }
12007
12008 if (pat)
12009 emit_insn (pat);
12010 return 0;
12011
12012 case IX86_BUILTIN_LLWPCB:
12013 arg0 = CALL_EXPR_ARG (exp, 0);
12014 op0 = expand_normal (arg0);
12015
12016 if (!register_operand (op0, Pmode))
12017 op0 = ix86_zero_extend_to_Pmode (op0);
12018 emit_insn (gen_lwp_llwpcb (Pmode, op0));
12019 return 0;
12020
12021 case IX86_BUILTIN_SLWPCB:
12022 if (!target
12023 || !register_operand (target, Pmode))
12024 target = gen_reg_rtx (Pmode);
12025 emit_insn (gen_lwp_slwpcb (Pmode, target));
12026 return target;
12027
12028 case IX86_BUILTIN_LWPVAL32:
12029 case IX86_BUILTIN_LWPVAL64:
12030 case IX86_BUILTIN_LWPINS32:
12031 case IX86_BUILTIN_LWPINS64:
12032 mode = ((fcode == IX86_BUILTIN_LWPVAL32
12033 || fcode == IX86_BUILTIN_LWPINS32)
12034 ? SImode : DImode);
12035
12036 if (fcode == IX86_BUILTIN_LWPVAL32
12037 || fcode == IX86_BUILTIN_LWPVAL64)
12038 icode = code_for_lwp_lwpval (mode);
12039 else
12040 icode = code_for_lwp_lwpins (mode);
12041
12042 arg0 = CALL_EXPR_ARG (exp, 0);
12043 arg1 = CALL_EXPR_ARG (exp, 1);
12044 arg2 = CALL_EXPR_ARG (exp, 2);
12045 op0 = expand_normal (arg0);
12046 op1 = expand_normal (arg1);
12047 op2 = expand_normal (arg2);
12048 mode0 = insn_data[icode].operand[0].mode;
12049
12050 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12051 op0 = copy_to_mode_reg (mode0, op0);
12052 if (!insn_data[icode].operand[1].predicate (op1, SImode))
12053 op1 = copy_to_mode_reg (SImode, op1);
12054
12055 if (!CONST_INT_P (op2))
12056 {
12057 error ("the last argument must be a 32-bit immediate");
12058 return const0_rtx;
12059 }
12060
12061 emit_insn (GEN_FCN (icode) (op0, op1, op2));
12062
12063 if (fcode == IX86_BUILTIN_LWPINS32
12064 || fcode == IX86_BUILTIN_LWPINS64)
12065 {
12066 if (target == 0
12067 || !nonimmediate_operand (target, QImode))
12068 target = gen_reg_rtx (QImode);
12069
12070 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
12071 const0_rtx);
12072 emit_insn (gen_rtx_SET (target, pat));
12073
12074 return target;
12075 }
12076 else
12077 return 0;
12078
12079 case IX86_BUILTIN_BEXTRI32:
12080 case IX86_BUILTIN_BEXTRI64:
12081 mode = (fcode == IX86_BUILTIN_BEXTRI32 ? SImode : DImode);
12082
12083 arg0 = CALL_EXPR_ARG (exp, 0);
12084 arg1 = CALL_EXPR_ARG (exp, 1);
12085 op0 = expand_normal (arg0);
12086 op1 = expand_normal (arg1);
12087
12088 if (!CONST_INT_P (op1))
12089 {
12090 error ("last argument must be an immediate");
12091 return const0_rtx;
12092 }
12093 else
12094 {
12095 unsigned char lsb_index = UINTVAL (op1);
12096 unsigned char length = UINTVAL (op1) >> 8;
12097
12098 unsigned char bitsize = GET_MODE_BITSIZE (mode);
12099
12100 icode = code_for_tbm_bextri (mode);
12101
12102 mode1 = insn_data[icode].operand[1].mode;
12103 if (!insn_data[icode].operand[1].predicate (op0, mode1))
12104 op0 = copy_to_mode_reg (mode1, op0);
12105
12106 mode0 = insn_data[icode].operand[0].mode;
12107 if (target == 0
12108 || !register_operand (target, mode0))
12109 target = gen_reg_rtx (mode0);
12110
12111 if (length == 0 || lsb_index >= bitsize)
12112 {
12113 emit_move_insn (target, const0_rtx);
12114 return target;
12115 }
12116
12117 if (length + lsb_index > bitsize)
12118 length = bitsize - lsb_index;
12119
12120 op1 = GEN_INT (length);
12121 op2 = GEN_INT (lsb_index);
12122
12123 emit_insn (GEN_FCN (icode) (target, op0, op1, op2));
12124 return target;
12125 }
12126
12127 case IX86_BUILTIN_RDRAND16_STEP:
12128 mode = HImode;
12129 goto rdrand_step;
12130
12131 case IX86_BUILTIN_RDRAND32_STEP:
12132 mode = SImode;
12133 goto rdrand_step;
12134
12135 case IX86_BUILTIN_RDRAND64_STEP:
12136 mode = DImode;
12137
12138 rdrand_step:
12139 arg0 = CALL_EXPR_ARG (exp, 0);
12140 op1 = expand_normal (arg0);
12141 if (!address_operand (op1, VOIDmode))
12142 {
12143 op1 = convert_memory_address (Pmode, op1);
12144 op1 = copy_addr_to_reg (op1);
12145 }
12146
12147 op0 = gen_reg_rtx (mode);
12148 emit_insn (gen_rdrand (mode, op0));
12149
12150 emit_move_insn (gen_rtx_MEM (mode, op1), op0);
12151
12152 op1 = force_reg (SImode, const1_rtx);
12153
12154 /* Emit SImode conditional move. */
12155 if (mode == HImode)
12156 {
12157 if (TARGET_ZERO_EXTEND_WITH_AND
12158 && optimize_function_for_speed_p (cfun))
12159 {
12160 op2 = force_reg (SImode, const0_rtx);
12161
12162 emit_insn (gen_movstricthi
12163 (gen_lowpart (HImode, op2), op0));
12164 }
12165 else
12166 {
12167 op2 = gen_reg_rtx (SImode);
12168
12169 emit_insn (gen_zero_extendhisi2 (op2, op0));
12170 }
12171 }
12172 else if (mode == SImode)
12173 op2 = op0;
12174 else
12175 op2 = gen_rtx_SUBREG (SImode, op0, 0);
12176
12177 if (target == 0
12178 || !register_operand (target, SImode))
12179 target = gen_reg_rtx (SImode);
12180
12181 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
12182 const0_rtx);
12183 emit_insn (gen_rtx_SET (target,
12184 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
12185 return target;
12186
12187 case IX86_BUILTIN_RDSEED16_STEP:
12188 mode = HImode;
12189 goto rdseed_step;
12190
12191 case IX86_BUILTIN_RDSEED32_STEP:
12192 mode = SImode;
12193 goto rdseed_step;
12194
12195 case IX86_BUILTIN_RDSEED64_STEP:
12196 mode = DImode;
12197
12198 rdseed_step:
12199 arg0 = CALL_EXPR_ARG (exp, 0);
12200 op1 = expand_normal (arg0);
12201 if (!address_operand (op1, VOIDmode))
12202 {
12203 op1 = convert_memory_address (Pmode, op1);
12204 op1 = copy_addr_to_reg (op1);
12205 }
12206
12207 op0 = gen_reg_rtx (mode);
12208 emit_insn (gen_rdseed (mode, op0));
12209
12210 emit_move_insn (gen_rtx_MEM (mode, op1), op0);
12211
12212 op2 = gen_reg_rtx (QImode);
12213
12214 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
12215 const0_rtx);
12216 emit_insn (gen_rtx_SET (op2, pat));
12217
12218 if (target == 0
12219 || !register_operand (target, SImode))
12220 target = gen_reg_rtx (SImode);
12221
12222 emit_insn (gen_zero_extendqisi2 (target, op2));
12223 return target;
12224
12225 case IX86_BUILTIN_SBB32:
12226 icode = CODE_FOR_subborrowsi;
12227 icode2 = CODE_FOR_subborrowsi_0;
12228 mode0 = SImode;
12229 mode1 = DImode;
12230 mode2 = CCmode;
12231 goto handlecarry;
12232
12233 case IX86_BUILTIN_SBB64:
12234 icode = CODE_FOR_subborrowdi;
12235 icode2 = CODE_FOR_subborrowdi_0;
12236 mode0 = DImode;
12237 mode1 = TImode;
12238 mode2 = CCmode;
12239 goto handlecarry;
12240
12241 case IX86_BUILTIN_ADDCARRYX32:
12242 icode = CODE_FOR_addcarrysi;
12243 icode2 = CODE_FOR_addcarrysi_0;
12244 mode0 = SImode;
12245 mode1 = DImode;
12246 mode2 = CCCmode;
12247 goto handlecarry;
12248
12249 case IX86_BUILTIN_ADDCARRYX64:
12250 icode = CODE_FOR_addcarrydi;
12251 icode2 = CODE_FOR_addcarrydi_0;
12252 mode0 = DImode;
12253 mode1 = TImode;
12254 mode2 = CCCmode;
12255
12256 handlecarry:
12257 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
12258 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
12259 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
12260 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
12261
12262 op1 = expand_normal (arg0);
12263 if (!integer_zerop (arg0))
12264 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
12265
12266 op2 = expand_normal (arg1);
12267 if (!register_operand (op2, mode0))
12268 op2 = copy_to_mode_reg (mode0, op2);
12269
12270 op3 = expand_normal (arg2);
12271 if (!register_operand (op3, mode0))
12272 op3 = copy_to_mode_reg (mode0, op3);
12273
12274 op4 = expand_normal (arg3);
12275 if (!address_operand (op4, VOIDmode))
12276 {
12277 op4 = convert_memory_address (Pmode, op4);
12278 op4 = copy_addr_to_reg (op4);
12279 }
12280
12281 op0 = gen_reg_rtx (mode0);
12282 if (integer_zerop (arg0))
12283 {
12284 /* If arg0 is 0, optimize right away into add or sub
12285 instruction that sets CCCmode flags. */
12286 op1 = gen_rtx_REG (mode2, FLAGS_REG);
12287 emit_insn (GEN_FCN (icode2) (op0, op2, op3));
12288 }
12289 else
12290 {
12291 /* Generate CF from input operand. */
12292 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
12293
12294 /* Generate instruction that consumes CF. */
12295 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
12296 pat = gen_rtx_LTU (mode1, op1, const0_rtx);
12297 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
12298 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
12299 }
12300
12301 /* Return current CF value. */
12302 if (target == 0)
12303 target = gen_reg_rtx (QImode);
12304
12305 pat = gen_rtx_LTU (QImode, op1, const0_rtx);
12306 emit_insn (gen_rtx_SET (target, pat));
12307
12308 /* Store the result. */
12309 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
12310
12311 return target;
12312
12313 case IX86_BUILTIN_READ_FLAGS:
12314 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
12315
12316 if (optimize
12317 || target == NULL_RTX
12318 || !nonimmediate_operand (target, word_mode)
12319 || GET_MODE (target) != word_mode)
12320 target = gen_reg_rtx (word_mode);
12321
12322 emit_insn (gen_pop (target));
12323 return target;
12324
12325 case IX86_BUILTIN_WRITE_FLAGS:
12326
12327 arg0 = CALL_EXPR_ARG (exp, 0);
12328 op0 = expand_normal (arg0);
12329 if (!general_no_elim_operand (op0, word_mode))
12330 op0 = copy_to_mode_reg (word_mode, op0);
12331
12332 emit_insn (gen_push (op0));
12333 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
12334 return 0;
12335
12336 case IX86_BUILTIN_KTESTC8:
12337 icode = CODE_FOR_ktestqi;
12338 mode3 = CCCmode;
12339 goto kortest;
12340
12341 case IX86_BUILTIN_KTESTZ8:
12342 icode = CODE_FOR_ktestqi;
12343 mode3 = CCZmode;
12344 goto kortest;
12345
12346 case IX86_BUILTIN_KTESTC16:
12347 icode = CODE_FOR_ktesthi;
12348 mode3 = CCCmode;
12349 goto kortest;
12350
12351 case IX86_BUILTIN_KTESTZ16:
12352 icode = CODE_FOR_ktesthi;
12353 mode3 = CCZmode;
12354 goto kortest;
12355
12356 case IX86_BUILTIN_KTESTC32:
12357 icode = CODE_FOR_ktestsi;
12358 mode3 = CCCmode;
12359 goto kortest;
12360
12361 case IX86_BUILTIN_KTESTZ32:
12362 icode = CODE_FOR_ktestsi;
12363 mode3 = CCZmode;
12364 goto kortest;
12365
12366 case IX86_BUILTIN_KTESTC64:
12367 icode = CODE_FOR_ktestdi;
12368 mode3 = CCCmode;
12369 goto kortest;
12370
12371 case IX86_BUILTIN_KTESTZ64:
12372 icode = CODE_FOR_ktestdi;
12373 mode3 = CCZmode;
12374 goto kortest;
12375
12376 case IX86_BUILTIN_KORTESTC8:
12377 icode = CODE_FOR_kortestqi;
12378 mode3 = CCCmode;
12379 goto kortest;
12380
12381 case IX86_BUILTIN_KORTESTZ8:
12382 icode = CODE_FOR_kortestqi;
12383 mode3 = CCZmode;
12384 goto kortest;
12385
12386 case IX86_BUILTIN_KORTESTC16:
12387 icode = CODE_FOR_kortesthi;
12388 mode3 = CCCmode;
12389 goto kortest;
12390
12391 case IX86_BUILTIN_KORTESTZ16:
12392 icode = CODE_FOR_kortesthi;
12393 mode3 = CCZmode;
12394 goto kortest;
12395
12396 case IX86_BUILTIN_KORTESTC32:
12397 icode = CODE_FOR_kortestsi;
12398 mode3 = CCCmode;
12399 goto kortest;
12400
12401 case IX86_BUILTIN_KORTESTZ32:
12402 icode = CODE_FOR_kortestsi;
12403 mode3 = CCZmode;
12404 goto kortest;
12405
12406 case IX86_BUILTIN_KORTESTC64:
12407 icode = CODE_FOR_kortestdi;
12408 mode3 = CCCmode;
12409 goto kortest;
12410
12411 case IX86_BUILTIN_KORTESTZ64:
12412 icode = CODE_FOR_kortestdi;
12413 mode3 = CCZmode;
12414
12415 kortest:
12416 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
12417 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
12418 op0 = expand_normal (arg0);
12419 op1 = expand_normal (arg1);
12420
12421 mode0 = insn_data[icode].operand[0].mode;
12422 mode1 = insn_data[icode].operand[1].mode;
12423
12424 if (GET_MODE (op0) != VOIDmode)
12425 op0 = force_reg (GET_MODE (op0), op0);
12426
12427 op0 = gen_lowpart (mode0, op0);
12428
12429 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12430 op0 = copy_to_mode_reg (mode0, op0);
12431
12432 if (GET_MODE (op1) != VOIDmode)
12433 op1 = force_reg (GET_MODE (op1), op1);
12434
12435 op1 = gen_lowpart (mode1, op1);
12436
12437 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12438 op1 = copy_to_mode_reg (mode1, op1);
12439
12440 target = gen_reg_rtx (QImode);
12441
12442 /* Emit kortest. */
12443 emit_insn (GEN_FCN (icode) (op0, op1));
12444 /* And use setcc to return result from flags. */
12445 ix86_expand_setcc (target, EQ,
12446 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
12447 return target;
12448
12449 case IX86_BUILTIN_GATHERSIV2DF:
12450 icode = CODE_FOR_avx2_gathersiv2df;
12451 goto gather_gen;
12452 case IX86_BUILTIN_GATHERSIV4DF:
12453 icode = CODE_FOR_avx2_gathersiv4df;
12454 goto gather_gen;
12455 case IX86_BUILTIN_GATHERDIV2DF:
12456 icode = CODE_FOR_avx2_gatherdiv2df;
12457 goto gather_gen;
12458 case IX86_BUILTIN_GATHERDIV4DF:
12459 icode = CODE_FOR_avx2_gatherdiv4df;
12460 goto gather_gen;
12461 case IX86_BUILTIN_GATHERSIV4SF:
12462 icode = CODE_FOR_avx2_gathersiv4sf;
12463 goto gather_gen;
12464 case IX86_BUILTIN_GATHERSIV8SF:
12465 icode = CODE_FOR_avx2_gathersiv8sf;
12466 goto gather_gen;
12467 case IX86_BUILTIN_GATHERDIV4SF:
12468 icode = CODE_FOR_avx2_gatherdiv4sf;
12469 goto gather_gen;
12470 case IX86_BUILTIN_GATHERDIV8SF:
12471 icode = CODE_FOR_avx2_gatherdiv8sf;
12472 goto gather_gen;
12473 case IX86_BUILTIN_GATHERSIV2DI:
12474 icode = CODE_FOR_avx2_gathersiv2di;
12475 goto gather_gen;
12476 case IX86_BUILTIN_GATHERSIV4DI:
12477 icode = CODE_FOR_avx2_gathersiv4di;
12478 goto gather_gen;
12479 case IX86_BUILTIN_GATHERDIV2DI:
12480 icode = CODE_FOR_avx2_gatherdiv2di;
12481 goto gather_gen;
12482 case IX86_BUILTIN_GATHERDIV4DI:
12483 icode = CODE_FOR_avx2_gatherdiv4di;
12484 goto gather_gen;
12485 case IX86_BUILTIN_GATHERSIV4SI:
12486 icode = CODE_FOR_avx2_gathersiv4si;
12487 goto gather_gen;
12488 case IX86_BUILTIN_GATHERSIV8SI:
12489 icode = CODE_FOR_avx2_gathersiv8si;
12490 goto gather_gen;
12491 case IX86_BUILTIN_GATHERDIV4SI:
12492 icode = CODE_FOR_avx2_gatherdiv4si;
12493 goto gather_gen;
12494 case IX86_BUILTIN_GATHERDIV8SI:
12495 icode = CODE_FOR_avx2_gatherdiv8si;
12496 goto gather_gen;
12497 case IX86_BUILTIN_GATHERALTSIV4DF:
12498 icode = CODE_FOR_avx2_gathersiv4df;
12499 goto gather_gen;
12500 case IX86_BUILTIN_GATHERALTDIV8SF:
12501 icode = CODE_FOR_avx2_gatherdiv8sf;
12502 goto gather_gen;
12503 case IX86_BUILTIN_GATHERALTSIV4DI:
12504 icode = CODE_FOR_avx2_gathersiv4di;
12505 goto gather_gen;
12506 case IX86_BUILTIN_GATHERALTDIV8SI:
12507 icode = CODE_FOR_avx2_gatherdiv8si;
12508 goto gather_gen;
12509 case IX86_BUILTIN_GATHER3SIV16SF:
12510 icode = CODE_FOR_avx512f_gathersiv16sf;
12511 goto gather_gen;
12512 case IX86_BUILTIN_GATHER3SIV8DF:
12513 icode = CODE_FOR_avx512f_gathersiv8df;
12514 goto gather_gen;
12515 case IX86_BUILTIN_GATHER3DIV16SF:
12516 icode = CODE_FOR_avx512f_gatherdiv16sf;
12517 goto gather_gen;
12518 case IX86_BUILTIN_GATHER3DIV8DF:
12519 icode = CODE_FOR_avx512f_gatherdiv8df;
12520 goto gather_gen;
12521 case IX86_BUILTIN_GATHER3SIV16SI:
12522 icode = CODE_FOR_avx512f_gathersiv16si;
12523 goto gather_gen;
12524 case IX86_BUILTIN_GATHER3SIV8DI:
12525 icode = CODE_FOR_avx512f_gathersiv8di;
12526 goto gather_gen;
12527 case IX86_BUILTIN_GATHER3DIV16SI:
12528 icode = CODE_FOR_avx512f_gatherdiv16si;
12529 goto gather_gen;
12530 case IX86_BUILTIN_GATHER3DIV8DI:
12531 icode = CODE_FOR_avx512f_gatherdiv8di;
12532 goto gather_gen;
12533 case IX86_BUILTIN_GATHER3ALTSIV8DF:
12534 icode = CODE_FOR_avx512f_gathersiv8df;
12535 goto gather_gen;
12536 case IX86_BUILTIN_GATHER3ALTDIV16SF:
12537 icode = CODE_FOR_avx512f_gatherdiv16sf;
12538 goto gather_gen;
12539 case IX86_BUILTIN_GATHER3ALTSIV8DI:
12540 icode = CODE_FOR_avx512f_gathersiv8di;
12541 goto gather_gen;
12542 case IX86_BUILTIN_GATHER3ALTDIV16SI:
12543 icode = CODE_FOR_avx512f_gatherdiv16si;
12544 goto gather_gen;
12545 case IX86_BUILTIN_GATHER3SIV2DF:
12546 icode = CODE_FOR_avx512vl_gathersiv2df;
12547 goto gather_gen;
12548 case IX86_BUILTIN_GATHER3SIV4DF:
12549 icode = CODE_FOR_avx512vl_gathersiv4df;
12550 goto gather_gen;
12551 case IX86_BUILTIN_GATHER3DIV2DF:
12552 icode = CODE_FOR_avx512vl_gatherdiv2df;
12553 goto gather_gen;
12554 case IX86_BUILTIN_GATHER3DIV4DF:
12555 icode = CODE_FOR_avx512vl_gatherdiv4df;
12556 goto gather_gen;
12557 case IX86_BUILTIN_GATHER3SIV4SF:
12558 icode = CODE_FOR_avx512vl_gathersiv4sf;
12559 goto gather_gen;
12560 case IX86_BUILTIN_GATHER3SIV8SF:
12561 icode = CODE_FOR_avx512vl_gathersiv8sf;
12562 goto gather_gen;
12563 case IX86_BUILTIN_GATHER3DIV4SF:
12564 icode = CODE_FOR_avx512vl_gatherdiv4sf;
12565 goto gather_gen;
12566 case IX86_BUILTIN_GATHER3DIV8SF:
12567 icode = CODE_FOR_avx512vl_gatherdiv8sf;
12568 goto gather_gen;
12569 case IX86_BUILTIN_GATHER3SIV2DI:
12570 icode = CODE_FOR_avx512vl_gathersiv2di;
12571 goto gather_gen;
12572 case IX86_BUILTIN_GATHER3SIV4DI:
12573 icode = CODE_FOR_avx512vl_gathersiv4di;
12574 goto gather_gen;
12575 case IX86_BUILTIN_GATHER3DIV2DI:
12576 icode = CODE_FOR_avx512vl_gatherdiv2di;
12577 goto gather_gen;
12578 case IX86_BUILTIN_GATHER3DIV4DI:
12579 icode = CODE_FOR_avx512vl_gatherdiv4di;
12580 goto gather_gen;
12581 case IX86_BUILTIN_GATHER3SIV4SI:
12582 icode = CODE_FOR_avx512vl_gathersiv4si;
12583 goto gather_gen;
12584 case IX86_BUILTIN_GATHER3SIV8SI:
12585 icode = CODE_FOR_avx512vl_gathersiv8si;
12586 goto gather_gen;
12587 case IX86_BUILTIN_GATHER3DIV4SI:
12588 icode = CODE_FOR_avx512vl_gatherdiv4si;
12589 goto gather_gen;
12590 case IX86_BUILTIN_GATHER3DIV8SI:
12591 icode = CODE_FOR_avx512vl_gatherdiv8si;
12592 goto gather_gen;
12593 case IX86_BUILTIN_GATHER3ALTSIV4DF:
12594 icode = CODE_FOR_avx512vl_gathersiv4df;
12595 goto gather_gen;
12596 case IX86_BUILTIN_GATHER3ALTDIV8SF:
12597 icode = CODE_FOR_avx512vl_gatherdiv8sf;
12598 goto gather_gen;
12599 case IX86_BUILTIN_GATHER3ALTSIV4DI:
12600 icode = CODE_FOR_avx512vl_gathersiv4di;
12601 goto gather_gen;
12602 case IX86_BUILTIN_GATHER3ALTDIV8SI:
12603 icode = CODE_FOR_avx512vl_gatherdiv8si;
12604 goto gather_gen;
12605 case IX86_BUILTIN_SCATTERSIV16SF:
12606 icode = CODE_FOR_avx512f_scattersiv16sf;
12607 goto scatter_gen;
12608 case IX86_BUILTIN_SCATTERSIV8DF:
12609 icode = CODE_FOR_avx512f_scattersiv8df;
12610 goto scatter_gen;
12611 case IX86_BUILTIN_SCATTERDIV16SF:
12612 icode = CODE_FOR_avx512f_scatterdiv16sf;
12613 goto scatter_gen;
12614 case IX86_BUILTIN_SCATTERDIV8DF:
12615 icode = CODE_FOR_avx512f_scatterdiv8df;
12616 goto scatter_gen;
12617 case IX86_BUILTIN_SCATTERSIV16SI:
12618 icode = CODE_FOR_avx512f_scattersiv16si;
12619 goto scatter_gen;
12620 case IX86_BUILTIN_SCATTERSIV8DI:
12621 icode = CODE_FOR_avx512f_scattersiv8di;
12622 goto scatter_gen;
12623 case IX86_BUILTIN_SCATTERDIV16SI:
12624 icode = CODE_FOR_avx512f_scatterdiv16si;
12625 goto scatter_gen;
12626 case IX86_BUILTIN_SCATTERDIV8DI:
12627 icode = CODE_FOR_avx512f_scatterdiv8di;
12628 goto scatter_gen;
12629 case IX86_BUILTIN_SCATTERSIV8SF:
12630 icode = CODE_FOR_avx512vl_scattersiv8sf;
12631 goto scatter_gen;
12632 case IX86_BUILTIN_SCATTERSIV4SF:
12633 icode = CODE_FOR_avx512vl_scattersiv4sf;
12634 goto scatter_gen;
12635 case IX86_BUILTIN_SCATTERSIV4DF:
12636 icode = CODE_FOR_avx512vl_scattersiv4df;
12637 goto scatter_gen;
12638 case IX86_BUILTIN_SCATTERSIV2DF:
12639 icode = CODE_FOR_avx512vl_scattersiv2df;
12640 goto scatter_gen;
12641 case IX86_BUILTIN_SCATTERDIV8SF:
12642 icode = CODE_FOR_avx512vl_scatterdiv8sf;
12643 goto scatter_gen;
12644 case IX86_BUILTIN_SCATTERDIV4SF:
12645 icode = CODE_FOR_avx512vl_scatterdiv4sf;
12646 goto scatter_gen;
12647 case IX86_BUILTIN_SCATTERDIV4DF:
12648 icode = CODE_FOR_avx512vl_scatterdiv4df;
12649 goto scatter_gen;
12650 case IX86_BUILTIN_SCATTERDIV2DF:
12651 icode = CODE_FOR_avx512vl_scatterdiv2df;
12652 goto scatter_gen;
12653 case IX86_BUILTIN_SCATTERSIV8SI:
12654 icode = CODE_FOR_avx512vl_scattersiv8si;
12655 goto scatter_gen;
12656 case IX86_BUILTIN_SCATTERSIV4SI:
12657 icode = CODE_FOR_avx512vl_scattersiv4si;
12658 goto scatter_gen;
12659 case IX86_BUILTIN_SCATTERSIV4DI:
12660 icode = CODE_FOR_avx512vl_scattersiv4di;
12661 goto scatter_gen;
12662 case IX86_BUILTIN_SCATTERSIV2DI:
12663 icode = CODE_FOR_avx512vl_scattersiv2di;
12664 goto scatter_gen;
12665 case IX86_BUILTIN_SCATTERDIV8SI:
12666 icode = CODE_FOR_avx512vl_scatterdiv8si;
12667 goto scatter_gen;
12668 case IX86_BUILTIN_SCATTERDIV4SI:
12669 icode = CODE_FOR_avx512vl_scatterdiv4si;
12670 goto scatter_gen;
12671 case IX86_BUILTIN_SCATTERDIV4DI:
12672 icode = CODE_FOR_avx512vl_scatterdiv4di;
12673 goto scatter_gen;
12674 case IX86_BUILTIN_SCATTERDIV2DI:
12675 icode = CODE_FOR_avx512vl_scatterdiv2di;
12676 goto scatter_gen;
12677 case IX86_BUILTIN_GATHERPFDPD:
12678 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
12679 goto vec_prefetch_gen;
12680 case IX86_BUILTIN_SCATTERALTSIV8DF:
12681 icode = CODE_FOR_avx512f_scattersiv8df;
12682 goto scatter_gen;
12683 case IX86_BUILTIN_SCATTERALTDIV16SF:
12684 icode = CODE_FOR_avx512f_scatterdiv16sf;
12685 goto scatter_gen;
12686 case IX86_BUILTIN_SCATTERALTSIV8DI:
12687 icode = CODE_FOR_avx512f_scattersiv8di;
12688 goto scatter_gen;
12689 case IX86_BUILTIN_SCATTERALTDIV16SI:
12690 icode = CODE_FOR_avx512f_scatterdiv16si;
12691 goto scatter_gen;
12692 case IX86_BUILTIN_SCATTERALTSIV4DF:
12693 icode = CODE_FOR_avx512vl_scattersiv4df;
12694 goto scatter_gen;
12695 case IX86_BUILTIN_SCATTERALTDIV8SF:
12696 icode = CODE_FOR_avx512vl_scatterdiv8sf;
12697 goto scatter_gen;
12698 case IX86_BUILTIN_SCATTERALTSIV4DI:
12699 icode = CODE_FOR_avx512vl_scattersiv4di;
12700 goto scatter_gen;
12701 case IX86_BUILTIN_SCATTERALTDIV8SI:
12702 icode = CODE_FOR_avx512vl_scatterdiv8si;
12703 goto scatter_gen;
12704 case IX86_BUILTIN_SCATTERALTSIV2DF:
12705 icode = CODE_FOR_avx512vl_scattersiv2df;
12706 goto scatter_gen;
12707 case IX86_BUILTIN_SCATTERALTDIV4SF:
12708 icode = CODE_FOR_avx512vl_scatterdiv4sf;
12709 goto scatter_gen;
12710 case IX86_BUILTIN_SCATTERALTSIV2DI:
12711 icode = CODE_FOR_avx512vl_scattersiv2di;
12712 goto scatter_gen;
12713 case IX86_BUILTIN_SCATTERALTDIV4SI:
12714 icode = CODE_FOR_avx512vl_scatterdiv4si;
12715 goto scatter_gen;
12716 case IX86_BUILTIN_GATHERPFDPS:
12717 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
12718 goto vec_prefetch_gen;
12719 case IX86_BUILTIN_GATHERPFQPD:
12720 icode = CODE_FOR_avx512pf_gatherpfv8didf;
12721 goto vec_prefetch_gen;
12722 case IX86_BUILTIN_GATHERPFQPS:
12723 icode = CODE_FOR_avx512pf_gatherpfv8disf;
12724 goto vec_prefetch_gen;
12725 case IX86_BUILTIN_SCATTERPFDPD:
12726 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
12727 goto vec_prefetch_gen;
12728 case IX86_BUILTIN_SCATTERPFDPS:
12729 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
12730 goto vec_prefetch_gen;
12731 case IX86_BUILTIN_SCATTERPFQPD:
12732 icode = CODE_FOR_avx512pf_scatterpfv8didf;
12733 goto vec_prefetch_gen;
12734 case IX86_BUILTIN_SCATTERPFQPS:
12735 icode = CODE_FOR_avx512pf_scatterpfv8disf;
12736 goto vec_prefetch_gen;
12737
12738 gather_gen:
12739 rtx half;
12740 rtx (*gen) (rtx, rtx);
12741
12742 arg0 = CALL_EXPR_ARG (exp, 0);
12743 arg1 = CALL_EXPR_ARG (exp, 1);
12744 arg2 = CALL_EXPR_ARG (exp, 2);
12745 arg3 = CALL_EXPR_ARG (exp, 3);
12746 arg4 = CALL_EXPR_ARG (exp, 4);
12747 op0 = expand_normal (arg0);
12748 op1 = expand_normal (arg1);
12749 op2 = expand_normal (arg2);
12750 op3 = expand_normal (arg3);
12751 op4 = expand_normal (arg4);
12752 /* Note the arg order is different from the operand order. */
12753 mode0 = insn_data[icode].operand[1].mode;
12754 mode2 = insn_data[icode].operand[3].mode;
12755 mode3 = insn_data[icode].operand[4].mode;
12756 mode4 = insn_data[icode].operand[5].mode;
12757
12758 if (target == NULL_RTX
12759 || GET_MODE (target) != insn_data[icode].operand[0].mode
12760 || !insn_data[icode].operand[0].predicate (target,
12761 GET_MODE (target)))
12762 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
12763 else
12764 subtarget = target;
12765
12766 switch (fcode)
12767 {
12768 case IX86_BUILTIN_GATHER3ALTSIV8DF:
12769 case IX86_BUILTIN_GATHER3ALTSIV8DI:
12770 half = gen_reg_rtx (V8SImode);
12771 if (!nonimmediate_operand (op2, V16SImode))
12772 op2 = copy_to_mode_reg (V16SImode, op2);
12773 emit_insn (gen_vec_extract_lo_v16si (half, op2));
12774 op2 = half;
12775 break;
12776 case IX86_BUILTIN_GATHER3ALTSIV4DF:
12777 case IX86_BUILTIN_GATHER3ALTSIV4DI:
12778 case IX86_BUILTIN_GATHERALTSIV4DF:
12779 case IX86_BUILTIN_GATHERALTSIV4DI:
12780 half = gen_reg_rtx (V4SImode);
12781 if (!nonimmediate_operand (op2, V8SImode))
12782 op2 = copy_to_mode_reg (V8SImode, op2);
12783 emit_insn (gen_vec_extract_lo_v8si (half, op2));
12784 op2 = half;
12785 break;
12786 case IX86_BUILTIN_GATHER3ALTDIV16SF:
12787 case IX86_BUILTIN_GATHER3ALTDIV16SI:
12788 half = gen_reg_rtx (mode0);
12789 if (mode0 == V8SFmode)
12790 gen = gen_vec_extract_lo_v16sf;
12791 else
12792 gen = gen_vec_extract_lo_v16si;
12793 if (!nonimmediate_operand (op0, GET_MODE (op0)))
12794 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
12795 emit_insn (gen (half, op0));
12796 op0 = half;
12797 op3 = lowpart_subreg (QImode, op3, HImode);
12798 break;
12799 case IX86_BUILTIN_GATHER3ALTDIV8SF:
12800 case IX86_BUILTIN_GATHER3ALTDIV8SI:
12801 case IX86_BUILTIN_GATHERALTDIV8SF:
12802 case IX86_BUILTIN_GATHERALTDIV8SI:
12803 half = gen_reg_rtx (mode0);
12804 if (mode0 == V4SFmode)
12805 gen = gen_vec_extract_lo_v8sf;
12806 else
12807 gen = gen_vec_extract_lo_v8si;
12808 if (!nonimmediate_operand (op0, GET_MODE (op0)))
12809 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
12810 emit_insn (gen (half, op0));
12811 op0 = half;
12812 if (VECTOR_MODE_P (GET_MODE (op3)))
12813 {
12814 half = gen_reg_rtx (mode0);
12815 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12816 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12817 emit_insn (gen (half, op3));
12818 op3 = half;
12819 }
12820 break;
12821 default:
12822 break;
12823 }
12824
12825 /* Force memory operand only with base register here. But we
12826 don't want to do it on memory operand for other builtin
12827 functions. */
12828 op1 = ix86_zero_extend_to_Pmode (op1);
12829
12830 if (!insn_data[icode].operand[1].predicate (op0, mode0))
12831 op0 = copy_to_mode_reg (mode0, op0);
12832 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
12833 op1 = copy_to_mode_reg (Pmode, op1);
12834 if (!insn_data[icode].operand[3].predicate (op2, mode2))
12835 op2 = copy_to_mode_reg (mode2, op2);
12836
12837 op3 = fixup_modeless_constant (op3, mode3);
12838
12839 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
12840 {
12841 if (!insn_data[icode].operand[4].predicate (op3, mode3))
12842 op3 = copy_to_mode_reg (mode3, op3);
12843 }
12844 else
12845 {
12846 op3 = copy_to_reg (op3);
12847 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
12848 }
12849 if (!insn_data[icode].operand[5].predicate (op4, mode4))
12850 {
12851 error ("the last argument must be scale 1, 2, 4, 8");
12852 return const0_rtx;
12853 }
12854
12855 /* Optimize. If mask is known to have all high bits set,
12856 replace op0 with pc_rtx to signal that the instruction
12857 overwrites the whole destination and doesn't use its
12858 previous contents. */
12859 if (optimize)
12860 {
12861 if (TREE_CODE (arg3) == INTEGER_CST)
12862 {
12863 if (integer_all_onesp (arg3))
12864 op0 = pc_rtx;
12865 }
12866 else if (TREE_CODE (arg3) == VECTOR_CST)
12867 {
12868 unsigned int negative = 0;
12869 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
12870 {
12871 tree cst = VECTOR_CST_ELT (arg3, i);
12872 if (TREE_CODE (cst) == INTEGER_CST
12873 && tree_int_cst_sign_bit (cst))
12874 negative++;
12875 else if (TREE_CODE (cst) == REAL_CST
12876 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
12877 negative++;
12878 }
12879 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
12880 op0 = pc_rtx;
12881 }
12882 else if (TREE_CODE (arg3) == SSA_NAME
12883 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
12884 {
12885 /* Recognize also when mask is like:
12886 __v2df src = _mm_setzero_pd ();
12887 __v2df mask = _mm_cmpeq_pd (src, src);
12888 or
12889 __v8sf src = _mm256_setzero_ps ();
12890 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
12891 as that is a cheaper way to load all ones into
12892 a register than having to load a constant from
12893 memory. */
12894 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
12895 if (is_gimple_call (def_stmt))
12896 {
12897 tree fndecl = gimple_call_fndecl (def_stmt);
12898 if (fndecl
12899 && fndecl_built_in_p (fndecl, BUILT_IN_MD))
12900 switch (DECL_MD_FUNCTION_CODE (fndecl))
12901 {
12902 case IX86_BUILTIN_CMPPD:
12903 case IX86_BUILTIN_CMPPS:
12904 case IX86_BUILTIN_CMPPD256:
12905 case IX86_BUILTIN_CMPPS256:
12906 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
12907 break;
12908 /* FALLTHRU */
12909 case IX86_BUILTIN_CMPEQPD:
12910 case IX86_BUILTIN_CMPEQPS:
12911 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
12912 && initializer_zerop (gimple_call_arg (def_stmt,
12913 1)))
12914 op0 = pc_rtx;
12915 break;
12916 default:
12917 break;
12918 }
12919 }
12920 }
12921 }
12922
12923 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
12924 if (! pat)
12925 return const0_rtx;
12926 emit_insn (pat);
12927
12928 switch (fcode)
12929 {
12930 case IX86_BUILTIN_GATHER3DIV16SF:
12931 if (target == NULL_RTX)
12932 target = gen_reg_rtx (V8SFmode);
12933 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
12934 break;
12935 case IX86_BUILTIN_GATHER3DIV16SI:
12936 if (target == NULL_RTX)
12937 target = gen_reg_rtx (V8SImode);
12938 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
12939 break;
12940 case IX86_BUILTIN_GATHER3DIV8SF:
12941 case IX86_BUILTIN_GATHERDIV8SF:
12942 if (target == NULL_RTX)
12943 target = gen_reg_rtx (V4SFmode);
12944 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
12945 break;
12946 case IX86_BUILTIN_GATHER3DIV8SI:
12947 case IX86_BUILTIN_GATHERDIV8SI:
12948 if (target == NULL_RTX)
12949 target = gen_reg_rtx (V4SImode);
12950 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
12951 break;
12952 default:
12953 target = subtarget;
12954 break;
12955 }
12956 return target;
12957
12958 scatter_gen:
12959 arg0 = CALL_EXPR_ARG (exp, 0);
12960 arg1 = CALL_EXPR_ARG (exp, 1);
12961 arg2 = CALL_EXPR_ARG (exp, 2);
12962 arg3 = CALL_EXPR_ARG (exp, 3);
12963 arg4 = CALL_EXPR_ARG (exp, 4);
12964 op0 = expand_normal (arg0);
12965 op1 = expand_normal (arg1);
12966 op2 = expand_normal (arg2);
12967 op3 = expand_normal (arg3);
12968 op4 = expand_normal (arg4);
12969 mode1 = insn_data[icode].operand[1].mode;
12970 mode2 = insn_data[icode].operand[2].mode;
12971 mode3 = insn_data[icode].operand[3].mode;
12972 mode4 = insn_data[icode].operand[4].mode;
12973
12974 /* Scatter instruction stores operand op3 to memory with
12975 indices from op2 and scale from op4 under writemask op1.
12976 If index operand op2 has more elements then source operand
12977 op3 one need to use only its low half. And vice versa. */
12978 switch (fcode)
12979 {
12980 case IX86_BUILTIN_SCATTERALTSIV8DF:
12981 case IX86_BUILTIN_SCATTERALTSIV8DI:
12982 half = gen_reg_rtx (V8SImode);
12983 if (!nonimmediate_operand (op2, V16SImode))
12984 op2 = copy_to_mode_reg (V16SImode, op2);
12985 emit_insn (gen_vec_extract_lo_v16si (half, op2));
12986 op2 = half;
12987 break;
12988 case IX86_BUILTIN_SCATTERALTDIV16SF:
12989 case IX86_BUILTIN_SCATTERALTDIV16SI:
12990 half = gen_reg_rtx (mode3);
12991 if (mode3 == V8SFmode)
12992 gen = gen_vec_extract_lo_v16sf;
12993 else
12994 gen = gen_vec_extract_lo_v16si;
12995 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12996 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12997 emit_insn (gen (half, op3));
12998 op3 = half;
12999 break;
13000 case IX86_BUILTIN_SCATTERALTSIV4DF:
13001 case IX86_BUILTIN_SCATTERALTSIV4DI:
13002 half = gen_reg_rtx (V4SImode);
13003 if (!nonimmediate_operand (op2, V8SImode))
13004 op2 = copy_to_mode_reg (V8SImode, op2);
13005 emit_insn (gen_vec_extract_lo_v8si (half, op2));
13006 op2 = half;
13007 break;
13008 case IX86_BUILTIN_SCATTERALTDIV8SF:
13009 case IX86_BUILTIN_SCATTERALTDIV8SI:
13010 half = gen_reg_rtx (mode3);
13011 if (mode3 == V4SFmode)
13012 gen = gen_vec_extract_lo_v8sf;
13013 else
13014 gen = gen_vec_extract_lo_v8si;
13015 if (!nonimmediate_operand (op3, GET_MODE (op3)))
13016 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
13017 emit_insn (gen (half, op3));
13018 op3 = half;
13019 break;
13020 case IX86_BUILTIN_SCATTERALTSIV2DF:
13021 case IX86_BUILTIN_SCATTERALTSIV2DI:
13022 if (!nonimmediate_operand (op2, V4SImode))
13023 op2 = copy_to_mode_reg (V4SImode, op2);
13024 break;
13025 case IX86_BUILTIN_SCATTERALTDIV4SF:
13026 case IX86_BUILTIN_SCATTERALTDIV4SI:
13027 if (!nonimmediate_operand (op3, GET_MODE (op3)))
13028 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
13029 break;
13030 default:
13031 break;
13032 }
13033
13034 /* Force memory operand only with base register here. But we
13035 don't want to do it on memory operand for other builtin
13036 functions. */
13037 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
13038
13039 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
13040 op0 = copy_to_mode_reg (Pmode, op0);
13041
13042 op1 = fixup_modeless_constant (op1, mode1);
13043
13044 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
13045 {
13046 if (!insn_data[icode].operand[1].predicate (op1, mode1))
13047 op1 = copy_to_mode_reg (mode1, op1);
13048 }
13049 else
13050 {
13051 op1 = copy_to_reg (op1);
13052 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
13053 }
13054
13055 if (!insn_data[icode].operand[2].predicate (op2, mode2))
13056 op2 = copy_to_mode_reg (mode2, op2);
13057
13058 if (!insn_data[icode].operand[3].predicate (op3, mode3))
13059 op3 = copy_to_mode_reg (mode3, op3);
13060
13061 if (!insn_data[icode].operand[4].predicate (op4, mode4))
13062 {
13063 error ("the last argument must be scale 1, 2, 4, 8");
13064 return const0_rtx;
13065 }
13066
13067 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
13068 if (! pat)
13069 return const0_rtx;
13070
13071 emit_insn (pat);
13072 return 0;
13073
13074 vec_prefetch_gen:
13075 arg0 = CALL_EXPR_ARG (exp, 0);
13076 arg1 = CALL_EXPR_ARG (exp, 1);
13077 arg2 = CALL_EXPR_ARG (exp, 2);
13078 arg3 = CALL_EXPR_ARG (exp, 3);
13079 arg4 = CALL_EXPR_ARG (exp, 4);
13080 op0 = expand_normal (arg0);
13081 op1 = expand_normal (arg1);
13082 op2 = expand_normal (arg2);
13083 op3 = expand_normal (arg3);
13084 op4 = expand_normal (arg4);
13085 mode0 = insn_data[icode].operand[0].mode;
13086 mode1 = insn_data[icode].operand[1].mode;
13087 mode3 = insn_data[icode].operand[3].mode;
13088 mode4 = insn_data[icode].operand[4].mode;
13089
13090 op0 = fixup_modeless_constant (op0, mode0);
13091
13092 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
13093 {
13094 if (!insn_data[icode].operand[0].predicate (op0, mode0))
13095 op0 = copy_to_mode_reg (mode0, op0);
13096 }
13097 else
13098 {
13099 op0 = copy_to_reg (op0);
13100 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
13101 }
13102
13103 if (!insn_data[icode].operand[1].predicate (op1, mode1))
13104 op1 = copy_to_mode_reg (mode1, op1);
13105
13106 /* Force memory operand only with base register here. But we
13107 don't want to do it on memory operand for other builtin
13108 functions. */
13109 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
13110
13111 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
13112 op2 = copy_to_mode_reg (Pmode, op2);
13113
13114 if (!insn_data[icode].operand[3].predicate (op3, mode3))
13115 {
13116 error ("the forth argument must be scale 1, 2, 4, 8");
13117 return const0_rtx;
13118 }
13119
13120 if (!insn_data[icode].operand[4].predicate (op4, mode4))
13121 {
13122 error ("incorrect hint operand");
13123 return const0_rtx;
13124 }
13125
13126 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
13127 if (! pat)
13128 return const0_rtx;
13129
13130 emit_insn (pat);
13131
13132 return 0;
13133
13134 case IX86_BUILTIN_XABORT:
13135 icode = CODE_FOR_xabort;
13136 arg0 = CALL_EXPR_ARG (exp, 0);
13137 op0 = expand_normal (arg0);
13138 mode0 = insn_data[icode].operand[0].mode;
13139 if (!insn_data[icode].operand[0].predicate (op0, mode0))
13140 {
13141 error ("the argument to %<xabort%> intrinsic must "
13142 "be an 8-bit immediate");
13143 return const0_rtx;
13144 }
13145 emit_insn (gen_xabort (op0));
13146 return 0;
13147
13148 case IX86_BUILTIN_RDSSPD:
13149 case IX86_BUILTIN_RDSSPQ:
13150 mode = (fcode == IX86_BUILTIN_RDSSPD ? SImode : DImode);
13151
13152 if (target == 0
13153 || !register_operand (target, mode))
13154 target = gen_reg_rtx (mode);
13155
13156 op0 = force_reg (mode, const0_rtx);
13157
13158 emit_insn (gen_rdssp (mode, target, op0));
13159 return target;
13160
13161 case IX86_BUILTIN_INCSSPD:
13162 case IX86_BUILTIN_INCSSPQ:
13163 mode = (fcode == IX86_BUILTIN_INCSSPD ? SImode : DImode);
13164
13165 arg0 = CALL_EXPR_ARG (exp, 0);
13166 op0 = expand_normal (arg0);
13167
13168 op0 = force_reg (mode, op0);
13169
13170 emit_insn (gen_incssp (mode, op0));
13171 return 0;
13172
13173 case IX86_BUILTIN_HRESET:
13174 icode = CODE_FOR_hreset;
13175 arg0 = CALL_EXPR_ARG (exp, 0);
13176 op0 = expand_normal (arg0);
13177 op0 = force_reg (SImode, op0);
13178 emit_insn (gen_hreset (op0));
13179 return 0;
13180
13181 case IX86_BUILTIN_RSTORSSP:
13182 case IX86_BUILTIN_CLRSSBSY:
13183 arg0 = CALL_EXPR_ARG (exp, 0);
13184 op0 = expand_normal (arg0);
13185 icode = (fcode == IX86_BUILTIN_RSTORSSP
13186 ? CODE_FOR_rstorssp
13187 : CODE_FOR_clrssbsy);
13188
13189 if (!address_operand (op0, VOIDmode))
13190 {
13191 op0 = convert_memory_address (Pmode, op0);
13192 op0 = copy_addr_to_reg (op0);
13193 }
13194 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (DImode, op0)));
13195 return 0;
13196
13197 case IX86_BUILTIN_WRSSD:
13198 case IX86_BUILTIN_WRSSQ:
13199 case IX86_BUILTIN_WRUSSD:
13200 case IX86_BUILTIN_WRUSSQ:
13201 mode = ((fcode == IX86_BUILTIN_WRSSD
13202 || fcode == IX86_BUILTIN_WRUSSD)
13203 ? SImode : DImode);
13204
13205 arg0 = CALL_EXPR_ARG (exp, 0);
13206 op0 = expand_normal (arg0);
13207 arg1 = CALL_EXPR_ARG (exp, 1);
13208 op1 = expand_normal (arg1);
13209
13210 op0 = force_reg (mode, op0);
13211
13212 if (!address_operand (op1, VOIDmode))
13213 {
13214 op1 = convert_memory_address (Pmode, op1);
13215 op1 = copy_addr_to_reg (op1);
13216 }
13217 op1 = gen_rtx_MEM (mode, op1);
13218
13219 icode = ((fcode == IX86_BUILTIN_WRSSD
13220 || fcode == IX86_BUILTIN_WRSSQ)
13221 ? code_for_wrss (mode)
13222 : code_for_wruss (mode));
13223 emit_insn (GEN_FCN (icode) (op0, op1));
13224
13225 return 0;
13226
13227 case IX86_BUILTIN_VZEROUPPER:
13228 cfun->machine->has_explicit_vzeroupper = true;
13229 break;
13230
13231 default:
13232 break;
13233 }
13234
13235 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
13236 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
13237 {
13238 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
13239 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
13240 target);
13241 }
13242
13243 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
13244 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
13245 {
13246 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
13247 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
13248 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
13249 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
13250 int masked = 1;
13251 machine_mode mode, wide_mode, nar_mode;
13252
13253 nar_mode = V4SFmode;
13254 mode = V16SFmode;
13255 wide_mode = V64SFmode;
13256 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
13257 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
13258
13259 switch (fcode)
13260 {
13261 case IX86_BUILTIN_4FMAPS:
13262 fcn = gen_avx5124fmaddps_4fmaddps;
13263 masked = 0;
13264 goto v4fma_expand;
13265
13266 case IX86_BUILTIN_4DPWSSD:
13267 nar_mode = V4SImode;
13268 mode = V16SImode;
13269 wide_mode = V64SImode;
13270 fcn = gen_avx5124vnniw_vp4dpwssd;
13271 masked = 0;
13272 goto v4fma_expand;
13273
13274 case IX86_BUILTIN_4DPWSSDS:
13275 nar_mode = V4SImode;
13276 mode = V16SImode;
13277 wide_mode = V64SImode;
13278 fcn = gen_avx5124vnniw_vp4dpwssds;
13279 masked = 0;
13280 goto v4fma_expand;
13281
13282 case IX86_BUILTIN_4FNMAPS:
13283 fcn = gen_avx5124fmaddps_4fnmaddps;
13284 masked = 0;
13285 goto v4fma_expand;
13286
13287 case IX86_BUILTIN_4FNMAPS_MASK:
13288 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
13289 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
13290 goto v4fma_expand;
13291
13292 case IX86_BUILTIN_4DPWSSD_MASK:
13293 nar_mode = V4SImode;
13294 mode = V16SImode;
13295 wide_mode = V64SImode;
13296 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
13297 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
13298 goto v4fma_expand;
13299
13300 case IX86_BUILTIN_4DPWSSDS_MASK:
13301 nar_mode = V4SImode;
13302 mode = V16SImode;
13303 wide_mode = V64SImode;
13304 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
13305 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
13306 goto v4fma_expand;
13307
13308 case IX86_BUILTIN_4FMAPS_MASK:
13309 {
13310 tree args[4];
13311 rtx ops[4];
13312 rtx wide_reg;
13313 rtx accum;
13314 rtx addr;
13315 rtx mem;
13316
13317 v4fma_expand:
13318 wide_reg = gen_reg_rtx (wide_mode);
13319 for (i = 0; i < 4; i++)
13320 {
13321 args[i] = CALL_EXPR_ARG (exp, i);
13322 ops[i] = expand_normal (args[i]);
13323
13324 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
13325 ops[i]);
13326 }
13327
13328 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
13329 accum = force_reg (mode, accum);
13330
13331 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
13332 addr = force_reg (Pmode, addr);
13333
13334 mem = gen_rtx_MEM (nar_mode, addr);
13335
13336 target = gen_reg_rtx (mode);
13337
13338 emit_move_insn (target, accum);
13339
13340 if (! masked)
13341 emit_insn (fcn (target, accum, wide_reg, mem));
13342 else
13343 {
13344 rtx merge, mask;
13345 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
13346
13347 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
13348
13349 if (CONST_INT_P (mask))
13350 mask = fixup_modeless_constant (mask, HImode);
13351
13352 mask = force_reg (HImode, mask);
13353
13354 if (GET_MODE (mask) != HImode)
13355 mask = gen_rtx_SUBREG (HImode, mask, 0);
13356
13357 /* If merge is 0 then we're about to emit z-masked variant. */
13358 if (const0_operand (merge, mode))
13359 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
13360 /* If merge is the same as accum then emit merge-masked variant. */
13361 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
13362 {
13363 merge = force_reg (mode, merge);
13364 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
13365 }
13366 /* Merge with something unknown might happen if we z-mask w/ -O0. */
13367 else
13368 {
13369 target = gen_reg_rtx (mode);
13370 emit_move_insn (target, merge);
13371 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
13372 }
13373 }
13374 return target;
13375 }
13376
13377 case IX86_BUILTIN_4FNMASS:
13378 fcn = gen_avx5124fmaddps_4fnmaddss;
13379 masked = 0;
13380 goto s4fma_expand;
13381
13382 case IX86_BUILTIN_4FMASS:
13383 fcn = gen_avx5124fmaddps_4fmaddss;
13384 masked = 0;
13385 goto s4fma_expand;
13386
13387 case IX86_BUILTIN_4FNMASS_MASK:
13388 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
13389 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
13390 goto s4fma_expand;
13391
13392 case IX86_BUILTIN_4FMASS_MASK:
13393 {
13394 tree args[4];
13395 rtx ops[4];
13396 rtx wide_reg;
13397 rtx accum;
13398 rtx addr;
13399 rtx mem;
13400
13401 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
13402 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
13403
13404 s4fma_expand:
13405 mode = V4SFmode;
13406 wide_reg = gen_reg_rtx (V64SFmode);
13407 for (i = 0; i < 4; i++)
13408 {
13409 rtx tmp;
13410 args[i] = CALL_EXPR_ARG (exp, i);
13411 ops[i] = expand_normal (args[i]);
13412
13413 tmp = gen_reg_rtx (SFmode);
13414 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
13415
13416 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
13417 gen_rtx_SUBREG (V16SFmode, tmp, 0));
13418 }
13419
13420 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
13421 accum = force_reg (V4SFmode, accum);
13422
13423 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
13424 addr = force_reg (Pmode, addr);
13425
13426 mem = gen_rtx_MEM (V4SFmode, addr);
13427
13428 target = gen_reg_rtx (V4SFmode);
13429
13430 emit_move_insn (target, accum);
13431
13432 if (! masked)
13433 emit_insn (fcn (target, accum, wide_reg, mem));
13434 else
13435 {
13436 rtx merge, mask;
13437 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
13438
13439 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
13440
13441 if (CONST_INT_P (mask))
13442 mask = fixup_modeless_constant (mask, QImode);
13443
13444 mask = force_reg (QImode, mask);
13445
13446 if (GET_MODE (mask) != QImode)
13447 mask = gen_rtx_SUBREG (QImode, mask, 0);
13448
13449 /* If merge is 0 then we're about to emit z-masked variant. */
13450 if (const0_operand (merge, mode))
13451 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
13452 /* If merge is the same as accum then emit merge-masked
13453 variant. */
13454 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
13455 {
13456 merge = force_reg (mode, merge);
13457 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
13458 }
13459 /* Merge with something unknown might happen if we z-mask
13460 w/ -O0. */
13461 else
13462 {
13463 target = gen_reg_rtx (mode);
13464 emit_move_insn (target, merge);
13465 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
13466 }
13467 }
13468 return target;
13469 }
13470 case IX86_BUILTIN_RDPID:
13471 return ix86_expand_special_args_builtin (bdesc_args + i, exp,
13472 target);
13473 case IX86_BUILTIN_FABSQ:
13474 case IX86_BUILTIN_COPYSIGNQ:
13475 if (!TARGET_SSE)
13476 /* Emit a normal call if SSE isn't available. */
13477 return expand_call (exp, target, ignore);
13478 /* FALLTHRU */
13479 default:
13480 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
13481 }
13482 }
13483
13484 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
13485 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
13486 {
13487 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
13488 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
13489 }
13490
13491 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
13492 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
13493 {
13494 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
13495 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
13496 }
13497
13498 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
13499 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
13500 {
13501 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
13502 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
13503 }
13504
13505 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
13506 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
13507 {
13508 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
13509 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
13510 }
13511
13512 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
13513 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
13514 {
13515 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
13516 const struct builtin_description *d = bdesc_multi_arg + i;
13517 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
13518 (enum ix86_builtin_func_type)
13519 d->flag, d->comparison);
13520 }
13521
13522 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
13523 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
13524 {
13525 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
13526 return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
13527 target);
13528 }
13529
13530 gcc_unreachable ();
13531 }
13532
13533 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
13534 fill target with val via vec_duplicate. */
13535
13536 static bool
13537 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
13538 {
13539 bool ok;
13540 rtx_insn *insn;
13541 rtx dup;
13542
13543 /* First attempt to recognize VAL as-is. */
13544 dup = gen_vec_duplicate (mode, val);
13545 insn = emit_insn (gen_rtx_SET (target, dup));
13546 if (recog_memoized (insn) < 0)
13547 {
13548 rtx_insn *seq;
13549 machine_mode innermode = GET_MODE_INNER (mode);
13550 rtx reg;
13551
13552 /* If that fails, force VAL into a register. */
13553
13554 start_sequence ();
13555 reg = force_reg (innermode, val);
13556 if (GET_MODE (reg) != innermode)
13557 reg = gen_lowpart (innermode, reg);
13558 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
13559 seq = get_insns ();
13560 end_sequence ();
13561 if (seq)
13562 emit_insn_before (seq, insn);
13563
13564 ok = recog_memoized (insn) >= 0;
13565 gcc_assert (ok);
13566 }
13567 return true;
13568 }
13569
13570 /* Get a vector mode of the same size as the original but with elements
13571 twice as wide. This is only guaranteed to apply to integral vectors. */
13572
13573 static machine_mode
13574 get_mode_wider_vector (machine_mode o)
13575 {
13576 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
13577 machine_mode n = GET_MODE_WIDER_MODE (o).require ();
13578 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
13579 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
13580 return n;
13581 }
13582
13583 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
13584 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
13585
13586 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13587 with all elements equal to VAR. Return true if successful. */
13588
13589 static bool
13590 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
13591 rtx target, rtx val)
13592 {
13593 bool ok;
13594
13595 switch (mode)
13596 {
13597 case E_V2SImode:
13598 case E_V2SFmode:
13599 if (!mmx_ok)
13600 return false;
13601 /* FALLTHRU */
13602
13603 case E_V4DFmode:
13604 case E_V4DImode:
13605 case E_V8SFmode:
13606 case E_V8SImode:
13607 case E_V2DFmode:
13608 case E_V2DImode:
13609 case E_V4SFmode:
13610 case E_V4SImode:
13611 case E_V16SImode:
13612 case E_V8DImode:
13613 case E_V16SFmode:
13614 case E_V8DFmode:
13615 return ix86_vector_duplicate_value (mode, target, val);
13616
13617 case E_V4HImode:
13618 if (!mmx_ok)
13619 return false;
13620 if (TARGET_SSE || TARGET_3DNOW_A)
13621 {
13622 rtx x;
13623
13624 val = gen_lowpart (SImode, val);
13625 x = gen_rtx_TRUNCATE (HImode, val);
13626 x = gen_rtx_VEC_DUPLICATE (mode, x);
13627 emit_insn (gen_rtx_SET (target, x));
13628 return true;
13629 }
13630 goto widen;
13631
13632 case E_V8QImode:
13633 if (!mmx_ok)
13634 return false;
13635 goto widen;
13636
13637 case E_V8HImode:
13638 if (TARGET_AVX2)
13639 return ix86_vector_duplicate_value (mode, target, val);
13640
13641 if (TARGET_SSE2)
13642 {
13643 struct expand_vec_perm_d dperm;
13644 rtx tmp1, tmp2;
13645
13646 permute:
13647 memset (&dperm, 0, sizeof (dperm));
13648 dperm.target = target;
13649 dperm.vmode = mode;
13650 dperm.nelt = GET_MODE_NUNITS (mode);
13651 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
13652 dperm.one_operand_p = true;
13653
13654 /* Extend to SImode using a paradoxical SUBREG. */
13655 tmp1 = gen_reg_rtx (SImode);
13656 emit_move_insn (tmp1, gen_lowpart (SImode, val));
13657
13658 /* Insert the SImode value as low element of a V4SImode vector. */
13659 tmp2 = gen_reg_rtx (V4SImode);
13660 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
13661 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
13662
13663 ok = (expand_vec_perm_1 (&dperm)
13664 || expand_vec_perm_broadcast_1 (&dperm));
13665 gcc_assert (ok);
13666 return ok;
13667 }
13668 goto widen;
13669
13670 case E_V16QImode:
13671 if (TARGET_AVX2)
13672 return ix86_vector_duplicate_value (mode, target, val);
13673
13674 if (TARGET_SSE2)
13675 goto permute;
13676 goto widen;
13677
13678 widen:
13679 /* Replicate the value once into the next wider mode and recurse. */
13680 {
13681 machine_mode smode, wsmode, wvmode;
13682 rtx x;
13683
13684 smode = GET_MODE_INNER (mode);
13685 wvmode = get_mode_wider_vector (mode);
13686 wsmode = GET_MODE_INNER (wvmode);
13687
13688 val = convert_modes (wsmode, smode, val, true);
13689 x = expand_simple_binop (wsmode, ASHIFT, val,
13690 GEN_INT (GET_MODE_BITSIZE (smode)),
13691 NULL_RTX, 1, OPTAB_LIB_WIDEN);
13692 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
13693
13694 x = gen_reg_rtx (wvmode);
13695 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
13696 gcc_assert (ok);
13697 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
13698 return ok;
13699 }
13700
13701 case E_V16HImode:
13702 case E_V32QImode:
13703 if (TARGET_AVX2)
13704 return ix86_vector_duplicate_value (mode, target, val);
13705 else
13706 {
13707 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
13708 rtx x = gen_reg_rtx (hvmode);
13709
13710 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
13711 gcc_assert (ok);
13712
13713 x = gen_rtx_VEC_CONCAT (mode, x, x);
13714 emit_insn (gen_rtx_SET (target, x));
13715 }
13716 return true;
13717
13718 case E_V64QImode:
13719 case E_V32HImode:
13720 if (TARGET_AVX512BW)
13721 return ix86_vector_duplicate_value (mode, target, val);
13722 else
13723 {
13724 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
13725 rtx x = gen_reg_rtx (hvmode);
13726
13727 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
13728 gcc_assert (ok);
13729
13730 x = gen_rtx_VEC_CONCAT (mode, x, x);
13731 emit_insn (gen_rtx_SET (target, x));
13732 }
13733 return true;
13734
13735 default:
13736 return false;
13737 }
13738 }
13739
13740 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13741 whose ONE_VAR element is VAR, and other elements are zero. Return true
13742 if successful. */
13743
13744 static bool
13745 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
13746 rtx target, rtx var, int one_var)
13747 {
13748 machine_mode vsimode;
13749 rtx new_target;
13750 rtx x, tmp;
13751 bool use_vector_set = false;
13752 rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
13753
13754 switch (mode)
13755 {
13756 case E_V2DImode:
13757 /* For SSE4.1, we normally use vector set. But if the second
13758 element is zero and inter-unit moves are OK, we use movq
13759 instead. */
13760 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
13761 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
13762 && one_var == 0));
13763 break;
13764 case E_V16QImode:
13765 case E_V4SImode:
13766 case E_V4SFmode:
13767 use_vector_set = TARGET_SSE4_1;
13768 break;
13769 case E_V8HImode:
13770 use_vector_set = TARGET_SSE2;
13771 break;
13772 case E_V8QImode:
13773 use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
13774 break;
13775 case E_V4HImode:
13776 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
13777 break;
13778 case E_V32QImode:
13779 case E_V16HImode:
13780 use_vector_set = TARGET_AVX;
13781 break;
13782 case E_V8SImode:
13783 use_vector_set = TARGET_AVX;
13784 gen_vec_set_0 = gen_vec_setv8si_0;
13785 break;
13786 case E_V8SFmode:
13787 use_vector_set = TARGET_AVX;
13788 gen_vec_set_0 = gen_vec_setv8sf_0;
13789 break;
13790 case E_V4DFmode:
13791 use_vector_set = TARGET_AVX;
13792 gen_vec_set_0 = gen_vec_setv4df_0;
13793 break;
13794 case E_V4DImode:
13795 /* Use ix86_expand_vector_set in 64bit mode only. */
13796 use_vector_set = TARGET_AVX && TARGET_64BIT;
13797 gen_vec_set_0 = gen_vec_setv4di_0;
13798 break;
13799 case E_V16SImode:
13800 use_vector_set = TARGET_AVX512F && one_var == 0;
13801 gen_vec_set_0 = gen_vec_setv16si_0;
13802 break;
13803 case E_V16SFmode:
13804 use_vector_set = TARGET_AVX512F && one_var == 0;
13805 gen_vec_set_0 = gen_vec_setv16sf_0;
13806 break;
13807 case E_V8DFmode:
13808 use_vector_set = TARGET_AVX512F && one_var == 0;
13809 gen_vec_set_0 = gen_vec_setv8df_0;
13810 break;
13811 case E_V8DImode:
13812 /* Use ix86_expand_vector_set in 64bit mode only. */
13813 use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
13814 gen_vec_set_0 = gen_vec_setv8di_0;
13815 break;
13816 default:
13817 break;
13818 }
13819
13820 if (use_vector_set)
13821 {
13822 if (gen_vec_set_0 && one_var == 0)
13823 {
13824 var = force_reg (GET_MODE_INNER (mode), var);
13825 emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
13826 return true;
13827 }
13828 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
13829 var = force_reg (GET_MODE_INNER (mode), var);
13830 ix86_expand_vector_set (mmx_ok, target, var, one_var);
13831 return true;
13832 }
13833
13834 switch (mode)
13835 {
13836 case E_V2SFmode:
13837 case E_V2SImode:
13838 if (!mmx_ok)
13839 return false;
13840 /* FALLTHRU */
13841
13842 case E_V2DFmode:
13843 case E_V2DImode:
13844 if (one_var != 0)
13845 return false;
13846 var = force_reg (GET_MODE_INNER (mode), var);
13847 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
13848 emit_insn (gen_rtx_SET (target, x));
13849 return true;
13850
13851 case E_V4SFmode:
13852 case E_V4SImode:
13853 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
13854 new_target = gen_reg_rtx (mode);
13855 else
13856 new_target = target;
13857 var = force_reg (GET_MODE_INNER (mode), var);
13858 x = gen_rtx_VEC_DUPLICATE (mode, var);
13859 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
13860 emit_insn (gen_rtx_SET (new_target, x));
13861 if (one_var != 0)
13862 {
13863 /* We need to shuffle the value to the correct position, so
13864 create a new pseudo to store the intermediate result. */
13865
13866 /* With SSE2, we can use the integer shuffle insns. */
13867 if (mode != V4SFmode && TARGET_SSE2)
13868 {
13869 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
13870 const1_rtx,
13871 GEN_INT (one_var == 1 ? 0 : 1),
13872 GEN_INT (one_var == 2 ? 0 : 1),
13873 GEN_INT (one_var == 3 ? 0 : 1)));
13874 if (target != new_target)
13875 emit_move_insn (target, new_target);
13876 return true;
13877 }
13878
13879 /* Otherwise convert the intermediate result to V4SFmode and
13880 use the SSE1 shuffle instructions. */
13881 if (mode != V4SFmode)
13882 {
13883 tmp = gen_reg_rtx (V4SFmode);
13884 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
13885 }
13886 else
13887 tmp = new_target;
13888
13889 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
13890 const1_rtx,
13891 GEN_INT (one_var == 1 ? 0 : 1),
13892 GEN_INT (one_var == 2 ? 0+4 : 1+4),
13893 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
13894
13895 if (mode != V4SFmode)
13896 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
13897 else if (tmp != target)
13898 emit_move_insn (target, tmp);
13899 }
13900 else if (target != new_target)
13901 emit_move_insn (target, new_target);
13902 return true;
13903
13904 case E_V8HImode:
13905 case E_V16QImode:
13906 vsimode = V4SImode;
13907 goto widen;
13908 case E_V4HImode:
13909 case E_V8QImode:
13910 if (!mmx_ok)
13911 return false;
13912 vsimode = V2SImode;
13913 goto widen;
13914 widen:
13915 if (one_var != 0)
13916 return false;
13917
13918 /* Zero extend the variable element to SImode and recurse. */
13919 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
13920
13921 x = gen_reg_rtx (vsimode);
13922 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
13923 var, one_var))
13924 gcc_unreachable ();
13925
13926 emit_move_insn (target, gen_lowpart (mode, x));
13927 return true;
13928
13929 default:
13930 return false;
13931 }
13932 }
13933
13934 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13935 consisting of the values in VALS. It is known that all elements
13936 except ONE_VAR are constants. Return true if successful. */
13937
13938 static bool
13939 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
13940 rtx target, rtx vals, int one_var)
13941 {
13942 rtx var = XVECEXP (vals, 0, one_var);
13943 machine_mode wmode;
13944 rtx const_vec, x;
13945
13946 const_vec = copy_rtx (vals);
13947 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
13948 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
13949
13950 switch (mode)
13951 {
13952 case E_V2DFmode:
13953 case E_V2DImode:
13954 case E_V2SFmode:
13955 case E_V2SImode:
13956 /* For the two element vectors, it's just as easy to use
13957 the general case. */
13958 return false;
13959
13960 case E_V4DImode:
13961 /* Use ix86_expand_vector_set in 64bit mode only. */
13962 if (!TARGET_64BIT)
13963 return false;
13964 /* FALLTHRU */
13965 case E_V4DFmode:
13966 case E_V8SFmode:
13967 case E_V8SImode:
13968 case E_V16HImode:
13969 case E_V32QImode:
13970 case E_V4SFmode:
13971 case E_V4SImode:
13972 case E_V8HImode:
13973 case E_V4HImode:
13974 break;
13975
13976 case E_V16QImode:
13977 if (TARGET_SSE4_1)
13978 break;
13979 wmode = V8HImode;
13980 goto widen;
13981 case E_V8QImode:
13982 if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1)
13983 break;
13984 wmode = V4HImode;
13985 goto widen;
13986 widen:
13987 /* There's no way to set one QImode entry easily. Combine
13988 the variable value with its adjacent constant value, and
13989 promote to an HImode set. */
13990 x = XVECEXP (vals, 0, one_var ^ 1);
13991 if (one_var & 1)
13992 {
13993 var = convert_modes (HImode, QImode, var, true);
13994 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
13995 NULL_RTX, 1, OPTAB_LIB_WIDEN);
13996 x = GEN_INT (INTVAL (x) & 0xff);
13997 }
13998 else
13999 {
14000 var = convert_modes (HImode, QImode, var, true);
14001 x = gen_int_mode (UINTVAL (x) << 8, HImode);
14002 }
14003 if (x != const0_rtx)
14004 var = expand_simple_binop (HImode, IOR, var, x, var,
14005 1, OPTAB_LIB_WIDEN);
14006
14007 x = gen_reg_rtx (wmode);
14008 emit_move_insn (x, gen_lowpart (wmode, const_vec));
14009 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
14010
14011 emit_move_insn (target, gen_lowpart (mode, x));
14012 return true;
14013
14014 default:
14015 return false;
14016 }
14017
14018 emit_move_insn (target, const_vec);
14019 ix86_expand_vector_set (mmx_ok, target, var, one_var);
14020 return true;
14021 }
14022
14023 /* A subroutine of ix86_expand_vector_init_general. Use vector
14024 concatenate to handle the most general case: all values variable,
14025 and none identical. */
14026
14027 static void
14028 ix86_expand_vector_init_concat (machine_mode mode,
14029 rtx target, rtx *ops, int n)
14030 {
14031 machine_mode half_mode = VOIDmode;
14032 rtx half[2];
14033 rtvec v;
14034 int i, j;
14035
14036 switch (n)
14037 {
14038 case 2:
14039 switch (mode)
14040 {
14041 case E_V16SImode:
14042 half_mode = V8SImode;
14043 break;
14044 case E_V16SFmode:
14045 half_mode = V8SFmode;
14046 break;
14047 case E_V8DImode:
14048 half_mode = V4DImode;
14049 break;
14050 case E_V8DFmode:
14051 half_mode = V4DFmode;
14052 break;
14053 case E_V8SImode:
14054 half_mode = V4SImode;
14055 break;
14056 case E_V8SFmode:
14057 half_mode = V4SFmode;
14058 break;
14059 case E_V4DImode:
14060 half_mode = V2DImode;
14061 break;
14062 case E_V4DFmode:
14063 half_mode = V2DFmode;
14064 break;
14065 case E_V4SImode:
14066 half_mode = V2SImode;
14067 break;
14068 case E_V4SFmode:
14069 half_mode = V2SFmode;
14070 break;
14071 case E_V2DImode:
14072 half_mode = DImode;
14073 break;
14074 case E_V2SImode:
14075 half_mode = SImode;
14076 break;
14077 case E_V2DFmode:
14078 half_mode = DFmode;
14079 break;
14080 case E_V2SFmode:
14081 half_mode = SFmode;
14082 break;
14083 default:
14084 gcc_unreachable ();
14085 }
14086
14087 if (!register_operand (ops[1], half_mode))
14088 ops[1] = force_reg (half_mode, ops[1]);
14089 if (!register_operand (ops[0], half_mode))
14090 ops[0] = force_reg (half_mode, ops[0]);
14091 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
14092 ops[1])));
14093 break;
14094
14095 case 4:
14096 switch (mode)
14097 {
14098 case E_V4DImode:
14099 half_mode = V2DImode;
14100 break;
14101 case E_V4DFmode:
14102 half_mode = V2DFmode;
14103 break;
14104 case E_V4SImode:
14105 half_mode = V2SImode;
14106 break;
14107 case E_V4SFmode:
14108 half_mode = V2SFmode;
14109 break;
14110 default:
14111 gcc_unreachable ();
14112 }
14113 goto half;
14114
14115 case 8:
14116 switch (mode)
14117 {
14118 case E_V8DImode:
14119 half_mode = V4DImode;
14120 break;
14121 case E_V8DFmode:
14122 half_mode = V4DFmode;
14123 break;
14124 case E_V8SImode:
14125 half_mode = V4SImode;
14126 break;
14127 case E_V8SFmode:
14128 half_mode = V4SFmode;
14129 break;
14130 default:
14131 gcc_unreachable ();
14132 }
14133 goto half;
14134
14135 case 16:
14136 switch (mode)
14137 {
14138 case E_V16SImode:
14139 half_mode = V8SImode;
14140 break;
14141 case E_V16SFmode:
14142 half_mode = V8SFmode;
14143 break;
14144 default:
14145 gcc_unreachable ();
14146 }
14147 goto half;
14148
14149 half:
14150 /* FIXME: We process inputs backward to help RA. PR 36222. */
14151 i = n - 1;
14152 for (j = 1; j != -1; j--)
14153 {
14154 half[j] = gen_reg_rtx (half_mode);
14155 switch (n >> 1)
14156 {
14157 case 2:
14158 v = gen_rtvec (2, ops[i-1], ops[i]);
14159 i -= 2;
14160 break;
14161 case 4:
14162 v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]);
14163 i -= 4;
14164 break;
14165 case 8:
14166 v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4],
14167 ops[i-3], ops[i-2], ops[i-1], ops[i]);
14168 i -= 8;
14169 break;
14170 default:
14171 gcc_unreachable ();
14172 }
14173 ix86_expand_vector_init (false, half[j],
14174 gen_rtx_PARALLEL (half_mode, v));
14175 }
14176
14177 ix86_expand_vector_init_concat (mode, target, half, 2);
14178 break;
14179
14180 default:
14181 gcc_unreachable ();
14182 }
14183 }
14184
14185 /* A subroutine of ix86_expand_vector_init_general. Use vector
14186 interleave to handle the most general case: all values variable,
14187 and none identical. */
14188
14189 static void
14190 ix86_expand_vector_init_interleave (machine_mode mode,
14191 rtx target, rtx *ops, int n)
14192 {
14193 machine_mode first_imode, second_imode, third_imode, inner_mode;
14194 int i, j;
14195 rtx op0, op1;
14196 rtx (*gen_load_even) (rtx, rtx, rtx);
14197 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
14198 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
14199
14200 switch (mode)
14201 {
14202 case E_V8HImode:
14203 gen_load_even = gen_vec_setv8hi;
14204 gen_interleave_first_low = gen_vec_interleave_lowv4si;
14205 gen_interleave_second_low = gen_vec_interleave_lowv2di;
14206 inner_mode = HImode;
14207 first_imode = V4SImode;
14208 second_imode = V2DImode;
14209 third_imode = VOIDmode;
14210 break;
14211 case E_V16QImode:
14212 gen_load_even = gen_vec_setv16qi;
14213 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
14214 gen_interleave_second_low = gen_vec_interleave_lowv4si;
14215 inner_mode = QImode;
14216 first_imode = V8HImode;
14217 second_imode = V4SImode;
14218 third_imode = V2DImode;
14219 break;
14220 default:
14221 gcc_unreachable ();
14222 }
14223
14224 for (i = 0; i < n; i++)
14225 {
14226 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
14227 op0 = gen_reg_rtx (SImode);
14228 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
14229
14230 /* Insert the SImode value as low element of V4SImode vector. */
14231 op1 = gen_reg_rtx (V4SImode);
14232 op0 = gen_rtx_VEC_MERGE (V4SImode,
14233 gen_rtx_VEC_DUPLICATE (V4SImode,
14234 op0),
14235 CONST0_RTX (V4SImode),
14236 const1_rtx);
14237 emit_insn (gen_rtx_SET (op1, op0));
14238
14239 /* Cast the V4SImode vector back to a vector in orignal mode. */
14240 op0 = gen_reg_rtx (mode);
14241 emit_move_insn (op0, gen_lowpart (mode, op1));
14242
14243 /* Load even elements into the second position. */
14244 emit_insn (gen_load_even (op0,
14245 force_reg (inner_mode,
14246 ops [i + i + 1]),
14247 const1_rtx));
14248
14249 /* Cast vector to FIRST_IMODE vector. */
14250 ops[i] = gen_reg_rtx (first_imode);
14251 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
14252 }
14253
14254 /* Interleave low FIRST_IMODE vectors. */
14255 for (i = j = 0; i < n; i += 2, j++)
14256 {
14257 op0 = gen_reg_rtx (first_imode);
14258 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
14259
14260 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
14261 ops[j] = gen_reg_rtx (second_imode);
14262 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
14263 }
14264
14265 /* Interleave low SECOND_IMODE vectors. */
14266 switch (second_imode)
14267 {
14268 case E_V4SImode:
14269 for (i = j = 0; i < n / 2; i += 2, j++)
14270 {
14271 op0 = gen_reg_rtx (second_imode);
14272 emit_insn (gen_interleave_second_low (op0, ops[i],
14273 ops[i + 1]));
14274
14275 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
14276 vector. */
14277 ops[j] = gen_reg_rtx (third_imode);
14278 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
14279 }
14280 second_imode = V2DImode;
14281 gen_interleave_second_low = gen_vec_interleave_lowv2di;
14282 /* FALLTHRU */
14283
14284 case E_V2DImode:
14285 op0 = gen_reg_rtx (second_imode);
14286 emit_insn (gen_interleave_second_low (op0, ops[0],
14287 ops[1]));
14288
14289 /* Cast the SECOND_IMODE vector back to a vector on original
14290 mode. */
14291 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
14292 break;
14293
14294 default:
14295 gcc_unreachable ();
14296 }
14297 }
14298
14299 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
14300 all values variable, and none identical. */
14301
14302 static void
14303 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
14304 rtx target, rtx vals)
14305 {
14306 rtx ops[64], op0, op1, op2, op3, op4, op5;
14307 machine_mode half_mode = VOIDmode;
14308 machine_mode quarter_mode = VOIDmode;
14309 int n, i;
14310
14311 switch (mode)
14312 {
14313 case E_V2SFmode:
14314 case E_V2SImode:
14315 if (!mmx_ok && !TARGET_SSE)
14316 break;
14317 /* FALLTHRU */
14318
14319 case E_V16SImode:
14320 case E_V16SFmode:
14321 case E_V8DFmode:
14322 case E_V8DImode:
14323 case E_V8SFmode:
14324 case E_V8SImode:
14325 case E_V4DFmode:
14326 case E_V4DImode:
14327 case E_V4SFmode:
14328 case E_V4SImode:
14329 case E_V2DFmode:
14330 case E_V2DImode:
14331 n = GET_MODE_NUNITS (mode);
14332 for (i = 0; i < n; i++)
14333 ops[i] = XVECEXP (vals, 0, i);
14334 ix86_expand_vector_init_concat (mode, target, ops, n);
14335 return;
14336
14337 case E_V2TImode:
14338 for (i = 0; i < 2; i++)
14339 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
14340 op0 = gen_reg_rtx (V4DImode);
14341 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
14342 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
14343 return;
14344
14345 case E_V4TImode:
14346 for (i = 0; i < 4; i++)
14347 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
14348 ops[4] = gen_reg_rtx (V4DImode);
14349 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
14350 ops[5] = gen_reg_rtx (V4DImode);
14351 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
14352 op0 = gen_reg_rtx (V8DImode);
14353 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
14354 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
14355 return;
14356
14357 case E_V32QImode:
14358 half_mode = V16QImode;
14359 goto half;
14360
14361 case E_V16HImode:
14362 half_mode = V8HImode;
14363 goto half;
14364
14365 half:
14366 n = GET_MODE_NUNITS (mode);
14367 for (i = 0; i < n; i++)
14368 ops[i] = XVECEXP (vals, 0, i);
14369 op0 = gen_reg_rtx (half_mode);
14370 op1 = gen_reg_rtx (half_mode);
14371 ix86_expand_vector_init_interleave (half_mode, op0, ops,
14372 n >> 2);
14373 ix86_expand_vector_init_interleave (half_mode, op1,
14374 &ops [n >> 1], n >> 2);
14375 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
14376 return;
14377
14378 case E_V64QImode:
14379 quarter_mode = V16QImode;
14380 half_mode = V32QImode;
14381 goto quarter;
14382
14383 case E_V32HImode:
14384 quarter_mode = V8HImode;
14385 half_mode = V16HImode;
14386 goto quarter;
14387
14388 quarter:
14389 n = GET_MODE_NUNITS (mode);
14390 for (i = 0; i < n; i++)
14391 ops[i] = XVECEXP (vals, 0, i);
14392 op0 = gen_reg_rtx (quarter_mode);
14393 op1 = gen_reg_rtx (quarter_mode);
14394 op2 = gen_reg_rtx (quarter_mode);
14395 op3 = gen_reg_rtx (quarter_mode);
14396 op4 = gen_reg_rtx (half_mode);
14397 op5 = gen_reg_rtx (half_mode);
14398 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
14399 n >> 3);
14400 ix86_expand_vector_init_interleave (quarter_mode, op1,
14401 &ops [n >> 2], n >> 3);
14402 ix86_expand_vector_init_interleave (quarter_mode, op2,
14403 &ops [n >> 1], n >> 3);
14404 ix86_expand_vector_init_interleave (quarter_mode, op3,
14405 &ops [(n >> 1) | (n >> 2)], n >> 3);
14406 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
14407 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
14408 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
14409 return;
14410
14411 case E_V16QImode:
14412 if (!TARGET_SSE4_1)
14413 break;
14414 /* FALLTHRU */
14415
14416 case E_V8HImode:
14417 if (!TARGET_SSE2)
14418 break;
14419
14420 /* Don't use ix86_expand_vector_init_interleave if we can't
14421 move from GPR to SSE register directly. */
14422 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
14423 break;
14424
14425 n = GET_MODE_NUNITS (mode);
14426 for (i = 0; i < n; i++)
14427 ops[i] = XVECEXP (vals, 0, i);
14428 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
14429 return;
14430
14431 case E_V4HImode:
14432 case E_V8QImode:
14433 break;
14434
14435 default:
14436 gcc_unreachable ();
14437 }
14438
14439 {
14440 int i, j, n_elts, n_words, n_elt_per_word;
14441 machine_mode inner_mode;
14442 rtx words[4], shift;
14443
14444 inner_mode = GET_MODE_INNER (mode);
14445 n_elts = GET_MODE_NUNITS (mode);
14446 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
14447 n_elt_per_word = n_elts / n_words;
14448 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
14449
14450 for (i = 0; i < n_words; ++i)
14451 {
14452 rtx word = NULL_RTX;
14453
14454 for (j = 0; j < n_elt_per_word; ++j)
14455 {
14456 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
14457 elt = convert_modes (word_mode, inner_mode, elt, true);
14458
14459 if (j == 0)
14460 word = elt;
14461 else
14462 {
14463 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
14464 word, 1, OPTAB_LIB_WIDEN);
14465 word = expand_simple_binop (word_mode, IOR, word, elt,
14466 word, 1, OPTAB_LIB_WIDEN);
14467 }
14468 }
14469
14470 words[i] = word;
14471 }
14472
14473 if (n_words == 1)
14474 emit_move_insn (target, gen_lowpart (mode, words[0]));
14475 else if (n_words == 2)
14476 {
14477 rtx tmp = gen_reg_rtx (mode);
14478 emit_clobber (tmp);
14479 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
14480 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
14481 emit_move_insn (target, tmp);
14482 }
14483 else if (n_words == 4)
14484 {
14485 rtx tmp = gen_reg_rtx (V4SImode);
14486 gcc_assert (word_mode == SImode);
14487 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
14488 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
14489 emit_move_insn (target, gen_lowpart (mode, tmp));
14490 }
14491 else
14492 gcc_unreachable ();
14493 }
14494 }
14495
14496 /* Initialize vector TARGET via VALS. Suppress the use of MMX
14497 instructions unless MMX_OK is true. */
14498
14499 void
14500 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
14501 {
14502 machine_mode mode = GET_MODE (target);
14503 machine_mode inner_mode = GET_MODE_INNER (mode);
14504 int n_elts = GET_MODE_NUNITS (mode);
14505 int n_var = 0, one_var = -1;
14506 bool all_same = true, all_const_zero = true;
14507 int i;
14508 rtx x;
14509
14510 /* Handle first initialization from vector elts. */
14511 if (n_elts != XVECLEN (vals, 0))
14512 {
14513 rtx subtarget = target;
14514 x = XVECEXP (vals, 0, 0);
14515 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
14516 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
14517 {
14518 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
14519 if (inner_mode == QImode || inner_mode == HImode)
14520 {
14521 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
14522 mode = mode_for_vector (SImode, n_bits / 4).require ();
14523 inner_mode = mode_for_vector (SImode, n_bits / 8).require ();
14524 ops[0] = gen_lowpart (inner_mode, ops[0]);
14525 ops[1] = gen_lowpart (inner_mode, ops[1]);
14526 subtarget = gen_reg_rtx (mode);
14527 }
14528 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
14529 if (subtarget != target)
14530 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
14531 return;
14532 }
14533 gcc_unreachable ();
14534 }
14535
14536 for (i = 0; i < n_elts; ++i)
14537 {
14538 x = XVECEXP (vals, 0, i);
14539 if (!(CONST_SCALAR_INT_P (x)
14540 || CONST_DOUBLE_P (x)
14541 || CONST_FIXED_P (x)))
14542 n_var++, one_var = i;
14543 else if (x != CONST0_RTX (inner_mode))
14544 all_const_zero = false;
14545 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
14546 all_same = false;
14547 }
14548
14549 /* Constants are best loaded from the constant pool. */
14550 if (n_var == 0)
14551 {
14552 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
14553 return;
14554 }
14555
14556 /* If all values are identical, broadcast the value. */
14557 if (all_same
14558 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
14559 XVECEXP (vals, 0, 0)))
14560 return;
14561
14562 /* Values where only one field is non-constant are best loaded from
14563 the pool and overwritten via move later. */
14564 if (n_var == 1)
14565 {
14566 if (all_const_zero
14567 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
14568 XVECEXP (vals, 0, one_var),
14569 one_var))
14570 return;
14571
14572 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
14573 return;
14574 }
14575
14576 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
14577 }
14578
14579 /* Implemented as
14580 V setg (V v, int idx, T val)
14581 {
14582 V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
14583 V valv = (V){val, val, val, val, val, val, val, val};
14584 V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
14585 v = (v & ~mask) | (valv & mask);
14586 return v;
14587 }. */
14588 void
14589 ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
14590 {
14591 rtx vec[64];
14592 machine_mode mode = GET_MODE (target);
14593 machine_mode cmp_mode = mode;
14594 int n_elts = GET_MODE_NUNITS (mode);
14595 rtx valv,idxv,constv,idx_tmp;
14596 bool ok = false;
14597
14598 /* 512-bits vector byte/word broadcast and comparison only available
14599 under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
14600 when without TARGET_AVX512BW. */
14601 if ((mode == V32HImode || mode == V64QImode) && !TARGET_AVX512BW)
14602 {
14603 gcc_assert (TARGET_AVX512F);
14604 rtx vhi, vlo, idx_hi;
14605 machine_mode half_mode;
14606 rtx (*extract_hi)(rtx, rtx);
14607 rtx (*extract_lo)(rtx, rtx);
14608
14609 if (mode == V32HImode)
14610 {
14611 half_mode = V16HImode;
14612 extract_hi = gen_vec_extract_hi_v32hi;
14613 extract_lo = gen_vec_extract_lo_v32hi;
14614 }
14615 else
14616 {
14617 half_mode = V32QImode;
14618 extract_hi = gen_vec_extract_hi_v64qi;
14619 extract_lo = gen_vec_extract_lo_v64qi;
14620 }
14621
14622 vhi = gen_reg_rtx (half_mode);
14623 vlo = gen_reg_rtx (half_mode);
14624 idx_hi = gen_reg_rtx (GET_MODE (idx));
14625 emit_insn (extract_hi (vhi, target));
14626 emit_insn (extract_lo (vlo, target));
14627 vec[0] = idx_hi;
14628 vec[1] = idx;
14629 vec[2] = GEN_INT (n_elts/2);
14630 ix86_expand_binary_operator (MINUS, GET_MODE (idx), vec);
14631 ix86_expand_vector_set_var (vhi, val, idx_hi);
14632 ix86_expand_vector_set_var (vlo, val, idx);
14633 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, vlo, vhi)));
14634 return;
14635 }
14636
14637 if (FLOAT_MODE_P (GET_MODE_INNER (mode)))
14638 {
14639 switch (mode)
14640 {
14641 case E_V2DFmode:
14642 cmp_mode = V2DImode;
14643 break;
14644 case E_V4DFmode:
14645 cmp_mode = V4DImode;
14646 break;
14647 case E_V8DFmode:
14648 cmp_mode = V8DImode;
14649 break;
14650 case E_V4SFmode:
14651 cmp_mode = V4SImode;
14652 break;
14653 case E_V8SFmode:
14654 cmp_mode = V8SImode;
14655 break;
14656 case E_V16SFmode:
14657 cmp_mode = V16SImode;
14658 break;
14659 default:
14660 gcc_unreachable ();
14661 }
14662 }
14663
14664 for (int i = 0; i != n_elts; i++)
14665 vec[i] = GEN_INT (i);
14666 constv = gen_rtx_CONST_VECTOR (cmp_mode, gen_rtvec_v (n_elts, vec));
14667 valv = gen_reg_rtx (mode);
14668 idxv = gen_reg_rtx (cmp_mode);
14669 idx_tmp = convert_to_mode (GET_MODE_INNER (cmp_mode), idx, 1);
14670
14671 ok = ix86_expand_vector_init_duplicate (false, mode, valv, val);
14672 gcc_assert (ok);
14673 ok = ix86_expand_vector_init_duplicate (false, cmp_mode, idxv, idx_tmp);
14674 gcc_assert (ok);
14675 vec[0] = target;
14676 vec[1] = valv;
14677 vec[2] = target;
14678 vec[3] = gen_rtx_EQ (mode, idxv, constv);
14679 vec[4] = idxv;
14680 vec[5] = constv;
14681 ok = ix86_expand_int_vcond (vec);
14682 gcc_assert (ok);
14683 }
14684
14685 void
14686 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
14687 {
14688 machine_mode mode = GET_MODE (target);
14689 machine_mode inner_mode = GET_MODE_INNER (mode);
14690 machine_mode half_mode;
14691 bool use_vec_merge = false;
14692 rtx tmp;
14693 static rtx (*gen_extract[6][2]) (rtx, rtx)
14694 = {
14695 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
14696 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
14697 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
14698 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
14699 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
14700 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
14701 };
14702 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
14703 = {
14704 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
14705 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
14706 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
14707 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
14708 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
14709 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
14710 };
14711 int i, j, n;
14712 machine_mode mmode = VOIDmode;
14713 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
14714
14715 switch (mode)
14716 {
14717 case E_V2SImode:
14718 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14719 if (use_vec_merge)
14720 break;
14721 /* FALLTHRU */
14722
14723 case E_V2SFmode:
14724 if (mmx_ok)
14725 {
14726 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
14727 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
14728 if (elt == 0)
14729 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
14730 else
14731 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
14732 emit_insn (gen_rtx_SET (target, tmp));
14733 return;
14734 }
14735 break;
14736
14737 case E_V2DImode:
14738 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
14739 if (use_vec_merge)
14740 break;
14741
14742 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
14743 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
14744 if (elt == 0)
14745 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
14746 else
14747 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
14748 emit_insn (gen_rtx_SET (target, tmp));
14749 return;
14750
14751 case E_V2DFmode:
14752 /* NB: For ELT == 0, use standard scalar operation patterns which
14753 preserve the rest of the vector for combiner:
14754
14755 (vec_merge:V2DF
14756 (vec_duplicate:V2DF (reg:DF))
14757 (reg:V2DF)
14758 (const_int 1))
14759 */
14760 if (elt == 0)
14761 goto do_vec_merge;
14762
14763 {
14764 rtx op0, op1;
14765
14766 /* For the two element vectors, we implement a VEC_CONCAT with
14767 the extraction of the other element. */
14768
14769 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
14770 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
14771
14772 if (elt == 0)
14773 op0 = val, op1 = tmp;
14774 else
14775 op0 = tmp, op1 = val;
14776
14777 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
14778 emit_insn (gen_rtx_SET (target, tmp));
14779 }
14780 return;
14781
14782 case E_V4SFmode:
14783 use_vec_merge = TARGET_SSE4_1;
14784 if (use_vec_merge)
14785 break;
14786
14787 switch (elt)
14788 {
14789 case 0:
14790 use_vec_merge = true;
14791 break;
14792
14793 case 1:
14794 /* tmp = target = A B C D */
14795 tmp = copy_to_reg (target);
14796 /* target = A A B B */
14797 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
14798 /* target = X A B B */
14799 ix86_expand_vector_set (false, target, val, 0);
14800 /* target = A X C D */
14801 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14802 const1_rtx, const0_rtx,
14803 GEN_INT (2+4), GEN_INT (3+4)));
14804 return;
14805
14806 case 2:
14807 /* tmp = target = A B C D */
14808 tmp = copy_to_reg (target);
14809 /* tmp = X B C D */
14810 ix86_expand_vector_set (false, tmp, val, 0);
14811 /* target = A B X D */
14812 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14813 const0_rtx, const1_rtx,
14814 GEN_INT (0+4), GEN_INT (3+4)));
14815 return;
14816
14817 case 3:
14818 /* tmp = target = A B C D */
14819 tmp = copy_to_reg (target);
14820 /* tmp = X B C D */
14821 ix86_expand_vector_set (false, tmp, val, 0);
14822 /* target = A B X D */
14823 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14824 const0_rtx, const1_rtx,
14825 GEN_INT (2+4), GEN_INT (0+4)));
14826 return;
14827
14828 default:
14829 gcc_unreachable ();
14830 }
14831 break;
14832
14833 case E_V4SImode:
14834 use_vec_merge = TARGET_SSE4_1;
14835 if (use_vec_merge)
14836 break;
14837
14838 /* Element 0 handled by vec_merge below. */
14839 if (elt == 0)
14840 {
14841 use_vec_merge = true;
14842 break;
14843 }
14844
14845 if (TARGET_SSE2)
14846 {
14847 /* With SSE2, use integer shuffles to swap element 0 and ELT,
14848 store into element 0, then shuffle them back. */
14849
14850 rtx order[4];
14851
14852 order[0] = GEN_INT (elt);
14853 order[1] = const1_rtx;
14854 order[2] = const2_rtx;
14855 order[3] = GEN_INT (3);
14856 order[elt] = const0_rtx;
14857
14858 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
14859 order[1], order[2], order[3]));
14860
14861 ix86_expand_vector_set (false, target, val, 0);
14862
14863 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
14864 order[1], order[2], order[3]));
14865 }
14866 else
14867 {
14868 /* For SSE1, we have to reuse the V4SF code. */
14869 rtx t = gen_reg_rtx (V4SFmode);
14870 emit_move_insn (t, gen_lowpart (V4SFmode, target));
14871 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
14872 emit_move_insn (target, gen_lowpart (mode, t));
14873 }
14874 return;
14875
14876 case E_V8HImode:
14877 use_vec_merge = TARGET_SSE2;
14878 break;
14879 case E_V4HImode:
14880 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
14881 break;
14882
14883 case E_V16QImode:
14884 use_vec_merge = TARGET_SSE4_1;
14885 break;
14886
14887 case E_V8QImode:
14888 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14889 break;
14890
14891 case E_V32QImode:
14892 half_mode = V16QImode;
14893 j = 0;
14894 n = 16;
14895 goto half;
14896
14897 case E_V16HImode:
14898 half_mode = V8HImode;
14899 j = 1;
14900 n = 8;
14901 goto half;
14902
14903 case E_V8SImode:
14904 half_mode = V4SImode;
14905 j = 2;
14906 n = 4;
14907 goto half;
14908
14909 case E_V4DImode:
14910 half_mode = V2DImode;
14911 j = 3;
14912 n = 2;
14913 goto half;
14914
14915 case E_V8SFmode:
14916 half_mode = V4SFmode;
14917 j = 4;
14918 n = 4;
14919 goto half;
14920
14921 case E_V4DFmode:
14922 half_mode = V2DFmode;
14923 j = 5;
14924 n = 2;
14925 goto half;
14926
14927 half:
14928 /* Compute offset. */
14929 i = elt / n;
14930 elt %= n;
14931
14932 gcc_assert (i <= 1);
14933
14934 /* Extract the half. */
14935 tmp = gen_reg_rtx (half_mode);
14936 emit_insn (gen_extract[j][i] (tmp, target));
14937
14938 /* Put val in tmp at elt. */
14939 ix86_expand_vector_set (false, tmp, val, elt);
14940
14941 /* Put it back. */
14942 emit_insn (gen_insert[j][i] (target, target, tmp));
14943 return;
14944
14945 case E_V8DFmode:
14946 if (TARGET_AVX512F)
14947 {
14948 mmode = QImode;
14949 gen_blendm = gen_avx512f_blendmv8df;
14950 }
14951 break;
14952
14953 case E_V8DImode:
14954 if (TARGET_AVX512F)
14955 {
14956 mmode = QImode;
14957 gen_blendm = gen_avx512f_blendmv8di;
14958 }
14959 break;
14960
14961 case E_V16SFmode:
14962 if (TARGET_AVX512F)
14963 {
14964 mmode = HImode;
14965 gen_blendm = gen_avx512f_blendmv16sf;
14966 }
14967 break;
14968
14969 case E_V16SImode:
14970 if (TARGET_AVX512F)
14971 {
14972 mmode = HImode;
14973 gen_blendm = gen_avx512f_blendmv16si;
14974 }
14975 break;
14976
14977 case E_V32HImode:
14978 if (TARGET_AVX512BW)
14979 {
14980 mmode = SImode;
14981 gen_blendm = gen_avx512bw_blendmv32hi;
14982 }
14983 else if (TARGET_AVX512F)
14984 {
14985 half_mode = E_V8HImode;
14986 n = 8;
14987 goto quarter;
14988 }
14989 break;
14990
14991 case E_V64QImode:
14992 if (TARGET_AVX512BW)
14993 {
14994 mmode = DImode;
14995 gen_blendm = gen_avx512bw_blendmv64qi;
14996 }
14997 else if (TARGET_AVX512F)
14998 {
14999 half_mode = E_V16QImode;
15000 n = 16;
15001 goto quarter;
15002 }
15003 break;
15004
15005 quarter:
15006 /* Compute offset. */
15007 i = elt / n;
15008 elt %= n;
15009
15010 gcc_assert (i <= 3);
15011
15012 {
15013 /* Extract the quarter. */
15014 tmp = gen_reg_rtx (V4SImode);
15015 rtx tmp2 = gen_lowpart (V16SImode, target);
15016 rtx mask = gen_reg_rtx (QImode);
15017
15018 emit_move_insn (mask, constm1_rtx);
15019 emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
15020 tmp, mask));
15021
15022 tmp2 = gen_reg_rtx (half_mode);
15023 emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
15024 tmp = tmp2;
15025
15026 /* Put val in tmp at elt. */
15027 ix86_expand_vector_set (false, tmp, val, elt);
15028
15029 /* Put it back. */
15030 tmp2 = gen_reg_rtx (V16SImode);
15031 rtx tmp3 = gen_lowpart (V16SImode, target);
15032 mask = gen_reg_rtx (HImode);
15033 emit_move_insn (mask, constm1_rtx);
15034 tmp = gen_lowpart (V4SImode, tmp);
15035 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
15036 tmp3, mask));
15037 emit_move_insn (target, gen_lowpart (mode, tmp2));
15038 }
15039 return;
15040
15041 default:
15042 break;
15043 }
15044
15045 if (mmode != VOIDmode)
15046 {
15047 tmp = gen_reg_rtx (mode);
15048 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
15049 /* The avx512*_blendm<mode> expanders have different operand order
15050 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
15051 elements where the mask is set and second input operand otherwise,
15052 in {sse,avx}*_*blend* the first input operand is used for elements
15053 where the mask is clear and second input operand otherwise. */
15054 emit_insn (gen_blendm (target, target, tmp,
15055 force_reg (mmode,
15056 gen_int_mode (HOST_WIDE_INT_1U << elt,
15057 mmode))));
15058 }
15059 else if (use_vec_merge)
15060 {
15061 do_vec_merge:
15062 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
15063 tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
15064 GEN_INT (HOST_WIDE_INT_1U << elt));
15065 emit_insn (gen_rtx_SET (target, tmp));
15066 }
15067 else
15068 {
15069 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
15070
15071 emit_move_insn (mem, target);
15072
15073 tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
15074 emit_move_insn (tmp, val);
15075
15076 emit_move_insn (target, mem);
15077 }
15078 }
15079
15080 void
15081 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
15082 {
15083 machine_mode mode = GET_MODE (vec);
15084 machine_mode inner_mode = GET_MODE_INNER (mode);
15085 bool use_vec_extr = false;
15086 rtx tmp;
15087
15088 switch (mode)
15089 {
15090 case E_V2SImode:
15091 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
15092 if (use_vec_extr)
15093 break;
15094 /* FALLTHRU */
15095
15096 case E_V2SFmode:
15097 if (!mmx_ok)
15098 break;
15099 /* FALLTHRU */
15100
15101 case E_V2DFmode:
15102 case E_V2DImode:
15103 case E_V2TImode:
15104 case E_V4TImode:
15105 use_vec_extr = true;
15106 break;
15107
15108 case E_V4SFmode:
15109 use_vec_extr = TARGET_SSE4_1;
15110 if (use_vec_extr)
15111 break;
15112
15113 switch (elt)
15114 {
15115 case 0:
15116 tmp = vec;
15117 break;
15118
15119 case 1:
15120 case 3:
15121 tmp = gen_reg_rtx (mode);
15122 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
15123 GEN_INT (elt), GEN_INT (elt),
15124 GEN_INT (elt+4), GEN_INT (elt+4)));
15125 break;
15126
15127 case 2:
15128 tmp = gen_reg_rtx (mode);
15129 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
15130 break;
15131
15132 default:
15133 gcc_unreachable ();
15134 }
15135 vec = tmp;
15136 use_vec_extr = true;
15137 elt = 0;
15138 break;
15139
15140 case E_V4SImode:
15141 use_vec_extr = TARGET_SSE4_1;
15142 if (use_vec_extr)
15143 break;
15144
15145 if (TARGET_SSE2)
15146 {
15147 switch (elt)
15148 {
15149 case 0:
15150 tmp = vec;
15151 break;
15152
15153 case 1:
15154 case 3:
15155 tmp = gen_reg_rtx (mode);
15156 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
15157 GEN_INT (elt), GEN_INT (elt),
15158 GEN_INT (elt), GEN_INT (elt)));
15159 break;
15160
15161 case 2:
15162 tmp = gen_reg_rtx (mode);
15163 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
15164 break;
15165
15166 default:
15167 gcc_unreachable ();
15168 }
15169 vec = tmp;
15170 use_vec_extr = true;
15171 elt = 0;
15172 }
15173 else
15174 {
15175 /* For SSE1, we have to reuse the V4SF code. */
15176 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
15177 gen_lowpart (V4SFmode, vec), elt);
15178 return;
15179 }
15180 break;
15181
15182 case E_V8HImode:
15183 use_vec_extr = TARGET_SSE2;
15184 break;
15185 case E_V4HImode:
15186 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
15187 break;
15188
15189 case E_V16QImode:
15190 use_vec_extr = TARGET_SSE4_1;
15191 if (!use_vec_extr
15192 && TARGET_SSE2
15193 && elt == 0
15194 && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
15195 {
15196 tmp = gen_reg_rtx (SImode);
15197 ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec),
15198 0);
15199 emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
15200 return;
15201 }
15202 break;
15203
15204 case E_V8SFmode:
15205 if (TARGET_AVX)
15206 {
15207 tmp = gen_reg_rtx (V4SFmode);
15208 if (elt < 4)
15209 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
15210 else
15211 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
15212 ix86_expand_vector_extract (false, target, tmp, elt & 3);
15213 return;
15214 }
15215 break;
15216
15217 case E_V4DFmode:
15218 if (TARGET_AVX)
15219 {
15220 tmp = gen_reg_rtx (V2DFmode);
15221 if (elt < 2)
15222 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
15223 else
15224 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
15225 ix86_expand_vector_extract (false, target, tmp, elt & 1);
15226 return;
15227 }
15228 break;
15229
15230 case E_V32QImode:
15231 if (TARGET_AVX)
15232 {
15233 tmp = gen_reg_rtx (V16QImode);
15234 if (elt < 16)
15235 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
15236 else
15237 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
15238 ix86_expand_vector_extract (false, target, tmp, elt & 15);
15239 return;
15240 }
15241 break;
15242
15243 case E_V16HImode:
15244 if (TARGET_AVX)
15245 {
15246 tmp = gen_reg_rtx (V8HImode);
15247 if (elt < 8)
15248 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
15249 else
15250 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
15251 ix86_expand_vector_extract (false, target, tmp, elt & 7);
15252 return;
15253 }
15254 break;
15255
15256 case E_V8SImode:
15257 if (TARGET_AVX)
15258 {
15259 tmp = gen_reg_rtx (V4SImode);
15260 if (elt < 4)
15261 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
15262 else
15263 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
15264 ix86_expand_vector_extract (false, target, tmp, elt & 3);
15265 return;
15266 }
15267 break;
15268
15269 case E_V4DImode:
15270 if (TARGET_AVX)
15271 {
15272 tmp = gen_reg_rtx (V2DImode);
15273 if (elt < 2)
15274 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
15275 else
15276 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
15277 ix86_expand_vector_extract (false, target, tmp, elt & 1);
15278 return;
15279 }
15280 break;
15281
15282 case E_V32HImode:
15283 if (TARGET_AVX512BW)
15284 {
15285 tmp = gen_reg_rtx (V16HImode);
15286 if (elt < 16)
15287 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
15288 else
15289 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
15290 ix86_expand_vector_extract (false, target, tmp, elt & 15);
15291 return;
15292 }
15293 break;
15294
15295 case E_V64QImode:
15296 if (TARGET_AVX512BW)
15297 {
15298 tmp = gen_reg_rtx (V32QImode);
15299 if (elt < 32)
15300 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
15301 else
15302 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
15303 ix86_expand_vector_extract (false, target, tmp, elt & 31);
15304 return;
15305 }
15306 break;
15307
15308 case E_V16SFmode:
15309 tmp = gen_reg_rtx (V8SFmode);
15310 if (elt < 8)
15311 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
15312 else
15313 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
15314 ix86_expand_vector_extract (false, target, tmp, elt & 7);
15315 return;
15316
15317 case E_V8DFmode:
15318 tmp = gen_reg_rtx (V4DFmode);
15319 if (elt < 4)
15320 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
15321 else
15322 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
15323 ix86_expand_vector_extract (false, target, tmp, elt & 3);
15324 return;
15325
15326 case E_V16SImode:
15327 tmp = gen_reg_rtx (V8SImode);
15328 if (elt < 8)
15329 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
15330 else
15331 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
15332 ix86_expand_vector_extract (false, target, tmp, elt & 7);
15333 return;
15334
15335 case E_V8DImode:
15336 tmp = gen_reg_rtx (V4DImode);
15337 if (elt < 4)
15338 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
15339 else
15340 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
15341 ix86_expand_vector_extract (false, target, tmp, elt & 3);
15342 return;
15343
15344 case E_V8QImode:
15345 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
15346 /* ??? Could extract the appropriate HImode element and shift. */
15347 break;
15348
15349 default:
15350 break;
15351 }
15352
15353 if (use_vec_extr)
15354 {
15355 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
15356 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
15357
15358 /* Let the rtl optimizers know about the zero extension performed. */
15359 if (inner_mode == QImode || inner_mode == HImode)
15360 {
15361 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
15362 target = gen_lowpart (SImode, target);
15363 }
15364
15365 emit_insn (gen_rtx_SET (target, tmp));
15366 }
15367 else
15368 {
15369 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
15370
15371 emit_move_insn (mem, vec);
15372
15373 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
15374 emit_move_insn (target, tmp);
15375 }
15376 }
15377
15378 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
15379 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
15380 The upper bits of DEST are undefined, though they shouldn't cause
15381 exceptions (some bits from src or all zeros are ok). */
15382
15383 static void
15384 emit_reduc_half (rtx dest, rtx src, int i)
15385 {
15386 rtx tem, d = dest;
15387 switch (GET_MODE (src))
15388 {
15389 case E_V4SFmode:
15390 if (i == 128)
15391 tem = gen_sse_movhlps (dest, src, src);
15392 else
15393 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
15394 GEN_INT (1 + 4), GEN_INT (1 + 4));
15395 break;
15396 case E_V2DFmode:
15397 tem = gen_vec_interleave_highv2df (dest, src, src);
15398 break;
15399 case E_V16QImode:
15400 case E_V8HImode:
15401 case E_V4SImode:
15402 case E_V2DImode:
15403 d = gen_reg_rtx (V1TImode);
15404 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
15405 GEN_INT (i / 2));
15406 break;
15407 case E_V8SFmode:
15408 if (i == 256)
15409 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
15410 else
15411 tem = gen_avx_shufps256 (dest, src, src,
15412 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
15413 break;
15414 case E_V4DFmode:
15415 if (i == 256)
15416 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
15417 else
15418 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
15419 break;
15420 case E_V32QImode:
15421 case E_V16HImode:
15422 case E_V8SImode:
15423 case E_V4DImode:
15424 if (i == 256)
15425 {
15426 if (GET_MODE (dest) != V4DImode)
15427 d = gen_reg_rtx (V4DImode);
15428 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
15429 gen_lowpart (V4DImode, src),
15430 const1_rtx);
15431 }
15432 else
15433 {
15434 d = gen_reg_rtx (V2TImode);
15435 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
15436 GEN_INT (i / 2));
15437 }
15438 break;
15439 case E_V64QImode:
15440 case E_V32HImode:
15441 if (i < 64)
15442 {
15443 d = gen_reg_rtx (V4TImode);
15444 tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
15445 GEN_INT (i / 2));
15446 break;
15447 }
15448 /* FALLTHRU */
15449 case E_V16SImode:
15450 case E_V16SFmode:
15451 case E_V8DImode:
15452 case E_V8DFmode:
15453 if (i > 128)
15454 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
15455 gen_lowpart (V16SImode, src),
15456 gen_lowpart (V16SImode, src),
15457 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
15458 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
15459 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
15460 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
15461 GEN_INT (0xC), GEN_INT (0xD),
15462 GEN_INT (0xE), GEN_INT (0xF),
15463 GEN_INT (0x10), GEN_INT (0x11),
15464 GEN_INT (0x12), GEN_INT (0x13),
15465 GEN_INT (0x14), GEN_INT (0x15),
15466 GEN_INT (0x16), GEN_INT (0x17));
15467 else
15468 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
15469 gen_lowpart (V16SImode, src),
15470 GEN_INT (i == 128 ? 0x2 : 0x1),
15471 GEN_INT (0x3),
15472 GEN_INT (0x3),
15473 GEN_INT (0x3),
15474 GEN_INT (i == 128 ? 0x6 : 0x5),
15475 GEN_INT (0x7),
15476 GEN_INT (0x7),
15477 GEN_INT (0x7),
15478 GEN_INT (i == 128 ? 0xA : 0x9),
15479 GEN_INT (0xB),
15480 GEN_INT (0xB),
15481 GEN_INT (0xB),
15482 GEN_INT (i == 128 ? 0xE : 0xD),
15483 GEN_INT (0xF),
15484 GEN_INT (0xF),
15485 GEN_INT (0xF));
15486 break;
15487 default:
15488 gcc_unreachable ();
15489 }
15490 emit_insn (tem);
15491 if (d != dest)
15492 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
15493 }
15494
15495 /* Expand a vector reduction. FN is the binary pattern to reduce;
15496 DEST is the destination; IN is the input vector. */
15497
15498 void
15499 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
15500 {
15501 rtx half, dst, vec = in;
15502 machine_mode mode = GET_MODE (in);
15503 int i;
15504
15505 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
15506 if (TARGET_SSE4_1
15507 && mode == V8HImode
15508 && fn == gen_uminv8hi3)
15509 {
15510 emit_insn (gen_sse4_1_phminposuw (dest, in));
15511 return;
15512 }
15513
15514 for (i = GET_MODE_BITSIZE (mode);
15515 i > GET_MODE_UNIT_BITSIZE (mode);
15516 i >>= 1)
15517 {
15518 half = gen_reg_rtx (mode);
15519 emit_reduc_half (half, vec, i);
15520 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
15521 dst = dest;
15522 else
15523 dst = gen_reg_rtx (mode);
15524 emit_insn (fn (dst, half, vec));
15525 vec = dst;
15526 }
15527 }
15528
15529 /* Output code to perform a conditional jump to LABEL, if C2 flag in
15530 FP status register is set. */
15531
15532 void
15533 ix86_emit_fp_unordered_jump (rtx label)
15534 {
15535 rtx reg = gen_reg_rtx (HImode);
15536 rtx_insn *insn;
15537 rtx temp;
15538
15539 emit_insn (gen_x86_fnstsw_1 (reg));
15540
15541 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
15542 {
15543 emit_insn (gen_x86_sahf_1 (reg));
15544
15545 temp = gen_rtx_REG (CCmode, FLAGS_REG);
15546 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
15547 }
15548 else
15549 {
15550 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
15551
15552 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
15553 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
15554 }
15555
15556 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
15557 gen_rtx_LABEL_REF (VOIDmode, label),
15558 pc_rtx);
15559 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
15560 predict_jump (REG_BR_PROB_BASE * 10 / 100);
15561 JUMP_LABEL (insn) = label;
15562 }
15563
15564 /* Output code to perform an sinh XFmode calculation. */
15565
15566 void ix86_emit_i387_sinh (rtx op0, rtx op1)
15567 {
15568 rtx e1 = gen_reg_rtx (XFmode);
15569 rtx e2 = gen_reg_rtx (XFmode);
15570 rtx scratch = gen_reg_rtx (HImode);
15571 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15572 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15573 rtx cst1, tmp;
15574 rtx_code_label *jump_label = gen_label_rtx ();
15575 rtx_insn *insn;
15576
15577 /* scratch = fxam (op1) */
15578 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15579
15580 /* e1 = expm1 (|op1|) */
15581 emit_insn (gen_absxf2 (e2, op1));
15582 emit_insn (gen_expm1xf2 (e1, e2));
15583
15584 /* e2 = e1 / (e1 + 1.0) + e1 */
15585 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15586 emit_insn (gen_addxf3 (e2, e1, cst1));
15587 emit_insn (gen_divxf3 (e2, e1, e2));
15588 emit_insn (gen_addxf3 (e2, e2, e1));
15589
15590 /* flags = signbit (op1) */
15591 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15592
15593 /* if (flags) then e2 = -e2 */
15594 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15595 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15596 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15597 pc_rtx);
15598 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15599 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15600 JUMP_LABEL (insn) = jump_label;
15601
15602 emit_insn (gen_negxf2 (e2, e2));
15603
15604 emit_label (jump_label);
15605 LABEL_NUSES (jump_label) = 1;
15606
15607 /* op0 = 0.5 * e2 */
15608 half = force_reg (XFmode, half);
15609 emit_insn (gen_mulxf3 (op0, e2, half));
15610 }
15611
15612 /* Output code to perform an cosh XFmode calculation. */
15613
15614 void ix86_emit_i387_cosh (rtx op0, rtx op1)
15615 {
15616 rtx e1 = gen_reg_rtx (XFmode);
15617 rtx e2 = gen_reg_rtx (XFmode);
15618 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15619 rtx cst1;
15620
15621 /* e1 = exp (op1) */
15622 emit_insn (gen_expxf2 (e1, op1));
15623
15624 /* e2 = e1 + 1.0 / e1 */
15625 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15626 emit_insn (gen_divxf3 (e2, cst1, e1));
15627 emit_insn (gen_addxf3 (e2, e1, e2));
15628
15629 /* op0 = 0.5 * e2 */
15630 half = force_reg (XFmode, half);
15631 emit_insn (gen_mulxf3 (op0, e2, half));
15632 }
15633
15634 /* Output code to perform an tanh XFmode calculation. */
15635
15636 void ix86_emit_i387_tanh (rtx op0, rtx op1)
15637 {
15638 rtx e1 = gen_reg_rtx (XFmode);
15639 rtx e2 = gen_reg_rtx (XFmode);
15640 rtx scratch = gen_reg_rtx (HImode);
15641 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15642 rtx cst2, tmp;
15643 rtx_code_label *jump_label = gen_label_rtx ();
15644 rtx_insn *insn;
15645
15646 /* scratch = fxam (op1) */
15647 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15648
15649 /* e1 = expm1 (-|2 * op1|) */
15650 emit_insn (gen_addxf3 (e2, op1, op1));
15651 emit_insn (gen_absxf2 (e2, e2));
15652 emit_insn (gen_negxf2 (e2, e2));
15653 emit_insn (gen_expm1xf2 (e1, e2));
15654
15655 /* e2 = e1 / (e1 + 2.0) */
15656 cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
15657 emit_insn (gen_addxf3 (e2, e1, cst2));
15658 emit_insn (gen_divxf3 (e2, e1, e2));
15659
15660 /* flags = signbit (op1) */
15661 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15662
15663 /* if (!flags) then e2 = -e2 */
15664 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15665 gen_rtx_NE (VOIDmode, flags, const0_rtx),
15666 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15667 pc_rtx);
15668 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15669 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15670 JUMP_LABEL (insn) = jump_label;
15671
15672 emit_insn (gen_negxf2 (e2, e2));
15673
15674 emit_label (jump_label);
15675 LABEL_NUSES (jump_label) = 1;
15676
15677 emit_move_insn (op0, e2);
15678 }
15679
15680 /* Output code to perform an asinh XFmode calculation. */
15681
15682 void ix86_emit_i387_asinh (rtx op0, rtx op1)
15683 {
15684 rtx e1 = gen_reg_rtx (XFmode);
15685 rtx e2 = gen_reg_rtx (XFmode);
15686 rtx scratch = gen_reg_rtx (HImode);
15687 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15688 rtx cst1, tmp;
15689 rtx_code_label *jump_label = gen_label_rtx ();
15690 rtx_insn *insn;
15691
15692 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
15693 emit_insn (gen_mulxf3 (e1, op1, op1));
15694 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15695 emit_insn (gen_addxf3 (e2, e1, cst1));
15696 emit_insn (gen_sqrtxf2 (e2, e2));
15697 emit_insn (gen_addxf3 (e2, e2, cst1));
15698
15699 /* e1 = e1 / e2 */
15700 emit_insn (gen_divxf3 (e1, e1, e2));
15701
15702 /* scratch = fxam (op1) */
15703 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15704
15705 /* e1 = e1 + |op1| */
15706 emit_insn (gen_absxf2 (e2, op1));
15707 emit_insn (gen_addxf3 (e1, e1, e2));
15708
15709 /* e2 = log1p (e1) */
15710 ix86_emit_i387_log1p (e2, e1);
15711
15712 /* flags = signbit (op1) */
15713 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15714
15715 /* if (flags) then e2 = -e2 */
15716 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15717 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15718 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15719 pc_rtx);
15720 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15721 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15722 JUMP_LABEL (insn) = jump_label;
15723
15724 emit_insn (gen_negxf2 (e2, e2));
15725
15726 emit_label (jump_label);
15727 LABEL_NUSES (jump_label) = 1;
15728
15729 emit_move_insn (op0, e2);
15730 }
15731
15732 /* Output code to perform an acosh XFmode calculation. */
15733
15734 void ix86_emit_i387_acosh (rtx op0, rtx op1)
15735 {
15736 rtx e1 = gen_reg_rtx (XFmode);
15737 rtx e2 = gen_reg_rtx (XFmode);
15738 rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15739
15740 /* e2 = sqrt (op1 + 1.0) */
15741 emit_insn (gen_addxf3 (e2, op1, cst1));
15742 emit_insn (gen_sqrtxf2 (e2, e2));
15743
15744 /* e1 = sqrt (op1 - 1.0) */
15745 emit_insn (gen_subxf3 (e1, op1, cst1));
15746 emit_insn (gen_sqrtxf2 (e1, e1));
15747
15748 /* e1 = e1 * e2 */
15749 emit_insn (gen_mulxf3 (e1, e1, e2));
15750
15751 /* e1 = e1 + op1 */
15752 emit_insn (gen_addxf3 (e1, e1, op1));
15753
15754 /* op0 = log (e1) */
15755 emit_insn (gen_logxf2 (op0, e1));
15756 }
15757
15758 /* Output code to perform an atanh XFmode calculation. */
15759
15760 void ix86_emit_i387_atanh (rtx op0, rtx op1)
15761 {
15762 rtx e1 = gen_reg_rtx (XFmode);
15763 rtx e2 = gen_reg_rtx (XFmode);
15764 rtx scratch = gen_reg_rtx (HImode);
15765 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15766 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15767 rtx cst1, tmp;
15768 rtx_code_label *jump_label = gen_label_rtx ();
15769 rtx_insn *insn;
15770
15771 /* scratch = fxam (op1) */
15772 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15773
15774 /* e2 = |op1| */
15775 emit_insn (gen_absxf2 (e2, op1));
15776
15777 /* e1 = -(e2 + e2) / (e2 + 1.0) */
15778 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15779 emit_insn (gen_addxf3 (e1, e2, cst1));
15780 emit_insn (gen_addxf3 (e2, e2, e2));
15781 emit_insn (gen_negxf2 (e2, e2));
15782 emit_insn (gen_divxf3 (e1, e2, e1));
15783
15784 /* e2 = log1p (e1) */
15785 ix86_emit_i387_log1p (e2, e1);
15786
15787 /* flags = signbit (op1) */
15788 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15789
15790 /* if (!flags) then e2 = -e2 */
15791 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15792 gen_rtx_NE (VOIDmode, flags, const0_rtx),
15793 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15794 pc_rtx);
15795 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15796 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15797 JUMP_LABEL (insn) = jump_label;
15798
15799 emit_insn (gen_negxf2 (e2, e2));
15800
15801 emit_label (jump_label);
15802 LABEL_NUSES (jump_label) = 1;
15803
15804 /* op0 = 0.5 * e2 */
15805 half = force_reg (XFmode, half);
15806 emit_insn (gen_mulxf3 (op0, e2, half));
15807 }
15808
15809 /* Output code to perform a log1p XFmode calculation. */
15810
15811 void ix86_emit_i387_log1p (rtx op0, rtx op1)
15812 {
15813 rtx_code_label *label1 = gen_label_rtx ();
15814 rtx_code_label *label2 = gen_label_rtx ();
15815
15816 rtx tmp = gen_reg_rtx (XFmode);
15817 rtx res = gen_reg_rtx (XFmode);
15818 rtx cst, cstln2, cst1;
15819 rtx_insn *insn;
15820
15821 cst = const_double_from_real_value
15822 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
15823 cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
15824
15825 emit_insn (gen_absxf2 (tmp, op1));
15826
15827 cst = force_reg (XFmode, cst);
15828 ix86_expand_branch (GE, tmp, cst, label1);
15829 predict_jump (REG_BR_PROB_BASE * 10 / 100);
15830 insn = get_last_insn ();
15831 JUMP_LABEL (insn) = label1;
15832
15833 emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
15834 emit_jump (label2);
15835
15836 emit_label (label1);
15837 LABEL_NUSES (label1) = 1;
15838
15839 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15840 emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
15841 emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
15842
15843 emit_label (label2);
15844 LABEL_NUSES (label2) = 1;
15845
15846 emit_move_insn (op0, res);
15847 }
15848
15849 /* Emit code for round calculation. */
15850 void ix86_emit_i387_round (rtx op0, rtx op1)
15851 {
15852 machine_mode inmode = GET_MODE (op1);
15853 machine_mode outmode = GET_MODE (op0);
15854 rtx e1 = gen_reg_rtx (XFmode);
15855 rtx e2 = gen_reg_rtx (XFmode);
15856 rtx scratch = gen_reg_rtx (HImode);
15857 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15858 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15859 rtx res = gen_reg_rtx (outmode);
15860 rtx_code_label *jump_label = gen_label_rtx ();
15861 rtx (*floor_insn) (rtx, rtx);
15862 rtx (*neg_insn) (rtx, rtx);
15863 rtx_insn *insn;
15864 rtx tmp;
15865
15866 switch (inmode)
15867 {
15868 case E_SFmode:
15869 case E_DFmode:
15870 tmp = gen_reg_rtx (XFmode);
15871
15872 emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
15873 op1 = tmp;
15874 break;
15875 case E_XFmode:
15876 break;
15877 default:
15878 gcc_unreachable ();
15879 }
15880
15881 switch (outmode)
15882 {
15883 case E_SFmode:
15884 floor_insn = gen_frndintxf2_floor;
15885 neg_insn = gen_negsf2;
15886 break;
15887 case E_DFmode:
15888 floor_insn = gen_frndintxf2_floor;
15889 neg_insn = gen_negdf2;
15890 break;
15891 case E_XFmode:
15892 floor_insn = gen_frndintxf2_floor;
15893 neg_insn = gen_negxf2;
15894 break;
15895 case E_HImode:
15896 floor_insn = gen_lfloorxfhi2;
15897 neg_insn = gen_neghi2;
15898 break;
15899 case E_SImode:
15900 floor_insn = gen_lfloorxfsi2;
15901 neg_insn = gen_negsi2;
15902 break;
15903 case E_DImode:
15904 floor_insn = gen_lfloorxfdi2;
15905 neg_insn = gen_negdi2;
15906 break;
15907 default:
15908 gcc_unreachable ();
15909 }
15910
15911 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
15912
15913 /* scratch = fxam(op1) */
15914 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15915
15916 /* e1 = fabs(op1) */
15917 emit_insn (gen_absxf2 (e1, op1));
15918
15919 /* e2 = e1 + 0.5 */
15920 half = force_reg (XFmode, half);
15921 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
15922
15923 /* res = floor(e2) */
15924 switch (outmode)
15925 {
15926 case E_SFmode:
15927 case E_DFmode:
15928 {
15929 tmp = gen_reg_rtx (XFmode);
15930
15931 emit_insn (floor_insn (tmp, e2));
15932 emit_insn (gen_rtx_SET (res,
15933 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
15934 UNSPEC_TRUNC_NOOP)));
15935 }
15936 break;
15937 default:
15938 emit_insn (floor_insn (res, e2));
15939 }
15940
15941 /* flags = signbit(a) */
15942 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15943
15944 /* if (flags) then res = -res */
15945 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15946 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15947 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15948 pc_rtx);
15949 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15950 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15951 JUMP_LABEL (insn) = jump_label;
15952
15953 emit_insn (neg_insn (res, res));
15954
15955 emit_label (jump_label);
15956 LABEL_NUSES (jump_label) = 1;
15957
15958 emit_move_insn (op0, res);
15959 }
15960
15961 /* Output code to perform a Newton-Rhapson approximation of a single precision
15962 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
15963
15964 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
15965 {
15966 rtx x0, x1, e0, e1;
15967
15968 x0 = gen_reg_rtx (mode);
15969 e0 = gen_reg_rtx (mode);
15970 e1 = gen_reg_rtx (mode);
15971 x1 = gen_reg_rtx (mode);
15972
15973 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
15974
15975 b = force_reg (mode, b);
15976
15977 /* x0 = rcp(b) estimate */
15978 if (mode == V16SFmode || mode == V8DFmode)
15979 {
15980 if (TARGET_AVX512ER)
15981 {
15982 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15983 UNSPEC_RCP28)));
15984 /* res = a * x0 */
15985 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
15986 return;
15987 }
15988 else
15989 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15990 UNSPEC_RCP14)));
15991 }
15992 else
15993 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15994 UNSPEC_RCP)));
15995
15996 /* e0 = x0 * b */
15997 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
15998
15999 /* e0 = x0 * e0 */
16000 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
16001
16002 /* e1 = x0 + x0 */
16003 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
16004
16005 /* x1 = e1 - e0 */
16006 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
16007
16008 /* res = a * x1 */
16009 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
16010 }
16011
16012 /* Output code to perform a Newton-Rhapson approximation of a
16013 single precision floating point [reciprocal] square root. */
16014
16015 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
16016 {
16017 rtx x0, e0, e1, e2, e3, mthree, mhalf;
16018 REAL_VALUE_TYPE r;
16019 int unspec;
16020
16021 x0 = gen_reg_rtx (mode);
16022 e0 = gen_reg_rtx (mode);
16023 e1 = gen_reg_rtx (mode);
16024 e2 = gen_reg_rtx (mode);
16025 e3 = gen_reg_rtx (mode);
16026
16027 if (TARGET_AVX512ER && mode == V16SFmode)
16028 {
16029 if (recip)
16030 /* res = rsqrt28(a) estimate */
16031 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
16032 UNSPEC_RSQRT28)));
16033 else
16034 {
16035 /* x0 = rsqrt28(a) estimate */
16036 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
16037 UNSPEC_RSQRT28)));
16038 /* res = rcp28(x0) estimate */
16039 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
16040 UNSPEC_RCP28)));
16041 }
16042 return;
16043 }
16044
16045 real_from_integer (&r, VOIDmode, -3, SIGNED);
16046 mthree = const_double_from_real_value (r, SFmode);
16047
16048 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
16049 mhalf = const_double_from_real_value (r, SFmode);
16050 unspec = UNSPEC_RSQRT;
16051
16052 if (VECTOR_MODE_P (mode))
16053 {
16054 mthree = ix86_build_const_vector (mode, true, mthree);
16055 mhalf = ix86_build_const_vector (mode, true, mhalf);
16056 /* There is no 512-bit rsqrt. There is however rsqrt14. */
16057 if (GET_MODE_SIZE (mode) == 64)
16058 unspec = UNSPEC_RSQRT14;
16059 }
16060
16061 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
16062 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
16063
16064 a = force_reg (mode, a);
16065
16066 /* x0 = rsqrt(a) estimate */
16067 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
16068 unspec)));
16069
16070 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
16071 if (!recip)
16072 {
16073 rtx zero = force_reg (mode, CONST0_RTX(mode));
16074 rtx mask;
16075
16076 /* Handle masked compare. */
16077 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
16078 {
16079 mask = gen_reg_rtx (HImode);
16080 /* Imm value 0x4 corresponds to not-equal comparison. */
16081 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
16082 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
16083 }
16084 else
16085 {
16086 mask = gen_reg_rtx (mode);
16087 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
16088 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
16089 }
16090 }
16091
16092 mthree = force_reg (mode, mthree);
16093
16094 /* e0 = x0 * a */
16095 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
16096
16097 unsigned vector_size = GET_MODE_SIZE (mode);
16098 if (TARGET_FMA
16099 || (TARGET_AVX512F && vector_size == 64)
16100 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
16101 emit_insn (gen_rtx_SET (e2,
16102 gen_rtx_FMA (mode, e0, x0, mthree)));
16103 else
16104 {
16105 /* e1 = e0 * x0 */
16106 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
16107
16108 /* e2 = e1 - 3. */
16109 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
16110 }
16111
16112 mhalf = force_reg (mode, mhalf);
16113 if (recip)
16114 /* e3 = -.5 * x0 */
16115 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
16116 else
16117 /* e3 = -.5 * e0 */
16118 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
16119 /* ret = e2 * e3 */
16120 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
16121 }
16122
16123 /* Expand fabs (OP0) and return a new rtx that holds the result. The
16124 mask for masking out the sign-bit is stored in *SMASK, if that is
16125 non-null. */
16126
16127 static rtx
16128 ix86_expand_sse_fabs (rtx op0, rtx *smask)
16129 {
16130 machine_mode vmode, mode = GET_MODE (op0);
16131 rtx xa, mask;
16132
16133 xa = gen_reg_rtx (mode);
16134 if (mode == SFmode)
16135 vmode = V4SFmode;
16136 else if (mode == DFmode)
16137 vmode = V2DFmode;
16138 else
16139 vmode = mode;
16140 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
16141 if (!VECTOR_MODE_P (mode))
16142 {
16143 /* We need to generate a scalar mode mask in this case. */
16144 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
16145 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
16146 mask = gen_reg_rtx (mode);
16147 emit_insn (gen_rtx_SET (mask, tmp));
16148 }
16149 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
16150
16151 if (smask)
16152 *smask = mask;
16153
16154 return xa;
16155 }
16156
16157 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
16158 swapping the operands if SWAP_OPERANDS is true. The expanded
16159 code is a forward jump to a newly created label in case the
16160 comparison is true. The generated label rtx is returned. */
16161 static rtx_code_label *
16162 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
16163 bool swap_operands)
16164 {
16165 bool unordered_compare = ix86_unordered_fp_compare (code);
16166 rtx_code_label *label;
16167 rtx tmp, reg;
16168
16169 if (swap_operands)
16170 std::swap (op0, op1);
16171
16172 label = gen_label_rtx ();
16173 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
16174 if (unordered_compare)
16175 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
16176 reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
16177 emit_insn (gen_rtx_SET (reg, tmp));
16178 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
16179 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
16180 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
16181 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
16182 JUMP_LABEL (tmp) = label;
16183
16184 return label;
16185 }
16186
16187 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
16188 using comparison code CODE. Operands are swapped for the comparison if
16189 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
16190 static rtx
16191 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
16192 bool swap_operands)
16193 {
16194 rtx (*insn)(rtx, rtx, rtx, rtx);
16195 machine_mode mode = GET_MODE (op0);
16196 rtx mask = gen_reg_rtx (mode);
16197
16198 if (swap_operands)
16199 std::swap (op0, op1);
16200
16201 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
16202
16203 emit_insn (insn (mask, op0, op1,
16204 gen_rtx_fmt_ee (code, mode, op0, op1)));
16205 return mask;
16206 }
16207
16208 /* Expand copysign from SIGN to the positive value ABS_VALUE
16209 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
16210 the sign-bit. */
16211
16212 static void
16213 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
16214 {
16215 machine_mode mode = GET_MODE (sign);
16216 rtx sgn = gen_reg_rtx (mode);
16217 if (mask == NULL_RTX)
16218 {
16219 machine_mode vmode;
16220
16221 if (mode == SFmode)
16222 vmode = V4SFmode;
16223 else if (mode == DFmode)
16224 vmode = V2DFmode;
16225 else
16226 vmode = mode;
16227
16228 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
16229 if (!VECTOR_MODE_P (mode))
16230 {
16231 /* We need to generate a scalar mode mask in this case. */
16232 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
16233 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
16234 mask = gen_reg_rtx (mode);
16235 emit_insn (gen_rtx_SET (mask, tmp));
16236 }
16237 }
16238 else
16239 mask = gen_rtx_NOT (mode, mask);
16240 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
16241 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
16242 }
16243
16244 /* Expand SSE sequence for computing lround from OP1 storing
16245 into OP0. */
16246
16247 void
16248 ix86_expand_lround (rtx op0, rtx op1)
16249 {
16250 /* C code for the stuff we're doing below:
16251 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
16252 return (long)tmp;
16253 */
16254 machine_mode mode = GET_MODE (op1);
16255 const struct real_format *fmt;
16256 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
16257 rtx adj;
16258
16259 /* load nextafter (0.5, 0.0) */
16260 fmt = REAL_MODE_FORMAT (mode);
16261 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
16262 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
16263
16264 /* adj = copysign (0.5, op1) */
16265 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
16266 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
16267
16268 /* adj = op1 + adj */
16269 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
16270
16271 /* op0 = (imode)adj */
16272 expand_fix (op0, adj, 0);
16273 }
16274
16275 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
16276 into OPERAND0. */
16277
16278 void
16279 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
16280 {
16281 /* C code for the stuff we're doing below (for do_floor):
16282 xi = (long)op1;
16283 xi -= (double)xi > op1 ? 1 : 0;
16284 return xi;
16285 */
16286 machine_mode fmode = GET_MODE (op1);
16287 machine_mode imode = GET_MODE (op0);
16288 rtx ireg, freg, tmp;
16289 rtx_code_label *label;
16290
16291 /* reg = (long)op1 */
16292 ireg = gen_reg_rtx (imode);
16293 expand_fix (ireg, op1, 0);
16294
16295 /* freg = (double)reg */
16296 freg = gen_reg_rtx (fmode);
16297 expand_float (freg, ireg, 0);
16298
16299 /* ireg = (freg > op1) ? ireg - 1 : ireg */
16300 label = ix86_expand_sse_compare_and_jump (UNLE,
16301 freg, op1, !do_floor);
16302 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
16303 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
16304 emit_move_insn (ireg, tmp);
16305
16306 emit_label (label);
16307 LABEL_NUSES (label) = 1;
16308
16309 emit_move_insn (op0, ireg);
16310 }
16311
16312 /* Generate and return a rtx of mode MODE for 2**n where n is the number
16313 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
16314
16315 static rtx
16316 ix86_gen_TWO52 (machine_mode mode)
16317 {
16318 const struct real_format *fmt;
16319 REAL_VALUE_TYPE TWO52r;
16320 rtx TWO52;
16321
16322 fmt = REAL_MODE_FORMAT (mode);
16323 real_2expN (&TWO52r, fmt->p - 1, mode);
16324 TWO52 = const_double_from_real_value (TWO52r, mode);
16325 TWO52 = force_reg (mode, TWO52);
16326
16327 return TWO52;
16328 }
16329
16330 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
16331
16332 void
16333 ix86_expand_rint (rtx operand0, rtx operand1)
16334 {
16335 /* C code for the stuff we're doing below:
16336 xa = fabs (operand1);
16337 if (!isless (xa, 2**52))
16338 return operand1;
16339 two52 = 2**52;
16340 if (flag_rounding_math)
16341 {
16342 two52 = copysign (two52, operand1);
16343 xa = operand1;
16344 }
16345 xa = xa + two52 - two52;
16346 return copysign (xa, operand1);
16347 */
16348 machine_mode mode = GET_MODE (operand0);
16349 rtx res, xa, TWO52, mask;
16350 rtx_code_label *label;
16351
16352 TWO52 = ix86_gen_TWO52 (mode);
16353
16354 /* Temporary for holding the result, initialized to the input
16355 operand to ease control flow. */
16356 res = copy_to_reg (operand1);
16357
16358 /* xa = abs (operand1) */
16359 xa = ix86_expand_sse_fabs (res, &mask);
16360
16361 /* if (!isless (xa, TWO52)) goto label; */
16362 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16363
16364 if (flag_rounding_math)
16365 {
16366 ix86_sse_copysign_to_positive (TWO52, TWO52, res, mask);
16367 xa = res;
16368 }
16369
16370 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16371 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
16372
16373 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
16374 if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
16375 xa = ix86_expand_sse_fabs (xa, NULL);
16376
16377 ix86_sse_copysign_to_positive (res, xa, res, mask);
16378
16379 emit_label (label);
16380 LABEL_NUSES (label) = 1;
16381
16382 emit_move_insn (operand0, res);
16383 }
16384
16385 /* Expand SSE2 sequence for computing floor or ceil
16386 from OPERAND1 storing into OPERAND0. */
16387 void
16388 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
16389 {
16390 /* C code for the stuff we expand below.
16391 double xa = fabs (x), x2;
16392 if (!isless (xa, TWO52))
16393 return x;
16394 x2 = (double)(long)x;
16395
16396 Compensate. Floor:
16397 if (x2 > x)
16398 x2 -= 1;
16399 Compensate. Ceil:
16400 if (x2 < x)
16401 x2 += 1;
16402
16403 if (HONOR_SIGNED_ZEROS (mode))
16404 return copysign (x2, x);
16405 return x2;
16406 */
16407 machine_mode mode = GET_MODE (operand0);
16408 rtx xa, xi, TWO52, tmp, one, res, mask;
16409 rtx_code_label *label;
16410
16411 TWO52 = ix86_gen_TWO52 (mode);
16412
16413 /* Temporary for holding the result, initialized to the input
16414 operand to ease control flow. */
16415 res = copy_to_reg (operand1);
16416
16417 /* xa = abs (operand1) */
16418 xa = ix86_expand_sse_fabs (res, &mask);
16419
16420 /* if (!isless (xa, TWO52)) goto label; */
16421 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16422
16423 /* xa = (double)(long)x */
16424 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
16425 expand_fix (xi, res, 0);
16426 expand_float (xa, xi, 0);
16427
16428 /* generate 1.0 */
16429 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
16430
16431 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
16432 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
16433 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
16434 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
16435 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16436 if (HONOR_SIGNED_ZEROS (mode))
16437 {
16438 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
16439 if (do_floor && flag_rounding_math)
16440 tmp = ix86_expand_sse_fabs (tmp, NULL);
16441
16442 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
16443 }
16444 emit_move_insn (res, tmp);
16445
16446 emit_label (label);
16447 LABEL_NUSES (label) = 1;
16448
16449 emit_move_insn (operand0, res);
16450 }
16451
16452 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
16453 into OPERAND0 without relying on DImode truncation via cvttsd2siq
16454 that is only available on 64bit targets. */
16455 void
16456 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
16457 {
16458 /* C code for the stuff we expand below.
16459 double xa = fabs (x), x2;
16460 if (!isless (xa, TWO52))
16461 return x;
16462 xa = xa + TWO52 - TWO52;
16463 x2 = copysign (xa, x);
16464
16465 Compensate. Floor:
16466 if (x2 > x)
16467 x2 -= 1;
16468 Compensate. Ceil:
16469 if (x2 < x)
16470 x2 += 1;
16471
16472 if (HONOR_SIGNED_ZEROS (mode))
16473 x2 = copysign (x2, x);
16474 return x2;
16475 */
16476 machine_mode mode = GET_MODE (operand0);
16477 rtx xa, TWO52, tmp, one, res, mask;
16478 rtx_code_label *label;
16479
16480 TWO52 = ix86_gen_TWO52 (mode);
16481
16482 /* Temporary for holding the result, initialized to the input
16483 operand to ease control flow. */
16484 res = copy_to_reg (operand1);
16485
16486 /* xa = abs (operand1) */
16487 xa = ix86_expand_sse_fabs (res, &mask);
16488
16489 /* if (!isless (xa, TWO52)) goto label; */
16490 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16491
16492 /* xa = xa + TWO52 - TWO52; */
16493 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16494 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
16495
16496 /* xa = copysign (xa, operand1) */
16497 ix86_sse_copysign_to_positive (xa, xa, res, mask);
16498
16499 /* generate 1.0 */
16500 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
16501
16502 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
16503 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
16504 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
16505 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
16506 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16507 if (HONOR_SIGNED_ZEROS (mode))
16508 {
16509 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
16510 if (do_floor && flag_rounding_math)
16511 tmp = ix86_expand_sse_fabs (tmp, NULL);
16512
16513 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
16514 }
16515 emit_move_insn (res, tmp);
16516
16517 emit_label (label);
16518 LABEL_NUSES (label) = 1;
16519
16520 emit_move_insn (operand0, res);
16521 }
16522
16523 /* Expand SSE sequence for computing trunc
16524 from OPERAND1 storing into OPERAND0. */
16525 void
16526 ix86_expand_trunc (rtx operand0, rtx operand1)
16527 {
16528 /* C code for SSE variant we expand below.
16529 double xa = fabs (x), x2;
16530 if (!isless (xa, TWO52))
16531 return x;
16532 x2 = (double)(long)x;
16533 if (HONOR_SIGNED_ZEROS (mode))
16534 return copysign (x2, x);
16535 return x2;
16536 */
16537 machine_mode mode = GET_MODE (operand0);
16538 rtx xa, xi, TWO52, res, mask;
16539 rtx_code_label *label;
16540
16541 TWO52 = ix86_gen_TWO52 (mode);
16542
16543 /* Temporary for holding the result, initialized to the input
16544 operand to ease control flow. */
16545 res = copy_to_reg (operand1);
16546
16547 /* xa = abs (operand1) */
16548 xa = ix86_expand_sse_fabs (res, &mask);
16549
16550 /* if (!isless (xa, TWO52)) goto label; */
16551 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16552
16553 /* xa = (double)(long)x */
16554 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
16555 expand_fix (xi, res, 0);
16556 expand_float (xa, xi, 0);
16557
16558 if (HONOR_SIGNED_ZEROS (mode))
16559 ix86_sse_copysign_to_positive (xa, xa, res, mask);
16560
16561 emit_move_insn (res, xa);
16562
16563 emit_label (label);
16564 LABEL_NUSES (label) = 1;
16565
16566 emit_move_insn (operand0, res);
16567 }
16568
16569 /* Expand SSE sequence for computing trunc from OPERAND1 storing
16570 into OPERAND0 without relying on DImode truncation via cvttsd2siq
16571 that is only available on 64bit targets. */
16572 void
16573 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
16574 {
16575 machine_mode mode = GET_MODE (operand0);
16576 rtx xa, xa2, TWO52, tmp, one, res, mask;
16577 rtx_code_label *label;
16578
16579 /* C code for SSE variant we expand below.
16580 double xa = fabs (x), x2;
16581 if (!isless (xa, TWO52))
16582 return x;
16583 xa2 = xa + TWO52 - TWO52;
16584 Compensate:
16585 if (xa2 > xa)
16586 xa2 -= 1.0;
16587 x2 = copysign (xa2, x);
16588 return x2;
16589 */
16590
16591 TWO52 = ix86_gen_TWO52 (mode);
16592
16593 /* Temporary for holding the result, initialized to the input
16594 operand to ease control flow. */
16595 res =copy_to_reg (operand1);
16596
16597 /* xa = abs (operand1) */
16598 xa = ix86_expand_sse_fabs (res, &mask);
16599
16600 /* if (!isless (xa, TWO52)) goto label; */
16601 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16602
16603 /* xa2 = xa + TWO52 - TWO52; */
16604 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16605 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
16606
16607 /* generate 1.0 */
16608 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
16609
16610 /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0) */
16611 tmp = ix86_expand_sse_compare_mask (UNGT, xa2, xa, false);
16612 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
16613 tmp = expand_simple_binop (mode, MINUS,
16614 xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16615 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
16616 if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
16617 tmp = ix86_expand_sse_fabs (tmp, NULL);
16618
16619 /* res = copysign (xa2, operand1) */
16620 ix86_sse_copysign_to_positive (res, tmp, res, mask);
16621
16622 emit_label (label);
16623 LABEL_NUSES (label) = 1;
16624
16625 emit_move_insn (operand0, res);
16626 }
16627
16628 /* Expand SSE sequence for computing round
16629 from OPERAND1 storing into OPERAND0. */
16630 void
16631 ix86_expand_round (rtx operand0, rtx operand1)
16632 {
16633 /* C code for the stuff we're doing below:
16634 double xa = fabs (x);
16635 if (!isless (xa, TWO52))
16636 return x;
16637 xa = (double)(long)(xa + nextafter (0.5, 0.0));
16638 return copysign (xa, x);
16639 */
16640 machine_mode mode = GET_MODE (operand0);
16641 rtx res, TWO52, xa, xi, half, mask;
16642 rtx_code_label *label;
16643 const struct real_format *fmt;
16644 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
16645
16646 /* Temporary for holding the result, initialized to the input
16647 operand to ease control flow. */
16648 res = copy_to_reg (operand1);
16649
16650 TWO52 = ix86_gen_TWO52 (mode);
16651 xa = ix86_expand_sse_fabs (res, &mask);
16652 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16653
16654 /* load nextafter (0.5, 0.0) */
16655 fmt = REAL_MODE_FORMAT (mode);
16656 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
16657 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
16658
16659 /* xa = xa + 0.5 */
16660 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
16661 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
16662
16663 /* xa = (double)(int64_t)xa */
16664 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
16665 expand_fix (xi, xa, 0);
16666 expand_float (xa, xi, 0);
16667
16668 /* res = copysign (xa, operand1) */
16669 ix86_sse_copysign_to_positive (res, xa, res, mask);
16670
16671 emit_label (label);
16672 LABEL_NUSES (label) = 1;
16673
16674 emit_move_insn (operand0, res);
16675 }
16676
16677 /* Expand SSE sequence for computing round from OPERAND1 storing
16678 into OPERAND0 without relying on DImode truncation via cvttsd2siq
16679 that is only available on 64bit targets. */
16680 void
16681 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
16682 {
16683 /* C code for the stuff we expand below.
16684 double xa = fabs (x), xa2, x2;
16685 if (!isless (xa, TWO52))
16686 return x;
16687 Using the absolute value and copying back sign makes
16688 -0.0 -> -0.0 correct.
16689 xa2 = xa + TWO52 - TWO52;
16690 Compensate.
16691 dxa = xa2 - xa;
16692 if (dxa <= -0.5)
16693 xa2 += 1;
16694 else if (dxa > 0.5)
16695 xa2 -= 1;
16696 x2 = copysign (xa2, x);
16697 return x2;
16698 */
16699 machine_mode mode = GET_MODE (operand0);
16700 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
16701 rtx_code_label *label;
16702
16703 TWO52 = ix86_gen_TWO52 (mode);
16704
16705 /* Temporary for holding the result, initialized to the input
16706 operand to ease control flow. */
16707 res = copy_to_reg (operand1);
16708
16709 /* xa = abs (operand1) */
16710 xa = ix86_expand_sse_fabs (res, &mask);
16711
16712 /* if (!isless (xa, TWO52)) goto label; */
16713 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16714
16715 /* xa2 = xa + TWO52 - TWO52; */
16716 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16717 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
16718
16719 /* dxa = xa2 - xa; */
16720 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
16721
16722 /* generate 0.5, 1.0 and -0.5 */
16723 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
16724 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
16725 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
16726 0, OPTAB_DIRECT);
16727
16728 /* Compensate. */
16729 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
16730 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
16731 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
16732 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16733 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
16734 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
16735 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
16736 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16737
16738 /* res = copysign (xa2, operand1) */
16739 ix86_sse_copysign_to_positive (res, xa2, res, mask);
16740
16741 emit_label (label);
16742 LABEL_NUSES (label) = 1;
16743
16744 emit_move_insn (operand0, res);
16745 }
16746
16747 /* Expand SSE sequence for computing round
16748 from OP1 storing into OP0 using sse4 round insn. */
16749 void
16750 ix86_expand_round_sse4 (rtx op0, rtx op1)
16751 {
16752 machine_mode mode = GET_MODE (op0);
16753 rtx e1, e2, res, half;
16754 const struct real_format *fmt;
16755 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
16756 rtx (*gen_copysign) (rtx, rtx, rtx);
16757 rtx (*gen_round) (rtx, rtx, rtx);
16758
16759 switch (mode)
16760 {
16761 case E_SFmode:
16762 gen_copysign = gen_copysignsf3;
16763 gen_round = gen_sse4_1_roundsf2;
16764 break;
16765 case E_DFmode:
16766 gen_copysign = gen_copysigndf3;
16767 gen_round = gen_sse4_1_rounddf2;
16768 break;
16769 default:
16770 gcc_unreachable ();
16771 }
16772
16773 /* round (a) = trunc (a + copysign (0.5, a)) */
16774
16775 /* load nextafter (0.5, 0.0) */
16776 fmt = REAL_MODE_FORMAT (mode);
16777 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
16778 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
16779 half = const_double_from_real_value (pred_half, mode);
16780
16781 /* e1 = copysign (0.5, op1) */
16782 e1 = gen_reg_rtx (mode);
16783 emit_insn (gen_copysign (e1, half, op1));
16784
16785 /* e2 = op1 + e1 */
16786 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
16787
16788 /* res = trunc (e2) */
16789 res = gen_reg_rtx (mode);
16790 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
16791
16792 emit_move_insn (op0, res);
16793 }
16794
16795 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
16796 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
16797 insn every time. */
16798
16799 static GTY(()) rtx_insn *vselect_insn;
16800
16801 /* Initialize vselect_insn. */
16802
16803 static void
16804 init_vselect_insn (void)
16805 {
16806 unsigned i;
16807 rtx x;
16808
16809 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
16810 for (i = 0; i < MAX_VECT_LEN; ++i)
16811 XVECEXP (x, 0, i) = const0_rtx;
16812 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
16813 const0_rtx), x);
16814 x = gen_rtx_SET (const0_rtx, x);
16815 start_sequence ();
16816 vselect_insn = emit_insn (x);
16817 end_sequence ();
16818 }
16819
16820 /* Construct (set target (vec_select op0 (parallel perm))) and
16821 return true if that's a valid instruction in the active ISA. */
16822
16823 static bool
16824 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
16825 unsigned nelt, bool testing_p)
16826 {
16827 unsigned int i;
16828 rtx x, save_vconcat;
16829 int icode;
16830
16831 if (vselect_insn == NULL_RTX)
16832 init_vselect_insn ();
16833
16834 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
16835 PUT_NUM_ELEM (XVEC (x, 0), nelt);
16836 for (i = 0; i < nelt; ++i)
16837 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
16838 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
16839 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
16840 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
16841 SET_DEST (PATTERN (vselect_insn)) = target;
16842 icode = recog_memoized (vselect_insn);
16843
16844 if (icode >= 0 && !testing_p)
16845 emit_insn (copy_rtx (PATTERN (vselect_insn)));
16846
16847 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
16848 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
16849 INSN_CODE (vselect_insn) = -1;
16850
16851 return icode >= 0;
16852 }
16853
16854 /* Similar, but generate a vec_concat from op0 and op1 as well. */
16855
16856 static bool
16857 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
16858 const unsigned char *perm, unsigned nelt,
16859 bool testing_p)
16860 {
16861 machine_mode v2mode;
16862 rtx x;
16863 bool ok;
16864
16865 if (vselect_insn == NULL_RTX)
16866 init_vselect_insn ();
16867
16868 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
16869 return false;
16870 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
16871 PUT_MODE (x, v2mode);
16872 XEXP (x, 0) = op0;
16873 XEXP (x, 1) = op1;
16874 ok = expand_vselect (target, x, perm, nelt, testing_p);
16875 XEXP (x, 0) = const0_rtx;
16876 XEXP (x, 1) = const0_rtx;
16877 return ok;
16878 }
16879
16880 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16881 using movss or movsd. */
16882 static bool
16883 expand_vec_perm_movs (struct expand_vec_perm_d *d)
16884 {
16885 machine_mode vmode = d->vmode;
16886 unsigned i, nelt = d->nelt;
16887 rtx x;
16888
16889 if (d->one_operand_p)
16890 return false;
16891
16892 if (!(TARGET_SSE && vmode == V4SFmode)
16893 && !(TARGET_MMX_WITH_SSE && vmode == V2SFmode)
16894 && !(TARGET_SSE2 && vmode == V2DFmode))
16895 return false;
16896
16897 /* Only the first element is changed. */
16898 if (d->perm[0] != nelt && d->perm[0] != 0)
16899 return false;
16900 for (i = 1; i < nelt; ++i)
16901 if (d->perm[i] != i + nelt - d->perm[0])
16902 return false;
16903
16904 if (d->testing_p)
16905 return true;
16906
16907 if (d->perm[0] == nelt)
16908 x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
16909 else
16910 x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
16911
16912 emit_insn (gen_rtx_SET (d->target, x));
16913
16914 return true;
16915 }
16916
16917 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16918 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
16919
16920 static bool
16921 expand_vec_perm_blend (struct expand_vec_perm_d *d)
16922 {
16923 machine_mode mmode, vmode = d->vmode;
16924 unsigned i, nelt = d->nelt;
16925 unsigned HOST_WIDE_INT mask;
16926 rtx target, op0, op1, maskop, x;
16927 rtx rperm[32], vperm;
16928
16929 if (d->one_operand_p)
16930 return false;
16931 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
16932 && (TARGET_AVX512BW
16933 || GET_MODE_UNIT_SIZE (vmode) >= 4))
16934 ;
16935 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
16936 ;
16937 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
16938 ;
16939 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
16940 ;
16941 else
16942 return false;
16943
16944 /* This is a blend, not a permute. Elements must stay in their
16945 respective lanes. */
16946 for (i = 0; i < nelt; ++i)
16947 {
16948 unsigned e = d->perm[i];
16949 if (!(e == i || e == i + nelt))
16950 return false;
16951 }
16952
16953 if (d->testing_p)
16954 return true;
16955
16956 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
16957 decision should be extracted elsewhere, so that we only try that
16958 sequence once all budget==3 options have been tried. */
16959 target = d->target;
16960 op0 = d->op0;
16961 op1 = d->op1;
16962 mask = 0;
16963
16964 switch (vmode)
16965 {
16966 case E_V8DFmode:
16967 case E_V16SFmode:
16968 case E_V4DFmode:
16969 case E_V8SFmode:
16970 case E_V2DFmode:
16971 case E_V4SFmode:
16972 case E_V8HImode:
16973 case E_V8SImode:
16974 case E_V32HImode:
16975 case E_V64QImode:
16976 case E_V16SImode:
16977 case E_V8DImode:
16978 for (i = 0; i < nelt; ++i)
16979 mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
16980 break;
16981
16982 case E_V2DImode:
16983 for (i = 0; i < 2; ++i)
16984 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
16985 vmode = V8HImode;
16986 goto do_subreg;
16987
16988 case E_V4SImode:
16989 for (i = 0; i < 4; ++i)
16990 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
16991 vmode = V8HImode;
16992 goto do_subreg;
16993
16994 case E_V16QImode:
16995 /* See if bytes move in pairs so we can use pblendw with
16996 an immediate argument, rather than pblendvb with a vector
16997 argument. */
16998 for (i = 0; i < 16; i += 2)
16999 if (d->perm[i] + 1 != d->perm[i + 1])
17000 {
17001 use_pblendvb:
17002 for (i = 0; i < nelt; ++i)
17003 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
17004
17005 finish_pblendvb:
17006 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
17007 vperm = force_reg (vmode, vperm);
17008
17009 if (GET_MODE_SIZE (vmode) == 16)
17010 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
17011 else
17012 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
17013 if (target != d->target)
17014 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
17015 return true;
17016 }
17017
17018 for (i = 0; i < 8; ++i)
17019 mask |= (d->perm[i * 2] >= 16) << i;
17020 vmode = V8HImode;
17021 /* FALLTHRU */
17022
17023 do_subreg:
17024 target = gen_reg_rtx (vmode);
17025 op0 = gen_lowpart (vmode, op0);
17026 op1 = gen_lowpart (vmode, op1);
17027 break;
17028
17029 case E_V32QImode:
17030 /* See if bytes move in pairs. If not, vpblendvb must be used. */
17031 for (i = 0; i < 32; i += 2)
17032 if (d->perm[i] + 1 != d->perm[i + 1])
17033 goto use_pblendvb;
17034 /* See if bytes move in quadruplets. If yes, vpblendd
17035 with immediate can be used. */
17036 for (i = 0; i < 32; i += 4)
17037 if (d->perm[i] + 2 != d->perm[i + 2])
17038 break;
17039 if (i < 32)
17040 {
17041 /* See if bytes move the same in both lanes. If yes,
17042 vpblendw with immediate can be used. */
17043 for (i = 0; i < 16; i += 2)
17044 if (d->perm[i] + 16 != d->perm[i + 16])
17045 goto use_pblendvb;
17046
17047 /* Use vpblendw. */
17048 for (i = 0; i < 16; ++i)
17049 mask |= (d->perm[i * 2] >= 32) << i;
17050 vmode = V16HImode;
17051 goto do_subreg;
17052 }
17053
17054 /* Use vpblendd. */
17055 for (i = 0; i < 8; ++i)
17056 mask |= (d->perm[i * 4] >= 32) << i;
17057 vmode = V8SImode;
17058 goto do_subreg;
17059
17060 case E_V16HImode:
17061 /* See if words move in pairs. If yes, vpblendd can be used. */
17062 for (i = 0; i < 16; i += 2)
17063 if (d->perm[i] + 1 != d->perm[i + 1])
17064 break;
17065 if (i < 16)
17066 {
17067 /* See if words move the same in both lanes. If not,
17068 vpblendvb must be used. */
17069 for (i = 0; i < 8; i++)
17070 if (d->perm[i] + 8 != d->perm[i + 8])
17071 {
17072 /* Use vpblendvb. */
17073 for (i = 0; i < 32; ++i)
17074 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
17075
17076 vmode = V32QImode;
17077 nelt = 32;
17078 target = gen_reg_rtx (vmode);
17079 op0 = gen_lowpart (vmode, op0);
17080 op1 = gen_lowpart (vmode, op1);
17081 goto finish_pblendvb;
17082 }
17083
17084 /* Use vpblendw. */
17085 for (i = 0; i < 16; ++i)
17086 mask |= (d->perm[i] >= 16) << i;
17087 break;
17088 }
17089
17090 /* Use vpblendd. */
17091 for (i = 0; i < 8; ++i)
17092 mask |= (d->perm[i * 2] >= 16) << i;
17093 vmode = V8SImode;
17094 goto do_subreg;
17095
17096 case E_V4DImode:
17097 /* Use vpblendd. */
17098 for (i = 0; i < 4; ++i)
17099 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
17100 vmode = V8SImode;
17101 goto do_subreg;
17102
17103 default:
17104 gcc_unreachable ();
17105 }
17106
17107 switch (vmode)
17108 {
17109 case E_V8DFmode:
17110 case E_V8DImode:
17111 mmode = QImode;
17112 break;
17113 case E_V16SFmode:
17114 case E_V16SImode:
17115 mmode = HImode;
17116 break;
17117 case E_V32HImode:
17118 mmode = SImode;
17119 break;
17120 case E_V64QImode:
17121 mmode = DImode;
17122 break;
17123 default:
17124 mmode = VOIDmode;
17125 }
17126
17127 if (mmode != VOIDmode)
17128 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
17129 else
17130 maskop = GEN_INT (mask);
17131
17132 /* This matches five different patterns with the different modes. */
17133 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
17134 x = gen_rtx_SET (target, x);
17135 emit_insn (x);
17136 if (target != d->target)
17137 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
17138
17139 return true;
17140 }
17141
17142 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
17143 in terms of the variable form of vpermilps.
17144
17145 Note that we will have already failed the immediate input vpermilps,
17146 which requires that the high and low part shuffle be identical; the
17147 variable form doesn't require that. */
17148
17149 static bool
17150 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
17151 {
17152 rtx rperm[8], vperm;
17153 unsigned i;
17154
17155 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
17156 return false;
17157
17158 /* We can only permute within the 128-bit lane. */
17159 for (i = 0; i < 8; ++i)
17160 {
17161 unsigned e = d->perm[i];
17162 if (i < 4 ? e >= 4 : e < 4)
17163 return false;
17164 }
17165
17166 if (d->testing_p)
17167 return true;
17168
17169 for (i = 0; i < 8; ++i)
17170 {
17171 unsigned e = d->perm[i];
17172
17173 /* Within each 128-bit lane, the elements of op0 are numbered
17174 from 0 and the elements of op1 are numbered from 4. */
17175 if (e >= 8 + 4)
17176 e -= 8;
17177 else if (e >= 4)
17178 e -= 4;
17179
17180 rperm[i] = GEN_INT (e);
17181 }
17182
17183 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
17184 vperm = force_reg (V8SImode, vperm);
17185 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
17186
17187 return true;
17188 }
17189
17190 /* Return true if permutation D can be performed as VMODE permutation
17191 instead. */
17192
17193 static bool
17194 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
17195 {
17196 unsigned int i, j, chunk;
17197
17198 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
17199 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
17200 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
17201 return false;
17202
17203 if (GET_MODE_NUNITS (vmode) >= d->nelt)
17204 return true;
17205
17206 chunk = d->nelt / GET_MODE_NUNITS (vmode);
17207 for (i = 0; i < d->nelt; i += chunk)
17208 if (d->perm[i] & (chunk - 1))
17209 return false;
17210 else
17211 for (j = 1; j < chunk; ++j)
17212 if (d->perm[i] + j != d->perm[i + j])
17213 return false;
17214
17215 return true;
17216 }
17217
17218 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
17219 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
17220
17221 static bool
17222 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
17223 {
17224 unsigned i, nelt, eltsz, mask;
17225 unsigned char perm[64];
17226 machine_mode vmode = V16QImode;
17227 rtx rperm[64], vperm, target, op0, op1;
17228
17229 nelt = d->nelt;
17230
17231 if (!d->one_operand_p)
17232 {
17233 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
17234 {
17235 if (TARGET_AVX2
17236 && valid_perm_using_mode_p (V2TImode, d))
17237 {
17238 if (d->testing_p)
17239 return true;
17240
17241 /* Use vperm2i128 insn. The pattern uses
17242 V4DImode instead of V2TImode. */
17243 target = d->target;
17244 if (d->vmode != V4DImode)
17245 target = gen_reg_rtx (V4DImode);
17246 op0 = gen_lowpart (V4DImode, d->op0);
17247 op1 = gen_lowpart (V4DImode, d->op1);
17248 rperm[0]
17249 = GEN_INT ((d->perm[0] / (nelt / 2))
17250 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
17251 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
17252 if (target != d->target)
17253 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
17254 return true;
17255 }
17256 return false;
17257 }
17258 }
17259 else
17260 {
17261 if (GET_MODE_SIZE (d->vmode) == 16)
17262 {
17263 if (!TARGET_SSSE3)
17264 return false;
17265 }
17266 else if (GET_MODE_SIZE (d->vmode) == 32)
17267 {
17268 if (!TARGET_AVX2)
17269 return false;
17270
17271 /* V4DImode should be already handled through
17272 expand_vselect by vpermq instruction. */
17273 gcc_assert (d->vmode != V4DImode);
17274
17275 vmode = V32QImode;
17276 if (d->vmode == V8SImode
17277 || d->vmode == V16HImode
17278 || d->vmode == V32QImode)
17279 {
17280 /* First see if vpermq can be used for
17281 V8SImode/V16HImode/V32QImode. */
17282 if (valid_perm_using_mode_p (V4DImode, d))
17283 {
17284 for (i = 0; i < 4; i++)
17285 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
17286 if (d->testing_p)
17287 return true;
17288 target = gen_reg_rtx (V4DImode);
17289 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
17290 perm, 4, false))
17291 {
17292 emit_move_insn (d->target,
17293 gen_lowpart (d->vmode, target));
17294 return true;
17295 }
17296 return false;
17297 }
17298
17299 /* Next see if vpermd can be used. */
17300 if (valid_perm_using_mode_p (V8SImode, d))
17301 vmode = V8SImode;
17302 }
17303 /* Or if vpermps can be used. */
17304 else if (d->vmode == V8SFmode)
17305 vmode = V8SImode;
17306
17307 if (vmode == V32QImode)
17308 {
17309 /* vpshufb only works intra lanes, it is not
17310 possible to shuffle bytes in between the lanes. */
17311 for (i = 0; i < nelt; ++i)
17312 if ((d->perm[i] ^ i) & (nelt / 2))
17313 return false;
17314 }
17315 }
17316 else if (GET_MODE_SIZE (d->vmode) == 64)
17317 {
17318 if (!TARGET_AVX512BW)
17319 return false;
17320
17321 /* If vpermq didn't work, vpshufb won't work either. */
17322 if (d->vmode == V8DFmode || d->vmode == V8DImode)
17323 return false;
17324
17325 vmode = V64QImode;
17326 if (d->vmode == V16SImode
17327 || d->vmode == V32HImode
17328 || d->vmode == V64QImode)
17329 {
17330 /* First see if vpermq can be used for
17331 V16SImode/V32HImode/V64QImode. */
17332 if (valid_perm_using_mode_p (V8DImode, d))
17333 {
17334 for (i = 0; i < 8; i++)
17335 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
17336 if (d->testing_p)
17337 return true;
17338 target = gen_reg_rtx (V8DImode);
17339 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
17340 perm, 8, false))
17341 {
17342 emit_move_insn (d->target,
17343 gen_lowpart (d->vmode, target));
17344 return true;
17345 }
17346 return false;
17347 }
17348
17349 /* Next see if vpermd can be used. */
17350 if (valid_perm_using_mode_p (V16SImode, d))
17351 vmode = V16SImode;
17352 }
17353 /* Or if vpermps can be used. */
17354 else if (d->vmode == V16SFmode)
17355 vmode = V16SImode;
17356 if (vmode == V64QImode)
17357 {
17358 /* vpshufb only works intra lanes, it is not
17359 possible to shuffle bytes in between the lanes. */
17360 for (i = 0; i < nelt; ++i)
17361 if ((d->perm[i] ^ i) & (3 * nelt / 4))
17362 return false;
17363 }
17364 }
17365 else
17366 return false;
17367 }
17368
17369 if (d->testing_p)
17370 return true;
17371
17372 if (vmode == V8SImode)
17373 for (i = 0; i < 8; ++i)
17374 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
17375 else if (vmode == V16SImode)
17376 for (i = 0; i < 16; ++i)
17377 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
17378 else
17379 {
17380 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
17381 if (!d->one_operand_p)
17382 mask = 2 * nelt - 1;
17383 else if (vmode == V16QImode)
17384 mask = nelt - 1;
17385 else if (vmode == V64QImode)
17386 mask = nelt / 4 - 1;
17387 else
17388 mask = nelt / 2 - 1;
17389
17390 for (i = 0; i < nelt; ++i)
17391 {
17392 unsigned j, e = d->perm[i] & mask;
17393 for (j = 0; j < eltsz; ++j)
17394 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
17395 }
17396 }
17397
17398 vperm = gen_rtx_CONST_VECTOR (vmode,
17399 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
17400 vperm = force_reg (vmode, vperm);
17401
17402 target = d->target;
17403 if (d->vmode != vmode)
17404 target = gen_reg_rtx (vmode);
17405 op0 = gen_lowpart (vmode, d->op0);
17406 if (d->one_operand_p)
17407 {
17408 if (vmode == V16QImode)
17409 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
17410 else if (vmode == V32QImode)
17411 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
17412 else if (vmode == V64QImode)
17413 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
17414 else if (vmode == V8SFmode)
17415 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
17416 else if (vmode == V8SImode)
17417 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
17418 else if (vmode == V16SFmode)
17419 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
17420 else if (vmode == V16SImode)
17421 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
17422 else
17423 gcc_unreachable ();
17424 }
17425 else
17426 {
17427 op1 = gen_lowpart (vmode, d->op1);
17428 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
17429 }
17430 if (target != d->target)
17431 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
17432
17433 return true;
17434 }
17435
17436 /* For V*[QHS]Imode permutations, check if the same permutation
17437 can't be performed in a 2x, 4x or 8x wider inner mode. */
17438
17439 static bool
17440 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
17441 struct expand_vec_perm_d *nd)
17442 {
17443 int i;
17444 machine_mode mode = VOIDmode;
17445
17446 switch (d->vmode)
17447 {
17448 case E_V16QImode: mode = V8HImode; break;
17449 case E_V32QImode: mode = V16HImode; break;
17450 case E_V64QImode: mode = V32HImode; break;
17451 case E_V8HImode: mode = V4SImode; break;
17452 case E_V16HImode: mode = V8SImode; break;
17453 case E_V32HImode: mode = V16SImode; break;
17454 case E_V4SImode: mode = V2DImode; break;
17455 case E_V8SImode: mode = V4DImode; break;
17456 case E_V16SImode: mode = V8DImode; break;
17457 default: return false;
17458 }
17459 for (i = 0; i < d->nelt; i += 2)
17460 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
17461 return false;
17462 nd->vmode = mode;
17463 nd->nelt = d->nelt / 2;
17464 for (i = 0; i < nd->nelt; i++)
17465 nd->perm[i] = d->perm[2 * i] / 2;
17466 if (GET_MODE_INNER (mode) != DImode)
17467 canonicalize_vector_int_perm (nd, nd);
17468 if (nd != d)
17469 {
17470 nd->one_operand_p = d->one_operand_p;
17471 nd->testing_p = d->testing_p;
17472 if (d->op0 == d->op1)
17473 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
17474 else
17475 {
17476 nd->op0 = gen_lowpart (nd->vmode, d->op0);
17477 nd->op1 = gen_lowpart (nd->vmode, d->op1);
17478 }
17479 if (d->testing_p)
17480 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
17481 else
17482 nd->target = gen_reg_rtx (nd->vmode);
17483 }
17484 return true;
17485 }
17486
17487 /* Try to expand one-operand permutation with constant mask. */
17488
17489 static bool
17490 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
17491 {
17492 machine_mode mode = GET_MODE (d->op0);
17493 machine_mode maskmode = mode;
17494 rtx (*gen) (rtx, rtx, rtx) = NULL;
17495 rtx target, op0, mask;
17496 rtx vec[64];
17497
17498 if (!rtx_equal_p (d->op0, d->op1))
17499 return false;
17500
17501 if (!TARGET_AVX512F)
17502 return false;
17503
17504 switch (mode)
17505 {
17506 case E_V16SImode:
17507 gen = gen_avx512f_permvarv16si;
17508 break;
17509 case E_V16SFmode:
17510 gen = gen_avx512f_permvarv16sf;
17511 maskmode = V16SImode;
17512 break;
17513 case E_V8DImode:
17514 gen = gen_avx512f_permvarv8di;
17515 break;
17516 case E_V8DFmode:
17517 gen = gen_avx512f_permvarv8df;
17518 maskmode = V8DImode;
17519 break;
17520 default:
17521 return false;
17522 }
17523
17524 target = d->target;
17525 op0 = d->op0;
17526 for (int i = 0; i < d->nelt; ++i)
17527 vec[i] = GEN_INT (d->perm[i]);
17528 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
17529 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
17530 return true;
17531 }
17532
17533 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
17534
17535 /* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
17536 in a single instruction. */
17537
17538 static bool
17539 expand_vec_perm_1 (struct expand_vec_perm_d *d)
17540 {
17541 unsigned i, nelt = d->nelt;
17542 struct expand_vec_perm_d nd;
17543
17544 /* Check plain VEC_SELECT first, because AVX has instructions that could
17545 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
17546 input where SEL+CONCAT may not. */
17547 if (d->one_operand_p)
17548 {
17549 int mask = nelt - 1;
17550 bool identity_perm = true;
17551 bool broadcast_perm = true;
17552
17553 for (i = 0; i < nelt; i++)
17554 {
17555 nd.perm[i] = d->perm[i] & mask;
17556 if (nd.perm[i] != i)
17557 identity_perm = false;
17558 if (nd.perm[i])
17559 broadcast_perm = false;
17560 }
17561
17562 if (identity_perm)
17563 {
17564 if (!d->testing_p)
17565 emit_move_insn (d->target, d->op0);
17566 return true;
17567 }
17568 else if (broadcast_perm && TARGET_AVX2)
17569 {
17570 /* Use vpbroadcast{b,w,d}. */
17571 rtx (*gen) (rtx, rtx) = NULL;
17572 switch (d->vmode)
17573 {
17574 case E_V64QImode:
17575 if (TARGET_AVX512BW)
17576 gen = gen_avx512bw_vec_dupv64qi_1;
17577 break;
17578 case E_V32QImode:
17579 gen = gen_avx2_pbroadcastv32qi_1;
17580 break;
17581 case E_V32HImode:
17582 if (TARGET_AVX512BW)
17583 gen = gen_avx512bw_vec_dupv32hi_1;
17584 break;
17585 case E_V16HImode:
17586 gen = gen_avx2_pbroadcastv16hi_1;
17587 break;
17588 case E_V16SImode:
17589 if (TARGET_AVX512F)
17590 gen = gen_avx512f_vec_dupv16si_1;
17591 break;
17592 case E_V8SImode:
17593 gen = gen_avx2_pbroadcastv8si_1;
17594 break;
17595 case E_V16QImode:
17596 gen = gen_avx2_pbroadcastv16qi;
17597 break;
17598 case E_V8HImode:
17599 gen = gen_avx2_pbroadcastv8hi;
17600 break;
17601 case E_V16SFmode:
17602 if (TARGET_AVX512F)
17603 gen = gen_avx512f_vec_dupv16sf_1;
17604 break;
17605 case E_V8SFmode:
17606 gen = gen_avx2_vec_dupv8sf_1;
17607 break;
17608 case E_V8DFmode:
17609 if (TARGET_AVX512F)
17610 gen = gen_avx512f_vec_dupv8df_1;
17611 break;
17612 case E_V8DImode:
17613 if (TARGET_AVX512F)
17614 gen = gen_avx512f_vec_dupv8di_1;
17615 break;
17616 /* For other modes prefer other shuffles this function creates. */
17617 default: break;
17618 }
17619 if (gen != NULL)
17620 {
17621 if (!d->testing_p)
17622 emit_insn (gen (d->target, d->op0));
17623 return true;
17624 }
17625 }
17626
17627 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
17628 return true;
17629
17630 /* There are plenty of patterns in sse.md that are written for
17631 SEL+CONCAT and are not replicated for a single op. Perhaps
17632 that should be changed, to avoid the nastiness here. */
17633
17634 /* Recognize interleave style patterns, which means incrementing
17635 every other permutation operand. */
17636 for (i = 0; i < nelt; i += 2)
17637 {
17638 nd.perm[i] = d->perm[i] & mask;
17639 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
17640 }
17641 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
17642 d->testing_p))
17643 return true;
17644
17645 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
17646 if (nelt >= 4)
17647 {
17648 for (i = 0; i < nelt; i += 4)
17649 {
17650 nd.perm[i + 0] = d->perm[i + 0] & mask;
17651 nd.perm[i + 1] = d->perm[i + 1] & mask;
17652 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
17653 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
17654 }
17655
17656 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
17657 d->testing_p))
17658 return true;
17659 }
17660 }
17661
17662 /* Try movss/movsd instructions. */
17663 if (expand_vec_perm_movs (d))
17664 return true;
17665
17666 /* Finally, try the fully general two operand permute. */
17667 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
17668 d->testing_p))
17669 return true;
17670
17671 /* Recognize interleave style patterns with reversed operands. */
17672 if (!d->one_operand_p)
17673 {
17674 for (i = 0; i < nelt; ++i)
17675 {
17676 unsigned e = d->perm[i];
17677 if (e >= nelt)
17678 e -= nelt;
17679 else
17680 e += nelt;
17681 nd.perm[i] = e;
17682 }
17683
17684 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
17685 d->testing_p))
17686 return true;
17687 }
17688
17689 /* Try the SSE4.1 blend variable merge instructions. */
17690 if (expand_vec_perm_blend (d))
17691 return true;
17692
17693 /* Try one of the AVX vpermil variable permutations. */
17694 if (expand_vec_perm_vpermil (d))
17695 return true;
17696
17697 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
17698 vpshufb, vpermd, vpermps or vpermq variable permutation. */
17699 if (expand_vec_perm_pshufb (d))
17700 return true;
17701
17702 /* Try the AVX2 vpalignr instruction. */
17703 if (expand_vec_perm_palignr (d, true))
17704 return true;
17705
17706 /* Try the AVX512F vperm{s,d} instructions. */
17707 if (ix86_expand_vec_one_operand_perm_avx512 (d))
17708 return true;
17709
17710 /* Try the AVX512F vpermt2/vpermi2 instructions. */
17711 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
17712 return true;
17713
17714 /* See if we can get the same permutation in different vector integer
17715 mode. */
17716 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
17717 {
17718 if (!d->testing_p)
17719 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
17720 return true;
17721 }
17722 return false;
17723 }
17724
17725 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
17726 in terms of a pair of pshuflw + pshufhw instructions. */
17727
17728 static bool
17729 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
17730 {
17731 unsigned char perm2[MAX_VECT_LEN];
17732 unsigned i;
17733 bool ok;
17734
17735 if (d->vmode != V8HImode || !d->one_operand_p)
17736 return false;
17737
17738 /* The two permutations only operate in 64-bit lanes. */
17739 for (i = 0; i < 4; ++i)
17740 if (d->perm[i] >= 4)
17741 return false;
17742 for (i = 4; i < 8; ++i)
17743 if (d->perm[i] < 4)
17744 return false;
17745
17746 if (d->testing_p)
17747 return true;
17748
17749 /* Emit the pshuflw. */
17750 memcpy (perm2, d->perm, 4);
17751 for (i = 4; i < 8; ++i)
17752 perm2[i] = i;
17753 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
17754 gcc_assert (ok);
17755
17756 /* Emit the pshufhw. */
17757 memcpy (perm2 + 4, d->perm + 4, 4);
17758 for (i = 0; i < 4; ++i)
17759 perm2[i] = i;
17760 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
17761 gcc_assert (ok);
17762
17763 return true;
17764 }
17765
17766 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17767 the permutation using the SSSE3 palignr instruction. This succeeds
17768 when all of the elements in PERM fit within one vector and we merely
17769 need to shift them down so that a single vector permutation has a
17770 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
17771 the vpalignr instruction itself can perform the requested permutation. */
17772
17773 static bool
17774 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
17775 {
17776 unsigned i, nelt = d->nelt;
17777 unsigned min, max, minswap, maxswap;
17778 bool in_order, ok, swap = false;
17779 rtx shift, target;
17780 struct expand_vec_perm_d dcopy;
17781
17782 /* Even with AVX, palignr only operates on 128-bit vectors,
17783 in AVX2 palignr operates on both 128-bit lanes. */
17784 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
17785 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
17786 return false;
17787
17788 min = 2 * nelt;
17789 max = 0;
17790 minswap = 2 * nelt;
17791 maxswap = 0;
17792 for (i = 0; i < nelt; ++i)
17793 {
17794 unsigned e = d->perm[i];
17795 unsigned eswap = d->perm[i] ^ nelt;
17796 if (GET_MODE_SIZE (d->vmode) == 32)
17797 {
17798 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
17799 eswap = e ^ (nelt / 2);
17800 }
17801 if (e < min)
17802 min = e;
17803 if (e > max)
17804 max = e;
17805 if (eswap < minswap)
17806 minswap = eswap;
17807 if (eswap > maxswap)
17808 maxswap = eswap;
17809 }
17810 if (min == 0
17811 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
17812 {
17813 if (d->one_operand_p
17814 || minswap == 0
17815 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
17816 ? nelt / 2 : nelt))
17817 return false;
17818 swap = true;
17819 min = minswap;
17820 max = maxswap;
17821 }
17822
17823 /* Given that we have SSSE3, we know we'll be able to implement the
17824 single operand permutation after the palignr with pshufb for
17825 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
17826 first. */
17827 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
17828 return true;
17829
17830 dcopy = *d;
17831 if (swap)
17832 {
17833 dcopy.op0 = d->op1;
17834 dcopy.op1 = d->op0;
17835 for (i = 0; i < nelt; ++i)
17836 dcopy.perm[i] ^= nelt;
17837 }
17838
17839 in_order = true;
17840 for (i = 0; i < nelt; ++i)
17841 {
17842 unsigned e = dcopy.perm[i];
17843 if (GET_MODE_SIZE (d->vmode) == 32
17844 && e >= nelt
17845 && (e & (nelt / 2 - 1)) < min)
17846 e = e - min - (nelt / 2);
17847 else
17848 e = e - min;
17849 if (e != i)
17850 in_order = false;
17851 dcopy.perm[i] = e;
17852 }
17853 dcopy.one_operand_p = true;
17854
17855 if (single_insn_only_p && !in_order)
17856 return false;
17857
17858 /* For AVX2, test whether we can permute the result in one instruction. */
17859 if (d->testing_p)
17860 {
17861 if (in_order)
17862 return true;
17863 dcopy.op1 = dcopy.op0;
17864 return expand_vec_perm_1 (&dcopy);
17865 }
17866
17867 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
17868 if (GET_MODE_SIZE (d->vmode) == 16)
17869 {
17870 target = gen_reg_rtx (TImode);
17871 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
17872 gen_lowpart (TImode, dcopy.op0), shift));
17873 }
17874 else
17875 {
17876 target = gen_reg_rtx (V2TImode);
17877 emit_insn (gen_avx2_palignrv2ti (target,
17878 gen_lowpart (V2TImode, dcopy.op1),
17879 gen_lowpart (V2TImode, dcopy.op0),
17880 shift));
17881 }
17882
17883 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
17884
17885 /* Test for the degenerate case where the alignment by itself
17886 produces the desired permutation. */
17887 if (in_order)
17888 {
17889 emit_move_insn (d->target, dcopy.op0);
17890 return true;
17891 }
17892
17893 ok = expand_vec_perm_1 (&dcopy);
17894 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
17895
17896 return ok;
17897 }
17898
17899 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17900 the permutation using the SSE4_1 pblendv instruction. Potentially
17901 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
17902
17903 static bool
17904 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
17905 {
17906 unsigned i, which, nelt = d->nelt;
17907 struct expand_vec_perm_d dcopy, dcopy1;
17908 machine_mode vmode = d->vmode;
17909 bool ok;
17910
17911 /* Use the same checks as in expand_vec_perm_blend. */
17912 if (d->one_operand_p)
17913 return false;
17914 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
17915 ;
17916 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
17917 ;
17918 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
17919 ;
17920 else
17921 return false;
17922
17923 /* Figure out where permutation elements stay not in their
17924 respective lanes. */
17925 for (i = 0, which = 0; i < nelt; ++i)
17926 {
17927 unsigned e = d->perm[i];
17928 if (e != i)
17929 which |= (e < nelt ? 1 : 2);
17930 }
17931 /* We can pblend the part where elements stay not in their
17932 respective lanes only when these elements are all in one
17933 half of a permutation.
17934 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
17935 lanes, but both 8 and 9 >= 8
17936 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
17937 respective lanes and 8 >= 8, but 2 not. */
17938 if (which != 1 && which != 2)
17939 return false;
17940 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
17941 return true;
17942
17943 /* First we apply one operand permutation to the part where
17944 elements stay not in their respective lanes. */
17945 dcopy = *d;
17946 if (which == 2)
17947 dcopy.op0 = dcopy.op1 = d->op1;
17948 else
17949 dcopy.op0 = dcopy.op1 = d->op0;
17950 if (!d->testing_p)
17951 dcopy.target = gen_reg_rtx (vmode);
17952 dcopy.one_operand_p = true;
17953
17954 for (i = 0; i < nelt; ++i)
17955 dcopy.perm[i] = d->perm[i] & (nelt - 1);
17956
17957 ok = expand_vec_perm_1 (&dcopy);
17958 if (GET_MODE_SIZE (vmode) != 16 && !ok)
17959 return false;
17960 else
17961 gcc_assert (ok);
17962 if (d->testing_p)
17963 return true;
17964
17965 /* Next we put permuted elements into their positions. */
17966 dcopy1 = *d;
17967 if (which == 2)
17968 dcopy1.op1 = dcopy.target;
17969 else
17970 dcopy1.op0 = dcopy.target;
17971
17972 for (i = 0; i < nelt; ++i)
17973 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
17974
17975 ok = expand_vec_perm_blend (&dcopy1);
17976 gcc_assert (ok);
17977
17978 return true;
17979 }
17980
17981 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
17982
17983 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17984 a two vector permutation into a single vector permutation by using
17985 an interleave operation to merge the vectors. */
17986
17987 static bool
17988 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
17989 {
17990 struct expand_vec_perm_d dremap, dfinal;
17991 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
17992 unsigned HOST_WIDE_INT contents;
17993 unsigned char remap[2 * MAX_VECT_LEN];
17994 rtx_insn *seq;
17995 bool ok, same_halves = false;
17996
17997 if (GET_MODE_SIZE (d->vmode) == 16)
17998 {
17999 if (d->one_operand_p)
18000 return false;
18001 }
18002 else if (GET_MODE_SIZE (d->vmode) == 32)
18003 {
18004 if (!TARGET_AVX)
18005 return false;
18006 /* For 32-byte modes allow even d->one_operand_p.
18007 The lack of cross-lane shuffling in some instructions
18008 might prevent a single insn shuffle. */
18009 dfinal = *d;
18010 dfinal.testing_p = true;
18011 /* If expand_vec_perm_interleave3 can expand this into
18012 a 3 insn sequence, give up and let it be expanded as
18013 3 insn sequence. While that is one insn longer,
18014 it doesn't need a memory operand and in the common
18015 case that both interleave low and high permutations
18016 with the same operands are adjacent needs 4 insns
18017 for both after CSE. */
18018 if (expand_vec_perm_interleave3 (&dfinal))
18019 return false;
18020 }
18021 else
18022 return false;
18023
18024 /* Examine from whence the elements come. */
18025 contents = 0;
18026 for (i = 0; i < nelt; ++i)
18027 contents |= HOST_WIDE_INT_1U << d->perm[i];
18028
18029 memset (remap, 0xff, sizeof (remap));
18030 dremap = *d;
18031
18032 if (GET_MODE_SIZE (d->vmode) == 16)
18033 {
18034 unsigned HOST_WIDE_INT h1, h2, h3, h4;
18035
18036 /* Split the two input vectors into 4 halves. */
18037 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
18038 h2 = h1 << nelt2;
18039 h3 = h2 << nelt2;
18040 h4 = h3 << nelt2;
18041
18042 /* If the elements from the low halves use interleave low, and similarly
18043 for interleave high. If the elements are from mis-matched halves, we
18044 can use shufps for V4SF/V4SI or do a DImode shuffle. */
18045 if ((contents & (h1 | h3)) == contents)
18046 {
18047 /* punpckl* */
18048 for (i = 0; i < nelt2; ++i)
18049 {
18050 remap[i] = i * 2;
18051 remap[i + nelt] = i * 2 + 1;
18052 dremap.perm[i * 2] = i;
18053 dremap.perm[i * 2 + 1] = i + nelt;
18054 }
18055 if (!TARGET_SSE2 && d->vmode == V4SImode)
18056 dremap.vmode = V4SFmode;
18057 }
18058 else if ((contents & (h2 | h4)) == contents)
18059 {
18060 /* punpckh* */
18061 for (i = 0; i < nelt2; ++i)
18062 {
18063 remap[i + nelt2] = i * 2;
18064 remap[i + nelt + nelt2] = i * 2 + 1;
18065 dremap.perm[i * 2] = i + nelt2;
18066 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
18067 }
18068 if (!TARGET_SSE2 && d->vmode == V4SImode)
18069 dremap.vmode = V4SFmode;
18070 }
18071 else if ((contents & (h1 | h4)) == contents)
18072 {
18073 /* shufps */
18074 for (i = 0; i < nelt2; ++i)
18075 {
18076 remap[i] = i;
18077 remap[i + nelt + nelt2] = i + nelt2;
18078 dremap.perm[i] = i;
18079 dremap.perm[i + nelt2] = i + nelt + nelt2;
18080 }
18081 if (nelt != 4)
18082 {
18083 /* shufpd */
18084 dremap.vmode = V2DImode;
18085 dremap.nelt = 2;
18086 dremap.perm[0] = 0;
18087 dremap.perm[1] = 3;
18088 }
18089 }
18090 else if ((contents & (h2 | h3)) == contents)
18091 {
18092 /* shufps */
18093 for (i = 0; i < nelt2; ++i)
18094 {
18095 remap[i + nelt2] = i;
18096 remap[i + nelt] = i + nelt2;
18097 dremap.perm[i] = i + nelt2;
18098 dremap.perm[i + nelt2] = i + nelt;
18099 }
18100 if (nelt != 4)
18101 {
18102 /* shufpd */
18103 dremap.vmode = V2DImode;
18104 dremap.nelt = 2;
18105 dremap.perm[0] = 1;
18106 dremap.perm[1] = 2;
18107 }
18108 }
18109 else
18110 return false;
18111 }
18112 else
18113 {
18114 unsigned int nelt4 = nelt / 4, nzcnt = 0;
18115 unsigned HOST_WIDE_INT q[8];
18116 unsigned int nonzero_halves[4];
18117
18118 /* Split the two input vectors into 8 quarters. */
18119 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
18120 for (i = 1; i < 8; ++i)
18121 q[i] = q[0] << (nelt4 * i);
18122 for (i = 0; i < 4; ++i)
18123 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
18124 {
18125 nonzero_halves[nzcnt] = i;
18126 ++nzcnt;
18127 }
18128
18129 if (nzcnt == 1)
18130 {
18131 gcc_assert (d->one_operand_p);
18132 nonzero_halves[1] = nonzero_halves[0];
18133 same_halves = true;
18134 }
18135 else if (d->one_operand_p)
18136 {
18137 gcc_assert (nonzero_halves[0] == 0);
18138 gcc_assert (nonzero_halves[1] == 1);
18139 }
18140
18141 if (nzcnt <= 2)
18142 {
18143 if (d->perm[0] / nelt2 == nonzero_halves[1])
18144 {
18145 /* Attempt to increase the likelihood that dfinal
18146 shuffle will be intra-lane. */
18147 std::swap (nonzero_halves[0], nonzero_halves[1]);
18148 }
18149
18150 /* vperm2f128 or vperm2i128. */
18151 for (i = 0; i < nelt2; ++i)
18152 {
18153 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
18154 remap[i + nonzero_halves[0] * nelt2] = i;
18155 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
18156 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
18157 }
18158
18159 if (d->vmode != V8SFmode
18160 && d->vmode != V4DFmode
18161 && d->vmode != V8SImode)
18162 {
18163 dremap.vmode = V8SImode;
18164 dremap.nelt = 8;
18165 for (i = 0; i < 4; ++i)
18166 {
18167 dremap.perm[i] = i + nonzero_halves[0] * 4;
18168 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
18169 }
18170 }
18171 }
18172 else if (d->one_operand_p)
18173 return false;
18174 else if (TARGET_AVX2
18175 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
18176 {
18177 /* vpunpckl* */
18178 for (i = 0; i < nelt4; ++i)
18179 {
18180 remap[i] = i * 2;
18181 remap[i + nelt] = i * 2 + 1;
18182 remap[i + nelt2] = i * 2 + nelt2;
18183 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
18184 dremap.perm[i * 2] = i;
18185 dremap.perm[i * 2 + 1] = i + nelt;
18186 dremap.perm[i * 2 + nelt2] = i + nelt2;
18187 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
18188 }
18189 }
18190 else if (TARGET_AVX2
18191 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
18192 {
18193 /* vpunpckh* */
18194 for (i = 0; i < nelt4; ++i)
18195 {
18196 remap[i + nelt4] = i * 2;
18197 remap[i + nelt + nelt4] = i * 2 + 1;
18198 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
18199 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
18200 dremap.perm[i * 2] = i + nelt4;
18201 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
18202 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
18203 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
18204 }
18205 }
18206 else
18207 return false;
18208 }
18209
18210 /* Use the remapping array set up above to move the elements from their
18211 swizzled locations into their final destinations. */
18212 dfinal = *d;
18213 for (i = 0; i < nelt; ++i)
18214 {
18215 unsigned e = remap[d->perm[i]];
18216 gcc_assert (e < nelt);
18217 /* If same_halves is true, both halves of the remapped vector are the
18218 same. Avoid cross-lane accesses if possible. */
18219 if (same_halves && i >= nelt2)
18220 {
18221 gcc_assert (e < nelt2);
18222 dfinal.perm[i] = e + nelt2;
18223 }
18224 else
18225 dfinal.perm[i] = e;
18226 }
18227 if (!d->testing_p)
18228 {
18229 dremap.target = gen_reg_rtx (dremap.vmode);
18230 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
18231 }
18232 dfinal.op1 = dfinal.op0;
18233 dfinal.one_operand_p = true;
18234
18235 /* Test if the final remap can be done with a single insn. For V4SFmode or
18236 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
18237 start_sequence ();
18238 ok = expand_vec_perm_1 (&dfinal);
18239 seq = get_insns ();
18240 end_sequence ();
18241
18242 if (!ok)
18243 return false;
18244
18245 if (d->testing_p)
18246 return true;
18247
18248 if (dremap.vmode != dfinal.vmode)
18249 {
18250 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
18251 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
18252 }
18253
18254 ok = expand_vec_perm_1 (&dremap);
18255 gcc_assert (ok);
18256
18257 emit_insn (seq);
18258 return true;
18259 }
18260
18261 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
18262 a single vector cross-lane permutation into vpermq followed
18263 by any of the single insn permutations. */
18264
18265 static bool
18266 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
18267 {
18268 struct expand_vec_perm_d dremap, dfinal;
18269 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
18270 unsigned contents[2];
18271 bool ok;
18272
18273 if (!(TARGET_AVX2
18274 && (d->vmode == V32QImode || d->vmode == V16HImode)
18275 && d->one_operand_p))
18276 return false;
18277
18278 contents[0] = 0;
18279 contents[1] = 0;
18280 for (i = 0; i < nelt2; ++i)
18281 {
18282 contents[0] |= 1u << (d->perm[i] / nelt4);
18283 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
18284 }
18285
18286 for (i = 0; i < 2; ++i)
18287 {
18288 unsigned int cnt = 0;
18289 for (j = 0; j < 4; ++j)
18290 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
18291 return false;
18292 }
18293
18294 if (d->testing_p)
18295 return true;
18296
18297 dremap = *d;
18298 dremap.vmode = V4DImode;
18299 dremap.nelt = 4;
18300 dremap.target = gen_reg_rtx (V4DImode);
18301 dremap.op0 = gen_lowpart (V4DImode, d->op0);
18302 dremap.op1 = dremap.op0;
18303 dremap.one_operand_p = true;
18304 for (i = 0; i < 2; ++i)
18305 {
18306 unsigned int cnt = 0;
18307 for (j = 0; j < 4; ++j)
18308 if ((contents[i] & (1u << j)) != 0)
18309 dremap.perm[2 * i + cnt++] = j;
18310 for (; cnt < 2; ++cnt)
18311 dremap.perm[2 * i + cnt] = 0;
18312 }
18313
18314 dfinal = *d;
18315 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
18316 dfinal.op1 = dfinal.op0;
18317 dfinal.one_operand_p = true;
18318 for (i = 0, j = 0; i < nelt; ++i)
18319 {
18320 if (i == nelt2)
18321 j = 2;
18322 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
18323 if ((d->perm[i] / nelt4) == dremap.perm[j])
18324 ;
18325 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
18326 dfinal.perm[i] |= nelt4;
18327 else
18328 gcc_unreachable ();
18329 }
18330
18331 ok = expand_vec_perm_1 (&dremap);
18332 gcc_assert (ok);
18333
18334 ok = expand_vec_perm_1 (&dfinal);
18335 gcc_assert (ok);
18336
18337 return true;
18338 }
18339
18340 static bool canonicalize_perm (struct expand_vec_perm_d *d);
18341
18342 /* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
18343 a vector permutation using two instructions, vperm2f128 resp.
18344 vperm2i128 followed by any single in-lane permutation. */
18345
18346 static bool
18347 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
18348 {
18349 struct expand_vec_perm_d dfirst, dsecond;
18350 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
18351 bool ok;
18352
18353 if (!TARGET_AVX
18354 || GET_MODE_SIZE (d->vmode) != 32
18355 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
18356 return false;
18357
18358 dsecond = *d;
18359 dsecond.one_operand_p = false;
18360 dsecond.testing_p = true;
18361
18362 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
18363 immediate. For perm < 16 the second permutation uses
18364 d->op0 as first operand, for perm >= 16 it uses d->op1
18365 as first operand. The second operand is the result of
18366 vperm2[fi]128. */
18367 for (perm = 0; perm < 32; perm++)
18368 {
18369 /* Ignore permutations which do not move anything cross-lane. */
18370 if (perm < 16)
18371 {
18372 /* The second shuffle for e.g. V4DFmode has
18373 0123 and ABCD operands.
18374 Ignore AB23, as 23 is already in the second lane
18375 of the first operand. */
18376 if ((perm & 0xc) == (1 << 2)) continue;
18377 /* And 01CD, as 01 is in the first lane of the first
18378 operand. */
18379 if ((perm & 3) == 0) continue;
18380 /* And 4567, as then the vperm2[fi]128 doesn't change
18381 anything on the original 4567 second operand. */
18382 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
18383 }
18384 else
18385 {
18386 /* The second shuffle for e.g. V4DFmode has
18387 4567 and ABCD operands.
18388 Ignore AB67, as 67 is already in the second lane
18389 of the first operand. */
18390 if ((perm & 0xc) == (3 << 2)) continue;
18391 /* And 45CD, as 45 is in the first lane of the first
18392 operand. */
18393 if ((perm & 3) == 2) continue;
18394 /* And 0123, as then the vperm2[fi]128 doesn't change
18395 anything on the original 0123 first operand. */
18396 if ((perm & 0xf) == (1 << 2)) continue;
18397 }
18398
18399 for (i = 0; i < nelt; i++)
18400 {
18401 j = d->perm[i] / nelt2;
18402 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
18403 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
18404 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
18405 dsecond.perm[i] = d->perm[i] & (nelt - 1);
18406 else
18407 break;
18408 }
18409
18410 if (i == nelt)
18411 {
18412 start_sequence ();
18413 ok = expand_vec_perm_1 (&dsecond);
18414 end_sequence ();
18415 }
18416 else
18417 ok = false;
18418
18419 if (ok)
18420 {
18421 if (d->testing_p)
18422 return true;
18423
18424 /* Found a usable second shuffle. dfirst will be
18425 vperm2f128 on d->op0 and d->op1. */
18426 dsecond.testing_p = false;
18427 dfirst = *d;
18428 dfirst.target = gen_reg_rtx (d->vmode);
18429 for (i = 0; i < nelt; i++)
18430 dfirst.perm[i] = (i & (nelt2 - 1))
18431 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
18432
18433 canonicalize_perm (&dfirst);
18434 ok = expand_vec_perm_1 (&dfirst);
18435 gcc_assert (ok);
18436
18437 /* And dsecond is some single insn shuffle, taking
18438 d->op0 and result of vperm2f128 (if perm < 16) or
18439 d->op1 and result of vperm2f128 (otherwise). */
18440 if (perm >= 16)
18441 dsecond.op0 = dsecond.op1;
18442 dsecond.op1 = dfirst.target;
18443
18444 ok = expand_vec_perm_1 (&dsecond);
18445 gcc_assert (ok);
18446
18447 return true;
18448 }
18449
18450 /* For one operand, the only useful vperm2f128 permutation is 0x01
18451 aka lanes swap. */
18452 if (d->one_operand_p)
18453 return false;
18454 }
18455
18456 return false;
18457 }
18458
18459 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
18460 a two vector permutation using 2 intra-lane interleave insns
18461 and cross-lane shuffle for 32-byte vectors. */
18462
18463 static bool
18464 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
18465 {
18466 unsigned i, nelt;
18467 rtx (*gen) (rtx, rtx, rtx);
18468
18469 if (d->one_operand_p)
18470 return false;
18471 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
18472 ;
18473 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
18474 ;
18475 else
18476 return false;
18477
18478 nelt = d->nelt;
18479 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
18480 return false;
18481 for (i = 0; i < nelt; i += 2)
18482 if (d->perm[i] != d->perm[0] + i / 2
18483 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
18484 return false;
18485
18486 if (d->testing_p)
18487 return true;
18488
18489 switch (d->vmode)
18490 {
18491 case E_V32QImode:
18492 if (d->perm[0])
18493 gen = gen_vec_interleave_highv32qi;
18494 else
18495 gen = gen_vec_interleave_lowv32qi;
18496 break;
18497 case E_V16HImode:
18498 if (d->perm[0])
18499 gen = gen_vec_interleave_highv16hi;
18500 else
18501 gen = gen_vec_interleave_lowv16hi;
18502 break;
18503 case E_V8SImode:
18504 if (d->perm[0])
18505 gen = gen_vec_interleave_highv8si;
18506 else
18507 gen = gen_vec_interleave_lowv8si;
18508 break;
18509 case E_V4DImode:
18510 if (d->perm[0])
18511 gen = gen_vec_interleave_highv4di;
18512 else
18513 gen = gen_vec_interleave_lowv4di;
18514 break;
18515 case E_V8SFmode:
18516 if (d->perm[0])
18517 gen = gen_vec_interleave_highv8sf;
18518 else
18519 gen = gen_vec_interleave_lowv8sf;
18520 break;
18521 case E_V4DFmode:
18522 if (d->perm[0])
18523 gen = gen_vec_interleave_highv4df;
18524 else
18525 gen = gen_vec_interleave_lowv4df;
18526 break;
18527 default:
18528 gcc_unreachable ();
18529 }
18530
18531 emit_insn (gen (d->target, d->op0, d->op1));
18532 return true;
18533 }
18534
18535 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
18536 a single vector permutation using a single intra-lane vector
18537 permutation, vperm2f128 swapping the lanes and vblend* insn blending
18538 the non-swapped and swapped vectors together. */
18539
18540 static bool
18541 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
18542 {
18543 struct expand_vec_perm_d dfirst, dsecond;
18544 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
18545 rtx_insn *seq;
18546 bool ok;
18547 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
18548
18549 if (!TARGET_AVX
18550 || TARGET_AVX2
18551 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
18552 || !d->one_operand_p)
18553 return false;
18554
18555 dfirst = *d;
18556 for (i = 0; i < nelt; i++)
18557 dfirst.perm[i] = 0xff;
18558 for (i = 0, msk = 0; i < nelt; i++)
18559 {
18560 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
18561 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
18562 return false;
18563 dfirst.perm[j] = d->perm[i];
18564 if (j != i)
18565 msk |= (1 << i);
18566 }
18567 for (i = 0; i < nelt; i++)
18568 if (dfirst.perm[i] == 0xff)
18569 dfirst.perm[i] = i;
18570
18571 if (!d->testing_p)
18572 dfirst.target = gen_reg_rtx (dfirst.vmode);
18573
18574 start_sequence ();
18575 ok = expand_vec_perm_1 (&dfirst);
18576 seq = get_insns ();
18577 end_sequence ();
18578
18579 if (!ok)
18580 return false;
18581
18582 if (d->testing_p)
18583 return true;
18584
18585 emit_insn (seq);
18586
18587 dsecond = *d;
18588 dsecond.op0 = dfirst.target;
18589 dsecond.op1 = dfirst.target;
18590 dsecond.one_operand_p = true;
18591 dsecond.target = gen_reg_rtx (dsecond.vmode);
18592 for (i = 0; i < nelt; i++)
18593 dsecond.perm[i] = i ^ nelt2;
18594
18595 ok = expand_vec_perm_1 (&dsecond);
18596 gcc_assert (ok);
18597
18598 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
18599 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
18600 return true;
18601 }
18602
18603 /* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
18604 permutation using two vperm2f128, followed by a vshufpd insn blending
18605 the two vectors together. */
18606
18607 static bool
18608 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
18609 {
18610 struct expand_vec_perm_d dfirst, dsecond, dthird;
18611 bool ok;
18612
18613 if (!TARGET_AVX || (d->vmode != V4DFmode))
18614 return false;
18615
18616 if (d->testing_p)
18617 return true;
18618
18619 dfirst = *d;
18620 dsecond = *d;
18621 dthird = *d;
18622
18623 dfirst.perm[0] = (d->perm[0] & ~1);
18624 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
18625 dfirst.perm[2] = (d->perm[2] & ~1);
18626 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
18627 dsecond.perm[0] = (d->perm[1] & ~1);
18628 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
18629 dsecond.perm[2] = (d->perm[3] & ~1);
18630 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
18631 dthird.perm[0] = (d->perm[0] % 2);
18632 dthird.perm[1] = (d->perm[1] % 2) + 4;
18633 dthird.perm[2] = (d->perm[2] % 2) + 2;
18634 dthird.perm[3] = (d->perm[3] % 2) + 6;
18635
18636 dfirst.target = gen_reg_rtx (dfirst.vmode);
18637 dsecond.target = gen_reg_rtx (dsecond.vmode);
18638 dthird.op0 = dfirst.target;
18639 dthird.op1 = dsecond.target;
18640 dthird.one_operand_p = false;
18641
18642 canonicalize_perm (&dfirst);
18643 canonicalize_perm (&dsecond);
18644
18645 ok = expand_vec_perm_1 (&dfirst)
18646 && expand_vec_perm_1 (&dsecond)
18647 && expand_vec_perm_1 (&dthird);
18648
18649 gcc_assert (ok);
18650
18651 return true;
18652 }
18653
18654 static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
18655
18656 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
18657 a two vector permutation using two intra-lane vector
18658 permutations, vperm2f128 swapping the lanes and vblend* insn blending
18659 the non-swapped and swapped vectors together. */
18660
18661 static bool
18662 expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
18663 {
18664 struct expand_vec_perm_d dfirst, dsecond, dthird;
18665 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
18666 rtx_insn *seq1, *seq2;
18667 bool ok;
18668 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
18669
18670 if (!TARGET_AVX
18671 || TARGET_AVX2
18672 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
18673 || d->one_operand_p)
18674 return false;
18675
18676 dfirst = *d;
18677 dsecond = *d;
18678 for (i = 0; i < nelt; i++)
18679 {
18680 dfirst.perm[i] = 0xff;
18681 dsecond.perm[i] = 0xff;
18682 }
18683 for (i = 0, msk = 0; i < nelt; i++)
18684 {
18685 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
18686 if (j == i)
18687 {
18688 dfirst.perm[j] = d->perm[i];
18689 which1 |= (d->perm[i] < nelt ? 1 : 2);
18690 }
18691 else
18692 {
18693 dsecond.perm[j] = d->perm[i];
18694 which2 |= (d->perm[i] < nelt ? 1 : 2);
18695 msk |= (1U << i);
18696 }
18697 }
18698 if (msk == 0 || msk == (1U << nelt) - 1)
18699 return false;
18700
18701 if (!d->testing_p)
18702 {
18703 dfirst.target = gen_reg_rtx (dfirst.vmode);
18704 dsecond.target = gen_reg_rtx (dsecond.vmode);
18705 }
18706
18707 for (i = 0; i < nelt; i++)
18708 {
18709 if (dfirst.perm[i] == 0xff)
18710 dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
18711 if (dsecond.perm[i] == 0xff)
18712 dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
18713 }
18714 canonicalize_perm (&dfirst);
18715 start_sequence ();
18716 ok = ix86_expand_vec_perm_const_1 (&dfirst);
18717 seq1 = get_insns ();
18718 end_sequence ();
18719
18720 if (!ok)
18721 return false;
18722
18723 canonicalize_perm (&dsecond);
18724 start_sequence ();
18725 ok = ix86_expand_vec_perm_const_1 (&dsecond);
18726 seq2 = get_insns ();
18727 end_sequence ();
18728
18729 if (!ok)
18730 return false;
18731
18732 if (d->testing_p)
18733 return true;
18734
18735 emit_insn (seq1);
18736 emit_insn (seq2);
18737
18738 dthird = *d;
18739 dthird.op0 = dsecond.target;
18740 dthird.op1 = dsecond.target;
18741 dthird.one_operand_p = true;
18742 dthird.target = gen_reg_rtx (dthird.vmode);
18743 for (i = 0; i < nelt; i++)
18744 dthird.perm[i] = i ^ nelt2;
18745
18746 ok = expand_vec_perm_1 (&dthird);
18747 gcc_assert (ok);
18748
18749 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
18750 emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
18751 return true;
18752 }
18753
18754 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
18755 permutation with two pshufb insns and an ior. We should have already
18756 failed all two instruction sequences. */
18757
18758 static bool
18759 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
18760 {
18761 rtx rperm[2][16], vperm, l, h, op, m128;
18762 unsigned int i, nelt, eltsz;
18763
18764 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
18765 return false;
18766 gcc_assert (!d->one_operand_p);
18767
18768 if (d->testing_p)
18769 return true;
18770
18771 nelt = d->nelt;
18772 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18773
18774 /* Generate two permutation masks. If the required element is within
18775 the given vector it is shuffled into the proper lane. If the required
18776 element is in the other vector, force a zero into the lane by setting
18777 bit 7 in the permutation mask. */
18778 m128 = GEN_INT (-128);
18779 for (i = 0; i < nelt; ++i)
18780 {
18781 unsigned j, e = d->perm[i];
18782 unsigned which = (e >= nelt);
18783 if (e >= nelt)
18784 e -= nelt;
18785
18786 for (j = 0; j < eltsz; ++j)
18787 {
18788 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
18789 rperm[1-which][i*eltsz + j] = m128;
18790 }
18791 }
18792
18793 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
18794 vperm = force_reg (V16QImode, vperm);
18795
18796 l = gen_reg_rtx (V16QImode);
18797 op = gen_lowpart (V16QImode, d->op0);
18798 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
18799
18800 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
18801 vperm = force_reg (V16QImode, vperm);
18802
18803 h = gen_reg_rtx (V16QImode);
18804 op = gen_lowpart (V16QImode, d->op1);
18805 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
18806
18807 op = d->target;
18808 if (d->vmode != V16QImode)
18809 op = gen_reg_rtx (V16QImode);
18810 emit_insn (gen_iorv16qi3 (op, l, h));
18811 if (op != d->target)
18812 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18813
18814 return true;
18815 }
18816
18817 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
18818 with two vpshufb insns, vpermq and vpor. We should have already failed
18819 all two or three instruction sequences. */
18820
18821 static bool
18822 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
18823 {
18824 rtx rperm[2][32], vperm, l, h, hp, op, m128;
18825 unsigned int i, nelt, eltsz;
18826
18827 if (!TARGET_AVX2
18828 || !d->one_operand_p
18829 || (d->vmode != V32QImode && d->vmode != V16HImode))
18830 return false;
18831
18832 if (d->testing_p)
18833 return true;
18834
18835 nelt = d->nelt;
18836 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18837
18838 /* Generate two permutation masks. If the required element is within
18839 the same lane, it is shuffled in. If the required element from the
18840 other lane, force a zero by setting bit 7 in the permutation mask.
18841 In the other mask the mask has non-negative elements if element
18842 is requested from the other lane, but also moved to the other lane,
18843 so that the result of vpshufb can have the two V2TImode halves
18844 swapped. */
18845 m128 = GEN_INT (-128);
18846 for (i = 0; i < nelt; ++i)
18847 {
18848 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
18849 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
18850
18851 for (j = 0; j < eltsz; ++j)
18852 {
18853 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
18854 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
18855 }
18856 }
18857
18858 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
18859 vperm = force_reg (V32QImode, vperm);
18860
18861 h = gen_reg_rtx (V32QImode);
18862 op = gen_lowpart (V32QImode, d->op0);
18863 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
18864
18865 /* Swap the 128-byte lanes of h into hp. */
18866 hp = gen_reg_rtx (V4DImode);
18867 op = gen_lowpart (V4DImode, h);
18868 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
18869 const1_rtx));
18870
18871 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
18872 vperm = force_reg (V32QImode, vperm);
18873
18874 l = gen_reg_rtx (V32QImode);
18875 op = gen_lowpart (V32QImode, d->op0);
18876 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
18877
18878 op = d->target;
18879 if (d->vmode != V32QImode)
18880 op = gen_reg_rtx (V32QImode);
18881 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
18882 if (op != d->target)
18883 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18884
18885 return true;
18886 }
18887
18888 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18889 and extract-odd permutations of two V32QImode and V16QImode operand
18890 with two vpshufb insns, vpor and vpermq. We should have already
18891 failed all two or three instruction sequences. */
18892
18893 static bool
18894 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
18895 {
18896 rtx rperm[2][32], vperm, l, h, ior, op, m128;
18897 unsigned int i, nelt, eltsz;
18898
18899 if (!TARGET_AVX2
18900 || d->one_operand_p
18901 || (d->vmode != V32QImode && d->vmode != V16HImode))
18902 return false;
18903
18904 for (i = 0; i < d->nelt; ++i)
18905 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
18906 return false;
18907
18908 if (d->testing_p)
18909 return true;
18910
18911 nelt = d->nelt;
18912 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18913
18914 /* Generate two permutation masks. In the first permutation mask
18915 the first quarter will contain indexes for the first half
18916 of the op0, the second quarter will contain bit 7 set, third quarter
18917 will contain indexes for the second half of the op0 and the
18918 last quarter bit 7 set. In the second permutation mask
18919 the first quarter will contain bit 7 set, the second quarter
18920 indexes for the first half of the op1, the third quarter bit 7 set
18921 and last quarter indexes for the second half of the op1.
18922 I.e. the first mask e.g. for V32QImode extract even will be:
18923 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
18924 (all values masked with 0xf except for -128) and second mask
18925 for extract even will be
18926 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
18927 m128 = GEN_INT (-128);
18928 for (i = 0; i < nelt; ++i)
18929 {
18930 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
18931 unsigned which = d->perm[i] >= nelt;
18932 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
18933
18934 for (j = 0; j < eltsz; ++j)
18935 {
18936 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
18937 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
18938 }
18939 }
18940
18941 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
18942 vperm = force_reg (V32QImode, vperm);
18943
18944 l = gen_reg_rtx (V32QImode);
18945 op = gen_lowpart (V32QImode, d->op0);
18946 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
18947
18948 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
18949 vperm = force_reg (V32QImode, vperm);
18950
18951 h = gen_reg_rtx (V32QImode);
18952 op = gen_lowpart (V32QImode, d->op1);
18953 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
18954
18955 ior = gen_reg_rtx (V32QImode);
18956 emit_insn (gen_iorv32qi3 (ior, l, h));
18957
18958 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
18959 op = gen_reg_rtx (V4DImode);
18960 ior = gen_lowpart (V4DImode, ior);
18961 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
18962 const1_rtx, GEN_INT (3)));
18963 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18964
18965 return true;
18966 }
18967
18968 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18969 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
18970 with two "and" and "pack" or two "shift" and "pack" insns. We should
18971 have already failed all two instruction sequences. */
18972
18973 static bool
18974 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
18975 {
18976 rtx op, dop0, dop1, t;
18977 unsigned i, odd, c, s, nelt = d->nelt;
18978 bool end_perm = false;
18979 machine_mode half_mode;
18980 rtx (*gen_and) (rtx, rtx, rtx);
18981 rtx (*gen_pack) (rtx, rtx, rtx);
18982 rtx (*gen_shift) (rtx, rtx, rtx);
18983
18984 if (d->one_operand_p)
18985 return false;
18986
18987 switch (d->vmode)
18988 {
18989 case E_V8HImode:
18990 /* Required for "pack". */
18991 if (!TARGET_SSE4_1)
18992 return false;
18993 c = 0xffff;
18994 s = 16;
18995 half_mode = V4SImode;
18996 gen_and = gen_andv4si3;
18997 gen_pack = gen_sse4_1_packusdw;
18998 gen_shift = gen_lshrv4si3;
18999 break;
19000 case E_V16QImode:
19001 /* No check as all instructions are SSE2. */
19002 c = 0xff;
19003 s = 8;
19004 half_mode = V8HImode;
19005 gen_and = gen_andv8hi3;
19006 gen_pack = gen_sse2_packuswb;
19007 gen_shift = gen_lshrv8hi3;
19008 break;
19009 case E_V16HImode:
19010 if (!TARGET_AVX2)
19011 return false;
19012 c = 0xffff;
19013 s = 16;
19014 half_mode = V8SImode;
19015 gen_and = gen_andv8si3;
19016 gen_pack = gen_avx2_packusdw;
19017 gen_shift = gen_lshrv8si3;
19018 end_perm = true;
19019 break;
19020 case E_V32QImode:
19021 if (!TARGET_AVX2)
19022 return false;
19023 c = 0xff;
19024 s = 8;
19025 half_mode = V16HImode;
19026 gen_and = gen_andv16hi3;
19027 gen_pack = gen_avx2_packuswb;
19028 gen_shift = gen_lshrv16hi3;
19029 end_perm = true;
19030 break;
19031 default:
19032 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
19033 general shuffles. */
19034 return false;
19035 }
19036
19037 /* Check that permutation is even or odd. */
19038 odd = d->perm[0];
19039 if (odd > 1)
19040 return false;
19041
19042 for (i = 1; i < nelt; ++i)
19043 if (d->perm[i] != 2 * i + odd)
19044 return false;
19045
19046 if (d->testing_p)
19047 return true;
19048
19049 dop0 = gen_reg_rtx (half_mode);
19050 dop1 = gen_reg_rtx (half_mode);
19051 if (odd == 0)
19052 {
19053 t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
19054 t = force_reg (half_mode, t);
19055 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
19056 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
19057 }
19058 else
19059 {
19060 emit_insn (gen_shift (dop0,
19061 gen_lowpart (half_mode, d->op0),
19062 GEN_INT (s)));
19063 emit_insn (gen_shift (dop1,
19064 gen_lowpart (half_mode, d->op1),
19065 GEN_INT (s)));
19066 }
19067 /* In AVX2 for 256 bit case we need to permute pack result. */
19068 if (TARGET_AVX2 && end_perm)
19069 {
19070 op = gen_reg_rtx (d->vmode);
19071 t = gen_reg_rtx (V4DImode);
19072 emit_insn (gen_pack (op, dop0, dop1));
19073 emit_insn (gen_avx2_permv4di_1 (t,
19074 gen_lowpart (V4DImode, op),
19075 const0_rtx,
19076 const2_rtx,
19077 const1_rtx,
19078 GEN_INT (3)));
19079 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
19080 }
19081 else
19082 emit_insn (gen_pack (d->target, dop0, dop1));
19083
19084 return true;
19085 }
19086
19087 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
19088 and extract-odd permutations of two V64QI operands
19089 with two "shifts", two "truncs" and one "concat" insns for "odd"
19090 and two "truncs" and one concat insn for "even."
19091 Have already failed all two instruction sequences. */
19092
19093 static bool
19094 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
19095 {
19096 rtx t1, t2, t3, t4;
19097 unsigned i, odd, nelt = d->nelt;
19098
19099 if (!TARGET_AVX512BW
19100 || d->one_operand_p
19101 || d->vmode != V64QImode)
19102 return false;
19103
19104 /* Check that permutation is even or odd. */
19105 odd = d->perm[0];
19106 if (odd > 1)
19107 return false;
19108
19109 for (i = 1; i < nelt; ++i)
19110 if (d->perm[i] != 2 * i + odd)
19111 return false;
19112
19113 if (d->testing_p)
19114 return true;
19115
19116
19117 if (odd)
19118 {
19119 t1 = gen_reg_rtx (V32HImode);
19120 t2 = gen_reg_rtx (V32HImode);
19121 emit_insn (gen_lshrv32hi3 (t1,
19122 gen_lowpart (V32HImode, d->op0),
19123 GEN_INT (8)));
19124 emit_insn (gen_lshrv32hi3 (t2,
19125 gen_lowpart (V32HImode, d->op1),
19126 GEN_INT (8)));
19127 }
19128 else
19129 {
19130 t1 = gen_lowpart (V32HImode, d->op0);
19131 t2 = gen_lowpart (V32HImode, d->op1);
19132 }
19133
19134 t3 = gen_reg_rtx (V32QImode);
19135 t4 = gen_reg_rtx (V32QImode);
19136 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
19137 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
19138 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
19139
19140 return true;
19141 }
19142
19143 /* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
19144 and extract-odd permutations. */
19145
19146 static bool
19147 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
19148 {
19149 rtx t1, t2, t3, t4, t5;
19150
19151 switch (d->vmode)
19152 {
19153 case E_V4DFmode:
19154 if (d->testing_p)
19155 break;
19156 t1 = gen_reg_rtx (V4DFmode);
19157 t2 = gen_reg_rtx (V4DFmode);
19158
19159 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
19160 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
19161 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
19162
19163 /* Now an unpck[lh]pd will produce the result required. */
19164 if (odd)
19165 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
19166 else
19167 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
19168 emit_insn (t3);
19169 break;
19170
19171 case E_V8SFmode:
19172 {
19173 int mask = odd ? 0xdd : 0x88;
19174
19175 if (d->testing_p)
19176 break;
19177 t1 = gen_reg_rtx (V8SFmode);
19178 t2 = gen_reg_rtx (V8SFmode);
19179 t3 = gen_reg_rtx (V8SFmode);
19180
19181 /* Shuffle within the 128-bit lanes to produce:
19182 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
19183 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
19184 GEN_INT (mask)));
19185
19186 /* Shuffle the lanes around to produce:
19187 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
19188 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
19189 GEN_INT (0x3)));
19190
19191 /* Shuffle within the 128-bit lanes to produce:
19192 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
19193 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
19194
19195 /* Shuffle within the 128-bit lanes to produce:
19196 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
19197 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
19198
19199 /* Shuffle the lanes around to produce:
19200 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
19201 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
19202 GEN_INT (0x20)));
19203 }
19204 break;
19205
19206 case E_V2DFmode:
19207 case E_V4SFmode:
19208 case E_V2DImode:
19209 case E_V2SImode:
19210 case E_V4SImode:
19211 /* These are always directly implementable by expand_vec_perm_1. */
19212 gcc_unreachable ();
19213
19214 case E_V2SFmode:
19215 gcc_assert (TARGET_MMX_WITH_SSE);
19216 /* We have no suitable instructions. */
19217 if (d->testing_p)
19218 return false;
19219 break;
19220
19221 case E_V4HImode:
19222 if (d->testing_p)
19223 break;
19224 /* We need 2*log2(N)-1 operations to achieve odd/even
19225 with interleave. */
19226 t1 = gen_reg_rtx (V4HImode);
19227 emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
19228 emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
19229 if (odd)
19230 t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
19231 else
19232 t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
19233 emit_insn (t2);
19234 break;
19235
19236 case E_V8HImode:
19237 if (TARGET_SSE4_1)
19238 return expand_vec_perm_even_odd_pack (d);
19239 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
19240 return expand_vec_perm_pshufb2 (d);
19241 else
19242 {
19243 if (d->testing_p)
19244 break;
19245 /* We need 2*log2(N)-1 operations to achieve odd/even
19246 with interleave. */
19247 t1 = gen_reg_rtx (V8HImode);
19248 t2 = gen_reg_rtx (V8HImode);
19249 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
19250 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
19251 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
19252 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
19253 if (odd)
19254 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
19255 else
19256 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
19257 emit_insn (t3);
19258 }
19259 break;
19260
19261 case E_V16QImode:
19262 return expand_vec_perm_even_odd_pack (d);
19263
19264 case E_V16HImode:
19265 case E_V32QImode:
19266 return expand_vec_perm_even_odd_pack (d);
19267
19268 case E_V64QImode:
19269 return expand_vec_perm_even_odd_trunc (d);
19270
19271 case E_V4DImode:
19272 if (!TARGET_AVX2)
19273 {
19274 struct expand_vec_perm_d d_copy = *d;
19275 d_copy.vmode = V4DFmode;
19276 if (d->testing_p)
19277 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
19278 else
19279 d_copy.target = gen_reg_rtx (V4DFmode);
19280 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
19281 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
19282 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
19283 {
19284 if (!d->testing_p)
19285 emit_move_insn (d->target,
19286 gen_lowpart (V4DImode, d_copy.target));
19287 return true;
19288 }
19289 return false;
19290 }
19291
19292 if (d->testing_p)
19293 break;
19294
19295 t1 = gen_reg_rtx (V4DImode);
19296 t2 = gen_reg_rtx (V4DImode);
19297
19298 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
19299 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
19300 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
19301
19302 /* Now an vpunpck[lh]qdq will produce the result required. */
19303 if (odd)
19304 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
19305 else
19306 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
19307 emit_insn (t3);
19308 break;
19309
19310 case E_V8SImode:
19311 if (!TARGET_AVX2)
19312 {
19313 struct expand_vec_perm_d d_copy = *d;
19314 d_copy.vmode = V8SFmode;
19315 if (d->testing_p)
19316 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
19317 else
19318 d_copy.target = gen_reg_rtx (V8SFmode);
19319 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
19320 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
19321 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
19322 {
19323 if (!d->testing_p)
19324 emit_move_insn (d->target,
19325 gen_lowpart (V8SImode, d_copy.target));
19326 return true;
19327 }
19328 return false;
19329 }
19330
19331 if (d->testing_p)
19332 break;
19333
19334 t1 = gen_reg_rtx (V8SImode);
19335 t2 = gen_reg_rtx (V8SImode);
19336 t3 = gen_reg_rtx (V4DImode);
19337 t4 = gen_reg_rtx (V4DImode);
19338 t5 = gen_reg_rtx (V4DImode);
19339
19340 /* Shuffle the lanes around into
19341 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
19342 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
19343 gen_lowpart (V4DImode, d->op1),
19344 GEN_INT (0x20)));
19345 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
19346 gen_lowpart (V4DImode, d->op1),
19347 GEN_INT (0x31)));
19348
19349 /* Swap the 2nd and 3rd position in each lane into
19350 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
19351 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
19352 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
19353 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
19354 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
19355
19356 /* Now an vpunpck[lh]qdq will produce
19357 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
19358 if (odd)
19359 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
19360 gen_lowpart (V4DImode, t2));
19361 else
19362 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
19363 gen_lowpart (V4DImode, t2));
19364 emit_insn (t3);
19365 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
19366 break;
19367
19368 default:
19369 gcc_unreachable ();
19370 }
19371
19372 return true;
19373 }
19374
19375 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
19376 extract-even and extract-odd permutations. */
19377
19378 static bool
19379 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
19380 {
19381 unsigned i, odd, nelt = d->nelt;
19382
19383 odd = d->perm[0];
19384 if (odd != 0 && odd != 1)
19385 return false;
19386
19387 for (i = 1; i < nelt; ++i)
19388 if (d->perm[i] != 2 * i + odd)
19389 return false;
19390
19391 return expand_vec_perm_even_odd_1 (d, odd);
19392 }
19393
19394 /* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
19395 permutations. We assume that expand_vec_perm_1 has already failed. */
19396
19397 static bool
19398 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
19399 {
19400 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
19401 machine_mode vmode = d->vmode;
19402 unsigned char perm2[4];
19403 rtx op0 = d->op0, dest;
19404 bool ok;
19405
19406 switch (vmode)
19407 {
19408 case E_V4DFmode:
19409 case E_V8SFmode:
19410 /* These are special-cased in sse.md so that we can optionally
19411 use the vbroadcast instruction. They expand to two insns
19412 if the input happens to be in a register. */
19413 gcc_unreachable ();
19414
19415 case E_V2DFmode:
19416 case E_V2SFmode:
19417 case E_V4SFmode:
19418 case E_V2DImode:
19419 case E_V2SImode:
19420 case E_V4SImode:
19421 /* These are always implementable using standard shuffle patterns. */
19422 gcc_unreachable ();
19423
19424 case E_V8HImode:
19425 case E_V16QImode:
19426 /* These can be implemented via interleave. We save one insn by
19427 stopping once we have promoted to V4SImode and then use pshufd. */
19428 if (d->testing_p)
19429 return true;
19430 do
19431 {
19432 rtx dest;
19433 rtx (*gen) (rtx, rtx, rtx)
19434 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
19435 : gen_vec_interleave_lowv8hi;
19436
19437 if (elt >= nelt2)
19438 {
19439 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
19440 : gen_vec_interleave_highv8hi;
19441 elt -= nelt2;
19442 }
19443 nelt2 /= 2;
19444
19445 dest = gen_reg_rtx (vmode);
19446 emit_insn (gen (dest, op0, op0));
19447 vmode = get_mode_wider_vector (vmode);
19448 op0 = gen_lowpart (vmode, dest);
19449 }
19450 while (vmode != V4SImode);
19451
19452 memset (perm2, elt, 4);
19453 dest = gen_reg_rtx (V4SImode);
19454 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
19455 gcc_assert (ok);
19456 if (!d->testing_p)
19457 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
19458 return true;
19459
19460 case E_V64QImode:
19461 case E_V32QImode:
19462 case E_V16HImode:
19463 case E_V8SImode:
19464 case E_V4DImode:
19465 /* For AVX2 broadcasts of the first element vpbroadcast* or
19466 vpermq should be used by expand_vec_perm_1. */
19467 gcc_assert (!TARGET_AVX2 || d->perm[0]);
19468 return false;
19469
19470 default:
19471 gcc_unreachable ();
19472 }
19473 }
19474
19475 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
19476 broadcast permutations. */
19477
19478 static bool
19479 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
19480 {
19481 unsigned i, elt, nelt = d->nelt;
19482
19483 if (!d->one_operand_p)
19484 return false;
19485
19486 elt = d->perm[0];
19487 for (i = 1; i < nelt; ++i)
19488 if (d->perm[i] != elt)
19489 return false;
19490
19491 return expand_vec_perm_broadcast_1 (d);
19492 }
19493
19494 /* Implement arbitrary permutations of two V64QImode operands
19495 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
19496 static bool
19497 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
19498 {
19499 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
19500 return false;
19501
19502 if (d->testing_p)
19503 return true;
19504
19505 struct expand_vec_perm_d ds[2];
19506 rtx rperm[128], vperm, target0, target1;
19507 unsigned int i, nelt;
19508 machine_mode vmode;
19509
19510 nelt = d->nelt;
19511 vmode = V64QImode;
19512
19513 for (i = 0; i < 2; i++)
19514 {
19515 ds[i] = *d;
19516 ds[i].vmode = V32HImode;
19517 ds[i].nelt = 32;
19518 ds[i].target = gen_reg_rtx (V32HImode);
19519 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
19520 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
19521 }
19522
19523 /* Prepare permutations such that the first one takes care of
19524 putting the even bytes into the right positions or one higher
19525 positions (ds[0]) and the second one takes care of
19526 putting the odd bytes into the right positions or one below
19527 (ds[1]). */
19528
19529 for (i = 0; i < nelt; i++)
19530 {
19531 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
19532 if (i & 1)
19533 {
19534 rperm[i] = constm1_rtx;
19535 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
19536 }
19537 else
19538 {
19539 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
19540 rperm[i + 64] = constm1_rtx;
19541 }
19542 }
19543
19544 bool ok = expand_vec_perm_1 (&ds[0]);
19545 gcc_assert (ok);
19546 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
19547
19548 ok = expand_vec_perm_1 (&ds[1]);
19549 gcc_assert (ok);
19550 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
19551
19552 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
19553 vperm = force_reg (vmode, vperm);
19554 target0 = gen_reg_rtx (V64QImode);
19555 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
19556
19557 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
19558 vperm = force_reg (vmode, vperm);
19559 target1 = gen_reg_rtx (V64QImode);
19560 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
19561
19562 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
19563 return true;
19564 }
19565
19566 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
19567 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
19568 all the shorter instruction sequences. */
19569
19570 static bool
19571 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
19572 {
19573 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
19574 unsigned int i, nelt, eltsz;
19575 bool used[4];
19576
19577 if (!TARGET_AVX2
19578 || d->one_operand_p
19579 || (d->vmode != V32QImode && d->vmode != V16HImode))
19580 return false;
19581
19582 if (d->testing_p)
19583 return true;
19584
19585 nelt = d->nelt;
19586 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
19587
19588 /* Generate 4 permutation masks. If the required element is within
19589 the same lane, it is shuffled in. If the required element from the
19590 other lane, force a zero by setting bit 7 in the permutation mask.
19591 In the other mask the mask has non-negative elements if element
19592 is requested from the other lane, but also moved to the other lane,
19593 so that the result of vpshufb can have the two V2TImode halves
19594 swapped. */
19595 m128 = GEN_INT (-128);
19596 for (i = 0; i < 32; ++i)
19597 {
19598 rperm[0][i] = m128;
19599 rperm[1][i] = m128;
19600 rperm[2][i] = m128;
19601 rperm[3][i] = m128;
19602 }
19603 used[0] = false;
19604 used[1] = false;
19605 used[2] = false;
19606 used[3] = false;
19607 for (i = 0; i < nelt; ++i)
19608 {
19609 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
19610 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
19611 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
19612
19613 for (j = 0; j < eltsz; ++j)
19614 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
19615 used[which] = true;
19616 }
19617
19618 for (i = 0; i < 2; ++i)
19619 {
19620 if (!used[2 * i + 1])
19621 {
19622 h[i] = NULL_RTX;
19623 continue;
19624 }
19625 vperm = gen_rtx_CONST_VECTOR (V32QImode,
19626 gen_rtvec_v (32, rperm[2 * i + 1]));
19627 vperm = force_reg (V32QImode, vperm);
19628 h[i] = gen_reg_rtx (V32QImode);
19629 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
19630 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
19631 }
19632
19633 /* Swap the 128-byte lanes of h[X]. */
19634 for (i = 0; i < 2; ++i)
19635 {
19636 if (h[i] == NULL_RTX)
19637 continue;
19638 op = gen_reg_rtx (V4DImode);
19639 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
19640 const2_rtx, GEN_INT (3), const0_rtx,
19641 const1_rtx));
19642 h[i] = gen_lowpart (V32QImode, op);
19643 }
19644
19645 for (i = 0; i < 2; ++i)
19646 {
19647 if (!used[2 * i])
19648 {
19649 l[i] = NULL_RTX;
19650 continue;
19651 }
19652 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
19653 vperm = force_reg (V32QImode, vperm);
19654 l[i] = gen_reg_rtx (V32QImode);
19655 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
19656 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
19657 }
19658
19659 for (i = 0; i < 2; ++i)
19660 {
19661 if (h[i] && l[i])
19662 {
19663 op = gen_reg_rtx (V32QImode);
19664 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
19665 l[i] = op;
19666 }
19667 else if (h[i])
19668 l[i] = h[i];
19669 }
19670
19671 gcc_assert (l[0] && l[1]);
19672 op = d->target;
19673 if (d->vmode != V32QImode)
19674 op = gen_reg_rtx (V32QImode);
19675 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
19676 if (op != d->target)
19677 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
19678 return true;
19679 }
19680
19681 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
19682 taken care of, perform the expansion in D and return true on success. */
19683
19684 static bool
19685 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
19686 {
19687 /* Try a single instruction expansion. */
19688 if (expand_vec_perm_1 (d))
19689 return true;
19690
19691 /* Try sequences of two instructions. */
19692
19693 if (expand_vec_perm_pshuflw_pshufhw (d))
19694 return true;
19695
19696 if (expand_vec_perm_palignr (d, false))
19697 return true;
19698
19699 if (expand_vec_perm_interleave2 (d))
19700 return true;
19701
19702 if (expand_vec_perm_broadcast (d))
19703 return true;
19704
19705 if (expand_vec_perm_vpermq_perm_1 (d))
19706 return true;
19707
19708 if (expand_vec_perm_vperm2f128 (d))
19709 return true;
19710
19711 if (expand_vec_perm_pblendv (d))
19712 return true;
19713
19714 /* Try sequences of three instructions. */
19715
19716 if (expand_vec_perm_even_odd_pack (d))
19717 return true;
19718
19719 if (expand_vec_perm_2vperm2f128_vshuf (d))
19720 return true;
19721
19722 if (expand_vec_perm_pshufb2 (d))
19723 return true;
19724
19725 if (expand_vec_perm_interleave3 (d))
19726 return true;
19727
19728 if (expand_vec_perm_vperm2f128_vblend (d))
19729 return true;
19730
19731 /* Try sequences of four instructions. */
19732
19733 if (expand_vec_perm_even_odd_trunc (d))
19734 return true;
19735 if (expand_vec_perm_vpshufb2_vpermq (d))
19736 return true;
19737
19738 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
19739 return true;
19740
19741 if (expand_vec_perm_vpermt2_vpshub2 (d))
19742 return true;
19743
19744 /* ??? Look for narrow permutations whose element orderings would
19745 allow the promotion to a wider mode. */
19746
19747 /* ??? Look for sequences of interleave or a wider permute that place
19748 the data into the correct lanes for a half-vector shuffle like
19749 pshuf[lh]w or vpermilps. */
19750
19751 /* ??? Look for sequences of interleave that produce the desired results.
19752 The combinatorics of punpck[lh] get pretty ugly... */
19753
19754 if (expand_vec_perm_even_odd (d))
19755 return true;
19756
19757 /* Even longer sequences. */
19758 if (expand_vec_perm_vpshufb4_vpermq2 (d))
19759 return true;
19760
19761 /* See if we can get the same permutation in different vector integer
19762 mode. */
19763 struct expand_vec_perm_d nd;
19764 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
19765 {
19766 if (!d->testing_p)
19767 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
19768 return true;
19769 }
19770
19771 /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
19772 if (expand_vec_perm2_vperm2f128_vblend (d))
19773 return true;
19774
19775 return false;
19776 }
19777
19778 /* If a permutation only uses one operand, make it clear. Returns true
19779 if the permutation references both operands. */
19780
19781 static bool
19782 canonicalize_perm (struct expand_vec_perm_d *d)
19783 {
19784 int i, which, nelt = d->nelt;
19785
19786 for (i = which = 0; i < nelt; ++i)
19787 which |= (d->perm[i] < nelt ? 1 : 2);
19788
19789 d->one_operand_p = true;
19790 switch (which)
19791 {
19792 default:
19793 gcc_unreachable();
19794
19795 case 3:
19796 if (!rtx_equal_p (d->op0, d->op1))
19797 {
19798 d->one_operand_p = false;
19799 break;
19800 }
19801 /* The elements of PERM do not suggest that only the first operand
19802 is used, but both operands are identical. Allow easier matching
19803 of the permutation by folding the permutation into the single
19804 input vector. */
19805 /* FALLTHRU */
19806
19807 case 2:
19808 for (i = 0; i < nelt; ++i)
19809 d->perm[i] &= nelt - 1;
19810 d->op0 = d->op1;
19811 break;
19812
19813 case 1:
19814 d->op1 = d->op0;
19815 break;
19816 }
19817
19818 return (which == 3);
19819 }
19820
19821 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
19822
19823 bool
19824 ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
19825 rtx op1, const vec_perm_indices &sel)
19826 {
19827 struct expand_vec_perm_d d;
19828 unsigned char perm[MAX_VECT_LEN];
19829 unsigned int i, nelt, which;
19830 bool two_args;
19831
19832 d.target = target;
19833 d.op0 = op0;
19834 d.op1 = op1;
19835
19836 d.vmode = vmode;
19837 gcc_assert (VECTOR_MODE_P (d.vmode));
19838 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19839 d.testing_p = !target;
19840
19841 gcc_assert (sel.length () == nelt);
19842 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
19843
19844 /* Given sufficient ISA support we can just return true here
19845 for selected vector modes. */
19846 switch (d.vmode)
19847 {
19848 case E_V16SFmode:
19849 case E_V16SImode:
19850 case E_V8DImode:
19851 case E_V8DFmode:
19852 if (!TARGET_AVX512F)
19853 return false;
19854 /* All implementable with a single vperm[it]2 insn. */
19855 if (d.testing_p)
19856 return true;
19857 break;
19858 case E_V32HImode:
19859 if (!TARGET_AVX512BW)
19860 return false;
19861 if (d.testing_p)
19862 /* All implementable with a single vperm[it]2 insn. */
19863 return true;
19864 break;
19865 case E_V64QImode:
19866 if (!TARGET_AVX512BW)
19867 return false;
19868 if (d.testing_p)
19869 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
19870 return true;
19871 break;
19872 case E_V8SImode:
19873 case E_V8SFmode:
19874 case E_V4DFmode:
19875 case E_V4DImode:
19876 if (!TARGET_AVX)
19877 return false;
19878 if (d.testing_p && TARGET_AVX512VL)
19879 /* All implementable with a single vperm[it]2 insn. */
19880 return true;
19881 break;
19882 case E_V16HImode:
19883 if (!TARGET_SSE2)
19884 return false;
19885 if (d.testing_p && TARGET_AVX2)
19886 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19887 return true;
19888 break;
19889 case E_V32QImode:
19890 if (!TARGET_SSE2)
19891 return false;
19892 if (d.testing_p && TARGET_AVX2)
19893 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19894 return true;
19895 break;
19896 case E_V8HImode:
19897 case E_V16QImode:
19898 if (!TARGET_SSE2)
19899 return false;
19900 /* Fall through. */
19901 case E_V4SImode:
19902 case E_V4SFmode:
19903 if (!TARGET_SSE)
19904 return false;
19905 /* All implementable with a single vpperm insn. */
19906 if (d.testing_p && TARGET_XOP)
19907 return true;
19908 /* All implementable with 2 pshufb + 1 ior. */
19909 if (d.testing_p && TARGET_SSSE3)
19910 return true;
19911 break;
19912 case E_V2SFmode:
19913 case E_V2SImode:
19914 case E_V4HImode:
19915 if (!TARGET_MMX_WITH_SSE)
19916 return false;
19917 break;
19918 case E_V2DImode:
19919 case E_V2DFmode:
19920 if (!TARGET_SSE)
19921 return false;
19922 /* All implementable with shufpd or unpck[lh]pd. */
19923 if (d.testing_p)
19924 return true;
19925 break;
19926 default:
19927 return false;
19928 }
19929
19930 for (i = which = 0; i < nelt; ++i)
19931 {
19932 unsigned char e = sel[i];
19933 gcc_assert (e < 2 * nelt);
19934 d.perm[i] = e;
19935 perm[i] = e;
19936 which |= (e < nelt ? 1 : 2);
19937 }
19938
19939 if (d.testing_p)
19940 {
19941 /* For all elements from second vector, fold the elements to first. */
19942 if (which == 2)
19943 for (i = 0; i < nelt; ++i)
19944 d.perm[i] -= nelt;
19945
19946 /* Check whether the mask can be applied to the vector type. */
19947 d.one_operand_p = (which != 3);
19948
19949 /* Implementable with shufps or pshufd. */
19950 if (d.one_operand_p
19951 && (d.vmode == V4SFmode || d.vmode == V2SFmode
19952 || d.vmode == V4SImode || d.vmode == V2SImode))
19953 return true;
19954
19955 /* Otherwise we have to go through the motions and see if we can
19956 figure out how to generate the requested permutation. */
19957 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
19958 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
19959 if (!d.one_operand_p)
19960 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
19961
19962 start_sequence ();
19963 bool ret = ix86_expand_vec_perm_const_1 (&d);
19964 end_sequence ();
19965
19966 return ret;
19967 }
19968
19969 two_args = canonicalize_perm (&d);
19970
19971 /* If one of the operands is a zero vector, try to match pmovzx. */
19972 if (two_args && (d.op0 == CONST0_RTX (vmode) || d.op1 == CONST0_RTX (vmode)))
19973 {
19974 struct expand_vec_perm_d dzero = d;
19975 if (d.op0 == CONST0_RTX (vmode))
19976 {
19977 d.op1 = dzero.op1 = force_reg (vmode, d.op1);
19978 std::swap (dzero.op0, dzero.op1);
19979 for (i = 0; i < nelt; ++i)
19980 dzero.perm[i] ^= nelt;
19981 }
19982 else
19983 d.op0 = dzero.op0 = force_reg (vmode, d.op0);
19984
19985 if (expand_vselect_vconcat (dzero.target, dzero.op0, dzero.op1,
19986 dzero.perm, nelt, dzero.testing_p))
19987 return true;
19988 }
19989
19990 /* Force operands into registers. */
19991 rtx nop0 = force_reg (vmode, d.op0);
19992 if (d.op0 == d.op1)
19993 d.op1 = nop0;
19994 d.op0 = nop0;
19995 d.op1 = force_reg (vmode, d.op1);
19996
19997 if (ix86_expand_vec_perm_const_1 (&d))
19998 return true;
19999
20000 /* If the selector says both arguments are needed, but the operands are the
20001 same, the above tried to expand with one_operand_p and flattened selector.
20002 If that didn't work, retry without one_operand_p; we succeeded with that
20003 during testing. */
20004 if (two_args && d.one_operand_p)
20005 {
20006 d.one_operand_p = false;
20007 memcpy (d.perm, perm, sizeof (perm));
20008 return ix86_expand_vec_perm_const_1 (&d);
20009 }
20010
20011 return false;
20012 }
20013
20014 void
20015 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
20016 {
20017 struct expand_vec_perm_d d;
20018 unsigned i, nelt;
20019
20020 d.target = targ;
20021 d.op0 = op0;
20022 d.op1 = op1;
20023 d.vmode = GET_MODE (targ);
20024 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
20025 d.one_operand_p = false;
20026 d.testing_p = false;
20027
20028 for (i = 0; i < nelt; ++i)
20029 d.perm[i] = i * 2 + odd;
20030
20031 /* We'll either be able to implement the permutation directly... */
20032 if (expand_vec_perm_1 (&d))
20033 return;
20034
20035 /* ... or we use the special-case patterns. */
20036 expand_vec_perm_even_odd_1 (&d, odd);
20037 }
20038
20039 static void
20040 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
20041 {
20042 struct expand_vec_perm_d d;
20043 unsigned i, nelt, base;
20044 bool ok;
20045
20046 d.target = targ;
20047 d.op0 = op0;
20048 d.op1 = op1;
20049 d.vmode = GET_MODE (targ);
20050 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
20051 d.one_operand_p = false;
20052 d.testing_p = false;
20053
20054 base = high_p ? nelt / 2 : 0;
20055 for (i = 0; i < nelt / 2; ++i)
20056 {
20057 d.perm[i * 2] = i + base;
20058 d.perm[i * 2 + 1] = i + base + nelt;
20059 }
20060
20061 /* Note that for AVX this isn't one instruction. */
20062 ok = ix86_expand_vec_perm_const_1 (&d);
20063 gcc_assert (ok);
20064 }
20065
20066 /* Optimize vector MUL generation for V8QI, V16QI and V32QI
20067 under TARGET_AVX512BW. i.e. for v16qi a * b, it has
20068
20069 vpmovzxbw ymm2, xmm0
20070 vpmovzxbw ymm3, xmm1
20071 vpmullw ymm4, ymm2, ymm3
20072 vpmovwb xmm0, ymm4
20073
20074 it would take less instructions than ix86_expand_vecop_qihi.
20075 Return true if success. */
20076
20077 bool
20078 ix86_expand_vecmul_qihi (rtx dest, rtx op1, rtx op2)
20079 {
20080 machine_mode himode, qimode = GET_MODE (dest);
20081 rtx hop1, hop2, hdest;
20082 rtx (*gen_extend)(rtx, rtx);
20083 rtx (*gen_truncate)(rtx, rtx);
20084
20085 /* There's no V64HImode multiplication instruction. */
20086 if (qimode == E_V64QImode)
20087 return false;
20088
20089 /* vpmovwb only available under AVX512BW. */
20090 if (!TARGET_AVX512BW)
20091 return false;
20092 if ((qimode == V8QImode || qimode == V16QImode)
20093 && !TARGET_AVX512VL)
20094 return false;
20095 /* Not generate zmm instruction when prefer 128/256 bit vector width. */
20096 if (qimode == V32QImode
20097 && (TARGET_PREFER_AVX128 || TARGET_PREFER_AVX256))
20098 return false;
20099
20100 switch (qimode)
20101 {
20102 case E_V8QImode:
20103 himode = V8HImode;
20104 gen_extend = gen_zero_extendv8qiv8hi2;
20105 gen_truncate = gen_truncv8hiv8qi2;
20106 break;
20107 case E_V16QImode:
20108 himode = V16HImode;
20109 gen_extend = gen_zero_extendv16qiv16hi2;
20110 gen_truncate = gen_truncv16hiv16qi2;
20111 break;
20112 case E_V32QImode:
20113 himode = V32HImode;
20114 gen_extend = gen_zero_extendv32qiv32hi2;
20115 gen_truncate = gen_truncv32hiv32qi2;
20116 break;
20117 default:
20118 gcc_unreachable ();
20119 }
20120
20121 hop1 = gen_reg_rtx (himode);
20122 hop2 = gen_reg_rtx (himode);
20123 hdest = gen_reg_rtx (himode);
20124 emit_insn (gen_extend (hop1, op1));
20125 emit_insn (gen_extend (hop2, op2));
20126 emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (MULT, himode,
20127 hop1, hop2)));
20128 emit_insn (gen_truncate (dest, hdest));
20129 return true;
20130 }
20131
20132 /* Expand a vector operation shift by constant for a V*QImode in terms of the
20133 same operation on V*HImode. Return true if success. */
20134 bool
20135 ix86_expand_vec_shift_qihi_constant (enum rtx_code code, rtx dest, rtx op1, rtx op2)
20136 {
20137 machine_mode qimode, himode;
20138 HOST_WIDE_INT and_constant, xor_constant;
20139 HOST_WIDE_INT shift_amount;
20140 rtx vec_const_and, vec_const_xor;
20141 rtx tmp, op1_subreg;
20142 rtx (*gen_shift) (rtx, rtx, rtx);
20143 rtx (*gen_and) (rtx, rtx, rtx);
20144 rtx (*gen_xor) (rtx, rtx, rtx);
20145 rtx (*gen_sub) (rtx, rtx, rtx);
20146
20147 /* Only optimize shift by constant. */
20148 if (!CONST_INT_P (op2))
20149 return false;
20150
20151 qimode = GET_MODE (dest);
20152 shift_amount = INTVAL (op2);
20153 /* Do nothing when shift amount greater equal 8. */
20154 if (shift_amount > 7)
20155 return false;
20156
20157 gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
20158 /* Record sign bit. */
20159 xor_constant = 1 << (8 - shift_amount - 1);
20160
20161 /* Zero upper/lower bits shift from left/right element. */
20162 and_constant
20163 = (code == ASHIFT ? 256 - (1 << shift_amount)
20164 : (1 << (8 - shift_amount)) - 1);
20165
20166 switch (qimode)
20167 {
20168 case V16QImode:
20169 himode = V8HImode;
20170 gen_shift =
20171 ((code == ASHIFT)
20172 ? gen_ashlv8hi3
20173 : (code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3);
20174 gen_and = gen_andv16qi3;
20175 gen_xor = gen_xorv16qi3;
20176 gen_sub = gen_subv16qi3;
20177 break;
20178 case V32QImode:
20179 himode = V16HImode;
20180 gen_shift =
20181 ((code == ASHIFT)
20182 ? gen_ashlv16hi3
20183 : (code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3);
20184 gen_and = gen_andv32qi3;
20185 gen_xor = gen_xorv32qi3;
20186 gen_sub = gen_subv32qi3;
20187 break;
20188 case V64QImode:
20189 himode = V32HImode;
20190 gen_shift =
20191 ((code == ASHIFT)
20192 ? gen_ashlv32hi3
20193 : (code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3);
20194 gen_and = gen_andv64qi3;
20195 gen_xor = gen_xorv64qi3;
20196 gen_sub = gen_subv64qi3;
20197 break;
20198 default:
20199 gcc_unreachable ();
20200 }
20201
20202 tmp = gen_reg_rtx (himode);
20203 vec_const_and = gen_reg_rtx (qimode);
20204 op1_subreg = lowpart_subreg (himode, op1, qimode);
20205
20206 /* For ASHIFT and LSHIFTRT, perform operation like
20207 vpsllw/vpsrlw $shift_amount, %op1, %dest.
20208 vpand %vec_const_and, %dest. */
20209 emit_insn (gen_shift (tmp, op1_subreg, op2));
20210 emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0));
20211 emit_move_insn (vec_const_and,
20212 ix86_build_const_vector (qimode, true,
20213 gen_int_mode (and_constant, QImode)));
20214 emit_insn (gen_and (dest, dest, vec_const_and));
20215
20216 /* For ASHIFTRT, perform extra operation like
20217 vpxor %vec_const_xor, %dest, %dest
20218 vpsubb %vec_const_xor, %dest, %dest */
20219 if (code == ASHIFTRT)
20220 {
20221 vec_const_xor = gen_reg_rtx (qimode);
20222 emit_move_insn (vec_const_xor,
20223 ix86_build_const_vector (qimode, true,
20224 gen_int_mode (xor_constant, QImode)));
20225 emit_insn (gen_xor (dest, dest, vec_const_xor));
20226 emit_insn (gen_sub (dest, dest, vec_const_xor));
20227 }
20228 return true;
20229 }
20230
20231 /* Expand a vector operation CODE for a V*QImode in terms of the
20232 same operation on V*HImode. */
20233
20234 void
20235 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
20236 {
20237 machine_mode qimode = GET_MODE (dest);
20238 machine_mode himode;
20239 rtx (*gen_il) (rtx, rtx, rtx);
20240 rtx (*gen_ih) (rtx, rtx, rtx);
20241 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
20242 struct expand_vec_perm_d d;
20243 bool ok, full_interleave;
20244 bool uns_p = false;
20245 int i;
20246
20247 switch (qimode)
20248 {
20249 case E_V16QImode:
20250 himode = V8HImode;
20251 gen_il = gen_vec_interleave_lowv16qi;
20252 gen_ih = gen_vec_interleave_highv16qi;
20253 break;
20254 case E_V32QImode:
20255 himode = V16HImode;
20256 gen_il = gen_avx2_interleave_lowv32qi;
20257 gen_ih = gen_avx2_interleave_highv32qi;
20258 break;
20259 case E_V64QImode:
20260 himode = V32HImode;
20261 gen_il = gen_avx512bw_interleave_lowv64qi;
20262 gen_ih = gen_avx512bw_interleave_highv64qi;
20263 break;
20264 default:
20265 gcc_unreachable ();
20266 }
20267
20268 op2_l = op2_h = op2;
20269 switch (code)
20270 {
20271 case MULT:
20272 /* Unpack data such that we've got a source byte in each low byte of
20273 each word. We don't care what goes into the high byte of each word.
20274 Rather than trying to get zero in there, most convenient is to let
20275 it be a copy of the low byte. */
20276 op2_l = gen_reg_rtx (qimode);
20277 op2_h = gen_reg_rtx (qimode);
20278 emit_insn (gen_il (op2_l, op2, op2));
20279 emit_insn (gen_ih (op2_h, op2, op2));
20280
20281 op1_l = gen_reg_rtx (qimode);
20282 op1_h = gen_reg_rtx (qimode);
20283 emit_insn (gen_il (op1_l, op1, op1));
20284 emit_insn (gen_ih (op1_h, op1, op1));
20285 full_interleave = qimode == V16QImode;
20286 break;
20287
20288 case ASHIFT:
20289 case LSHIFTRT:
20290 uns_p = true;
20291 /* FALLTHRU */
20292 case ASHIFTRT:
20293 op1_l = gen_reg_rtx (himode);
20294 op1_h = gen_reg_rtx (himode);
20295 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
20296 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
20297 full_interleave = true;
20298 break;
20299 default:
20300 gcc_unreachable ();
20301 }
20302
20303 /* Perform the operation. */
20304 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
20305 1, OPTAB_DIRECT);
20306 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
20307 1, OPTAB_DIRECT);
20308 gcc_assert (res_l && res_h);
20309
20310 /* Merge the data back into the right place. */
20311 d.target = dest;
20312 d.op0 = gen_lowpart (qimode, res_l);
20313 d.op1 = gen_lowpart (qimode, res_h);
20314 d.vmode = qimode;
20315 d.nelt = GET_MODE_NUNITS (qimode);
20316 d.one_operand_p = false;
20317 d.testing_p = false;
20318
20319 if (full_interleave)
20320 {
20321 /* For SSE2, we used an full interleave, so the desired
20322 results are in the even elements. */
20323 for (i = 0; i < d.nelt; ++i)
20324 d.perm[i] = i * 2;
20325 }
20326 else
20327 {
20328 /* For AVX, the interleave used above was not cross-lane. So the
20329 extraction is evens but with the second and third quarter swapped.
20330 Happily, that is even one insn shorter than even extraction.
20331 For AVX512BW we have 4 lanes. We extract evens from within a lane,
20332 always first from the first and then from the second source operand,
20333 the index bits above the low 4 bits remains the same.
20334 Thus, for d.nelt == 32 we want permutation
20335 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
20336 and for d.nelt == 64 we want permutation
20337 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
20338 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
20339 for (i = 0; i < d.nelt; ++i)
20340 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
20341 }
20342
20343 ok = ix86_expand_vec_perm_const_1 (&d);
20344 gcc_assert (ok);
20345
20346 set_unique_reg_note (get_last_insn (), REG_EQUAL,
20347 gen_rtx_fmt_ee (code, qimode, op1, op2));
20348 }
20349
20350 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
20351 if op is CONST_VECTOR with all odd elements equal to their
20352 preceding element. */
20353
20354 static bool
20355 const_vector_equal_evenodd_p (rtx op)
20356 {
20357 machine_mode mode = GET_MODE (op);
20358 int i, nunits = GET_MODE_NUNITS (mode);
20359 if (GET_CODE (op) != CONST_VECTOR
20360 || nunits != CONST_VECTOR_NUNITS (op))
20361 return false;
20362 for (i = 0; i < nunits; i += 2)
20363 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
20364 return false;
20365 return true;
20366 }
20367
20368 void
20369 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
20370 bool uns_p, bool odd_p)
20371 {
20372 machine_mode mode = GET_MODE (op1);
20373 machine_mode wmode = GET_MODE (dest);
20374 rtx x;
20375 rtx orig_op1 = op1, orig_op2 = op2;
20376
20377 if (!nonimmediate_operand (op1, mode))
20378 op1 = force_reg (mode, op1);
20379 if (!nonimmediate_operand (op2, mode))
20380 op2 = force_reg (mode, op2);
20381
20382 /* We only play even/odd games with vectors of SImode. */
20383 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
20384
20385 /* If we're looking for the odd results, shift those members down to
20386 the even slots. For some cpus this is faster than a PSHUFD. */
20387 if (odd_p)
20388 {
20389 /* For XOP use vpmacsdqh, but only for smult, as it is only
20390 signed. */
20391 if (TARGET_XOP && mode == V4SImode && !uns_p)
20392 {
20393 x = force_reg (wmode, CONST0_RTX (wmode));
20394 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
20395 return;
20396 }
20397
20398 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
20399 if (!const_vector_equal_evenodd_p (orig_op1))
20400 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
20401 x, NULL, 1, OPTAB_DIRECT);
20402 if (!const_vector_equal_evenodd_p (orig_op2))
20403 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
20404 x, NULL, 1, OPTAB_DIRECT);
20405 op1 = gen_lowpart (mode, op1);
20406 op2 = gen_lowpart (mode, op2);
20407 }
20408
20409 if (mode == V16SImode)
20410 {
20411 if (uns_p)
20412 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
20413 else
20414 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
20415 }
20416 else if (mode == V8SImode)
20417 {
20418 if (uns_p)
20419 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
20420 else
20421 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
20422 }
20423 else if (uns_p)
20424 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
20425 else if (TARGET_SSE4_1)
20426 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
20427 else
20428 {
20429 rtx s1, s2, t0, t1, t2;
20430
20431 /* The easiest way to implement this without PMULDQ is to go through
20432 the motions as if we are performing a full 64-bit multiply. With
20433 the exception that we need to do less shuffling of the elements. */
20434
20435 /* Compute the sign-extension, aka highparts, of the two operands. */
20436 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
20437 op1, pc_rtx, pc_rtx);
20438 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
20439 op2, pc_rtx, pc_rtx);
20440
20441 /* Multiply LO(A) * HI(B), and vice-versa. */
20442 t1 = gen_reg_rtx (wmode);
20443 t2 = gen_reg_rtx (wmode);
20444 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
20445 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
20446
20447 /* Multiply LO(A) * LO(B). */
20448 t0 = gen_reg_rtx (wmode);
20449 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
20450
20451 /* Combine and shift the highparts into place. */
20452 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
20453 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
20454 1, OPTAB_DIRECT);
20455
20456 /* Combine high and low parts. */
20457 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
20458 return;
20459 }
20460 emit_insn (x);
20461 }
20462
20463 void
20464 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
20465 bool uns_p, bool high_p)
20466 {
20467 machine_mode wmode = GET_MODE (dest);
20468 machine_mode mode = GET_MODE (op1);
20469 rtx t1, t2, t3, t4, mask;
20470
20471 switch (mode)
20472 {
20473 case E_V4SImode:
20474 t1 = gen_reg_rtx (mode);
20475 t2 = gen_reg_rtx (mode);
20476 if (TARGET_XOP && !uns_p)
20477 {
20478 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
20479 shuffle the elements once so that all elements are in the right
20480 place for immediate use: { A C B D }. */
20481 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
20482 const1_rtx, GEN_INT (3)));
20483 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
20484 const1_rtx, GEN_INT (3)));
20485 }
20486 else
20487 {
20488 /* Put the elements into place for the multiply. */
20489 ix86_expand_vec_interleave (t1, op1, op1, high_p);
20490 ix86_expand_vec_interleave (t2, op2, op2, high_p);
20491 high_p = false;
20492 }
20493 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
20494 break;
20495
20496 case E_V8SImode:
20497 /* Shuffle the elements between the lanes. After this we
20498 have { A B E F | C D G H } for each operand. */
20499 t1 = gen_reg_rtx (V4DImode);
20500 t2 = gen_reg_rtx (V4DImode);
20501 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
20502 const0_rtx, const2_rtx,
20503 const1_rtx, GEN_INT (3)));
20504 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
20505 const0_rtx, const2_rtx,
20506 const1_rtx, GEN_INT (3)));
20507
20508 /* Shuffle the elements within the lanes. After this we
20509 have { A A B B | C C D D } or { E E F F | G G H H }. */
20510 t3 = gen_reg_rtx (V8SImode);
20511 t4 = gen_reg_rtx (V8SImode);
20512 mask = GEN_INT (high_p
20513 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
20514 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
20515 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
20516 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
20517
20518 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
20519 break;
20520
20521 case E_V8HImode:
20522 case E_V16HImode:
20523 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
20524 uns_p, OPTAB_DIRECT);
20525 t2 = expand_binop (mode,
20526 uns_p ? umul_highpart_optab : smul_highpart_optab,
20527 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
20528 gcc_assert (t1 && t2);
20529
20530 t3 = gen_reg_rtx (mode);
20531 ix86_expand_vec_interleave (t3, t1, t2, high_p);
20532 emit_move_insn (dest, gen_lowpart (wmode, t3));
20533 break;
20534
20535 case E_V16QImode:
20536 case E_V32QImode:
20537 case E_V32HImode:
20538 case E_V16SImode:
20539 case E_V64QImode:
20540 t1 = gen_reg_rtx (wmode);
20541 t2 = gen_reg_rtx (wmode);
20542 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
20543 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
20544
20545 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
20546 break;
20547
20548 default:
20549 gcc_unreachable ();
20550 }
20551 }
20552
20553 void
20554 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
20555 {
20556 rtx res_1, res_2, res_3, res_4;
20557
20558 res_1 = gen_reg_rtx (V4SImode);
20559 res_2 = gen_reg_rtx (V4SImode);
20560 res_3 = gen_reg_rtx (V2DImode);
20561 res_4 = gen_reg_rtx (V2DImode);
20562 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
20563 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
20564
20565 /* Move the results in element 2 down to element 1; we don't care
20566 what goes in elements 2 and 3. Then we can merge the parts
20567 back together with an interleave.
20568
20569 Note that two other sequences were tried:
20570 (1) Use interleaves at the start instead of psrldq, which allows
20571 us to use a single shufps to merge things back at the end.
20572 (2) Use shufps here to combine the two vectors, then pshufd to
20573 put the elements in the correct order.
20574 In both cases the cost of the reformatting stall was too high
20575 and the overall sequence slower. */
20576
20577 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
20578 const0_rtx, const2_rtx,
20579 const0_rtx, const0_rtx));
20580 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
20581 const0_rtx, const2_rtx,
20582 const0_rtx, const0_rtx));
20583 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
20584
20585 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
20586 }
20587
20588 void
20589 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
20590 {
20591 machine_mode mode = GET_MODE (op0);
20592 rtx t1, t2, t3, t4, t5, t6;
20593
20594 if (TARGET_AVX512DQ && mode == V8DImode)
20595 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
20596 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
20597 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
20598 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
20599 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
20600 else if (TARGET_XOP && mode == V2DImode)
20601 {
20602 /* op1: A,B,C,D, op2: E,F,G,H */
20603 op1 = gen_lowpart (V4SImode, op1);
20604 op2 = gen_lowpart (V4SImode, op2);
20605
20606 t1 = gen_reg_rtx (V4SImode);
20607 t2 = gen_reg_rtx (V4SImode);
20608 t3 = gen_reg_rtx (V2DImode);
20609 t4 = gen_reg_rtx (V2DImode);
20610
20611 /* t1: B,A,D,C */
20612 emit_insn (gen_sse2_pshufd_1 (t1, op1,
20613 GEN_INT (1),
20614 GEN_INT (0),
20615 GEN_INT (3),
20616 GEN_INT (2)));
20617
20618 /* t2: (B*E),(A*F),(D*G),(C*H) */
20619 emit_insn (gen_mulv4si3 (t2, t1, op2));
20620
20621 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
20622 emit_insn (gen_xop_phadddq (t3, t2));
20623
20624 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
20625 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
20626
20627 /* Multiply lower parts and add all */
20628 t5 = gen_reg_rtx (V2DImode);
20629 emit_insn (gen_vec_widen_umult_even_v4si (t5,
20630 gen_lowpart (V4SImode, op1),
20631 gen_lowpart (V4SImode, op2)));
20632 force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
20633 }
20634 else
20635 {
20636 machine_mode nmode;
20637 rtx (*umul) (rtx, rtx, rtx);
20638
20639 if (mode == V2DImode)
20640 {
20641 umul = gen_vec_widen_umult_even_v4si;
20642 nmode = V4SImode;
20643 }
20644 else if (mode == V4DImode)
20645 {
20646 umul = gen_vec_widen_umult_even_v8si;
20647 nmode = V8SImode;
20648 }
20649 else if (mode == V8DImode)
20650 {
20651 umul = gen_vec_widen_umult_even_v16si;
20652 nmode = V16SImode;
20653 }
20654 else
20655 gcc_unreachable ();
20656
20657
20658 /* Multiply low parts. */
20659 t1 = gen_reg_rtx (mode);
20660 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
20661
20662 /* Shift input vectors right 32 bits so we can multiply high parts. */
20663 t6 = GEN_INT (32);
20664 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
20665 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
20666
20667 /* Multiply high parts by low parts. */
20668 t4 = gen_reg_rtx (mode);
20669 t5 = gen_reg_rtx (mode);
20670 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
20671 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
20672
20673 /* Combine and shift the highparts back. */
20674 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
20675 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
20676
20677 /* Combine high and low parts. */
20678 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
20679 }
20680
20681 set_unique_reg_note (get_last_insn (), REG_EQUAL,
20682 gen_rtx_MULT (mode, op1, op2));
20683 }
20684
20685 /* Return 1 if control tansfer instruction INSN
20686 should be encoded with notrack prefix. */
20687
20688 bool
20689 ix86_notrack_prefixed_insn_p (rtx_insn *insn)
20690 {
20691 if (!insn || !((flag_cf_protection & CF_BRANCH)))
20692 return false;
20693
20694 if (CALL_P (insn))
20695 {
20696 rtx call = get_call_rtx_from (insn);
20697 gcc_assert (call != NULL_RTX);
20698 rtx addr = XEXP (call, 0);
20699
20700 /* Do not emit 'notrack' if it's not an indirect call. */
20701 if (MEM_P (addr)
20702 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
20703 return false;
20704 else
20705 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
20706 }
20707
20708 if (JUMP_P (insn) && !flag_cet_switch)
20709 {
20710 rtx target = JUMP_LABEL (insn);
20711 if (target == NULL_RTX || ANY_RETURN_P (target))
20712 return false;
20713
20714 /* Check the jump is a switch table. */
20715 rtx_insn *label = as_a<rtx_insn *> (target);
20716 rtx_insn *table = next_insn (label);
20717 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
20718 return false;
20719 else
20720 return true;
20721 }
20722 return false;
20723 }
20724
20725 /* Calculate integer abs() using only SSE2 instructions. */
20726
20727 void
20728 ix86_expand_sse2_abs (rtx target, rtx input)
20729 {
20730 machine_mode mode = GET_MODE (target);
20731 rtx tmp0, tmp1, x;
20732
20733 switch (mode)
20734 {
20735 case E_V2DImode:
20736 case E_V4DImode:
20737 /* For 64-bit signed integer X, with SSE4.2 use
20738 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
20739 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
20740 32 and use logical instead of arithmetic right shift (which is
20741 unimplemented) and subtract. */
20742 if (TARGET_SSE4_2)
20743 {
20744 tmp0 = gen_reg_rtx (mode);
20745 tmp1 = gen_reg_rtx (mode);
20746 emit_move_insn (tmp1, CONST0_RTX (mode));
20747 if (mode == E_V2DImode)
20748 emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
20749 else
20750 emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
20751 }
20752 else
20753 {
20754 tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
20755 GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
20756 - 1), NULL, 0, OPTAB_DIRECT);
20757 tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
20758 }
20759
20760 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
20761 NULL, 0, OPTAB_DIRECT);
20762 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
20763 target, 0, OPTAB_DIRECT);
20764 break;
20765
20766 case E_V4SImode:
20767 /* For 32-bit signed integer X, the best way to calculate the absolute
20768 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
20769 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
20770 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
20771 NULL, 0, OPTAB_DIRECT);
20772 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
20773 NULL, 0, OPTAB_DIRECT);
20774 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
20775 target, 0, OPTAB_DIRECT);
20776 break;
20777
20778 case E_V8HImode:
20779 /* For 16-bit signed integer X, the best way to calculate the absolute
20780 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
20781 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
20782
20783 x = expand_simple_binop (mode, SMAX, tmp0, input,
20784 target, 0, OPTAB_DIRECT);
20785 break;
20786
20787 case E_V16QImode:
20788 /* For 8-bit signed integer X, the best way to calculate the absolute
20789 value of X is min ((unsigned char) X, (unsigned char) (-X)),
20790 as SSE2 provides the PMINUB insn. */
20791 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
20792
20793 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
20794 target, 0, OPTAB_DIRECT);
20795 break;
20796
20797 default:
20798 gcc_unreachable ();
20799 }
20800
20801 if (x != target)
20802 emit_move_insn (target, x);
20803 }
20804
20805 /* Expand an extract from a vector register through pextr insn.
20806 Return true if successful. */
20807
20808 bool
20809 ix86_expand_pextr (rtx *operands)
20810 {
20811 rtx dst = operands[0];
20812 rtx src = operands[1];
20813
20814 unsigned int size = INTVAL (operands[2]);
20815 unsigned int pos = INTVAL (operands[3]);
20816
20817 if (SUBREG_P (dst))
20818 {
20819 /* Reject non-lowpart subregs. */
20820 if (SUBREG_BYTE (dst) > 0)
20821 return false;
20822 dst = SUBREG_REG (dst);
20823 }
20824
20825 if (SUBREG_P (src))
20826 {
20827 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
20828 src = SUBREG_REG (src);
20829 }
20830
20831 switch (GET_MODE (src))
20832 {
20833 case E_V16QImode:
20834 case E_V8HImode:
20835 case E_V4SImode:
20836 case E_V2DImode:
20837 case E_V1TImode:
20838 {
20839 machine_mode srcmode, dstmode;
20840 rtx d, pat;
20841
20842 if (!int_mode_for_size (size, 0).exists (&dstmode))
20843 return false;
20844
20845 switch (dstmode)
20846 {
20847 case E_QImode:
20848 if (!TARGET_SSE4_1)
20849 return false;
20850 srcmode = V16QImode;
20851 break;
20852
20853 case E_HImode:
20854 if (!TARGET_SSE2)
20855 return false;
20856 srcmode = V8HImode;
20857 break;
20858
20859 case E_SImode:
20860 if (!TARGET_SSE4_1)
20861 return false;
20862 srcmode = V4SImode;
20863 break;
20864
20865 case E_DImode:
20866 gcc_assert (TARGET_64BIT);
20867 if (!TARGET_SSE4_1)
20868 return false;
20869 srcmode = V2DImode;
20870 break;
20871
20872 default:
20873 return false;
20874 }
20875
20876 /* Reject extractions from misaligned positions. */
20877 if (pos & (size-1))
20878 return false;
20879
20880 if (GET_MODE (dst) == dstmode)
20881 d = dst;
20882 else
20883 d = gen_reg_rtx (dstmode);
20884
20885 /* Construct insn pattern. */
20886 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
20887 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
20888
20889 /* Let the rtl optimizers know about the zero extension performed. */
20890 if (dstmode == QImode || dstmode == HImode)
20891 {
20892 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
20893 d = gen_lowpart (SImode, d);
20894 }
20895
20896 emit_insn (gen_rtx_SET (d, pat));
20897
20898 if (d != dst)
20899 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
20900 return true;
20901 }
20902
20903 default:
20904 return false;
20905 }
20906 }
20907
20908 /* Expand an insert into a vector register through pinsr insn.
20909 Return true if successful. */
20910
20911 bool
20912 ix86_expand_pinsr (rtx *operands)
20913 {
20914 rtx dst = operands[0];
20915 rtx src = operands[3];
20916
20917 unsigned int size = INTVAL (operands[1]);
20918 unsigned int pos = INTVAL (operands[2]);
20919
20920 if (SUBREG_P (dst))
20921 {
20922 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
20923 dst = SUBREG_REG (dst);
20924 }
20925
20926 switch (GET_MODE (dst))
20927 {
20928 case E_V16QImode:
20929 case E_V8HImode:
20930 case E_V4SImode:
20931 case E_V2DImode:
20932 case E_V1TImode:
20933 {
20934 machine_mode srcmode, dstmode;
20935 rtx (*pinsr)(rtx, rtx, rtx, rtx);
20936 rtx d;
20937
20938 if (!int_mode_for_size (size, 0).exists (&srcmode))
20939 return false;
20940
20941 switch (srcmode)
20942 {
20943 case E_QImode:
20944 if (!TARGET_SSE4_1)
20945 return false;
20946 dstmode = V16QImode;
20947 pinsr = gen_sse4_1_pinsrb;
20948 break;
20949
20950 case E_HImode:
20951 if (!TARGET_SSE2)
20952 return false;
20953 dstmode = V8HImode;
20954 pinsr = gen_sse2_pinsrw;
20955 break;
20956
20957 case E_SImode:
20958 if (!TARGET_SSE4_1)
20959 return false;
20960 dstmode = V4SImode;
20961 pinsr = gen_sse4_1_pinsrd;
20962 break;
20963
20964 case E_DImode:
20965 gcc_assert (TARGET_64BIT);
20966 if (!TARGET_SSE4_1)
20967 return false;
20968 dstmode = V2DImode;
20969 pinsr = gen_sse4_1_pinsrq;
20970 break;
20971
20972 default:
20973 return false;
20974 }
20975
20976 /* Reject insertions to misaligned positions. */
20977 if (pos & (size-1))
20978 return false;
20979
20980 if (SUBREG_P (src))
20981 {
20982 unsigned int srcpos = SUBREG_BYTE (src);
20983
20984 if (srcpos > 0)
20985 {
20986 rtx extr_ops[4];
20987
20988 extr_ops[0] = gen_reg_rtx (srcmode);
20989 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
20990 extr_ops[2] = GEN_INT (size);
20991 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
20992
20993 if (!ix86_expand_pextr (extr_ops))
20994 return false;
20995
20996 src = extr_ops[0];
20997 }
20998 else
20999 src = gen_lowpart (srcmode, SUBREG_REG (src));
21000 }
21001
21002 if (GET_MODE (dst) == dstmode)
21003 d = dst;
21004 else
21005 d = gen_reg_rtx (dstmode);
21006
21007 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
21008 gen_lowpart (srcmode, src),
21009 GEN_INT (1 << (pos / size))));
21010 if (d != dst)
21011 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
21012 return true;
21013 }
21014
21015 default:
21016 return false;
21017 }
21018 }
21019
21020 /* All CPUs prefer to avoid cross-lane operations so perform reductions
21021 upper against lower halves up to SSE reg size. */
21022
21023 machine_mode
21024 ix86_split_reduction (machine_mode mode)
21025 {
21026 /* Reduce lowpart against highpart until we reach SSE reg width to
21027 avoid cross-lane operations. */
21028 switch (mode)
21029 {
21030 case E_V8DImode:
21031 case E_V4DImode:
21032 return V2DImode;
21033 case E_V16SImode:
21034 case E_V8SImode:
21035 return V4SImode;
21036 case E_V32HImode:
21037 case E_V16HImode:
21038 return V8HImode;
21039 case E_V64QImode:
21040 case E_V32QImode:
21041 return V16QImode;
21042 case E_V16SFmode:
21043 case E_V8SFmode:
21044 return V4SFmode;
21045 case E_V8DFmode:
21046 case E_V4DFmode:
21047 return V2DFmode;
21048 default:
21049 return mode;
21050 }
21051 }
21052
21053 /* Generate call to __divmoddi4. */
21054
21055 void
21056 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
21057 rtx op0, rtx op1,
21058 rtx *quot_p, rtx *rem_p)
21059 {
21060 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
21061
21062 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
21063 mode, op0, mode, op1, mode,
21064 XEXP (rem, 0), Pmode);
21065 *quot_p = quot;
21066 *rem_p = rem;
21067 }
21068
21069 #include "gt-i386-expand.h"
This page took 0.962906 seconds and 5 git commands to generate.